ROCm support (#3462)

Summary: * add hipify at configure time * ROCm specific code paths behind USE_ROCM guards * support for wavefront 32 (Navi) and 64 (MI) * use builtins to match inline PTX * support C API on ROCm * support Python API on ROCm --------- Pull Request resolved: #3462 Reviewed By: asadoughi Differential Revision: D60431193 Pulled By: ramilbakhshyiev fbshipit-source-id: ac82d5ecb38f995c467e100ed583d5178ae489ee
facebookresearch · Jul 31, 2024 · b670cb1 · b670cb1
1 parent 677e73f
commit b670cb1
Show file tree

Hide file tree

Showing 55 changed files with 1,799 additions and 863 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,7 +23,13 @@ cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
 set(FAISS_LANGUAGES CXX)
 
 if(FAISS_ENABLE_GPU)
-  list(APPEND FAISS_LANGUAGES CUDA)
+  # if ROCm install detected, assume ROCm/HIP is GPU device
+  if (EXISTS /opt/rocm)
+    set(USE_ROCM TRUE)
+    list(APPEND FAISS_LANGUAGES HIP)
+  else()
+    list(APPEND FAISS_LANGUAGES CUDA)
+  endif()
 endif()
 
 if(FAISS_ENABLE_RAFT)
@@ -58,8 +64,17 @@ option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
 option(FAISS_ENABLE_C_API "Build C API." OFF)
 
 if(FAISS_ENABLE_GPU)
-  set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
-  enable_language(CUDA)
+  if(USE_ROCM)
+    enable_language(HIP)
+    add_definitions(-DUSE_ROCM)
+    find_package(HIP REQUIRED)
+    find_package(hipBLAS REQUIRED)
+    set(GPU_EXT_PREFIX "hip")
+  else ()
+    set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
+    enable_language(CUDA)
+    set(GPU_EXT_PREFIX "cu")
+  endif()
 endif()
 
 if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
@@ -69,7 +84,11 @@ if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
 add_subdirectory(faiss)
 
 if(FAISS_ENABLE_GPU)
-  add_subdirectory(faiss/gpu)
+  if(USE_ROCM)
+    add_subdirectory(faiss/gpu-rocm)
+  else()
+    add_subdirectory(faiss/gpu)
+  endif()
 endif()
 
 if(FAISS_ENABLE_PYTHON)
@@ -90,6 +109,10 @@ if(BUILD_TESTING)
   add_subdirectory(tests)
 
   if(FAISS_ENABLE_GPU)
-    add_subdirectory(faiss/gpu/test)
+    if(USE_ROCM)
+      add_subdirectory(faiss/gpu-rocm/test)
+    else()
+      add_subdirectory(faiss/gpu/test)
+    endif()
   endif()
 endif()
diff --git a/c_api/CMakeLists.txt b/c_api/CMakeLists.txt
@@ -56,5 +56,9 @@ add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
 target_link_libraries(example_c PRIVATE faiss_c)
 
 if(FAISS_ENABLE_GPU)
-  add_subdirectory(gpu)
+  if(USE_ROCM)
+    add_subdirectory(gpu-rocm)
+  else ()
+    add_subdirectory(gpu)
+  endif()
 endif()
diff --git a/c_api/gpu/CMakeLists.txt b/c_api/gpu/CMakeLists.txt
@@ -15,8 +15,14 @@ target_sources(faiss_c PRIVATE
 file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h")
 faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)
 
+if (USE_ROCM)
+find_package(HIP REQUIRED)
+find_package(hipBLAS REQUIRED)
+target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
+else()
 find_package(CUDAToolkit REQUIRED)
 target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
+endif()
 
 add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
 target_link_libraries(example_gpu_c PRIVATE faiss_c)
diff --git a/faiss/gpu/CMakeLists.txt b/faiss/gpu/CMakeLists.txt
@@ -197,6 +197,10 @@ function(generate_ivf_interleaved_code)
     "64|2048|8"
   )
 
+  if (USE_ROCM)
+     list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
+  endif()
+
   # Traverse through the Cartesian product of X and Y
   foreach(sub_codec ${SUB_CODEC_TYPE})
   foreach(metric_type ${SUB_METRIC_TYPE})
@@ -210,10 +214,10 @@ function(generate_ivf_interleaved_code)
     set(filename "template_${sub_codec}_${metric_type}_${sub_threads}_${sub_num_warp_q}_${sub_num_thread_q}")
     # Remove illegal characters from filename
     string(REGEX REPLACE "[^A-Za-z0-9_]" "" filename ${filename})
-    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.cu")
+    set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${GPU_EXT_PREFIX}")
 
     # Read the template file
-    file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.cu" template_content)
+    file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.${GPU_EXT_PREFIX}" template_content)
 
     # Replace the placeholders
     string(REPLACE "SUB_CODEC_TYPE" "${sub_codec}" template_content "${template_content}")
@@ -290,6 +294,10 @@ if(FAISS_ENABLE_RAFT)
   target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1)
 endif()
 
+if (USE_ROCM)
+  list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
+endif()
+
 # Export FAISS_GPU_HEADERS variable to parent scope.
 set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)
 
@@ -305,21 +313,26 @@ foreach(header ${FAISS_GPU_HEADERS})
   )
 endforeach()
 
-# Prepares a host linker script and enables host linker to support
-# very large device object files.
-# This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
-file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
-[=[
-SECTIONS
-{
-.nvFatBinSegment : { *(.nvFatBinSegment) }
-__nv_relfatbin : { *(__nv_relfatbin) }
-.nv_fatbin : { *(.nv_fatbin) }
-}
-]=]
-)
-target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
+if (USE_ROCM)
+  target_link_libraries(faiss_gpu PRIVATE $<$<BOOL:${USE_ROCM}>:hip::host> $<$<BOOL:${USE_ROCM}>:roc::hipblas>)
+  target_compile_options(faiss_gpu PRIVATE)
+else()
+  # Prepares a host linker script and enables host linker to support
+  # very large device object files.
+  # This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
+  file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
+  [=[
+  SECTIONS
+  {
+    .nvFatBinSegment : { *(.nvFatBinSegment) }
+    __nv_relfatbin : { *(__nv_relfatbin) }
+    .nv_fatbin : { *(.nv_fatbin) }
+  }
+  ]=]
+  )
+  target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
 
-find_package(CUDAToolkit REQUIRED)
-target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
-target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
+  find_package(CUDAToolkit REQUIRED)
+  target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
+  target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
+endif()
diff --git a/faiss/gpu/GpuFaissAssert.h b/faiss/gpu/GpuFaissAssert.h
@@ -15,7 +15,7 @@
 /// Assertions
 ///
 
-#ifdef __CUDA_ARCH__
+#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
 #define GPU_FAISS_ASSERT(X) assert(X)
 #define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
 #define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)

diff --git a/faiss/gpu/StandardGpuResources.cpp b/faiss/gpu/StandardGpuResources.cpp
@@ -363,11 +363,20 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
             prop.major,
             prop.minor);
 
+#if USE_ROCM
+    // Our code is pre-built with and expects warpSize == 32 or 64, validate
+    // that
+    FAISS_ASSERT_FMT(
+            prop.warpSize == 32 || prop.warpSize == 64,
+            "Device id %d does not have expected warpSize of 32 or 64",
+            device);
+#else
     // Our code is pre-built with and expects warpSize == 32, validate that
     FAISS_ASSERT_FMT(
             prop.warpSize == 32,
             "Device id %d does not have expected warpSize of 32",
             device);
+#endif
 
     // Create streams
     cudaStream_t defaultStream = nullptr;