Skip to content

Commit

Permalink
ROCm support (#3462)
Browse files Browse the repository at this point in the history
Summary:
* add hipify at configure time
* ROCm specific code paths behind USE_ROCM guards
* support for wavefront 32 (Navi) and 64 (MI)
* use builtins to match inline PTX
* support C API on ROCm
* support Python API on ROCm

---------

Pull Request resolved: #3462

Reviewed By: asadoughi

Differential Revision: D60431193

Pulled By: ramilbakhshyiev

fbshipit-source-id: ac82d5ecb38f995c467e100ed583d5178ae489ee
  • Loading branch information
iotamudelta authored and facebook-github-bot committed Jul 31, 2024
1 parent 677e73f commit b670cb1
Show file tree
Hide file tree
Showing 55 changed files with 1,799 additions and 863 deletions.
33 changes: 28 additions & 5 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,13 @@ cmake_minimum_required(VERSION 3.24.0 FATAL_ERROR)
set(FAISS_LANGUAGES CXX)

if(FAISS_ENABLE_GPU)
list(APPEND FAISS_LANGUAGES CUDA)
# if ROCm install detected, assume ROCm/HIP is GPU device
if (EXISTS /opt/rocm)
set(USE_ROCM TRUE)
list(APPEND FAISS_LANGUAGES HIP)
else()
list(APPEND FAISS_LANGUAGES CUDA)
endif()
endif()

if(FAISS_ENABLE_RAFT)
Expand Down Expand Up @@ -58,8 +64,17 @@ option(FAISS_ENABLE_PYTHON "Build Python extension." ON)
option(FAISS_ENABLE_C_API "Build C API." OFF)

if(FAISS_ENABLE_GPU)
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
enable_language(CUDA)
if(USE_ROCM)
enable_language(HIP)
add_definitions(-DUSE_ROCM)
find_package(HIP REQUIRED)
find_package(hipBLAS REQUIRED)
set(GPU_EXT_PREFIX "hip")
else ()
set(CMAKE_CUDA_HOST_COMPILER ${CMAKE_CXX_COMPILER})
enable_language(CUDA)
set(GPU_EXT_PREFIX "cu")
endif()
endif()

if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
Expand All @@ -69,7 +84,11 @@ if(FAISS_ENABLE_RAFT AND NOT TARGET raft::raft)
add_subdirectory(faiss)

if(FAISS_ENABLE_GPU)
add_subdirectory(faiss/gpu)
if(USE_ROCM)
add_subdirectory(faiss/gpu-rocm)
else()
add_subdirectory(faiss/gpu)
endif()
endif()

if(FAISS_ENABLE_PYTHON)
Expand All @@ -90,6 +109,10 @@ if(BUILD_TESTING)
add_subdirectory(tests)

if(FAISS_ENABLE_GPU)
add_subdirectory(faiss/gpu/test)
if(USE_ROCM)
add_subdirectory(faiss/gpu-rocm/test)
else()
add_subdirectory(faiss/gpu/test)
endif()
endif()
endif()
6 changes: 5 additions & 1 deletion c_api/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -56,5 +56,9 @@ add_executable(example_c EXCLUDE_FROM_ALL example_c.c)
target_link_libraries(example_c PRIVATE faiss_c)

if(FAISS_ENABLE_GPU)
add_subdirectory(gpu)
if(USE_ROCM)
add_subdirectory(gpu-rocm)
else ()
add_subdirectory(gpu)
endif()
endif()
6 changes: 6 additions & 0 deletions c_api/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,14 @@ target_sources(faiss_c PRIVATE
file(GLOB FAISS_C_API_GPU_HEADERS RELATIVE ${CMAKE_CURRENT_SOURCE_DIR} "*.h")
faiss_install_headers("${FAISS_C_API_GPU_HEADERS}" c_api/gpu)

if (USE_ROCM)
find_package(HIP REQUIRED)
find_package(hipBLAS REQUIRED)
target_link_libraries(faiss_c PUBLIC hip::host roc::hipblas)
else()
find_package(CUDAToolkit REQUIRED)
target_link_libraries(faiss_c PUBLIC CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass>)
endif()

add_executable(example_gpu_c EXCLUDE_FROM_ALL example_gpu_c.c)
target_link_libraries(example_gpu_c PRIVATE faiss_c)
51 changes: 32 additions & 19 deletions faiss/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,10 @@ function(generate_ivf_interleaved_code)
"64|2048|8"
)

if (USE_ROCM)
list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
endif()

# Traverse through the Cartesian product of X and Y
foreach(sub_codec ${SUB_CODEC_TYPE})
foreach(metric_type ${SUB_METRIC_TYPE})
Expand All @@ -210,10 +214,10 @@ function(generate_ivf_interleaved_code)
set(filename "template_${sub_codec}_${metric_type}_${sub_threads}_${sub_num_warp_q}_${sub_num_thread_q}")
# Remove illegal characters from filename
string(REGEX REPLACE "[^A-Za-z0-9_]" "" filename ${filename})
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.cu")
set(output_file "${CMAKE_CURRENT_BINARY_DIR}/${filename}.${GPU_EXT_PREFIX}")

# Read the template file
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.cu" template_content)
file(READ "${CMAKE_CURRENT_SOURCE_DIR}/impl/scan/IVFInterleavedScanKernelTemplate.${GPU_EXT_PREFIX}" template_content)

# Replace the placeholders
string(REPLACE "SUB_CODEC_TYPE" "${sub_codec}" template_content "${template_content}")
Expand Down Expand Up @@ -290,6 +294,10 @@ if(FAISS_ENABLE_RAFT)
target_compile_definitions(faiss_gpu PUBLIC USE_NVIDIA_RAFT=1)
endif()

if (USE_ROCM)
list(TRANSFORM FAISS_GPU_SRC REPLACE cu$ hip)
endif()

# Export FAISS_GPU_HEADERS variable to parent scope.
set(FAISS_GPU_HEADERS ${FAISS_GPU_HEADERS} PARENT_SCOPE)

Expand All @@ -305,21 +313,26 @@ foreach(header ${FAISS_GPU_HEADERS})
)
endforeach()

# Prepares a host linker script and enables host linker to support
# very large device object files.
# This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
[=[
SECTIONS
{
.nvFatBinSegment : { *(.nvFatBinSegment) }
__nv_relfatbin : { *(__nv_relfatbin) }
.nv_fatbin : { *(.nv_fatbin) }
}
]=]
)
target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")
if (USE_ROCM)
target_link_libraries(faiss_gpu PRIVATE $<$<BOOL:${USE_ROCM}>:hip::host> $<$<BOOL:${USE_ROCM}>:roc::hipblas>)
target_compile_options(faiss_gpu PRIVATE)
else()
# Prepares a host linker script and enables host linker to support
# very large device object files.
# This is what CUDA 11.5+ `nvcc -hls=gen-lcs -aug-hls` would generate
file(WRITE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld"
[=[
SECTIONS
{
.nvFatBinSegment : { *(.nvFatBinSegment) }
__nv_relfatbin : { *(__nv_relfatbin) }
.nv_fatbin : { *(.nv_fatbin) }
}
]=]
)
target_link_options(faiss_gpu PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/fatbin.ld")

find_package(CUDAToolkit REQUIRED)
target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
find_package(CUDAToolkit REQUIRED)
target_link_libraries(faiss_gpu PRIVATE CUDA::cudart CUDA::cublas $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::raft> $<$<BOOL:${FAISS_ENABLE_RAFT}>:raft::compiled> $<$<BOOL:${FAISS_ENABLE_RAFT}>:nvidia::cutlass::cutlass> $<$<BOOL:${FAISS_ENABLE_RAFT}>:OpenMP::OpenMP_CXX>)
target_compile_options(faiss_gpu PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:-Xfatbin=-compress-all --expt-extended-lambda --expt-relaxed-constexpr $<$<BOOL:${FAISS_ENABLE_RAFT}>:-Xcompiler=${OpenMP_CXX_FLAGS}>>)
endif()
2 changes: 1 addition & 1 deletion faiss/gpu/GpuFaissAssert.h
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
/// Assertions
///

#ifdef __CUDA_ARCH__
#if defined(__CUDA_ARCH__) || defined(USE_ROCM)
#define GPU_FAISS_ASSERT(X) assert(X)
#define GPU_FAISS_ASSERT_MSG(X, MSG) assert(X)
#define GPU_FAISS_ASSERT_FMT(X, FMT, ...) assert(X)
Expand Down
9 changes: 9 additions & 0 deletions faiss/gpu/StandardGpuResources.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,11 +363,20 @@ void StandardGpuResourcesImpl::initializeForDevice(int device) {
prop.major,
prop.minor);

#if USE_ROCM
// Our code is pre-built with and expects warpSize == 32 or 64, validate
// that
FAISS_ASSERT_FMT(
prop.warpSize == 32 || prop.warpSize == 64,
"Device id %d does not have expected warpSize of 32 or 64",
device);
#else
// Our code is pre-built with and expects warpSize == 32, validate that
FAISS_ASSERT_FMT(
prop.warpSize == 32,
"Device id %d does not have expected warpSize of 32",
device);
#endif

// Create streams
cudaStream_t defaultStream = nullptr;
Expand Down
Loading

0 comments on commit b670cb1

Please sign in to comment.