diff --git a/.github/workflows/linux-build.yml b/.github/workflows/linux-build.yml index 293a3d3e861b..6c547b35f548 100644 --- a/.github/workflows/linux-build.yml +++ b/.github/workflows/linux-build.yml @@ -61,6 +61,7 @@ jobs: VELOX_DEPENDENCY_SOURCE: SYSTEM simdjson_SOURCE: BUNDLED xsimd_SOURCE: BUNDLED + CUDA_VERSION: "11.8" steps: - uses: actions/checkout@v4 @@ -69,6 +70,15 @@ jobs: # it doesn't work run: git config --global --add safe.directory /__w/velox/velox + - name: Install Dependencies + run: | + # Allows to install arbitrary cuda-version whithout needing to update + # docker container before. It simplifies testing new/different versions + if ! yum list installed cuda-nvcc-$(echo ${CUDA_VERSION} | tr '.' '-') 1>/dev/null; then + source scripts/setup-centos8.sh + install_cuda ${CUDA_VERSION} + fi + - uses: assignUser/stash/restore@v1 with: path: '${{ env.CCACHE_DIR }}' @@ -81,6 +91,10 @@ jobs: - name: Make Release Build env: MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4' + CUDA_ARCHITECTURES: 60 + CUDA_COMPILER: /usr/local/cuda-${CUDA_VERSION}/bin/nvcc + # Without that, nvcc picks /usr/bin/c++ which is GCC 8 + CUDA_FLAGS: "-ccbin /opt/rh/gcc-toolset-9/root/usr/bin" run: | EXTRA_CMAKE_FLAGS=( "-DVELOX_ENABLE_BENCHMARKS=ON" @@ -92,6 +106,7 @@ jobs: "-DVELOX_ENABLE_ABFS=ON" "-DVELOX_ENABLE_SUBSTRAIT=ON" "-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON" + "-DVELOX_ENABLE_GPU=ON" ) make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}" diff --git a/CMakeLists.txt b/CMakeLists.txt index 53aaf4391775..23a05ef6a37d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -361,6 +361,10 @@ if(${VELOX_ENABLE_GPU}) enable_language(CUDA) # Determine CUDA_ARCHITECTURES automatically. cmake_policy(SET CMP0104 NEW) + if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES) + # it will fail later in the build otherwise + message(FATAL_ERROR "-DCMAKE_CUDA_ARCHITECTURES= must be set") + endif() if(CMAKE_BUILD_TYPE MATCHES Debug) add_compile_options("$<$:-G>") endif() diff --git a/Makefile b/Makefile index 2bab08bce1ab..3e8ff17397bd 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,18 @@ ifdef AZURESDK_ROOT_DIR CMAKE_FLAGS += -DAZURESDK_ROOT_DIR=$(AZURESDK_ROOT_DIR) endif +ifdef CUDA_ARCHITECTURES +CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES="$(CUDA_ARCHITECTURES)" +endif + +ifdef CUDA_COMPILER +CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER="$(CUDA_COMPILER)" +endif + +ifdef CUDA_FLAGS +CMAKE_FLAGS += -DCMAKE_CUDA_FLAGS="$(CUDA_FLAGS)" +endif + # Use Ninja if available. If Ninja is used, pass through parallelism control flags. USE_NINJA ?= 1 ifeq ($(USE_NINJA), 1) @@ -108,6 +120,14 @@ minimal: #: Minimal build $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON" $(MAKE) build BUILD_DIR=release +gpu: #: Build with GPU support + $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" + $(MAKE) build BUILD_DIR=release + +gpu_debug: #: Build with debugging symbols and GPU support + $(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON" + $(MAKE) build BUILD_DIR=debug + dwio: #: Minimal build with dwio enabled. $(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \ -DVELOX_BUILD_MINIMAL_WITH_DWIO=ON" diff --git a/scripts/setup-centos8.sh b/scripts/setup-centos8.sh index 0731dafc76c1..1a1157af5089 100755 --- a/scripts/setup-centos8.sh +++ b/scripts/setup-centos8.sh @@ -185,6 +185,12 @@ function install_duckdb { fi } +function install_cuda { + # See https://developer.nvidia.com/cuda-downloads + dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo + yum install -y cuda-nvcc-$(echo $1 | tr '.' '-') cuda-cudart-devel-$(echo $1 | tr '.' '-') +} + function install_velox_deps { run_and_time install_velox_deps_from_dnf run_and_time install_conda diff --git a/scripts/setup-ubuntu.sh b/scripts/setup-ubuntu.sh index d85c9320ed6f..822027ee34f6 100755 --- a/scripts/setup-ubuntu.sh +++ b/scripts/setup-ubuntu.sh @@ -141,6 +141,16 @@ function install_conda { bash Miniconda3-latest-Linux-$ARCH.sh -b -p $MINICONDA_PATH } +function install_cuda { + # See https://developer.nvidia.com/cuda-downloads + if ! dpkg -l cuda-keyring 1>/dev/null; then + wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb + $SUDO dpkg -i cuda-keyring_1.1-1_all.deb + rm cuda-keyring_1.1-1_all.deb + $SUDO apt update + fi + $SUDO apt install -y cuda-nvcc-$(echo $1 | tr '.' '-') cuda-cudart-dev-$(echo $1 | tr '.' '-') +} function install_velox_deps { run_and_time install_velox_deps_from_apt diff --git a/velox/common/memory/MemoryArbitrator.h b/velox/common/memory/MemoryArbitrator.h index 10376f34c675..2fa76317dcd2 100644 --- a/velox/common/memory/MemoryArbitrator.h +++ b/velox/common/memory/MemoryArbitrator.h @@ -450,3 +450,15 @@ void testingRunArbitration( uint64_t targetBytes = 0, bool allowSpill = true); } // namespace facebook::velox::memory + +#if FMT_VERSION < 100100 +template <> +struct fmt::formatter + : formatter { + auto format( + facebook::velox::memory::MemoryArbitrator::Stats s, + format_context& ctx) { + return formatter::format(s.toString(), ctx); + } +}; +#endif diff --git a/velox/experimental/gpu/tests/CMakeLists.txt b/velox/experimental/gpu/tests/CMakeLists.txt index 8eaca86e7b95..4dc31e422fbf 100644 --- a/velox/experimental/gpu/tests/CMakeLists.txt +++ b/velox/experimental/gpu/tests/CMakeLists.txt @@ -14,5 +14,3 @@ add_executable(velox_gpu_hash_table_test HashTableTest.cu) target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags) -set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES - native) diff --git a/velox/experimental/gpu/tests/HashTableTest.cu b/velox/experimental/gpu/tests/HashTableTest.cu index d1d5ac23ef25..6de1378d93a6 100644 --- a/velox/experimental/gpu/tests/HashTableTest.cu +++ b/velox/experimental/gpu/tests/HashTableTest.cu @@ -35,7 +35,7 @@ namespace { constexpr int kBlockSize = 256; -__device__ uint32_t jenkinsRevMix32(uint32_t key) { +[[maybe_unused]] __device__ uint32_t jenkinsRevMix32(uint32_t key) { key += (key << 12); // key *= (1 + (1 << 12)) key ^= (key >> 22); key += (key << 4); // key *= (1 + (1 << 4)) @@ -298,7 +298,7 @@ __global__ void probe( j = (j + sizeof(uint32_t)) & tableSizeMask; cmpMask = 0xffffffff; } - end: + end:; } } @@ -627,7 +627,7 @@ __global__ void probePartitioned( j = (j + sizeof(uint32_t)) & tableSizeMask; cmpMask = 0xffffffff; } - end: + end:; } } diff --git a/velox/experimental/wave/common/CMakeLists.txt b/velox/experimental/wave/common/CMakeLists.txt index 205d945331ae..a33fffffba35 100644 --- a/velox/experimental/wave/common/CMakeLists.txt +++ b/velox/experimental/wave/common/CMakeLists.txt @@ -15,8 +15,6 @@ add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp Type.cpp) -set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native) - target_link_libraries(velox_wave_common velox_exception velox_common_base velox_type) diff --git a/velox/experimental/wave/common/Cuda.cu b/velox/experimental/wave/common/Cuda.cu index 48a1e6804029..10e716065a24 100644 --- a/velox/experimental/wave/common/Cuda.cu +++ b/velox/experimental/wave/common/Cuda.cu @@ -170,8 +170,8 @@ struct EventImpl { Event::Event(bool withTime) : hasTiming_(withTime) { event_ = std::make_unique(); - CUDA_CHECK( - cudaEventCreate(&event_->event, withTime ? 0 : cudaEventDisableTiming)); + CUDA_CHECK(cudaEventCreateWithFlags( + &event_->event, withTime ? 0 : cudaEventDisableTiming)); } Event::~Event() {} diff --git a/velox/experimental/wave/common/tests/BlockTest.cpp b/velox/experimental/wave/common/tests/BlockTest.cpp index 6c6d8b20f2f4..012010233e71 100644 --- a/velox/experimental/wave/common/tests/BlockTest.cpp +++ b/velox/experimental/wave/common/tests/BlockTest.cpp @@ -59,7 +59,7 @@ TEST_F(BlockTest, boolToIndices) { std::vector referenceIndices(kNumFlags); std::vector referenceSizes(kNumBlocks); uint8_t* flags = flagsBuffer->as(); - for (auto i = 0; i < kNumFlags; ++i) { + for (auto i = 0ul; i < kNumFlags; ++i) { if ((i >> 8) % 17 == 0) { flags[i] = 0; } else if ((i >> 8) % 23 == 0) { diff --git a/velox/experimental/wave/common/tests/BlockTest.cu b/velox/experimental/wave/common/tests/BlockTest.cu index bfc35e7132f0..6b32b9880f12 100644 --- a/velox/experimental/wave/common/tests/BlockTest.cu +++ b/velox/experimental/wave/common/tests/BlockTest.cu @@ -11,7 +11,7 @@ __global__ void boolToIndices( int32_t** indices, int32_t* sizes, int64_t* times) { - extern __shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[]; + extern __shared__ char smem[]; int32_t idx = blockIdx.x; // Start cycle timer clock_t start = clock(); @@ -42,8 +42,7 @@ void BlockTestStream::testBoolToIndices( } __global__ void sum64(int64_t* numbers, int64_t* results) { - extern __shared__ __align__( - alignof(cub::BlockReduce::TempStorage)) char smem[]; + extern __shared__ char smem[]; int32_t idx = blockIdx.x; blockSum<256>( [&]() { return numbers[idx * 256 + threadIdx.x]; }, smem, results); diff --git a/velox/experimental/wave/common/tests/CMakeLists.txt b/velox/experimental/wave/common/tests/CMakeLists.txt index 159261e72a80..f9d2a3305eec 100644 --- a/velox/experimental/wave/common/tests/CMakeLists.txt +++ b/velox/experimental/wave/common/tests/CMakeLists.txt @@ -15,9 +15,6 @@ add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu BlockTest.cpp BlockTest.cu) -set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES - native) - add_test(velox_wave_common_test velox_wave_common_test) target_link_libraries( diff --git a/velox/experimental/wave/dwio/CMakeLists.txt b/velox/experimental/wave/dwio/CMakeLists.txt index ea7356914813..109bc63bb10a 100644 --- a/velox/experimental/wave/dwio/CMakeLists.txt +++ b/velox/experimental/wave/dwio/CMakeLists.txt @@ -16,3 +16,5 @@ add_subdirectory(decode) add_library(velox_wave_dwio ColumnReader.cpp FormatData.cpp ReadStream.cpp StructColumnReader.cpp) + +target_link_libraries(velox_wave_dwio Folly::folly fmt::fmt xsimd) diff --git a/velox/experimental/wave/dwio/decode/CMakeLists.txt b/velox/experimental/wave/dwio/decode/CMakeLists.txt index cb6bedda4d49..412ba83c8bd3 100644 --- a/velox/experimental/wave/dwio/decode/CMakeLists.txt +++ b/velox/experimental/wave/dwio/decode/CMakeLists.txt @@ -16,6 +16,4 @@ add_subdirectory(tests) add_library(velox_wave_decode GpuDecoder.cu) -set_target_properties(velox_wave_decode PROPERTIES CUDA_ARCHITECTURES native) - target_link_libraries(velox_wave_decode velox_wave_common) diff --git a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh index 401ceb83e8d1..62c23facac15 100644 --- a/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh +++ b/velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh @@ -170,8 +170,7 @@ __device__ int scatterIndices( int32_t end, int32_t* indices) { typedef cub::BlockScan BlockScan; - extern __shared__ __align__( - alignof(typename BlockScan::TempStorage)) char smem[]; + extern __shared__ char smem[]; auto* scanStorage = reinterpret_cast(smem); int numMatch; bool match; @@ -198,8 +197,7 @@ __device__ int scatterIndices( int32_t end, int32_t* indices) { typedef cub::BlockScan BlockScan; - extern __shared__ __align__( - alignof(typename BlockScan::TempStorage)) char smem[]; + extern __shared__ char smem[]; auto* scanStorage = reinterpret_cast(smem); constexpr int kPerThread = 8; int numMatch[kPerThread]; @@ -411,8 +409,7 @@ __device__ void decodeMainlyConstant(GpuDecode& plan) { template __device__ T sum(const U* values, int size) { using Reduce = cub::BlockReduce; - extern __shared__ __align__( - alignof(typename Reduce::TempStorage)) char smem[]; + extern __shared__ char smem[]; auto* reduceStorage = reinterpret_cast(smem); T total = 0; for (int i = 0; i < size; i += kBlockSize) { @@ -452,8 +449,7 @@ __device__ int upperBound(const T* data, int size, T target) { template __device__ void decodeRle(GpuDecode::Rle& op) { using BlockScan = cub::BlockScan; - extern __shared__ __align__( - alignof(typename BlockScan::TempStorage)) char smem[]; + extern __shared__ char smem[]; auto* scanStorage = reinterpret_cast(smem); static_assert(sizeof(*scanStorage) >= sizeof(int32_t) * kBlockSize); diff --git a/velox/experimental/wave/dwio/decode/tests/CMakeLists.txt b/velox/experimental/wave/dwio/decode/tests/CMakeLists.txt index aac1f7e4e0ec..566d22c4ce7e 100644 --- a/velox/experimental/wave/dwio/decode/tests/CMakeLists.txt +++ b/velox/experimental/wave/dwio/decode/tests/CMakeLists.txt @@ -14,10 +14,7 @@ add_executable(velox_wave_decode_test GpuDecoderTest.cu) -set_target_properties(velox_wave_decode_test PROPERTIES CUDA_ARCHITECTURES - native) - -add_test(velox_wave_common_test velox_wave_common_test) +add_test(velox_wave_decode_test velox_wave_decode_test) target_link_libraries( velox_wave_decode_test diff --git a/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu b/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu index 4f391df7a10f..46e11cbc2a25 100644 --- a/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu +++ b/velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu @@ -131,6 +131,10 @@ void makeBitpackDict( class GpuDecoderTest : public ::testing::Test { protected: void SetUp() override { + if (int device; cudaGetDevice(&device) != cudaSuccess) { + GTEST_SKIP() << "No CUDA detected, skipping all tests"; + } + CUDA_CHECK_FATAL(cudaEventCreate(&startEvent_)); CUDA_CHECK_FATAL(cudaEventCreate(&stopEvent_)); } @@ -614,6 +618,11 @@ int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); folly::Init init{&argc, &argv}; + if (int device; cudaGetDevice(&device) != cudaSuccess) { + std::cerr << "No CUDA detected, skipping all tests" << std::endl; + return 0; + } + cudaDeviceProp prop; CUDA_CHECK_FATAL(cudaGetDeviceProperties(&prop, FLAGS_device_id)); printf("Running on device: %s\n", prop.name); diff --git a/velox/experimental/wave/exec/CMakeLists.txt b/velox/experimental/wave/exec/CMakeLists.txt index 815a80ac3dce..c31e2946b7a4 100644 --- a/velox/experimental/wave/exec/CMakeLists.txt +++ b/velox/experimental/wave/exec/CMakeLists.txt @@ -14,6 +14,8 @@ add_library(velox_wave_stream OperandSet.cpp Wave.cpp) +target_link_libraries(velox_wave_stream Folly::folly fmt::fmt xsimd) + add_library( velox_wave_exec Aggregation.cpp @@ -29,8 +31,6 @@ add_library( WaveHiveDataSource.cpp WaveSplitReader.cpp) -set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native) - target_link_libraries( velox_wave_exec velox_wave_vector diff --git a/velox/experimental/wave/exec/tests/CMakeLists.txt b/velox/experimental/wave/exec/tests/CMakeLists.txt index f06c3b41e8d4..c6e8f81145e8 100644 --- a/velox/experimental/wave/exec/tests/CMakeLists.txt +++ b/velox/experimental/wave/exec/tests/CMakeLists.txt @@ -17,8 +17,6 @@ add_subdirectory(utils) add_executable(velox_wave_exec_test FilterProjectTest.cpp TableScanTest.cpp Main.cpp) -set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES native) - add_test(velox_wave_exec_test velox_wave_exec_test) target_link_libraries( diff --git a/velox/experimental/wave/exec/tests/TableScanTest.cpp b/velox/experimental/wave/exec/tests/TableScanTest.cpp index cd1d9d081937..0e6bbebdbd41 100644 --- a/velox/experimental/wave/exec/tests/TableScanTest.cpp +++ b/velox/experimental/wave/exec/tests/TableScanTest.cpp @@ -34,6 +34,9 @@ using namespace facebook::velox::exec::test; class TableScanTest : public virtual HiveConnectorTestBase { protected: void SetUp() override { + if (int device; cudaGetDevice(&device) != cudaSuccess) { + GTEST_SKIP() << "No CUDA detected, skipping all tests"; + } HiveConnectorTestBase::SetUp(); wave::registerWave(); wave::WaveHiveDataSource::registerConnector(); diff --git a/velox/experimental/wave/vector/tests/CMakeLists.txt b/velox/experimental/wave/vector/tests/CMakeLists.txt index 2d4201c4c4cc..5423006f52c8 100644 --- a/velox/experimental/wave/vector/tests/CMakeLists.txt +++ b/velox/experimental/wave/vector/tests/CMakeLists.txt @@ -14,9 +14,6 @@ add_executable(velox_wave_vector_test VectorTest.cpp) -set_target_properties(velox_wave_vector_test PROPERTIES CUDA_ARCHITECTURES - native) - add_test(veloxwave__vector_test velox_wave_vector_test) target_link_libraries(