Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Linux build on GPU on Github Actions #9335

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions .github/workflows/linux-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ jobs:
VELOX_DEPENDENCY_SOURCE: SYSTEM
simdjson_SOURCE: BUNDLED
xsimd_SOURCE: BUNDLED
CUDA_VERSION: "11.8"
steps:
- uses: actions/checkout@v4

Expand All @@ -69,6 +70,15 @@ jobs:
# it doesn't work
run: git config --global --add safe.directory /__w/velox/velox

- name: Install Dependencies
run: |
# Allows to install arbitrary cuda-version whithout needing to update
# docker container before. It simplifies testing new/different versions
if ! yum list installed cuda-nvcc-$(echo ${CUDA_VERSION} | tr '.' '-') 1>/dev/null; then
source scripts/setup-centos8.sh
install_cuda ${CUDA_VERSION}
fi

- uses: assignUser/stash/restore@v1
with:
path: '${{ env.CCACHE_DIR }}'
Expand All @@ -81,6 +91,10 @@ jobs:
- name: Make Release Build
env:
MAKEFLAGS: 'NUM_THREADS=8 MAX_HIGH_MEM_JOBS=4 MAX_LINK_JOBS=4'
CUDA_ARCHITECTURES: 60
CUDA_COMPILER: /usr/local/cuda-${CUDA_VERSION}/bin/nvcc
# Without that, nvcc picks /usr/bin/c++ which is GCC 8
CUDA_FLAGS: "-ccbin /opt/rh/gcc-toolset-9/root/usr/bin"
run: |
EXTRA_CMAKE_FLAGS=(
"-DVELOX_ENABLE_BENCHMARKS=ON"
Expand All @@ -92,6 +106,7 @@ jobs:
"-DVELOX_ENABLE_ABFS=ON"
"-DVELOX_ENABLE_SUBSTRAIT=ON"
"-DVELOX_ENABLE_REMOTE_FUNCTIONS=ON"
"-DVELOX_ENABLE_GPU=ON"
)
make release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS[*]}"

Expand Down
4 changes: 4 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,10 @@ if(${VELOX_ENABLE_GPU})
enable_language(CUDA)
# Determine CUDA_ARCHITECTURES automatically.
cmake_policy(SET CMP0104 NEW)
if(NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
# it will fail later in the build otherwise
message(FATAL_ERROR "-DCMAKE_CUDA_ARCHITECTURES= must be set")
endif()
if(CMAKE_BUILD_TYPE MATCHES Debug)
add_compile_options("$<$<COMPILE_LANGUAGE:CUDA>:-G>")
endif()
Expand Down
20 changes: 20 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,18 @@ ifdef AZURESDK_ROOT_DIR
CMAKE_FLAGS += -DAZURESDK_ROOT_DIR=$(AZURESDK_ROOT_DIR)
endif

ifdef CUDA_ARCHITECTURES
CMAKE_FLAGS += -DCMAKE_CUDA_ARCHITECTURES="$(CUDA_ARCHITECTURES)"
endif

ifdef CUDA_COMPILER
CMAKE_FLAGS += -DCMAKE_CUDA_COMPILER="$(CUDA_COMPILER)"
endif

ifdef CUDA_FLAGS
CMAKE_FLAGS += -DCMAKE_CUDA_FLAGS="$(CUDA_FLAGS)"
endif

# Use Ninja if available. If Ninja is used, pass through parallelism control flags.
USE_NINJA ?= 1
ifeq ($(USE_NINJA), 1)
Expand Down Expand Up @@ -108,6 +120,14 @@ minimal: #: Minimal build
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_BUILD_MINIMAL=ON"
$(MAKE) build BUILD_DIR=release

gpu: #: Build with GPU support
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON"
$(MAKE) build BUILD_DIR=release

gpu_debug: #: Build with debugging symbols and GPU support
$(MAKE) cmake BUILD_DIR=debug BUILD_TYPE=debug EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} -DVELOX_ENABLE_GPU=ON"
$(MAKE) build BUILD_DIR=debug

dwio: #: Minimal build with dwio enabled.
$(MAKE) cmake BUILD_DIR=release BUILD_TYPE=release EXTRA_CMAKE_FLAGS="${EXTRA_CMAKE_FLAGS} \
-DVELOX_BUILD_MINIMAL_WITH_DWIO=ON"
Expand Down
6 changes: 6 additions & 0 deletions scripts/setup-centos8.sh
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,12 @@ function install_duckdb {
fi
}

function install_cuda {
# See https://developer.nvidia.com/cuda-downloads
dnf config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel8/x86_64/cuda-rhel8.repo
yum install -y cuda-nvcc-$(echo $1 | tr '.' '-') cuda-cudart-devel-$(echo $1 | tr '.' '-')
}

function install_velox_deps {
run_and_time install_velox_deps_from_dnf
run_and_time install_conda
Expand Down
10 changes: 10 additions & 0 deletions scripts/setup-ubuntu.sh
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,16 @@ function install_conda {
bash Miniconda3-latest-Linux-$ARCH.sh -b -p $MINICONDA_PATH
}

function install_cuda {
# See https://developer.nvidia.com/cuda-downloads
if ! dpkg -l cuda-keyring 1>/dev/null; then
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
$SUDO dpkg -i cuda-keyring_1.1-1_all.deb
rm cuda-keyring_1.1-1_all.deb
$SUDO apt update
fi
$SUDO apt install -y cuda-nvcc-$(echo $1 | tr '.' '-') cuda-cudart-dev-$(echo $1 | tr '.' '-')
}

function install_velox_deps {
run_and_time install_velox_deps_from_apt
Expand Down
12 changes: 12 additions & 0 deletions velox/common/memory/MemoryArbitrator.h
Original file line number Diff line number Diff line change
Expand Up @@ -450,3 +450,15 @@ void testingRunArbitration(
uint64_t targetBytes = 0,
bool allowSpill = true);
} // namespace facebook::velox::memory

#if FMT_VERSION < 100100
template <>
struct fmt::formatter<facebook::velox::memory::MemoryArbitrator::Stats>
: formatter<std::string> {
auto format(
facebook::velox::memory::MemoryArbitrator::Stats s,
format_context& ctx) {
return formatter<std::string>::format(s.toString(), ctx);
}
};
#endif
2 changes: 0 additions & 2 deletions velox/experimental/gpu/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,5 +14,3 @@

add_executable(velox_gpu_hash_table_test HashTableTest.cu)
target_link_libraries(velox_gpu_hash_table_test Folly::folly gflags::gflags)
set_target_properties(velox_gpu_hash_table_test PROPERTIES CUDA_ARCHITECTURES
native)
6 changes: 3 additions & 3 deletions velox/experimental/gpu/tests/HashTableTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace {

constexpr int kBlockSize = 256;

__device__ uint32_t jenkinsRevMix32(uint32_t key) {
[[maybe_unused]] __device__ uint32_t jenkinsRevMix32(uint32_t key) {
key += (key << 12); // key *= (1 + (1 << 12))
key ^= (key >> 22);
key += (key << 4); // key *= (1 + (1 << 4))
Expand Down Expand Up @@ -298,7 +298,7 @@ __global__ void probe<true>(
j = (j + sizeof(uint32_t)) & tableSizeMask;
cmpMask = 0xffffffff;
}
end:
end:;
}
}

Expand Down Expand Up @@ -627,7 +627,7 @@ __global__ void probePartitioned<true>(
j = (j + sizeof(uint32_t)) & tableSizeMask;
cmpMask = 0xffffffff;
}
end:
end:;
}
}

Expand Down
2 changes: 0 additions & 2 deletions velox/experimental/wave/common/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,6 @@
add_library(velox_wave_common GpuArena.cpp Buffer.cpp Cuda.cu Exception.cpp
Type.cpp)

set_target_properties(velox_wave_common PROPERTIES CUDA_ARCHITECTURES native)

target_link_libraries(velox_wave_common velox_exception velox_common_base
velox_type)

Expand Down
4 changes: 2 additions & 2 deletions velox/experimental/wave/common/Cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,8 @@ struct EventImpl {

Event::Event(bool withTime) : hasTiming_(withTime) {
event_ = std::make_unique<EventImpl>();
CUDA_CHECK(
cudaEventCreate(&event_->event, withTime ? 0 : cudaEventDisableTiming));
CUDA_CHECK(cudaEventCreateWithFlags(
&event_->event, withTime ? 0 : cudaEventDisableTiming));
}

Event::~Event() {}
Expand Down
2 changes: 1 addition & 1 deletion velox/experimental/wave/common/tests/BlockTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ TEST_F(BlockTest, boolToIndices) {
std::vector<int32_t> referenceIndices(kNumFlags);
std::vector<int32_t> referenceSizes(kNumBlocks);
uint8_t* flags = flagsBuffer->as<uint8_t>();
for (auto i = 0; i < kNumFlags; ++i) {
for (auto i = 0ul; i < kNumFlags; ++i) {
if ((i >> 8) % 17 == 0) {
flags[i] = 0;
} else if ((i >> 8) % 23 == 0) {
Expand Down
5 changes: 2 additions & 3 deletions velox/experimental/wave/common/tests/BlockTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ __global__ void boolToIndices(
int32_t** indices,
int32_t* sizes,
int64_t* times) {
extern __shared__ __align__(alignof(ScanAlgorithm::TempStorage)) char smem[];
extern __shared__ char smem[];
int32_t idx = blockIdx.x;
// Start cycle timer
clock_t start = clock();
Expand Down Expand Up @@ -42,8 +42,7 @@ void BlockTestStream::testBoolToIndices(
}

__global__ void sum64(int64_t* numbers, int64_t* results) {
extern __shared__ __align__(
alignof(cub::BlockReduce<int64_t, 256>::TempStorage)) char smem[];
extern __shared__ char smem[];
int32_t idx = blockIdx.x;
blockSum<256>(
[&]() { return numbers[idx * 256 + threadIdx.x]; }, smem, results);
Expand Down
3 changes: 0 additions & 3 deletions velox/experimental/wave/common/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,6 @@
add_executable(velox_wave_common_test GpuArenaTest.cpp CudaTest.cpp CudaTest.cu
BlockTest.cpp BlockTest.cu)

set_target_properties(velox_wave_common_test PROPERTIES CUDA_ARCHITECTURES
native)

add_test(velox_wave_common_test velox_wave_common_test)

target_link_libraries(
Expand Down
2 changes: 2 additions & 0 deletions velox/experimental/wave/dwio/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,3 +16,5 @@ add_subdirectory(decode)

add_library(velox_wave_dwio ColumnReader.cpp FormatData.cpp ReadStream.cpp
StructColumnReader.cpp)

target_link_libraries(velox_wave_dwio Folly::folly fmt::fmt xsimd)
2 changes: 0 additions & 2 deletions velox/experimental/wave/dwio/decode/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,4 @@ add_subdirectory(tests)

add_library(velox_wave_decode GpuDecoder.cu)

set_target_properties(velox_wave_decode PROPERTIES CUDA_ARCHITECTURES native)

target_link_libraries(velox_wave_decode velox_wave_common)
12 changes: 4 additions & 8 deletions velox/experimental/wave/dwio/decode/GpuDecoder-inl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -170,8 +170,7 @@ __device__ int scatterIndices(
int32_t end,
int32_t* indices) {
typedef cub::BlockScan<int32_t, kBlockSize> BlockScan;
extern __shared__ __align__(
alignof(typename BlockScan::TempStorage)) char smem[];
extern __shared__ char smem[];
auto* scanStorage = reinterpret_cast<typename BlockScan::TempStorage*>(smem);
int numMatch;
bool match;
Expand All @@ -198,8 +197,7 @@ __device__ int scatterIndices(
int32_t end,
int32_t* indices) {
typedef cub::BlockScan<int32_t, kBlockSize> BlockScan;
extern __shared__ __align__(
alignof(typename BlockScan::TempStorage)) char smem[];
extern __shared__ char smem[];
auto* scanStorage = reinterpret_cast<typename BlockScan::TempStorage*>(smem);
constexpr int kPerThread = 8;
int numMatch[kPerThread];
Expand Down Expand Up @@ -411,8 +409,7 @@ __device__ void decodeMainlyConstant(GpuDecode& plan) {
template <int kBlockSize, typename T, typename U>
__device__ T sum(const U* values, int size) {
using Reduce = cub::BlockReduce<T, kBlockSize>;
extern __shared__ __align__(
alignof(typename Reduce::TempStorage)) char smem[];
extern __shared__ char smem[];
auto* reduceStorage = reinterpret_cast<typename Reduce::TempStorage*>(smem);
T total = 0;
for (int i = 0; i < size; i += kBlockSize) {
Expand Down Expand Up @@ -452,8 +449,7 @@ __device__ int upperBound(const T* data, int size, T target) {
template <int kBlockSize, typename T>
__device__ void decodeRle(GpuDecode::Rle& op) {
using BlockScan = cub::BlockScan<int32_t, kBlockSize>;
extern __shared__ __align__(
alignof(typename BlockScan::TempStorage)) char smem[];
extern __shared__ char smem[];
auto* scanStorage = reinterpret_cast<typename BlockScan::TempStorage*>(smem);

static_assert(sizeof(*scanStorage) >= sizeof(int32_t) * kBlockSize);
Expand Down
5 changes: 1 addition & 4 deletions velox/experimental/wave/dwio/decode/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,7 @@

add_executable(velox_wave_decode_test GpuDecoderTest.cu)

set_target_properties(velox_wave_decode_test PROPERTIES CUDA_ARCHITECTURES
native)

add_test(velox_wave_common_test velox_wave_common_test)
add_test(velox_wave_decode_test velox_wave_decode_test)

target_link_libraries(
velox_wave_decode_test
Expand Down
9 changes: 9 additions & 0 deletions velox/experimental/wave/dwio/decode/tests/GpuDecoderTest.cu
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,10 @@ void makeBitpackDict(
class GpuDecoderTest : public ::testing::Test {
protected:
void SetUp() override {
if (int device; cudaGetDevice(&device) != cudaSuccess) {
GTEST_SKIP() << "No CUDA detected, skipping all tests";
}

CUDA_CHECK_FATAL(cudaEventCreate(&startEvent_));
CUDA_CHECK_FATAL(cudaEventCreate(&stopEvent_));
}
Expand Down Expand Up @@ -614,6 +618,11 @@ int main(int argc, char** argv) {
testing::InitGoogleTest(&argc, argv);
folly::Init init{&argc, &argv};

if (int device; cudaGetDevice(&device) != cudaSuccess) {
std::cerr << "No CUDA detected, skipping all tests" << std::endl;
return 0;
}

cudaDeviceProp prop;
CUDA_CHECK_FATAL(cudaGetDeviceProperties(&prop, FLAGS_device_id));
printf("Running on device: %s\n", prop.name);
Expand Down
4 changes: 2 additions & 2 deletions velox/experimental/wave/exec/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

add_library(velox_wave_stream OperandSet.cpp Wave.cpp)

target_link_libraries(velox_wave_stream Folly::folly fmt::fmt xsimd)

add_library(
velox_wave_exec
Aggregation.cpp
Expand All @@ -29,8 +31,6 @@ add_library(
WaveHiveDataSource.cpp
WaveSplitReader.cpp)

set_target_properties(velox_wave_exec PROPERTIES CUDA_ARCHITECTURES native)

target_link_libraries(
velox_wave_exec
velox_wave_vector
Expand Down
2 changes: 0 additions & 2 deletions velox/experimental/wave/exec/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@ add_subdirectory(utils)
add_executable(velox_wave_exec_test FilterProjectTest.cpp TableScanTest.cpp
Main.cpp)

set_target_properties(velox_wave_exec_test PROPERTIES CUDA_ARCHITECTURES native)

add_test(velox_wave_exec_test velox_wave_exec_test)

target_link_libraries(
Expand Down
3 changes: 3 additions & 0 deletions velox/experimental/wave/exec/tests/TableScanTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,9 @@ using namespace facebook::velox::exec::test;
class TableScanTest : public virtual HiveConnectorTestBase {
protected:
void SetUp() override {
if (int device; cudaGetDevice(&device) != cudaSuccess) {
GTEST_SKIP() << "No CUDA detected, skipping all tests";
}
HiveConnectorTestBase::SetUp();
wave::registerWave();
wave::WaveHiveDataSource::registerConnector();
Expand Down
3 changes: 0 additions & 3 deletions velox/experimental/wave/vector/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,9 +14,6 @@

add_executable(velox_wave_vector_test VectorTest.cpp)

set_target_properties(velox_wave_vector_test PROPERTIES CUDA_ARCHITECTURES
native)

add_test(veloxwave__vector_test velox_wave_vector_test)

target_link_libraries(
Expand Down
Loading