From 4d75fbac93b7b0ea1c4884a85d9b135f3a72d0fc Mon Sep 17 00:00:00 2001 From: ImmanuelSegol <3ditds@gmail.com> Date: Wed, 14 Feb 2024 22:28:17 +0000 Subject: [PATCH 01/11] issue with init_optimized_poseidon_constants --- .../c++/multi-gpu-poseidon/CMakeLists.txt | 25 ++++ examples/c++/multi-gpu-poseidon/README.md | 2 + examples/c++/multi-gpu-poseidon/compile.sh | 9 ++ examples/c++/multi-gpu-poseidon/example.cu | 119 ++++++++++++++++++ examples/c++/multi-gpu-poseidon/run.sh | 2 + 5 files changed, 157 insertions(+) create mode 100644 examples/c++/multi-gpu-poseidon/CMakeLists.txt create mode 100644 examples/c++/multi-gpu-poseidon/README.md create mode 100755 examples/c++/multi-gpu-poseidon/compile.sh create mode 100644 examples/c++/multi-gpu-poseidon/example.cu create mode 100755 examples/c++/multi-gpu-poseidon/run.sh diff --git a/examples/c++/multi-gpu-poseidon/CMakeLists.txt b/examples/c++/multi-gpu-poseidon/CMakeLists.txt new file mode 100644 index 000000000..424b3e9bc --- /dev/null +++ b/examples/c++/multi-gpu-poseidon/CMakeLists.txt @@ -0,0 +1,25 @@ +cmake_minimum_required(VERSION 3.18) +set(CMAKE_CXX_STANDARD 17) +set(CMAKE_CUDA_STANDARD 17) +set(CMAKE_CUDA_STANDARD_REQUIRED TRUE) +set(CMAKE_CXX_STANDARD_REQUIRED TRUE) +if (${CMAKE_VERSION} VERSION_LESS "3.24.0") + set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH}) +else() + set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed +endif () +project(icicle LANGUAGES CUDA CXX) + +set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr") +set(CMAKE_CUDA_FLAGS_RELEASE "") +set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0") +# change the path to your Icicle location +include_directories("../../../icicle") +add_executable( + example + example.cu +) +find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ ) +target_link_libraries(example ${NVML_LIBRARY}) +set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON) + diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md new file mode 100644 index 000000000..28ecc8a53 --- /dev/null +++ b/examples/c++/multi-gpu-poseidon/README.md @@ -0,0 +1,2 @@ +# Muliple GPU on a single host + diff --git a/examples/c++/multi-gpu-poseidon/compile.sh b/examples/c++/multi-gpu-poseidon/compile.sh new file mode 100755 index 000000000..36c1ddac1 --- /dev/null +++ b/examples/c++/multi-gpu-poseidon/compile.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +# Exit immediately on error +set -e + +rm -rf build +mkdir -p build +cmake -S . -B build +cmake --build build diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu new file mode 100644 index 000000000..91d35416e --- /dev/null +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -0,0 +1,119 @@ +#include +#include +#include + +// select the curve (only 2 available so far) +#define CURVE_ID 2 +#include "appUtils/poseidon/poseidon.cu" + +using namespace poseidon; +using namespace curve_config; + +void setCudaDevice(const unsigned device_id) { + // Example function to set the CUDA device + std::cout << "Setting CUDA device to " << device_id << std::endl; + // cudaSetDevice(device_id); +} + +// function that a thread will execute +void processData(const device_context::DeviceContext ctx, const std::vector& inputData, std::vector& outputData) { + // Simulate some processing + PoseidonConstants column_constants; + int size_col = 11; + // init_optimized_poseidon_constants(ctx, &column_constants); + init_optimized_poseidon_constants(size_col, ctx, &column_constants); + PoseidonConfig column_config = default_poseidon_config(size_col+1); + column_config.are_inputs_on_device = true; + column_config.are_outputs_on_device = true; + + for (int num : inputData) { + outputData.push_back(num * 2); // Example operation + } +} + +void checkCudaError(cudaError_t error) { + if (error != cudaSuccess) { + std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl; + // Handle the error, e.g., exit the program or throw an exception. + } +} + +int main() { + const uint32_t size_col=11; + const unsigned size_partition = 1024; // size_row / nof_partitions; + // layers is allocated only for one partition, need to resuse for different partitions + const uint32_t size_layers = size_col * size_partition; // size_col * size_row + // Input data for each thread + std::vector inputData1 = {1, 2, 3, 4}; + std::vector inputData2 = {5, 6, 7, 8}; + + // Output data for each thread + std::vector outputData1, outputData2; + + + // Multiple devices are supported by device context + + // setCudaDevice(device_id); + cudaStream_t stream0, stream1; + cudaError_t err; + err = cudaStreamCreate(&stream0); + checkCudaError(err); + err = cudaStreamCreate(&stream1); + checkCudaError(err); + + device_context::DeviceContext ctx0 = device_context::DeviceContext{ + (cudaStream_t&)stream0, // SP: simulate different device as stream + 0, // device_id + 0, // mempool + }; + device_context::DeviceContext ctx1 = device_context::DeviceContext{ + (cudaStream_t&)stream1, // SP: simulate different device as stream + 0, // device_id + 0, // mempool + }; + + // Allocate and initialize memory for the layers + scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t))); + if (layers0 == nullptr) { + std::cerr << "Memory allocation for 'layers' failed." << std::endl; + } + scalar_t s = scalar_t::zero(); + for (unsigned i = 0; i < size_col*size_partition ; i++) { + layers0[i] = s; + s = s + scalar_t::one(); + } + scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t))); + if (layers1 == nullptr) { + std::cerr << "Memory allocation for 'layers' failed." << std::endl; + } + s = scalar_t::zero() + scalar_t::one(); + for (unsigned i = 0; i < size_col*size_partition ; i++) { + layers1[i] = s; + s = s + scalar_t::one(); + } + + + + // Start threads + std::thread thread1(processData, ctx0, std::ref(inputData1), std::ref(outputData1)); + std::thread thread2(processData, ctx1, std::ref(inputData2), std::ref(outputData2)); + + // Wait for the threads to finish + thread1.join(); + thread2.join(); + + // Process the output data (example: print the data) + std::cout << "Output Data from Thread 1: "; + for (int num : outputData1) { + std::cout << num << " "; + } + std::cout << std::endl; + + std::cout << "Output Data from Thread 2: "; + for (int num : outputData2) { + std::cout << num << " "; + } + std::cout << std::endl; + + return 0; +} diff --git a/examples/c++/multi-gpu-poseidon/run.sh b/examples/c++/multi-gpu-poseidon/run.sh new file mode 100755 index 000000000..6e3fc976d --- /dev/null +++ b/examples/c++/multi-gpu-poseidon/run.sh @@ -0,0 +1,2 @@ +#!/bin/bash +./build/example From 481f144dc80dabadb9cc7c7192bce0f94347ebd8 Mon Sep 17 00:00:00 2001 From: ImmanuelSegol <3ditds@gmail.com> Date: Thu, 15 Feb 2024 15:11:20 +0000 Subject: [PATCH 02/11] debug --- examples/c++/multi-gpu-poseidon/example.cu | 57 ++++++++++------------ icicle/appUtils/poseidon/poseidon.cuh | 1 + 2 files changed, 27 insertions(+), 31 deletions(-) diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu index 91d35416e..953952922 100644 --- a/examples/c++/multi-gpu-poseidon/example.cu +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -16,11 +16,9 @@ void setCudaDevice(const unsigned device_id) { } // function that a thread will execute -void processData(const device_context::DeviceContext ctx, const std::vector& inputData, std::vector& outputData) { - // Simulate some processing +void processData(device_context::DeviceContext ctx, const std::vector& inputData, std::vector& outputData) { PoseidonConstants column_constants; int size_col = 11; - // init_optimized_poseidon_constants(ctx, &column_constants); init_optimized_poseidon_constants(size_col, ctx, &column_constants); PoseidonConfig column_config = default_poseidon_config(size_col+1); column_config.are_inputs_on_device = true; @@ -61,36 +59,33 @@ int main() { err = cudaStreamCreate(&stream1); checkCudaError(err); - device_context::DeviceContext ctx0 = device_context::DeviceContext{ - (cudaStream_t&)stream0, // SP: simulate different device as stream - 0, // device_id - 0, // mempool - }; - device_context::DeviceContext ctx1 = device_context::DeviceContext{ - (cudaStream_t&)stream1, // SP: simulate different device as stream - 0, // device_id - 0, // mempool - }; + device_context::DeviceContext ctx0 = device_context::get_default_device_context(); + ctx0.device_id=0; + device_context::DeviceContext ctx1 = device_context::get_default_device_context(); + ctx1.device_id=1; + + + // Allocate and initialize memory for the layers - scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t))); - if (layers0 == nullptr) { - std::cerr << "Memory allocation for 'layers' failed." << std::endl; - } - scalar_t s = scalar_t::zero(); - for (unsigned i = 0; i < size_col*size_partition ; i++) { - layers0[i] = s; - s = s + scalar_t::one(); - } - scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t))); - if (layers1 == nullptr) { - std::cerr << "Memory allocation for 'layers' failed." << std::endl; - } - s = scalar_t::zero() + scalar_t::one(); - for (unsigned i = 0; i < size_col*size_partition ; i++) { - layers1[i] = s; - s = s + scalar_t::one(); - } + // scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t))); + // if (layers0 == nullptr) { + // std::cerr << "Memory allocation for 'layers' failed." << std::endl; + // } + // scalar_t s = scalar_t::zero(); + // for (unsigned i = 0; i < size_col*size_partition ; i++) { + // layers0[i] = s; + // s = s + scalar_t::one(); + // } + // scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t))); + // if (layers1 == nullptr) { + // std::cerr << "Memory allocation for 'layers' failed." << std::endl; + // } + // s = scalar_t::zero() + scalar_t::one(); + // for (unsigned i = 0; i < size_col*size_partition ; i++) { + // layers1[i] = s; + // s = s + scalar_t::one(); + // } diff --git a/icicle/appUtils/poseidon/poseidon.cuh b/icicle/appUtils/poseidon/poseidon.cuh index 5541405f3..b51dfc366 100644 --- a/icicle/appUtils/poseidon/poseidon.cuh +++ b/icicle/appUtils/poseidon/poseidon.cuh @@ -119,6 +119,7 @@ namespace poseidon { /** * Loads pre-calculated optimized constants, moves them to the device */ + // Stas: I have an issue with the number of argumnets template cudaError_t init_optimized_poseidon_constants(device_context::DeviceContext& ctx, PoseidonConstants* constants); From 29675bb40d27ffa91f436b10e96a487dcb4ac281 Mon Sep 17 00:00:00 2001 From: ImmanuelSegol <3ditds@gmail.com> Date: Thu, 15 Feb 2024 16:45:33 +0000 Subject: [PATCH 03/11] executes without errors --- examples/c++/multi-gpu-poseidon/example.cu | 117 ++++++++------------- 1 file changed, 44 insertions(+), 73 deletions(-) diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu index 953952922..e0a3790de 100644 --- a/examples/c++/multi-gpu-poseidon/example.cu +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -2,33 +2,13 @@ #include #include -// select the curve (only 2 available so far) +// select the curve #define CURVE_ID 2 #include "appUtils/poseidon/poseidon.cu" using namespace poseidon; using namespace curve_config; -void setCudaDevice(const unsigned device_id) { - // Example function to set the CUDA device - std::cout << "Setting CUDA device to " << device_id << std::endl; - // cudaSetDevice(device_id); -} - -// function that a thread will execute -void processData(device_context::DeviceContext ctx, const std::vector& inputData, std::vector& outputData) { - PoseidonConstants column_constants; - int size_col = 11; - init_optimized_poseidon_constants(size_col, ctx, &column_constants); - PoseidonConfig column_config = default_poseidon_config(size_col+1); - column_config.are_inputs_on_device = true; - column_config.are_outputs_on_device = true; - - for (int num : inputData) { - outputData.push_back(num * 2); // Example operation - } -} - void checkCudaError(cudaError_t error) { if (error != cudaSuccess) { std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl; @@ -36,78 +16,69 @@ void checkCudaError(cudaError_t error) { } } +// these global varibales go into template calls +const int size_col = 11; + +// this function executes the Poseidon thread +void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes) { + PoseidonConstants column_constants; + init_optimized_poseidon_constants(size_col, ctx, &column_constants); + PoseidonConfig column_config = default_poseidon_config(size_col+1); + cudaError_t err = poseidon_hash(layers, column_hashes, (size_t) size_partition, column_constants, column_config); + checkCudaError(err); +} + int main() { const uint32_t size_col=11; - const unsigned size_partition = 1024; // size_row / nof_partitions; + const unsigned size_row = (1<<30); + const unsigned nof_partitions = 64; + const unsigned size_partition = size_row / nof_partitions; // layers is allocated only for one partition, need to resuse for different partitions const uint32_t size_layers = size_col * size_partition; // size_col * size_row - // Input data for each thread - std::vector inputData1 = {1, 2, 3, 4}; - std::vector inputData2 = {5, 6, 7, 8}; - - // Output data for each thread - std::vector outputData1, outputData2; - - - // Multiple devices are supported by device context - - // setCudaDevice(device_id); - cudaStream_t stream0, stream1; - cudaError_t err; - err = cudaStreamCreate(&stream0); - checkCudaError(err); - err = cudaStreamCreate(&stream1); - checkCudaError(err); + + // Key: multiple devices are supported by device context device_context::DeviceContext ctx0 = device_context::get_default_device_context(); ctx0.device_id=0; device_context::DeviceContext ctx1 = device_context::get_default_device_context(); ctx1.device_id=1; - - + // Allocate and initialize memory for the layers and hashes + scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t))); + if (layers0 == nullptr) { + std::cerr << "Memory allocation for 'layers' failed." << std::endl; + } + scalar_t s = scalar_t::zero(); + for (unsigned i = 0; i < size_col*size_partition ; i++) { + layers0[i] = s; + s = s + scalar_t::one(); + } + scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t))); + if (layers1 == nullptr) { + std::cerr << "Memory allocation for 'layers' failed." << std::endl; + } + s = scalar_t::zero() + scalar_t::one(); + for (unsigned i = 0; i < size_col*size_partition ; i++) { + layers1[i] = s; + s = s + scalar_t::one(); + } - // Allocate and initialize memory for the layers - // scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t))); - // if (layers0 == nullptr) { - // std::cerr << "Memory allocation for 'layers' failed." << std::endl; - // } - // scalar_t s = scalar_t::zero(); - // for (unsigned i = 0; i < size_col*size_partition ; i++) { - // layers0[i] = s; - // s = s + scalar_t::one(); - // } - // scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t))); - // if (layers1 == nullptr) { - // std::cerr << "Memory allocation for 'layers' failed." << std::endl; - // } - // s = scalar_t::zero() + scalar_t::one(); - // for (unsigned i = 0; i < size_col*size_partition ; i++) { - // layers1[i] = s; - // s = s + scalar_t::one(); - // } - - + scalar_t* column_hash0 = static_cast(malloc(size_partition * sizeof(scalar_t))); + scalar_t* column_hash1 = static_cast(malloc(size_partition * sizeof(scalar_t))); // Start threads - std::thread thread1(processData, ctx0, std::ref(inputData1), std::ref(outputData1)); - std::thread thread2(processData, ctx1, std::ref(inputData2), std::ref(outputData2)); + std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0); + std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1); // Wait for the threads to finish + thread0.join(); thread1.join(); - thread2.join(); // Process the output data (example: print the data) - std::cout << "Output Data from Thread 1: "; - for (int num : outputData1) { - std::cout << num << " "; - } + std::cout << "Output Data from Thread 0: "; std::cout << std::endl; - std::cout << "Output Data from Thread 2: "; - for (int num : outputData2) { - std::cout << num << " "; - } + std::cout << "Output Data from Thread 1: "; std::cout << std::endl; return 0; From 418c3d844b3f9942d54662f96a727b79a918e0a2 Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Thu, 15 Feb 2024 22:10:15 +0000 Subject: [PATCH 04/11] completed example --- examples/c++/multi-gpu-poseidon/example.cu | 93 ++++++++++++++++------ 1 file changed, 68 insertions(+), 25 deletions(-) diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu index e0a3790de..a6865958a 100644 --- a/examples/c++/multi-gpu-poseidon/example.cu +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -1,6 +1,8 @@ #include #include -#include +#include + +#include // select the curve #define CURVE_ID 2 @@ -16,26 +18,53 @@ void checkCudaError(cudaError_t error) { } } -// these global varibales go into template calls +// these global constants go into template calls const int size_col = 11; // this function executes the Poseidon thread -void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes) { - PoseidonConstants column_constants; - init_optimized_poseidon_constants(size_col, ctx, &column_constants); - PoseidonConfig column_config = default_poseidon_config(size_col+1); - cudaError_t err = poseidon_hash(layers, column_hashes, (size_t) size_partition, column_constants, column_config); +void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants * constants) { + PoseidonConfig column_config = { + ctx, // ctx + false, // are_inputes_on_device + false, // are_outputs_on_device + false, // input_is_a_state + false, // aligned + false, // loop_state + false, // is_async + }; + cudaError_t err = poseidon_hash(layers, column_hashes, (size_t) size_partition, *constants, column_config); checkCudaError(err); } +using FpMilliseconds = std::chrono::duration; +#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now(); +#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count()); + + +#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \ + std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \ + exit(EXIT_FAILURE); \ +} + int main() { - const uint32_t size_col=11; - const unsigned size_row = (1<<30); + const unsigned size_row = (1<<26); const unsigned nof_partitions = 64; const unsigned size_partition = size_row / nof_partitions; // layers is allocated only for one partition, need to resuse for different partitions - const uint32_t size_layers = size_col * size_partition; // size_col * size_row + const uint32_t size_layers = size_col * size_partition; + nvmlInit(); + unsigned int deviceCount; + nvmlDeviceGetCount(&deviceCount); + std::cout << "Available GPUs: " << deviceCount << std::endl; + for (unsigned int i = 0; i < deviceCount; ++i) { + nvmlDevice_t device; + char name[NVML_DEVICE_NAME_BUFFER_SIZE]; + nvmlDeviceGetHandleByIndex(i, &device); + nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE); + std::cout << "Device ID: " << i << ", Type: " << name << std::endl; + } + // Key: multiple devices are supported by device context device_context::DeviceContext ctx0 = device_context::get_default_device_context(); @@ -43,20 +72,16 @@ int main() { device_context::DeviceContext ctx1 = device_context::get_default_device_context(); ctx1.device_id=1; - // Allocate and initialize memory for the layers and hashes + std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl; scalar_t* layers0 = static_cast(malloc(size_layers * sizeof(scalar_t))); - if (layers0 == nullptr) { - std::cerr << "Memory allocation for 'layers' failed." << std::endl; - } + CHECK_ALLOC(layers0); scalar_t s = scalar_t::zero(); for (unsigned i = 0; i < size_col*size_partition ; i++) { layers0[i] = s; s = s + scalar_t::one(); } scalar_t* layers1 = static_cast(malloc(size_layers * sizeof(scalar_t))); - if (layers1 == nullptr) { - std::cerr << "Memory allocation for 'layers' failed." << std::endl; - } + CHECK_ALLOC(layers1); s = scalar_t::zero() + scalar_t::one(); for (unsigned i = 0; i < size_col*size_partition ; i++) { layers1[i] = s; @@ -64,22 +89,40 @@ int main() { } scalar_t* column_hash0 = static_cast(malloc(size_partition * sizeof(scalar_t))); + CHECK_ALLOC(column_hash0); scalar_t* column_hash1 = static_cast(malloc(size_partition * sizeof(scalar_t))); + CHECK_ALLOC(column_hash1); + + PoseidonConstants column_constants0, column_constants1; + init_optimized_poseidon_constants(size_col, ctx0, &column_constants0); + init_optimized_poseidon_constants(size_col, ctx1, &column_constants1); - // Start threads - std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0); - std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1); + std::cout << "Parallel execution of Poseidon threads" << std::endl; + START_TIMER(parallel); + std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0); + std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1); // Wait for the threads to finish thread0.join(); thread1.join(); - - // Process the output data (example: print the data) + END_TIMER(parallel,"2 GPUs"); std::cout << "Output Data from Thread 0: "; - std::cout << std::endl; - + std::cout << column_hash0[0] << std::endl; std::cout << "Output Data from Thread 1: "; - std::cout << std::endl; + std::cout << column_hash1[0] << std::endl; + + std::cout << "Sequential execution of Poseidon threads" << std::endl; + START_TIMER(sequential); + std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0); + thread2.join(); + std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0); + thread3.join(); + END_TIMER(sequential,"1 GPU"); + std::cout << "Output Data from Thread 2: "; + std::cout << column_hash0[0] << std::endl; + std::cout << "Output Data from Thread 3: "; + std::cout << column_hash1[0] << std::endl; + nvmlShutdown(); return 0; } From 8dd52306dc9fc45cea00650ef4fe123a126a03ce Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Thu, 15 Feb 2024 23:07:33 +0000 Subject: [PATCH 05/11] update README --- examples/c++/multi-gpu-poseidon/README.md | 51 ++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md index 28ecc8a53..4fbae3338 100644 --- a/examples/c++/multi-gpu-poseidon/README.md +++ b/examples/c++/multi-gpu-poseidon/README.md @@ -1,2 +1,51 @@ -# Muliple GPU on a single host +# Icicle example: using muliple GPU to hash large dataset + +## Best-Practices + +This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first. + +## Key-Takeaway + +Use `device_context::DeviceContext` variable to select GPU to use. +Use C++ threads to compute `Icicle` primitives on different GPUs in parallel. + +## Concise Usage Explanation + +1. Include c++ threads + +```c++ +#include +``` + +2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id. + +```c++ +void threadPoseidon(device_context::DeviceContext ctx, ...) {...} +``` + +3. Initialize device contexts for different GPUs + +```c++ +device_context::DeviceContext ctx0 = device_context::get_default_device_context(); +ctx0.device_id=0; +device_context::DeviceContext ctx1 = device_context::get_default_device_context(); +ctx1.device_id=1; +``` +4. Finally, spawn the threads and wait for their completion + +```c++ +std::thread thread0(threadPoseidon, ctx0, ...); +std::thread thread1(threadPoseidon, ctx1, ...); +thread0.join(); +thread1.join(); +``` + +## What's in the example + +This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix. + +1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions` +2. Hash two partitions in parallel on two GPUs +3. Hash two partitions in series on one GPU +4. Compare execution times From 319358427fc8b707d53c5266ee758997da1f4d6e Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Fri, 16 Feb 2024 16:35:04 +0000 Subject: [PATCH 06/11] cudaSetDevice in the thread function --- examples/c++/multi-gpu-poseidon/example.cu | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu index a6865958a..2db13629b 100644 --- a/examples/c++/multi-gpu-poseidon/example.cu +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -7,6 +7,7 @@ // select the curve #define CURVE_ID 2 #include "appUtils/poseidon/poseidon.cu" +#include "utils/error_handler.cuh" using namespace poseidon; using namespace curve_config; @@ -23,6 +24,12 @@ const int size_col = 11; // this function executes the Poseidon thread void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants * constants) { + cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id)); + if (err_result != cudaSuccess) { + std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl; + return; + } + // CHK_IF_RETURN(); I can't use it in a standard thread function PoseidonConfig column_config = { ctx, // ctx false, // are_inputes_on_device @@ -30,7 +37,7 @@ void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, false, // input_is_a_state false, // aligned false, // loop_state - false, // is_async + true, // is_async }; cudaError_t err = poseidon_hash(layers, column_hashes, (size_t) size_partition, *constants, column_config); checkCudaError(err); From 6681be549acfc9bcc8f4bb4ccfdc390519d45317 Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Fri, 16 Feb 2024 19:43:58 +0000 Subject: [PATCH 07/11] fixed on-device memory issue --- examples/c++/multi-gpu-poseidon/example.cu | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu index 2db13629b..2d4bfbdfc 100644 --- a/examples/c++/multi-gpu-poseidon/example.cu +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -37,7 +37,7 @@ void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, false, // input_is_a_state false, // aligned false, // loop_state - true, // is_async + false, // is_async }; cudaError_t err = poseidon_hash(layers, column_hashes, (size_t) size_partition, *constants, column_config); checkCudaError(err); @@ -102,6 +102,11 @@ int main() { PoseidonConstants column_constants0, column_constants1; init_optimized_poseidon_constants(size_col, ctx0, &column_constants0); + cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id)); + if (err_result != cudaSuccess) { + std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl; + return; + } init_optimized_poseidon_constants(size_col, ctx1, &column_constants1); std::cout << "Parallel execution of Poseidon threads" << std::endl; From 518a3ad9b65d494371c30aeecc8f191eaae74d80 Mon Sep 17 00:00:00 2001 From: Stas Polonsky Date: Sat, 17 Feb 2024 00:18:21 +0000 Subject: [PATCH 08/11] ready for PR --- examples/c++/multi-gpu-poseidon/example.cu | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu index 2d4bfbdfc..6e5d1d56a 100644 --- a/examples/c++/multi-gpu-poseidon/example.cu +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -54,7 +54,7 @@ using FpMilliseconds = std::chrono::duration Date: Tue, 20 Feb 2024 19:14:25 -0500 Subject: [PATCH 09/11] fixed spelling --- examples/c++/multi-gpu-poseidon/README.md | 3 ++- examples/c++/multi-gpu-poseidon/example.cu | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md index 4fbae3338..5fb110fe6 100644 --- a/examples/c++/multi-gpu-poseidon/README.md +++ b/examples/c++/multi-gpu-poseidon/README.md @@ -1,4 +1,4 @@ -# Icicle example: using muliple GPU to hash large dataset +# Icicle example: using multiple GPU to hash large dataset ## Best-Practices @@ -31,6 +31,7 @@ ctx0.device_id=0; device_context::DeviceContext ctx1 = device_context::get_default_device_context(); ctx1.device_id=1; ``` + 4. Finally, spawn the threads and wait for their completion ```c++ diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu index 6e5d1d56a..edb12f9b1 100644 --- a/examples/c++/multi-gpu-poseidon/example.cu +++ b/examples/c++/multi-gpu-poseidon/example.cu @@ -57,7 +57,7 @@ int main() { const unsigned size_row = (1<<30); const unsigned nof_partitions = 64; const unsigned size_partition = size_row / nof_partitions; - // layers is allocated only for one partition, need to resuse for different partitions + // layers is allocated only for one partition, need to reuse for different partitions const uint32_t size_layers = size_col * size_partition; nvmlInit(); From 82d1ff47695d0b585d32f2932977efbf753e8e00 Mon Sep 17 00:00:00 2001 From: stas Date: Tue, 20 Feb 2024 20:40:45 -0500 Subject: [PATCH 10/11] fixed spelling in poseidon.cuh --- icicle/appUtils/poseidon/poseidon.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/icicle/appUtils/poseidon/poseidon.cuh b/icicle/appUtils/poseidon/poseidon.cuh index 18fcf77e8..6eb439567 100644 --- a/icicle/appUtils/poseidon/poseidon.cuh +++ b/icicle/appUtils/poseidon/poseidon.cuh @@ -119,7 +119,7 @@ namespace poseidon { /** * Loads pre-calculated optimized constants, moves them to the device */ - // Stas: I have an issue with the number of argumnets + // Stas: I have an issue with the number of arguments template cudaError_t init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants* constants); From c046fd17c6744a80f6be3dbda5abacc33b93fd10 Mon Sep 17 00:00:00 2001 From: stas Date: Tue, 20 Feb 2024 20:43:59 -0500 Subject: [PATCH 11/11] removed my comments from poseidon.cuh --- icicle/appUtils/poseidon/poseidon.cuh | 1 - 1 file changed, 1 deletion(-) diff --git a/icicle/appUtils/poseidon/poseidon.cuh b/icicle/appUtils/poseidon/poseidon.cuh index 6eb439567..b139dc9a3 100644 --- a/icicle/appUtils/poseidon/poseidon.cuh +++ b/icicle/appUtils/poseidon/poseidon.cuh @@ -119,7 +119,6 @@ namespace poseidon { /** * Loads pre-calculated optimized constants, moves them to the device */ - // Stas: I have an issue with the number of arguments template cudaError_t init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants* constants);