ingonyama-zk · svpolonsky · Feb 21, 2024 · Feb 14, 2024 · Feb 15, 2024 · Feb 15, 2024
diff --git a/examples/c++/multi-gpu-poseidon/CMakeLists.txt b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md
@@ -0,0 +1,52 @@
+# Icicle example: using multiple GPU to hash large dataset
+
+## Best-Practices
+
+This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.
+
+## Key-Takeaway
+
+Use `device_context::DeviceContext` variable to select GPU to use. 
+Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.
+
+## Concise Usage Explanation
+
+1. Include c++ threads
+
+```c++
+#include <thread>
+```
+
+2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.
+
+```c++
+void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
+```
+
+3. Initialize device contexts for different GPUs
+
+```c++
+device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+ctx0.device_id=0;
+device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+ctx1.device_id=1;
+``` 
+
+4. Finally, spawn the threads and wait for their completion
+
+```c++
+std::thread thread0(threadPoseidon, ctx0, ...);
+std::thread thread1(threadPoseidon, ctx1, ...);
+thread0.join();
+thread1.join();
+```
+
+## What's in the example
+
+This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
+
+1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
+2. Hash two partitions in parallel on two GPUs
+3. Hash two partitions in series on one GPU
+4. Compare execution times
+
diff --git a/examples/c++/multi-gpu-poseidon/compile.sh b/examples/c++/multi-gpu-poseidon/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
@@ -0,0 +1,148 @@
+#include <iostream>
+#include <thread>
+#include <chrono>
+
+#include <nvml.h>
+
+// select the curve
+#define CURVE_ID 2
+#include "appUtils/poseidon/poseidon.cu"
+#include "utils/error_handler.cuh"
+
+using namespace poseidon;
+using namespace curve_config;
+
+void checkCudaError(cudaError_t error) {
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+        // Handle the error, e.g., exit the program or throw an exception.
+    }
+}
+
+// these global constants go into template calls
+const int size_col = 11;
+
+// this function executes the Poseidon thread
+void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    // CHK_IF_RETURN(); I can't use it in a standard thread function
+    PoseidonConfig column_config = {
+        ctx,   // ctx
+        false, // are_inputes_on_device
+        false, // are_outputs_on_device
+        false, // input_is_a_state
+        false, // aligned
+        false, // loop_state
+        false, // is_async
+        };
+    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
+    checkCudaError(err);
+}
+
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+
+#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
+    exit(EXIT_FAILURE); \
+}
+
+int main() {
+    const unsigned size_row = (1<<30);
+    const unsigned nof_partitions = 64;
+    const unsigned size_partition = size_row / nof_partitions;
+    // layers is allocated only for one partition, need to reuse for different partitions
+    const uint32_t size_layers = size_col * size_partition;
+
+    nvmlInit();
+    unsigned int deviceCount;
+    nvmlDeviceGetCount(&deviceCount);
+    std::cout << "Available GPUs: " << deviceCount << std::endl;
+
+    for (unsigned int i = 0; i < deviceCount; ++i) {
+        nvmlDevice_t device;
+        nvmlMemory_t memory;
+        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+        nvmlDeviceGetHandleByIndex(i, &device);
+        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+        nvmlDeviceGetMemoryInfo(device, &memory);
+        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
+    }
+
+    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
+    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+
+    //===============================================================================
+    // Key: multiple devices are supported by device context
+    //===============================================================================
+
+    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+    ctx0.device_id=0;
+    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+    ctx1.device_id=1;
+
+    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
+    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    CHECK_ALLOC(layers0);
+    scalar_t s = scalar_t::zero();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers0[i] = s;
+        s = s + scalar_t::one();
+    }
+    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    CHECK_ALLOC(layers1);
+    s = scalar_t::zero() + scalar_t::one();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers1[i] = s;
+        s = s + scalar_t::one();
+    }
+
+    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash0);
+    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash1);
+
+    PoseidonConstants<scalar_t> column_constants0, column_constants1;
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
+
+    std::cout << "Parallel execution of Poseidon threads" << std::endl;
+    START_TIMER(parallel);
+    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
+
+    // Wait for the threads to finish
+    thread0.join();
+    thread1.join();
+    END_TIMER(parallel,"2 GPUs");
+    std::cout << "Output Data from Thread 0: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 1: ";
+    std::cout << column_hash1[0] << std::endl;
+
+    std::cout << "Sequential execution of Poseidon threads" << std::endl;
+    START_TIMER(sequential);
+    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    thread2.join();
+    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+    thread3.join();
+    END_TIMER(sequential,"1 GPU");
+    std::cout << "Output Data from Thread 2: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 3: ";
+    std::cout << column_hash1[0] << std::endl;
+
+    nvmlShutdown();
+    return 0;
+}
diff --git a/examples/c++/multi-gpu-poseidon/run.sh b/examples/c++/multi-gpu-poseidon/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example