Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Temp/stas/muli gpu example #381

Merged
merged 13 commits into from
Feb 21, 2024
25 changes: 25 additions & 0 deletions examples/c++/multi-gpu-poseidon/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
cmake_minimum_required(VERSION 3.18)
set(CMAKE_CXX_STANDARD 17)
set(CMAKE_CUDA_STANDARD 17)
set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
else()
set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
endif ()
project(icicle LANGUAGES CUDA CXX)

set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
set(CMAKE_CUDA_FLAGS_RELEASE "")
set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
# change the path to your Icicle location
include_directories("../../../icicle")
add_executable(
example
example.cu
)
find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
target_link_libraries(example ${NVML_LIBRARY})
set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)

52 changes: 52 additions & 0 deletions examples/c++/multi-gpu-poseidon/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
# Icicle example: using multiple GPU to hash large dataset

## Best-Practices

This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.

## Key-Takeaway

Use `device_context::DeviceContext` variable to select GPU to use.
Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.

## Concise Usage Explanation

1. Include c++ threads

```c++
#include <thread>
```

2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.

```c++
void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
```

3. Initialize device contexts for different GPUs

```c++
device_context::DeviceContext ctx0 = device_context::get_default_device_context();
ctx0.device_id=0;
device_context::DeviceContext ctx1 = device_context::get_default_device_context();
ctx1.device_id=1;
```

4. Finally, spawn the threads and wait for their completion

```c++
std::thread thread0(threadPoseidon, ctx0, ...);
std::thread thread1(threadPoseidon, ctx1, ...);
thread0.join();
thread1.join();
```

## What's in the example

This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.

1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
2. Hash two partitions in parallel on two GPUs
3. Hash two partitions in series on one GPU
4. Compare execution times

9 changes: 9 additions & 0 deletions examples/c++/multi-gpu-poseidon/compile.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash

# Exit immediately on error
set -e

rm -rf build
mkdir -p build
cmake -S . -B build
cmake --build build
148 changes: 148 additions & 0 deletions examples/c++/multi-gpu-poseidon/example.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
#include <iostream>
#include <thread>
#include <chrono>

#include <nvml.h>

// select the curve
#define CURVE_ID 2
#include "appUtils/poseidon/poseidon.cu"
#include "utils/error_handler.cuh"

using namespace poseidon;
using namespace curve_config;

void checkCudaError(cudaError_t error) {
if (error != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
// Handle the error, e.g., exit the program or throw an exception.
}
}

// these global constants go into template calls
const int size_col = 11;

// this function executes the Poseidon thread
void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
// CHK_IF_RETURN(); I can't use it in a standard thread function
PoseidonConfig column_config = {
ctx, // ctx
false, // are_inputes_on_device
false, // are_outputs_on_device
false, // input_is_a_state
false, // aligned
false, // loop_state
false, // is_async
};
cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
checkCudaError(err);
}

using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());


#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
exit(EXIT_FAILURE); \
}

int main() {
const unsigned size_row = (1<<30);
const unsigned nof_partitions = 64;
const unsigned size_partition = size_row / nof_partitions;
// layers is allocated only for one partition, need to reuse for different partitions
const uint32_t size_layers = size_col * size_partition;

nvmlInit();
unsigned int deviceCount;
nvmlDeviceGetCount(&deviceCount);
std::cout << "Available GPUs: " << deviceCount << std::endl;

for (unsigned int i = 0; i < deviceCount; ++i) {
nvmlDevice_t device;
nvmlMemory_t memory;
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
nvmlDeviceGetHandleByIndex(i, &device);
nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
nvmlDeviceGetMemoryInfo(device, &memory);
std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/" << memory.free/1024/1024 << std::endl;
}

const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
std::cout << "Required Memory (MiB) " << memory_partition << std::endl;

//===============================================================================
// Key: multiple devices are supported by device context
//===============================================================================

device_context::DeviceContext ctx0 = device_context::get_default_device_context();
ctx0.device_id=0;
device_context::DeviceContext ctx1 = device_context::get_default_device_context();
ctx1.device_id=1;

std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
CHECK_ALLOC(layers0);
scalar_t s = scalar_t::zero();
for (unsigned i = 0; i < size_col*size_partition ; i++) {
layers0[i] = s;
s = s + scalar_t::one();
}
scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
CHECK_ALLOC(layers1);
s = scalar_t::zero() + scalar_t::one();
for (unsigned i = 0; i < size_col*size_partition ; i++) {
layers1[i] = s;
s = s + scalar_t::one();
}

scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
CHECK_ALLOC(column_hash0);
scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
CHECK_ALLOC(column_hash1);

PoseidonConstants<scalar_t> column_constants0, column_constants1;
init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
cudaError_t err_result = CHK_STICKY(cudaSetDevice(ctx1.device_id));
if (err_result != cudaSuccess) {
std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
return;
}
init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);

std::cout << "Parallel execution of Poseidon threads" << std::endl;
START_TIMER(parallel);
std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);

// Wait for the threads to finish
thread0.join();
thread1.join();
END_TIMER(parallel,"2 GPUs");
std::cout << "Output Data from Thread 0: ";
std::cout << column_hash0[0] << std::endl;
std::cout << "Output Data from Thread 1: ";
std::cout << column_hash1[0] << std::endl;

std::cout << "Sequential execution of Poseidon threads" << std::endl;
START_TIMER(sequential);
std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
thread2.join();
std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
thread3.join();
END_TIMER(sequential,"1 GPU");
std::cout << "Output Data from Thread 2: ";
std::cout << column_hash0[0] << std::endl;
std::cout << "Output Data from Thread 3: ";
std::cout << column_hash1[0] << std::endl;

nvmlShutdown();
return 0;
}
2 changes: 2 additions & 0 deletions examples/c++/multi-gpu-poseidon/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
#!/bin/bash
./build/example
Loading