From 4d75fbac93b7b0ea1c4884a85d9b135f3a72d0fc Mon Sep 17 00:00:00 2001
From: ImmanuelSegol <3ditds@gmail.com>
Date: Wed, 14 Feb 2024 22:28:17 +0000
Subject: [PATCH 01/11] issue with init_optimized_poseidon_constants

---
 .../c++/multi-gpu-poseidon/CMakeLists.txt     |  25 ++++
 examples/c++/multi-gpu-poseidon/README.md     |   2 +
 examples/c++/multi-gpu-poseidon/compile.sh    |   9 ++
 examples/c++/multi-gpu-poseidon/example.cu    | 119 ++++++++++++++++++
 examples/c++/multi-gpu-poseidon/run.sh        |   2 +
 5 files changed, 157 insertions(+)
 create mode 100644 examples/c++/multi-gpu-poseidon/CMakeLists.txt
 create mode 100644 examples/c++/multi-gpu-poseidon/README.md
 create mode 100755 examples/c++/multi-gpu-poseidon/compile.sh
 create mode 100644 examples/c++/multi-gpu-poseidon/example.cu
 create mode 100755 examples/c++/multi-gpu-poseidon/run.sh
diff --git a/examples/c++/multi-gpu-poseidon/CMakeLists.txt b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
new file mode 100644
index 000000000..424b3e9bc
--- /dev/null
+++ b/examples/c++/multi-gpu-poseidon/CMakeLists.txt
@@ -0,0 +1,25 @@
+cmake_minimum_required(VERSION 3.18)
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CUDA_STANDARD 17)
+set(CMAKE_CUDA_STANDARD_REQUIRED TRUE)
+set(CMAKE_CXX_STANDARD_REQUIRED TRUE)
+if (${CMAKE_VERSION} VERSION_LESS "3.24.0")
+    set(CMAKE_CUDA_ARCHITECTURES ${CUDA_ARCH})
+else()
+    set(CMAKE_CUDA_ARCHITECTURES native) # on 3.24+, on earlier it is ignored, and the target is not passed
+endif ()
+project(icicle LANGUAGES CUDA CXX)
+
+set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr")
+set(CMAKE_CUDA_FLAGS_RELEASE "")
+set(CMAKE_CUDA_FLAGS_DEBUG "${CMAKE_CUDA_FLAGS_DEBUG} -g -G -O0")
+# change the path to your Icicle location
+include_directories("../../../icicle")
+add_executable(
+  example
+  example.cu
+)
+find_library(NVML_LIBRARY nvidia-ml PATHS /usr/local/cuda/targets/x86_64-linux/lib/stubs/ )
+target_link_libraries(example ${NVML_LIBRARY})
+set_target_properties(example PROPERTIES CUDA_SEPARABLE_COMPILATION ON)
+
diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md
new file mode 100644
index 000000000..28ecc8a53
--- /dev/null
+++ b/examples/c++/multi-gpu-poseidon/README.md
@@ -0,0 +1,2 @@
+# Muliple GPU on a single host
+
diff --git a/examples/c++/multi-gpu-poseidon/compile.sh b/examples/c++/multi-gpu-poseidon/compile.sh
new file mode 100755
index 000000000..36c1ddac1
--- /dev/null
+++ b/examples/c++/multi-gpu-poseidon/compile.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# Exit immediately on error
+set -e
+
+rm -rf build
+mkdir -p build
+cmake -S . -B build
+cmake --build build
diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
new file mode 100644
index 000000000..91d35416e
--- /dev/null
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -0,0 +1,119 @@
+#include <iostream>
+#include <thread>
+#include <vector>
+
+// select the curve (only 2 available so far)
+#define CURVE_ID 2
+#include "appUtils/poseidon/poseidon.cu"
+
+using namespace poseidon;
+using namespace curve_config;
+
+void setCudaDevice(const unsigned device_id) {
+    // Example function to set the CUDA device
+    std::cout << "Setting CUDA device to " << device_id << std::endl;
+    // cudaSetDevice(device_id);
+}
+
+// function that a thread will execute
+void processData(const device_context::DeviceContext ctx, const std::vector<int>& inputData, std::vector<int>& outputData) {
+    // Simulate some processing
+    PoseidonConstants<scalar_t> column_constants;
+    int size_col = 11;
+    // init_optimized_poseidon_constants<scalar_t>(ctx, &column_constants);
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx, &column_constants);
+    PoseidonConfig column_config = default_poseidon_config<scalar_t>(size_col+1);
+    column_config.are_inputs_on_device = true;
+    column_config.are_outputs_on_device = true;
+
+    for (int num : inputData) {
+        outputData.push_back(num * 2); // Example operation
+    }
+}
+
+void checkCudaError(cudaError_t error) {
+    if (error != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
+        // Handle the error, e.g., exit the program or throw an exception.
+    }
+}
+
+int main() {
+    const uint32_t size_col=11;
+    const unsigned size_partition = 1024; // size_row / nof_partitions;
+    // layers is allocated only for one partition, need to resuse for different partitions
+    const uint32_t size_layers = size_col * size_partition; // size_col * size_row
+    // Input data for each thread
+    std::vector<int> inputData1 = {1, 2, 3, 4};
+    std::vector<int> inputData2 = {5, 6, 7, 8};
+
+    // Output data for each thread
+    std::vector<int> outputData1, outputData2;
+
+
+    // Multiple devices are supported by device context
+
+    // setCudaDevice(device_id);
+    cudaStream_t stream0, stream1;
+    cudaError_t err;
+    err = cudaStreamCreate(&stream0);
+    checkCudaError(err);
+    err = cudaStreamCreate(&stream1);
+    checkCudaError(err);
+
+    device_context::DeviceContext ctx0 = device_context::DeviceContext{
+      (cudaStream_t&)stream0, // SP: simulate different device as stream
+      0,                             // device_id
+      0,                             // mempool
+    };
+    device_context::DeviceContext ctx1 = device_context::DeviceContext{
+      (cudaStream_t&)stream1, // SP: simulate different device as stream
+      0,                             // device_id
+      0,                             // mempool
+    };
+
+    // Allocate and initialize memory for the layers
+    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    if (layers0 == nullptr) {
+        std::cerr << "Memory allocation for 'layers' failed." << std::endl;
+    }
+    scalar_t s = scalar_t::zero();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers0[i] = s;
+        s = s + scalar_t::one();
+    }
+    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    if (layers1 == nullptr) {
+        std::cerr << "Memory allocation for 'layers' failed." << std::endl;
+    }
+    s = scalar_t::zero() + scalar_t::one();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers1[i] = s;
+        s = s + scalar_t::one();
+    }
+
+
+
+    // Start threads
+    std::thread thread1(processData, ctx0, std::ref(inputData1), std::ref(outputData1));
+    std::thread thread2(processData, ctx1, std::ref(inputData2), std::ref(outputData2));
+
+    // Wait for the threads to finish
+    thread1.join();
+    thread2.join();
+
+    // Process the output data (example: print the data)
+    std::cout << "Output Data from Thread 1: ";
+    for (int num : outputData1) {
+        std::cout << num << " ";
+    }
+    std::cout << std::endl;
+
+    std::cout << "Output Data from Thread 2: ";
+    for (int num : outputData2) {
+        std::cout << num << " ";
+    }
+    std::cout << std::endl;
+
+    return 0;
+}
diff --git a/examples/c++/multi-gpu-poseidon/run.sh b/examples/c++/multi-gpu-poseidon/run.sh
new file mode 100755
index 000000000..6e3fc976d
--- /dev/null
+++ b/examples/c++/multi-gpu-poseidon/run.sh
@@ -0,0 +1,2 @@
+#!/bin/bash
+./build/example

From 481f144dc80dabadb9cc7c7192bce0f94347ebd8 Mon Sep 17 00:00:00 2001
From: ImmanuelSegol <3ditds@gmail.com>
Date: Thu, 15 Feb 2024 15:11:20 +0000
Subject: [PATCH 02/11] debug

---
 examples/c++/multi-gpu-poseidon/example.cu | 57 ++++++++++------------
 icicle/appUtils/poseidon/poseidon.cuh      |  1 +
 2 files changed, 27 insertions(+), 31 deletions(-)

diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
index 91d35416e..953952922 100644
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -16,11 +16,9 @@ void setCudaDevice(const unsigned device_id) {
 }
 
 // function that a thread will execute
-void processData(const device_context::DeviceContext ctx, const std::vector<int>& inputData, std::vector<int>& outputData) {
-    // Simulate some processing
+void processData(device_context::DeviceContext ctx, const std::vector<int>& inputData, std::vector<int>& outputData) {
     PoseidonConstants<scalar_t> column_constants;
     int size_col = 11;
-    // init_optimized_poseidon_constants<scalar_t>(ctx, &column_constants);
     init_optimized_poseidon_constants<scalar_t>(size_col, ctx, &column_constants);
     PoseidonConfig column_config = default_poseidon_config<scalar_t>(size_col+1);
     column_config.are_inputs_on_device = true;
@@ -61,36 +59,33 @@ int main() {
     err = cudaStreamCreate(&stream1);
     checkCudaError(err);
 
-    device_context::DeviceContext ctx0 = device_context::DeviceContext{
-      (cudaStream_t&)stream0, // SP: simulate different device as stream
-      0,                             // device_id
-      0,                             // mempool
-    };
-    device_context::DeviceContext ctx1 = device_context::DeviceContext{
-      (cudaStream_t&)stream1, // SP: simulate different device as stream
-      0,                             // device_id
-      0,                             // mempool
-    };
+    device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+    ctx0.device_id=0;
+    device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+    ctx1.device_id=1;
+
+    
+    
 
     // Allocate and initialize memory for the layers
-    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    if (layers0 == nullptr) {
-        std::cerr << "Memory allocation for 'layers' failed." << std::endl;
-    }
-    scalar_t s = scalar_t::zero();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers0[i] = s;
-        s = s + scalar_t::one();
-    }
-    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    if (layers1 == nullptr) {
-        std::cerr << "Memory allocation for 'layers' failed." << std::endl;
-    }
-    s = scalar_t::zero() + scalar_t::one();
-    for (unsigned i = 0; i < size_col*size_partition ; i++) {
-        layers1[i] = s;
-        s = s + scalar_t::one();
-    }
+    // scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    // if (layers0 == nullptr) {
+    //     std::cerr << "Memory allocation for 'layers' failed." << std::endl;
+    // }
+    // scalar_t s = scalar_t::zero();
+    // for (unsigned i = 0; i < size_col*size_partition ; i++) {
+    //     layers0[i] = s;
+    //     s = s + scalar_t::one();
+    // }
+    // scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    // if (layers1 == nullptr) {
+    //     std::cerr << "Memory allocation for 'layers' failed." << std::endl;
+    // }
+    // s = scalar_t::zero() + scalar_t::one();
+    // for (unsigned i = 0; i < size_col*size_partition ; i++) {
+    //     layers1[i] = s;
+    //     s = s + scalar_t::one();
+    // }
 
 
 
diff --git a/icicle/appUtils/poseidon/poseidon.cuh b/icicle/appUtils/poseidon/poseidon.cuh
index 5541405f3..b51dfc366 100644
--- a/icicle/appUtils/poseidon/poseidon.cuh
+++ b/icicle/appUtils/poseidon/poseidon.cuh
@@ -119,6 +119,7 @@ namespace poseidon {
   /**
    * Loads pre-calculated optimized constants, moves them to the device
    */
+  // Stas: I have an issue with the number of argumnets
   template <typename S>
   cudaError_t init_optimized_poseidon_constants(device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);
 

From 29675bb40d27ffa91f436b10e96a487dcb4ac281 Mon Sep 17 00:00:00 2001
From: ImmanuelSegol <3ditds@gmail.com>
Date: Thu, 15 Feb 2024 16:45:33 +0000
Subject: [PATCH 03/11] executes without errors

---
 examples/c++/multi-gpu-poseidon/example.cu | 117 ++++++++-------------
 1 file changed, 44 insertions(+), 73 deletions(-)

diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
index 953952922..e0a3790de 100644
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -2,33 +2,13 @@
 #include <thread>
 #include <vector>
 
-// select the curve (only 2 available so far)
+// select the curve
 #define CURVE_ID 2
 #include "appUtils/poseidon/poseidon.cu"
 
 using namespace poseidon;
 using namespace curve_config;
 
-void setCudaDevice(const unsigned device_id) {
-    // Example function to set the CUDA device
-    std::cout << "Setting CUDA device to " << device_id << std::endl;
-    // cudaSetDevice(device_id);
-}
-
-// function that a thread will execute
-void processData(device_context::DeviceContext ctx, const std::vector<int>& inputData, std::vector<int>& outputData) {
-    PoseidonConstants<scalar_t> column_constants;
-    int size_col = 11;
-    init_optimized_poseidon_constants<scalar_t>(size_col, ctx, &column_constants);
-    PoseidonConfig column_config = default_poseidon_config<scalar_t>(size_col+1);
-    column_config.are_inputs_on_device = true;
-    column_config.are_outputs_on_device = true;
-
-    for (int num : inputData) {
-        outputData.push_back(num * 2); // Example operation
-    }
-}
-
 void checkCudaError(cudaError_t error) {
     if (error != cudaSuccess) {
         std::cerr << "CUDA error: " << cudaGetErrorString(error) << std::endl;
@@ -36,78 +16,69 @@ void checkCudaError(cudaError_t error) {
     }
 }
 
+// these global varibales go into template calls
+const int size_col = 11;
+
+// this function executes the Poseidon thread
+void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes) {
+    PoseidonConstants<scalar_t> column_constants;
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx, &column_constants);
+    PoseidonConfig column_config = default_poseidon_config<scalar_t>(size_col+1);
+    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, column_constants, column_config);
+    checkCudaError(err);
+}
+
 int main() {
     const uint32_t size_col=11;
-    const unsigned size_partition = 1024; // size_row / nof_partitions;
+    const unsigned size_row = (1<<30);
+    const unsigned nof_partitions = 64;
+    const unsigned size_partition = size_row / nof_partitions;
     // layers is allocated only for one partition, need to resuse for different partitions
     const uint32_t size_layers = size_col * size_partition; // size_col * size_row
-    // Input data for each thread
-    std::vector<int> inputData1 = {1, 2, 3, 4};
-    std::vector<int> inputData2 = {5, 6, 7, 8};
-
-    // Output data for each thread
-    std::vector<int> outputData1, outputData2;
-
-
-    // Multiple devices are supported by device context
-
-    // setCudaDevice(device_id);
-    cudaStream_t stream0, stream1;
-    cudaError_t err;
-    err = cudaStreamCreate(&stream0);
-    checkCudaError(err);
-    err = cudaStreamCreate(&stream1);
-    checkCudaError(err);
+    
+    // Key: multiple devices are supported by device context
 
     device_context::DeviceContext ctx0 = device_context::get_default_device_context();
     ctx0.device_id=0;
     device_context::DeviceContext ctx1 = device_context::get_default_device_context();
     ctx1.device_id=1;
-
-    
     
+    // Allocate and initialize memory for the layers and hashes
+    scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    if (layers0 == nullptr) {
+        std::cerr << "Memory allocation for 'layers' failed." << std::endl;
+    }
+    scalar_t s = scalar_t::zero();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers0[i] = s;
+        s = s + scalar_t::one();
+    }
+    scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
+    if (layers1 == nullptr) {
+        std::cerr << "Memory allocation for 'layers' failed." << std::endl;
+    }
+    s = scalar_t::zero() + scalar_t::one();
+    for (unsigned i = 0; i < size_col*size_partition ; i++) {
+        layers1[i] = s;
+        s = s + scalar_t::one();
+    }
 
-    // Allocate and initialize memory for the layers
-    // scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    // if (layers0 == nullptr) {
-    //     std::cerr << "Memory allocation for 'layers' failed." << std::endl;
-    // }
-    // scalar_t s = scalar_t::zero();
-    // for (unsigned i = 0; i < size_col*size_partition ; i++) {
-    //     layers0[i] = s;
-    //     s = s + scalar_t::one();
-    // }
-    // scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    // if (layers1 == nullptr) {
-    //     std::cerr << "Memory allocation for 'layers' failed." << std::endl;
-    // }
-    // s = scalar_t::zero() + scalar_t::one();
-    // for (unsigned i = 0; i < size_col*size_partition ; i++) {
-    //     layers1[i] = s;
-    //     s = s + scalar_t::one();
-    // }
-
-
+    scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
 
     // Start threads
-    std::thread thread1(processData, ctx0, std::ref(inputData1), std::ref(outputData1));
-    std::thread thread2(processData, ctx1, std::ref(inputData2), std::ref(outputData2));
+    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0);
+    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1);
 
     // Wait for the threads to finish
+    thread0.join();
     thread1.join();
-    thread2.join();
 
     // Process the output data (example: print the data)
-    std::cout << "Output Data from Thread 1: ";
-    for (int num : outputData1) {
-        std::cout << num << " ";
-    }
+    std::cout << "Output Data from Thread 0: ";
     std::cout << std::endl;
 
-    std::cout << "Output Data from Thread 2: ";
-    for (int num : outputData2) {
-        std::cout << num << " ";
-    }
+    std::cout << "Output Data from Thread 1: ";
     std::cout << std::endl;
 
     return 0;

From 418c3d844b3f9942d54662f96a727b79a918e0a2 Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Thu, 15 Feb 2024 22:10:15 +0000
Subject: [PATCH 04/11] completed example

---
 examples/c++/multi-gpu-poseidon/example.cu | 93 ++++++++++++++++------
 1 file changed, 68 insertions(+), 25 deletions(-)

diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
index e0a3790de..a6865958a 100644
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -1,6 +1,8 @@
 #include <iostream>
 #include <thread>
-#include <vector>
+#include <chrono>
+
+#include <nvml.h>
 
 // select the curve
 #define CURVE_ID 2
@@ -16,26 +18,53 @@ void checkCudaError(cudaError_t error) {
     }
 }
 
-// these global varibales go into template calls
+// these global constants go into template calls
 const int size_col = 11;
 
 // this function executes the Poseidon thread
-void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes) {
-    PoseidonConstants<scalar_t> column_constants;
-    init_optimized_poseidon_constants<scalar_t>(size_col, ctx, &column_constants);
-    PoseidonConfig column_config = default_poseidon_config<scalar_t>(size_col+1);
-    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, column_constants, column_config);
+void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
+    PoseidonConfig column_config = {
+        ctx,   // ctx
+        false, // are_inputes_on_device
+        false, // are_outputs_on_device
+        false, // input_is_a_state
+        false, // aligned
+        false, // loop_state
+        false, // is_async
+        };
+    cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
     checkCudaError(err);
 }
 
+using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::period>;
+#define START_TIMER(timer) auto timer##_start = std::chrono::high_resolution_clock::now();
+#define END_TIMER(timer, msg) printf("%s: %.0f ms\n", msg, FpMilliseconds(std::chrono::high_resolution_clock::now() - timer##_start).count());
+
+
+#define CHECK_ALLOC(ptr) if ((ptr) == nullptr) { \
+    std::cerr << "Memory allocation for '" #ptr "' failed." << std::endl; \
+    exit(EXIT_FAILURE); \
+}
+
 int main() {
-    const uint32_t size_col=11;
-    const unsigned size_row = (1<<30);
+    const unsigned size_row = (1<<26);
     const unsigned nof_partitions = 64;
     const unsigned size_partition = size_row / nof_partitions;
     // layers is allocated only for one partition, need to resuse for different partitions
-    const uint32_t size_layers = size_col * size_partition; // size_col * size_row
+    const uint32_t size_layers = size_col * size_partition;
     
+    nvmlInit();
+    unsigned int deviceCount;
+    nvmlDeviceGetCount(&deviceCount);
+    std::cout << "Available GPUs: " << deviceCount << std::endl;
+    for (unsigned int i = 0; i < deviceCount; ++i) {
+        nvmlDevice_t device;
+        char name[NVML_DEVICE_NAME_BUFFER_SIZE];
+        nvmlDeviceGetHandleByIndex(i, &device);
+        nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
+        std::cout << "Device ID: " << i << ", Type: " << name << std::endl;
+    }
+
     // Key: multiple devices are supported by device context
 
     device_context::DeviceContext ctx0 = device_context::get_default_device_context();
@@ -43,20 +72,16 @@ int main() {
     device_context::DeviceContext ctx1 = device_context::get_default_device_context();
     ctx1.device_id=1;
     
-    // Allocate and initialize memory for the layers and hashes
+    std::cout << "Allocate and initialize the memory for layers and hashes" << std::endl;
     scalar_t* layers0 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    if (layers0 == nullptr) {
-        std::cerr << "Memory allocation for 'layers' failed." << std::endl;
-    }
+    CHECK_ALLOC(layers0);
     scalar_t s = scalar_t::zero();
     for (unsigned i = 0; i < size_col*size_partition ; i++) {
         layers0[i] = s;
         s = s + scalar_t::one();
     }
     scalar_t* layers1 = static_cast<scalar_t*>(malloc(size_layers * sizeof(scalar_t)));
-    if (layers1 == nullptr) {
-        std::cerr << "Memory allocation for 'layers' failed." << std::endl;
-    }
+    CHECK_ALLOC(layers1);
     s = scalar_t::zero() + scalar_t::one();
     for (unsigned i = 0; i < size_col*size_partition ; i++) {
         layers1[i] = s;
@@ -64,22 +89,40 @@ int main() {
     }
 
     scalar_t* column_hash0 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash0);
     scalar_t* column_hash1 = static_cast<scalar_t*>(malloc(size_partition * sizeof(scalar_t)));
+    CHECK_ALLOC(column_hash1);
+
+    PoseidonConstants<scalar_t> column_constants0, column_constants1;
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
+    init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
 
-    // Start threads
-    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0);
-    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1);
+    std::cout << "Parallel execution of Poseidon threads" << std::endl;
+    START_TIMER(parallel);
+    std::thread thread0(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    std::thread thread1(threadPoseidon, ctx1, size_partition, layers1, column_hash1, &column_constants1);
 
     // Wait for the threads to finish
     thread0.join();
     thread1.join();
-
-    // Process the output data (example: print the data)
+    END_TIMER(parallel,"2 GPUs");
     std::cout << "Output Data from Thread 0: ";
-    std::cout << std::endl;
-
+    std::cout << column_hash0[0] << std::endl;
     std::cout << "Output Data from Thread 1: ";
-    std::cout << std::endl;
+    std::cout << column_hash1[0] << std::endl;
+
+    std::cout << "Sequential execution of Poseidon threads" << std::endl;
+    START_TIMER(sequential);
+    std::thread thread2(threadPoseidon, ctx0, size_partition, layers0, column_hash0, &column_constants0);
+    thread2.join();
+    std::thread thread3(threadPoseidon, ctx0, size_partition, layers1, column_hash1, &column_constants0);
+    thread3.join();
+    END_TIMER(sequential,"1 GPU");
+    std::cout << "Output Data from Thread 2: ";
+    std::cout << column_hash0[0] << std::endl;
+    std::cout << "Output Data from Thread 3: ";
+    std::cout << column_hash1[0] << std::endl;
 
+    nvmlShutdown();
     return 0;
 }

From 8dd52306dc9fc45cea00650ef4fe123a126a03ce Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Thu, 15 Feb 2024 23:07:33 +0000
Subject: [PATCH 05/11] update README

---
 examples/c++/multi-gpu-poseidon/README.md | 51 ++++++++++++++++++++++-
 1 file changed, 50 insertions(+), 1 deletion(-)

diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md
index 28ecc8a53..4fbae3338 100644
--- a/examples/c++/multi-gpu-poseidon/README.md
+++ b/examples/c++/multi-gpu-poseidon/README.md
@@ -1,2 +1,51 @@
-# Muliple GPU on a single host
+# Icicle example: using muliple GPU to hash large dataset
+
+## Best-Practices
+
+This example builds on [single GPU Poseidon example](../poseidon/README.md) so we recommend to run it first.
+
+## Key-Takeaway
+
+Use `device_context::DeviceContext` variable to select GPU to use. 
+Use C++ threads to compute `Icicle` primitives on different GPUs in parallel.
+
+## Concise Usage Explanation
+
+1. Include c++ threads
+
+```c++
+#include <thread>
+```
+
+2. Define a __thread function__. Importantly, device context `ctx` will hold the GPU id.
+
+```c++
+void threadPoseidon(device_context::DeviceContext ctx, ...) {...}
+```
+
+3. Initialize device contexts for different GPUs
+
+```c++
+device_context::DeviceContext ctx0 = device_context::get_default_device_context();
+ctx0.device_id=0;
+device_context::DeviceContext ctx1 = device_context::get_default_device_context();
+ctx1.device_id=1;
+``` 
+4. Finally, spawn the threads and wait for their completion
+
+```c++
+std::thread thread0(threadPoseidon, ctx0, ...);
+std::thread thread1(threadPoseidon, ctx1, ...);
+thread0.join();
+thread1.join();
+```
+
+## What's in the example
+
+This is a **toy** example executing the first step of the Filecoin's Pre-Commit 2 phase: compute $2^{30}$ Poseison hashes for each column of $11 \times 2^{30}$ matrix.
+
+1. Define the size of the example: $2^{30}$ won't fit on a typical machine, so we partition the problem into `nof_partitions`
+2. Hash two partitions in parallel on two GPUs
+3. Hash two partitions in series on one GPU
+4. Compare execution times
 

From 319358427fc8b707d53c5266ee758997da1f4d6e Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Fri, 16 Feb 2024 16:35:04 +0000
Subject: [PATCH 06/11] cudaSetDevice in the thread function

---
 examples/c++/multi-gpu-poseidon/example.cu | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
index a6865958a..2db13629b 100644
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -7,6 +7,7 @@
 // select the curve
 #define CURVE_ID 2
 #include "appUtils/poseidon/poseidon.cu"
+#include "utils/error_handler.cuh"
 
 using namespace poseidon;
 using namespace curve_config;
@@ -23,6 +24,12 @@ const int size_col = 11;
 
 // this function executes the Poseidon thread
 void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition, scalar_t * layers, scalar_t * column_hashes, PoseidonConstants<scalar_t> * constants) {
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
+    // CHK_IF_RETURN(); I can't use it in a standard thread function
     PoseidonConfig column_config = {
         ctx,   // ctx
         false, // are_inputes_on_device
@@ -30,7 +37,7 @@ void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition,
         false, // input_is_a_state
         false, // aligned
         false, // loop_state
-        false, // is_async
+        true, // is_async
         };
     cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
     checkCudaError(err);

From 6681be549acfc9bcc8f4bb4ccfdc390519d45317 Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Fri, 16 Feb 2024 19:43:58 +0000
Subject: [PATCH 07/11] fixed on-device memory issue

---
 examples/c++/multi-gpu-poseidon/example.cu | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
index 2db13629b..2d4bfbdfc 100644
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -37,7 +37,7 @@ void threadPoseidon(device_context::DeviceContext ctx, unsigned size_partition,
         false, // input_is_a_state
         false, // aligned
         false, // loop_state
-        true, // is_async
+        false, // is_async
         };
     cudaError_t err = poseidon_hash<scalar_t, size_col+1>(layers, column_hashes, (size_t) size_partition, *constants, column_config);
     checkCudaError(err);
@@ -102,6 +102,11 @@ int main() {
 
     PoseidonConstants<scalar_t> column_constants0, column_constants1;
     init_optimized_poseidon_constants<scalar_t>(size_col, ctx0, &column_constants0);
+    cudaError_t err_result =  CHK_STICKY(cudaSetDevice(ctx1.device_id));
+    if (err_result != cudaSuccess) {
+        std::cerr << "CUDA error: " << cudaGetErrorString(err_result) << std::endl;
+        return; 
+    }
     init_optimized_poseidon_constants<scalar_t>(size_col, ctx1, &column_constants1);
 
     std::cout << "Parallel execution of Poseidon threads" << std::endl;

From 518a3ad9b65d494371c30aeecc8f191eaae74d80 Mon Sep 17 00:00:00 2001
From: Stas Polonsky <stas@ingonyama.com>
Date: Sat, 17 Feb 2024 00:18:21 +0000
Subject: [PATCH 08/11] ready for PR

---
 examples/c++/multi-gpu-poseidon/example.cu | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
index 2d4bfbdfc..6e5d1d56a 100644
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -54,7 +54,7 @@ using FpMilliseconds = std::chrono::duration<float, std::chrono::milliseconds::p
 }
 
 int main() {
-    const unsigned size_row = (1<<26);
+    const unsigned size_row = (1<<30);
     const unsigned nof_partitions = 64;
     const unsigned size_partition = size_row / nof_partitions;
     // layers is allocated only for one partition, need to resuse for different partitions
@@ -64,15 +64,23 @@ int main() {
     unsigned int deviceCount;
     nvmlDeviceGetCount(&deviceCount);
     std::cout << "Available GPUs: " << deviceCount << std::endl;
+
     for (unsigned int i = 0; i < deviceCount; ++i) {
         nvmlDevice_t device;
+        nvmlMemory_t memory;
         char name[NVML_DEVICE_NAME_BUFFER_SIZE];
         nvmlDeviceGetHandleByIndex(i, &device);
         nvmlDeviceGetName(device, name, NVML_DEVICE_NAME_BUFFER_SIZE);
-        std::cout << "Device ID: " << i << ", Type: " << name << std::endl;
+        nvmlDeviceGetMemoryInfo(device, &memory);
+        std::cout << "Device ID: " << i << ", Type: " << name << ", Memory Total/Free (MiB) " << memory.total/1024/1024 << "/"  << memory.free/1024/1024 << std::endl;
     }
 
+    const unsigned memory_partition = sizeof(scalar_t)*(size_col+1)*size_partition/1024/1024;
+    std::cout << "Required Memory (MiB) " << memory_partition << std::endl;
+
+    //===============================================================================
     // Key: multiple devices are supported by device context
+    //===============================================================================
 
     device_context::DeviceContext ctx0 = device_context::get_default_device_context();
     ctx0.device_id=0;

From b1af193f6fc72e3a089f92bf92fbb9fe048277c9 Mon Sep 17 00:00:00 2001
From: stas <spolonsky@icloud.com>
Date: Tue, 20 Feb 2024 19:14:25 -0500
Subject: [PATCH 09/11] fixed spelling

---
 examples/c++/multi-gpu-poseidon/README.md  | 3 ++-
 examples/c++/multi-gpu-poseidon/example.cu | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/examples/c++/multi-gpu-poseidon/README.md b/examples/c++/multi-gpu-poseidon/README.md
index 4fbae3338..5fb110fe6 100644
--- a/examples/c++/multi-gpu-poseidon/README.md
+++ b/examples/c++/multi-gpu-poseidon/README.md
@@ -1,4 +1,4 @@
-# Icicle example: using muliple GPU to hash large dataset
+# Icicle example: using multiple GPU to hash large dataset
 
 ## Best-Practices
 
@@ -31,6 +31,7 @@ ctx0.device_id=0;
 device_context::DeviceContext ctx1 = device_context::get_default_device_context();
 ctx1.device_id=1;
 ``` 
+
 4. Finally, spawn the threads and wait for their completion
 
 ```c++
diff --git a/examples/c++/multi-gpu-poseidon/example.cu b/examples/c++/multi-gpu-poseidon/example.cu
index 6e5d1d56a..edb12f9b1 100644
--- a/examples/c++/multi-gpu-poseidon/example.cu
+++ b/examples/c++/multi-gpu-poseidon/example.cu
@@ -57,7 +57,7 @@ int main() {
     const unsigned size_row = (1<<30);
     const unsigned nof_partitions = 64;
     const unsigned size_partition = size_row / nof_partitions;
-    // layers is allocated only for one partition, need to resuse for different partitions
+    // layers is allocated only for one partition, need to reuse for different partitions
     const uint32_t size_layers = size_col * size_partition;
     
     nvmlInit();

From 82d1ff47695d0b585d32f2932977efbf753e8e00 Mon Sep 17 00:00:00 2001
From: stas <spolonsky@icloud.com>
Date: Tue, 20 Feb 2024 20:40:45 -0500
Subject: [PATCH 10/11] fixed spelling in poseidon.cuh

---
 icicle/appUtils/poseidon/poseidon.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/icicle/appUtils/poseidon/poseidon.cuh b/icicle/appUtils/poseidon/poseidon.cuh
index 18fcf77e8..6eb439567 100644
--- a/icicle/appUtils/poseidon/poseidon.cuh
+++ b/icicle/appUtils/poseidon/poseidon.cuh
@@ -119,7 +119,7 @@ namespace poseidon {
   /**
    * Loads pre-calculated optimized constants, moves them to the device
    */
-  // Stas: I have an issue with the number of argumnets
+  // Stas: I have an issue with the number of arguments
   template <typename S>
   cudaError_t
   init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);

From c046fd17c6744a80f6be3dbda5abacc33b93fd10 Mon Sep 17 00:00:00 2001
From: stas <spolonsky@icloud.com>
Date: Tue, 20 Feb 2024 20:43:59 -0500
Subject: [PATCH 11/11] removed my comments from poseidon.cuh

---
 icicle/appUtils/poseidon/poseidon.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/icicle/appUtils/poseidon/poseidon.cuh b/icicle/appUtils/poseidon/poseidon.cuh
index 6eb439567..b139dc9a3 100644
--- a/icicle/appUtils/poseidon/poseidon.cuh
+++ b/icicle/appUtils/poseidon/poseidon.cuh
@@ -119,7 +119,6 @@ namespace poseidon {
   /**
    * Loads pre-calculated optimized constants, moves them to the device
    */
-  // Stas: I have an issue with the number of arguments
   template <typename S>
   cudaError_t
   init_optimized_poseidon_constants(int arity, device_context::DeviceContext& ctx, PoseidonConstants<S>* constants);