diff --git a/.mailmap b/.mailmap index c62951189..e1fe3d4c4 100644 --- a/.mailmap +++ b/.mailmap @@ -6,6 +6,8 @@ Edward Atkin EdAtkin <35494466+EdAtkin@users.noreply.githu # Kamil Skwarczynski Kamil Skwarczynski Kamil <45295406+KSkwarczynski@users.noreply.github.com> +Kamil Skwarczynski Kamil Skwarczynski +Kamil Skwarczynski Kamil Skwarczynski # Henry Wallace Henry Wallace henry-israel diff --git a/cmake/Modules/CUDASamples.cmake b/cmake/Modules/CUDASamples.cmake index 389236180..6729d5466 100644 --- a/cmake/Modules/CUDASamples.cmake +++ b/cmake/Modules/CUDASamples.cmake @@ -31,3 +31,52 @@ endif() cmessage(STATUS "Using the following CUDA samples paths: ${CMAKE_CUDA_SAMPLES_PATH}") target_include_directories(MaCh3CompilerOptions INTERFACE ${CMAKE_CUDA_SAMPLES_PATH}) + + +# KS: Perform fancy CUDA Benchmarking +DefineEnabledRequiredSwitch(MaCh3_GPU_BENCHMARK FALSE) +if(MaCh3_GPU_BENCHMARK) + cmessage(STATUS "Building CUDA Benchmark") + + # KS: Define directories to iterate over, might be useful to expand + set(CUDA_SAMPLES_DIRS + "deviceQuery" + "bandwidthTest" + ) + + # KS: Iterate over each directory + foreach(sample_dir ${CUDA_SAMPLES_DIRS}) + # Define source and destination directories + set(SRC_DIR "${CMAKE_CUDA_SAMPLES_PATH}/../Samples/1_Utilities/${sample_dir}") + set(DST_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/") + + # CW: Copy over the provided nvidia utility + # CW: Often we can't write to the CUDA install directory, so let's build it here + file(COPY ${SRC_DIR} DESTINATION ${DST_DIR}) + + # KS: Change directory to copied sample + set(SAMPLE_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/${sample_dir}") + + # Modify Makefile path + set(MAKEFILE_PATH "${SAMPLE_DIR}/Makefile") + + # CW: Patch the litle hard-coded NVIDIA makefile + execute_process( + COMMAND sed -i "s,../../../Common,${CMAKE_CUDA_SAMPLES_PATH},g" ${MAKEFILE_PATH} + RESULT_VARIABLE SED_RESULT + ) + + # Add custom target to run make + add_custom_target(run_${sample_dir} ALL + COMMAND make + WORKING_DIRECTORY ${SAMPLE_DIR} + ) + + # Add custom target to run sample + add_custom_target(run_${sample_dir}_exec ALL + COMMAND ./${sample_dir} + WORKING_DIRECTORY ${SAMPLE_DIR} + DEPENDS run_${sample_dir} + ) + endforeach(sample_dir) +endif() diff --git a/manager/CMakeLists.txt b/manager/CMakeLists.txt index 31e30823f..b612c3dec 100644 --- a/manager/CMakeLists.txt +++ b/manager/CMakeLists.txt @@ -5,6 +5,7 @@ set(HEADERS MaCh3Logger.h Monitor.h MaCh3Exception.h + gpuUtils.cuh ) add_library(Manager SHARED diff --git a/manager/Monitor.cpp b/manager/Monitor.cpp index 282d84a4b..6fc175740 100644 --- a/manager/Monitor.cpp +++ b/manager/Monitor.cpp @@ -101,10 +101,10 @@ void GetCPUInfo(){ MACH3LOG_INFO("{}", TerminalToString("cat /proc/cpuinfo | grep -m 1 MHz")); //KS: Below code is convoluted because I mostly work on English based Linux but sometimes on Polish based Linux, this ensures it works on both. We can add support for other languages if needed MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i Archit")); - MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1d'")); - MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1i'")); - MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L2'")); - MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L3'")); + MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1d |L1d:'")); + MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1i |L1i:'")); + MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L2 |L2:'")); + MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L3 |L3:'")); MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'Thread.* per core:|Wątków na rdzeń:'")); MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E '^CPU(:|\\(s\\)):?\\s+[0-9]+'")); @@ -127,6 +127,8 @@ void GetGPUInfo(){ MACH3LOG_INFO("Total VRAM: {} MB", TerminalToString("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits")); // Print Driver Version MACH3LOG_INFO("Driver Version: {}", TerminalToString("nvidia-smi --query-gpu=driver_version --format=csv,noheader")); + // Print N GPU thread + MACH3LOG_INFO("Currently used GPU has: {} threads", GetNumGPUThreads()); #endif return; } diff --git a/manager/Monitor.h b/manager/Monitor.h index 1ef82e07d..06def70ba 100644 --- a/manager/Monitor.h +++ b/manager/Monitor.h @@ -19,6 +19,9 @@ #include "samplePDF/Structs.h" #include "manager/YamlHelper.h" +#ifdef CUDA +#include "manager/gpuUtils.cuh" +#endif namespace MaCh3Utils { /// @brief KS: Prints welcome message with MaCh3 logo diff --git a/manager/gpuUtils.cu b/manager/gpuUtils.cu index 6fc7d2afc..f6e34d606 100644 --- a/manager/gpuUtils.cu +++ b/manager/gpuUtils.cu @@ -1,34 +1,9 @@ -// C i/o for printf and others -#include -#include - -// CUDA specifics - -#include - -#ifdef CUDA_ERROR_CHECK -#include -#include -#endif - -// Define the macros -#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__) -#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__) - -/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now -#define _BlockSize_ 1024 - -// CUDA_ERROR_CHECK is now defined in the makefile instead -//#define CUDA_ERROR_CHECK - -// ************************************************** -// ERROR CHECKING ROUTINES -// Also exist in helper_cuda.h -// ************************************************** +// MaCh3 includes +#include "manager/gpuUtils.cuh" // ************************************************** -/// @brief Check for a safe call on GPU -inline void __cudaSafeCall( cudaError err, const char *file, const int line ) { +// Check for a safe call on GPU +void __cudaSafeCall( cudaError err, const char *file, const int line ) { // ************************************************** #ifdef CUDA_ERROR_CHECK if (cudaSuccess != err) { @@ -40,8 +15,8 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line ) { } // ************************************************** -/// @brief Check if there's been an error -inline void __cudaCheckError( const char *file, const int line ) { +// Check if there's been an error +void __cudaCheckError( const char *file, const int line ) { // ************************************************** #ifdef CUDA_ERROR_CHECK cudaError err = cudaGetLastError(); @@ -66,8 +41,8 @@ inline void __cudaCheckError( const char *file, const int line ) { // ******************************************* // ******************************************* -/// @brief KS: Get some fancy info about VRAM usage -inline void checkGpuMem() { +// KS: Get some fancy info about VRAM usage +void checkGpuMem() { // ******************************************* float free_m, total_m,used_m; @@ -84,8 +59,8 @@ inline void checkGpuMem() { } // ******************************************* -/// @brief KS: Get some fancy info about GPU -inline void PrintNdevices() { +// KS: Get some fancy info about GPU +void PrintNdevices() { // ******************************************* int nDevices; @@ -102,8 +77,8 @@ inline void PrintNdevices() { // ******************************************* -/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour. -inline void ResetDevice() { +// KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour. +void ResetDevice() { // ******************************************* cudaDeviceReset(); @@ -113,7 +88,7 @@ inline void ResetDevice() { // ******************************************* /// @brief Only useful if using multiple GPU -inline void SetDevice(const int deviceId) { +void SetDevice(const int deviceId) { // ******************************************* // Check if the device ID is valid @@ -131,8 +106,8 @@ inline void SetDevice(const int deviceId) { } // ******************************************* -/// @brief Get number of GPU threads for currently used GPU -inline void GetNumGPUThreads(const int Device = 0) { +// Get number of GPU threads for currently used GPU +int GetNumGPUThreads(const int Device) { // ******************************************* int deviceCount; @@ -149,5 +124,5 @@ inline void GetNumGPUThreads(const int Device = 0) { // Define the number of threads per block int nThreadsBlocks = (deviceProp.multiProcessorCount * deviceProp.maxThreadsPerMultiProcessor); - printf("Currently used GPU has : %i threads \n", nThreadsBlocks); + return nThreadsBlocks; } diff --git a/manager/gpuUtils.cuh b/manager/gpuUtils.cuh new file mode 100644 index 000000000..c9d7a3418 --- /dev/null +++ b/manager/gpuUtils.cuh @@ -0,0 +1,59 @@ +#pragma once + +// C i/o for printf and others +#include +#include + +// CUDA specifics + +#include + +#ifdef CUDA_ERROR_CHECK +#include +#include +#endif + +// Define the macros +#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__) +#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__) + +/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now +#define _BlockSize_ 1024 + +//KS: TODO +// There is plenty of useful stuff here https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp +// We might want to port some of these utilities, for example having bool if there is unified memory etc. + +// CUDA_ERROR_CHECK is now defined in the makefile instead +//#define CUDA_ERROR_CHECK + +// ************************************************** +// ERROR CHECKING ROUTINES +// Also exist in helper_cuda.h +// ************************************************** + +/// @brief Check for a safe call on GPU +void __cudaSafeCall( cudaError err, const char *file, const int line ); + +/// @brief Check if there's been an error +void __cudaCheckError( const char *file, const int line ); + +// ******************************************* +// Utils +// ******************************************* + +// ******************************************* +/// @brief KS: Get some fancy info about VRAM usage +void checkGpuMem(); + +/// @brief KS: Get some fancy info about GPU +void PrintNdevices(); + +/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour. +void ResetDevice(); + +/// @brief KS: Only useful if using multiple GPU +void SetDevice(const int deviceId); + +/// @brief KS: Get number of GPU threads for currently used GPU +int GetNumGPUThreads(const int Device = 0); diff --git a/mcmc/CMakeLists.txt b/mcmc/CMakeLists.txt index 56e3cfb84..2356ae449 100644 --- a/mcmc/CMakeLists.txt +++ b/mcmc/CMakeLists.txt @@ -7,6 +7,7 @@ set(HEADERS MCMCProcessor.h SampleSummary.h StatisticalUtils.h + gpuMCMCProcessorUtils.cuh ) add_library(MCMC SHARED diff --git a/mcmc/MCMCProcessor.cpp b/mcmc/MCMCProcessor.cpp index 81e0ad5bc..b7f270c2b 100644 --- a/mcmc/MCMCProcessor.cpp +++ b/mcmc/MCMCProcessor.cpp @@ -2,43 +2,6 @@ #include "TChain.h" -//Only if GPU is enabled -#ifdef CUDA -extern void InitGPU_AutoCorr( - float **ParStep_gpu, - float **NumeratorSum_gpu, - float **ParamSums_gpu, - float **DenomSum_gpu, - int n_Entries, - int n_Pars, - const int n_Lags); - -extern void CopyToGPU_AutoCorr( - float *ParStep_cpu, - float *NumeratorSum_cpu, - float *ParamSums_cpu, - float *DenomSum_cpu, - - float *ParStep_gpu, - float *NumeratorSum_gpu, - float *ParamSums_gpu, - float *DenomSum_gpu); - -extern void RunGPU_AutoCorr( - float *ParStep_gpu, - float *ParamSums_gpu, - float *NumeratorSum_gpu, - float *DenomSum_gpu, - float *NumeratorSum_cpu, - float *DenomSum_cpu); - -extern void CleanupGPU_AutoCorr( - float *ParStep_gpu, - float *NumeratorSum_gpu, - float *ParamSums_gpu, - float *DenomSum_gpu); -#endif - // **************************** MCMCProcessor::MCMCProcessor(const std::string &InputFile, bool MakePostfitCorr) : Chain(nullptr), StepCut(""), MakeCorr(MakePostfitCorr), MadePostfit(false) { diff --git a/mcmc/MCMCProcessor.h b/mcmc/MCMCProcessor.h index 1aeffb20a..852a4c7ad 100644 --- a/mcmc/MCMCProcessor.h +++ b/mcmc/MCMCProcessor.h @@ -37,6 +37,11 @@ // MaCh3 includes #include "mcmc/StatisticalUtils.h" +//Only if GPU is enabled +#ifdef CUDA +#include "mcmc/gpuMCMCProcessorUtils.cuh" +#endif + //KS: Joy of forward declaration https://gieseanw.wordpress.com/2018/02/25/the-joys-of-forward-declarations-results-from-the-real-world/ class TChain; diff --git a/mcmc/gpuMCMCProcessorUtils.cu b/mcmc/gpuMCMCProcessorUtils.cu index d7023d107..53e7c39b9 100644 --- a/mcmc/gpuMCMCProcessorUtils.cu +++ b/mcmc/gpuMCMCProcessorUtils.cu @@ -1,12 +1,4 @@ -// MaCh3 utils for processing/diagnostic MCMC -// Written by Kamil Skwarczynski -// -// Contains code to run on CUDA GPUs. Right now only can calculate autocorrelations -// Potential extensions: -// -Covariance matrix calculations and other matrix operations -// -Effective Sample Size evaluation - -#include "manager/gpuUtils.cu" +#include "mcmc/gpuMCMCProcessorUtils.cuh" // ****************************************** // CONSTANTS diff --git a/mcmc/gpuMCMCProcessorUtils.cuh b/mcmc/gpuMCMCProcessorUtils.cuh new file mode 100644 index 000000000..5c75f894e --- /dev/null +++ b/mcmc/gpuMCMCProcessorUtils.cuh @@ -0,0 +1,61 @@ +#pragma once +// MaCh3 utils for processing/diagnostic MCMC +// Written by Kamil Skwarczynski +// +// Contains code to run on CUDA GPUs. Right now only can calculate autocorrelations +// Potential extensions: +// -Covariance matrix calculations and other matrix operations +// -Effective Sample Size evaluation + +#include "manager/gpuUtils.cuh" + +// ******************************************* +// INITIALISE GPU +// ******************************************* + +/// @brief KS: Initialiser, here we allocate memory for variables and copy constants +__host__ void InitGPU_AutoCorr( + float **ParStep_gpu, + float **NumeratorSum_gpu, + float **ParamSums_gpu, + float **DenomSum_gpu, + + int n_Entries, + int n_Pars, + const int n_Lags); + +/// @brief KS: Copy necessary variables from CPU to GPU +__host__ void CopyToGPU_AutoCorr( + float *ParStep_cpu, + float *NumeratorSum_cpu, + float *ParamSums_cpu, + float *DenomSum_cpu, + + float *ParStep_gpu, + float *NumeratorSum_gpu, + float *ParamSums_gpu, + float *DenomSum_gpu); + + +/// @brief Eval autocorrelations based on Box and Jenkins +__global__ void EvalOnGPU_AutoCorr( + const float* __restrict__ ParStep_gpu, + const float* __restrict__ ParamSums_gpu, + float* NumeratorSum_gpu, + float* DenomSum_gpu); + +/// @brief KS: This call the main kernel responsible for calculating LagL and later copy results back to CPU +__host__ void RunGPU_AutoCorr( + float* ParStep_gpu, + float* ParamSums_gpu, + float* NumeratorSum_gpu, + float* DenomSum_gpu, + float* NumeratorSum_cpu, + float* DenomSum_cpu); + +/// @brief KS: free memory on gpu +__host__ void CleanupGPU_AutoCorr( + float *ParStep_gpu, + float *NumeratorSum_gpu, + float *ParamSums_gpu, + float *DenomSum_gpu); diff --git a/samplePDF/kdeGpu.cu b/samplePDF/kdeGpu.cu index d76b79ce8..81ed665da 100644 --- a/samplePDF/kdeGpu.cu +++ b/samplePDF/kdeGpu.cu @@ -1,5 +1,5 @@ //MaCh3 included -#include "manager/gpuUtils.cu" +#include "manager/gpuUtils.cuh" // -*- c++ -*- #include "stdio.h" diff --git a/samplePDF/kdeGpu2.cu b/samplePDF/kdeGpu2.cu index e904f52d2..55bf3e18d 100644 --- a/samplePDF/kdeGpu2.cu +++ b/samplePDF/kdeGpu2.cu @@ -4,7 +4,7 @@ #define MULTI_DATA 10 //MaCh3 included -#include "manager/gpuUtils.cu" +#include "manager/gpuUtils.cuh" #include "stdio.h" diff --git a/splines/gpuSplineUtils.cu b/splines/gpuSplineUtils.cu index 7ab0043cc..e34e38121 100644 --- a/splines/gpuSplineUtils.cu +++ b/splines/gpuSplineUtils.cu @@ -6,7 +6,7 @@ // Called from samplePDF/samplePDFND.cpp -> splines/SplineMonolith.cpp -> splines/gpuSplineUtils.cu //MaCh3 included -#include "manager/gpuUtils.cu" +#include "manager/gpuUtils.cuh" #include "splines/SplineCommon.h" // Hard code the number of splines