Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

CUDA Benchamrk #77

Merged
merged 3 commits into from
Jul 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .mailmap
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ Edward Atkin <edhuthwaite@aol.com> EdAtkin <35494466+EdAtkin@users.noreply.githu

# Kamil Skwarczynski
Kamil Skwarczynski <skwarczynskikamil@gmail.com> Kamil <45295406+KSkwarczynski@users.noreply.github.com>
Kamil Skwarczynski <skwarczynskikamil@gmail.com> Kamil Skwarczynski <kamilskw@bg12105.int.ets1.calculquebec.ca>
Kamil Skwarczynski <skwarczynskikamil@gmail.com> Kamil Skwarczynski <kamilskw@cdr293.int.cedar.computecanada.ca>

# Henry Wallace
Henry Wallace <henryisrael08@gmail.com> henry-israel <henryisrael08@gmail.com>
Expand Down
49 changes: 49 additions & 0 deletions cmake/Modules/CUDASamples.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,52 @@ endif()

cmessage(STATUS "Using the following CUDA samples paths: ${CMAKE_CUDA_SAMPLES_PATH}")
target_include_directories(MaCh3CompilerOptions INTERFACE ${CMAKE_CUDA_SAMPLES_PATH})


# KS: Perform fancy CUDA Benchmarking
DefineEnabledRequiredSwitch(MaCh3_GPU_BENCHMARK FALSE)
if(MaCh3_GPU_BENCHMARK)
cmessage(STATUS "Building CUDA Benchmark")

# KS: Define directories to iterate over, might be useful to expand
set(CUDA_SAMPLES_DIRS
"deviceQuery"
"bandwidthTest"
)

# KS: Iterate over each directory
foreach(sample_dir ${CUDA_SAMPLES_DIRS})
# Define source and destination directories
set(SRC_DIR "${CMAKE_CUDA_SAMPLES_PATH}/../Samples/1_Utilities/${sample_dir}")
set(DST_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/")

# CW: Copy over the provided nvidia utility
# CW: Often we can't write to the CUDA install directory, so let's build it here
file(COPY ${SRC_DIR} DESTINATION ${DST_DIR})

# KS: Change directory to copied sample
set(SAMPLE_DIR "${CMAKE_BINARY_DIR}/GPU_Benchmark/${sample_dir}")

# Modify Makefile path
set(MAKEFILE_PATH "${SAMPLE_DIR}/Makefile")

# CW: Patch the litle hard-coded NVIDIA makefile
execute_process(
COMMAND sed -i "s,../../../Common,${CMAKE_CUDA_SAMPLES_PATH},g" ${MAKEFILE_PATH}
RESULT_VARIABLE SED_RESULT
)

# Add custom target to run make
add_custom_target(run_${sample_dir} ALL
COMMAND make
WORKING_DIRECTORY ${SAMPLE_DIR}
)

# Add custom target to run sample
add_custom_target(run_${sample_dir}_exec ALL
COMMAND ./${sample_dir}
WORKING_DIRECTORY ${SAMPLE_DIR}
DEPENDS run_${sample_dir}
)
endforeach(sample_dir)
endif()
1 change: 1 addition & 0 deletions manager/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ set(HEADERS
MaCh3Logger.h
Monitor.h
MaCh3Exception.h
gpuUtils.cuh
)

add_library(Manager SHARED
Expand Down
10 changes: 6 additions & 4 deletions manager/Monitor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -101,10 +101,10 @@ void GetCPUInfo(){
MACH3LOG_INFO("{}", TerminalToString("cat /proc/cpuinfo | grep -m 1 MHz"));
//KS: Below code is convoluted because I mostly work on English based Linux but sometimes on Polish based Linux, this ensures it works on both. We can add support for other languages if needed
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i Archit"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1d'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L1i'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L2'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -i 'Cache L3'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1d |L1d:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L1i |L1i:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L2 |L2:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'L3 |L3:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E 'Thread.* per core:|Wątków na rdzeń:'"));
MACH3LOG_INFO("{}", TerminalToString("lscpu | grep -m 1 -E '^CPU(:|\\(s\\)):?\\s+[0-9]+'"));

Expand All @@ -127,6 +127,8 @@ void GetGPUInfo(){
MACH3LOG_INFO("Total VRAM: {} MB", TerminalToString("nvidia-smi --query-gpu=memory.total --format=csv,noheader,nounits"));
// Print Driver Version
MACH3LOG_INFO("Driver Version: {}", TerminalToString("nvidia-smi --query-gpu=driver_version --format=csv,noheader"));
// Print N GPU thread
MACH3LOG_INFO("Currently used GPU has: {} threads", GetNumGPUThreads());
#endif
return;
}
Expand Down
3 changes: 3 additions & 0 deletions manager/Monitor.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,9 @@
#include "samplePDF/Structs.h"
#include "manager/YamlHelper.h"

#ifdef CUDA
#include "manager/gpuUtils.cuh"
#endif

namespace MaCh3Utils {
/// @brief KS: Prints welcome message with MaCh3 logo
Expand Down
57 changes: 16 additions & 41 deletions manager/gpuUtils.cu
Original file line number Diff line number Diff line change
@@ -1,34 +1,9 @@
// C i/o for printf and others
#include <stdio.h>
#include <vector>

// CUDA specifics

#include <cuda_runtime.h>

#ifdef CUDA_ERROR_CHECK
#include <helper_functions.h>
#include <helper_cuda.h>
#endif

// Define the macros
#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)

/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now
#define _BlockSize_ 1024

// CUDA_ERROR_CHECK is now defined in the makefile instead
//#define CUDA_ERROR_CHECK

// **************************************************
// ERROR CHECKING ROUTINES
// Also exist in helper_cuda.h
// **************************************************
// MaCh3 includes
#include "manager/gpuUtils.cuh"

// **************************************************
/// @brief Check for a safe call on GPU
inline void __cudaSafeCall( cudaError err, const char *file, const int line ) {
// Check for a safe call on GPU
void __cudaSafeCall( cudaError err, const char *file, const int line ) {
// **************************************************
#ifdef CUDA_ERROR_CHECK
if (cudaSuccess != err) {
Expand All @@ -40,8 +15,8 @@ inline void __cudaSafeCall( cudaError err, const char *file, const int line ) {
}

// **************************************************
/// @brief Check if there's been an error
inline void __cudaCheckError( const char *file, const int line ) {
// Check if there's been an error
void __cudaCheckError( const char *file, const int line ) {
// **************************************************
#ifdef CUDA_ERROR_CHECK
cudaError err = cudaGetLastError();
Expand All @@ -66,8 +41,8 @@ inline void __cudaCheckError( const char *file, const int line ) {
// *******************************************

// *******************************************
/// @brief KS: Get some fancy info about VRAM usage
inline void checkGpuMem() {
// KS: Get some fancy info about VRAM usage
void checkGpuMem() {
// *******************************************

float free_m, total_m,used_m;
Expand All @@ -84,8 +59,8 @@ inline void checkGpuMem() {
}

// *******************************************
/// @brief KS: Get some fancy info about GPU
inline void PrintNdevices() {
// KS: Get some fancy info about GPU
void PrintNdevices() {
// *******************************************

int nDevices;
Expand All @@ -102,8 +77,8 @@ inline void PrintNdevices() {


// *******************************************
/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
inline void ResetDevice() {
// KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
void ResetDevice() {
// *******************************************

cudaDeviceReset();
Expand All @@ -113,7 +88,7 @@ inline void ResetDevice() {

// *******************************************
/// @brief Only useful if using multiple GPU
inline void SetDevice(const int deviceId) {
void SetDevice(const int deviceId) {
// *******************************************

// Check if the device ID is valid
Expand All @@ -131,8 +106,8 @@ inline void SetDevice(const int deviceId) {
}

// *******************************************
/// @brief Get number of GPU threads for currently used GPU
inline void GetNumGPUThreads(const int Device = 0) {
// Get number of GPU threads for currently used GPU
int GetNumGPUThreads(const int Device) {
// *******************************************

int deviceCount;
Expand All @@ -149,5 +124,5 @@ inline void GetNumGPUThreads(const int Device = 0) {
// Define the number of threads per block
int nThreadsBlocks = (deviceProp.multiProcessorCount * deviceProp.maxThreadsPerMultiProcessor);

printf("Currently used GPU has : %i threads \n", nThreadsBlocks);
return nThreadsBlocks;
}
59 changes: 59 additions & 0 deletions manager/gpuUtils.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
#pragma once

// C i/o for printf and others
#include <stdio.h>
#include <vector>

// CUDA specifics

#include <cuda_runtime.h>

#ifdef CUDA_ERROR_CHECK
#include <helper_functions.h>
#include <helper_cuda.h>
#endif

// Define the macros
#define CudaSafeCall(err) __cudaSafeCall(err, __FILE__, __LINE__)
#define CudaCheckError() __cudaCheckError(__FILE__, __LINE__)

/// KS: Need it for shared memory, there is way to use dynamic shared memory but I am lazy right now
#define _BlockSize_ 1024

//KS: TODO
// There is plenty of useful stuff here https://github.com/NVIDIA/cuda-samples/blob/master/Samples/1_Utilities/deviceQuery/deviceQuery.cpp
// We might want to port some of these utilities, for example having bool if there is unified memory etc.

// CUDA_ERROR_CHECK is now defined in the makefile instead
//#define CUDA_ERROR_CHECK

// **************************************************
// ERROR CHECKING ROUTINES
// Also exist in helper_cuda.h
// **************************************************

/// @brief Check for a safe call on GPU
void __cudaSafeCall( cudaError err, const char *file, const int line );

/// @brief Check if there's been an error
void __cudaCheckError( const char *file, const int line );

// *******************************************
// Utils
// *******************************************

// *******************************************
/// @brief KS: Get some fancy info about VRAM usage
void checkGpuMem();

/// @brief KS: Get some fancy info about GPU
void PrintNdevices();

/// @brief KS: Completely clean GPU, this is time consuming and may lead to unexpected behaviour.
void ResetDevice();

/// @brief KS: Only useful if using multiple GPU
void SetDevice(const int deviceId);

/// @brief KS: Get number of GPU threads for currently used GPU
int GetNumGPUThreads(const int Device = 0);
1 change: 1 addition & 0 deletions mcmc/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ set(HEADERS
MCMCProcessor.h
SampleSummary.h
StatisticalUtils.h
gpuMCMCProcessorUtils.cuh
)

add_library(MCMC SHARED
Expand Down
37 changes: 0 additions & 37 deletions mcmc/MCMCProcessor.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2,43 +2,6 @@

#include "TChain.h"

//Only if GPU is enabled
#ifdef CUDA
extern void InitGPU_AutoCorr(
float **ParStep_gpu,
float **NumeratorSum_gpu,
float **ParamSums_gpu,
float **DenomSum_gpu,
int n_Entries,
int n_Pars,
const int n_Lags);

extern void CopyToGPU_AutoCorr(
float *ParStep_cpu,
float *NumeratorSum_cpu,
float *ParamSums_cpu,
float *DenomSum_cpu,

float *ParStep_gpu,
float *NumeratorSum_gpu,
float *ParamSums_gpu,
float *DenomSum_gpu);

extern void RunGPU_AutoCorr(
float *ParStep_gpu,
float *ParamSums_gpu,
float *NumeratorSum_gpu,
float *DenomSum_gpu,
float *NumeratorSum_cpu,
float *DenomSum_cpu);

extern void CleanupGPU_AutoCorr(
float *ParStep_gpu,
float *NumeratorSum_gpu,
float *ParamSums_gpu,
float *DenomSum_gpu);
#endif

// ****************************
MCMCProcessor::MCMCProcessor(const std::string &InputFile, bool MakePostfitCorr) :
Chain(nullptr), StepCut(""), MakeCorr(MakePostfitCorr), MadePostfit(false) {
Expand Down
5 changes: 5 additions & 0 deletions mcmc/MCMCProcessor.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@
// MaCh3 includes
#include "mcmc/StatisticalUtils.h"

//Only if GPU is enabled
#ifdef CUDA
#include "mcmc/gpuMCMCProcessorUtils.cuh"
#endif

//KS: Joy of forward declaration https://gieseanw.wordpress.com/2018/02/25/the-joys-of-forward-declarations-results-from-the-real-world/
class TChain;

Expand Down
10 changes: 1 addition & 9 deletions mcmc/gpuMCMCProcessorUtils.cu
Original file line number Diff line number Diff line change
@@ -1,12 +1,4 @@
// MaCh3 utils for processing/diagnostic MCMC
// Written by Kamil Skwarczynski
//
// Contains code to run on CUDA GPUs. Right now only can calculate autocorrelations
// Potential extensions:
// -Covariance matrix calculations and other matrix operations
// -Effective Sample Size evaluation

#include "manager/gpuUtils.cu"
#include "mcmc/gpuMCMCProcessorUtils.cuh"

// ******************************************
// CONSTANTS
Expand Down
Loading
Loading