From 59d01477a33c92312adca6a5da421e1c5856d9ee Mon Sep 17 00:00:00 2001 From: psychocrypt Date: Wed, 31 Jul 2019 10:30:56 +0200 Subject: [PATCH] AMD: optimize VEGA auto suggestion - increase the work size to 16 for cryptonight_gpu for vega GPUs --- xmrstak/backend/amd/autoAdjust.hpp | 40 ++++++++++++++++-------------- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp index ee7168f60..858c03844 100644 --- a/xmrstak/backend/amd/autoAdjust.hpp +++ b/xmrstak/backend/amd/autoAdjust.hpp @@ -110,6 +110,21 @@ class autoAdjust } } + // check if cryptonight_monero_v8 is selected for the user or dev pool + bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()); + + // true for all cryptonight_heavy derivates since we check the user and dev pool + bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); + + // true for cryptonight_gpu as main user pool algorithm + bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; + + bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r; + + bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow; + + // 8 threads per block (this is a good value for the most gpus) + uint32_t default_workSize = 8; size_t minFreeMem = 128u * byteToMiB; /* 1000 is a magic selected limit, the reason is that more than 2GiB memory * sowing down the memory performance because of TLB cache misses @@ -130,6 +145,9 @@ class autoAdjust * to avoid out of memory errors */ maxThreads = 2024u; + + if(useCryptonight_gpu) + default_workSize = 16u; } // NVIDIA optimizations @@ -142,19 +160,6 @@ class autoAdjust minFreeMem = 512u * byteToMiB; } - // check if cryptonight_monero_v8 is selected for the user or dev pool - bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end()); - - // true for all cryptonight_heavy derivates since we check the user and dev pool - bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end(); - - // true for cryptonight_gpu as main user pool algorithm - bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu; - - bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r; - - bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow; - // set strided index to default ctx.stridedIndex = 1; @@ -203,11 +208,11 @@ class autoAdjust size_t perThread = hashMemSize + 240u; size_t maxIntensity = memPerThread / perThread; size_t possibleIntensity = std::min(maxThreads, maxIntensity); - // map intensity to a multiple of the compute unit count, 8 is the number of threads per work group - size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8; + // map intensity to a multiple of the compute unit count, default_workSize is the number of threads per work group + size_t intensity = (possibleIntensity / (default_workSize * ctx.computeUnits)) * ctx.computeUnits * default_workSize; // in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units if(numThreads == 2) - intensity = (possibleIntensity / 8) * 8; + intensity = (possibleIntensity / default_workSize) * default_workSize; //If the intensity is 0, then it's because the multiple of the unit count is greater than intensity if(intensity == 0) @@ -225,9 +230,8 @@ class autoAdjust conf += " // gpu: " + ctx.name + std::string(" compute units: ") + std::to_string(ctx.computeUnits) + "\n"; conf += " // memory:" + std::to_string(memPerThread / byteToMiB) + "|" + std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n"; - // set 8 threads per block (this is a good value for the most gpus) conf += std::string(" { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" + - " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" + + " \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(default_workSize) + ",\n" + " \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n" " \"unroll\" : " + std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +