From 59d01477a33c92312adca6a5da421e1c5856d9ee Mon Sep 17 00:00:00 2001
From: psychocrypt <psychocryptHPC@gmail.com>
Date: Wed, 31 Jul 2019 10:30:56 +0200
Subject: [PATCH] AMD: optimize VEGA auto suggestion

- increase the work size to 16 for cryptonight_gpu for vega GPUs
---
 xmrstak/backend/amd/autoAdjust.hpp | 40 ++++++++++++++++--------------
 1 file changed, 22 insertions(+), 18 deletions(-)

diff --git a/xmrstak/backend/amd/autoAdjust.hpp b/xmrstak/backend/amd/autoAdjust.hpp
index ee7168f60..858c03844 100644
--- a/xmrstak/backend/amd/autoAdjust.hpp
+++ b/xmrstak/backend/amd/autoAdjust.hpp
@@ -110,6 +110,21 @@ class autoAdjust
 				}
 			}
 
+			// check if cryptonight_monero_v8 is selected for the user or dev pool
+			bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
+
+			// true for all cryptonight_heavy derivates since we check the user and dev pool
+			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
+
+			// true for cryptonight_gpu as main user pool algorithm
+			bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
+
+			bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r;
+
+			bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow;
+
+			// 8 threads per block (this is a good value for the most gpus)
+			uint32_t default_workSize = 8;
 			size_t minFreeMem = 128u * byteToMiB;
 			/* 1000 is a magic selected limit, the reason is that more than 2GiB memory
 			 * sowing down the memory performance because of TLB cache misses
@@ -130,6 +145,9 @@ class autoAdjust
 				 * to avoid out of memory errors
 				 */
 				maxThreads = 2024u;
+
+				if(useCryptonight_gpu)
+					default_workSize = 16u;
 			}
 
 			// NVIDIA optimizations
@@ -142,19 +160,6 @@ class autoAdjust
 				minFreeMem = 512u * byteToMiB;
 			}
 
-			// check if cryptonight_monero_v8 is selected for the user or dev pool
-			bool useCryptonight_v8 = (std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_monero_v8) != neededAlgorithms.end());
-
-			// true for all cryptonight_heavy derivates since we check the user and dev pool
-			bool useCryptonight_heavy = std::find(neededAlgorithms.begin(), neededAlgorithms.end(), cryptonight_heavy) != neededAlgorithms.end();
-
-			// true for cryptonight_gpu as main user pool algorithm
-			bool useCryptonight_gpu = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_gpu;
-
-			bool useCryptonight_r = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r;
-
-			bool useCryptonight_r_wow = ::jconf::inst()->GetCurrentCoinSelection().GetDescription(1).GetMiningAlgo() == cryptonight_r_wow;
-
 			// set strided index to default
 			ctx.stridedIndex = 1;
 
@@ -203,11 +208,11 @@ class autoAdjust
 			size_t perThread = hashMemSize + 240u;
 			size_t maxIntensity = memPerThread / perThread;
 			size_t possibleIntensity = std::min(maxThreads, maxIntensity);
-			// map intensity to a multiple of the compute unit count, 8 is the number of threads per work group
-			size_t intensity = (possibleIntensity / (8 * ctx.computeUnits)) * ctx.computeUnits * 8;
+			// map intensity to a multiple of the compute unit count, default_workSize is the number of threads per work group
+			size_t intensity = (possibleIntensity / (default_workSize * ctx.computeUnits)) * ctx.computeUnits * default_workSize;
 			// in the case we use two threads per gpu we can be relax and need no multiple of the number of compute units
 			if(numThreads == 2)
-				intensity = (possibleIntensity / 8) * 8;
+				intensity = (possibleIntensity / default_workSize) * default_workSize;
 
 			//If the intensity is 0, then it's because the multiple of the unit count is greater than intensity
 			if(intensity == 0)
@@ -225,9 +230,8 @@ class autoAdjust
 					conf += "  // gpu: " + ctx.name + std::string("  compute units: ") + std::to_string(ctx.computeUnits) + "\n";
 					conf += "  // memory:" + std::to_string(memPerThread / byteToMiB) + "|" +
 							std::to_string(ctx.maxMemPerAlloc / byteToMiB) + "|" + std::to_string(maxAvailableFreeMem / byteToMiB) + " MiB (used per thread|max per alloc|total free)\n";
-					// set 8 threads per block (this is a good value for the most gpus)
 					conf += std::string("  { \"index\" : ") + std::to_string(ctx.deviceIdx) + ",\n" +
-							"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(8) + ",\n" +
+							"    \"intensity\" : " + std::to_string(intensity) + ", \"worksize\" : " + std::to_string(default_workSize) + ",\n" +
 							"    \"affine_to_cpu\" : false, \"strided_index\" : " + std::to_string(ctx.stridedIndex) + ", \"mem_chunk\" : 2,\n"
 																													   "    \"unroll\" : " +
 							std::to_string(numUnroll) + ", \"comp_mode\" : true, \"interleave\" : " + std::to_string(ctx.interleave) + "\n" +