From d1e20b4c5e8175aa3aafb9b784b985c1ea8e79bc Mon Sep 17 00:00:00 2001
From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com>
Date: Tue, 28 Jul 2020 14:21:44 -0700
Subject: [PATCH] Improve 4P2H topology on Rome (#243)

1. Use bi-directional rings
2. GPU search is sorted by PCI device ID to get consistent results
---
 src/graph/connect.cc |  3 +-
 src/graph/search.cc  | 76 +++++++++++++++++++++++++-------------------
 2 files changed, 44 insertions(+), 35 deletions(-)

diff --git a/src/graph/connect.cc b/src/graph/connect.cc
index 105d83cee..0dfde31a0 100644
--- a/src/graph/connect.cc
+++ b/src/graph/connect.cc
@@ -289,9 +289,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl
   memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int));
   memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int));
 
-  int nc = 0;
+  int nc = nChannels*2;
   if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*3;
-  else if (comm->topo->nodes[NET].count != 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = nChannels*4;
   int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels()));
 
   // Duplication should be complete now
diff --git a/src/graph/search.cc b/src/graph/search.cc
index 878a1d66b..4d4c23a0c 100644
--- a/src/graph/search.cc
+++ b/src/graph/search.cc
@@ -783,7 +783,7 @@ static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *ngpu, int
 }
 
 static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int *gpu1, int *gpu2, int ex1, int ex2) {
-  int n, m;
+  int n, m, idx, gid;
   int ngpus = system->nodes[GPU].count;
   *gpu1 = -1; *gpu2 = -1;
   int c1, c2;
@@ -794,6 +794,7 @@ static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int
     if (system->nodes[GPU].nodes[n].paths[CPU][c1].count != 2) continue;
     struct ncclTopoNode* node = system->nodes[GPU].nodes+n;
     if (node->paths[GPU] == NULL) continue;
+    idx = -1; gid = 0;
     for (m = 0; m < ngpus; m++) {
       if (system->nodes[GPU].nodes[m].gpu.dev == ex2) continue;
       if (system->nodes[GPU].nodes[m].paths[CPU][c2].count != 2) continue;
@@ -802,13 +803,18 @@ static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int
         if (link->remNode->gpu.dev == system->nodes[GPU].nodes[m].gpu.dev) break;
       }
       if (!link->remNode) continue;
-      if (link->type == LINK_NVL) break;
+      if (link->type == LINK_NVL) {
+        if (idx == -1 || (idx != -1 && system->nodes[GPU].nodes[m].id < gid)) {
+          idx = m;
+          gid = system->nodes[GPU].nodes[m].id;
+        }
+      }
     }
-    if (m < ngpus) break;
+    if (idx != -1) break;
   }
   if (n < ngpus) {
     *gpu1 = system->nodes[GPU].nodes[n].gpu.dev;
-    *gpu2 = system->nodes[GPU].nodes[m].gpu.dev;
+    *gpu2 = system->nodes[GPU].nodes[idx].gpu.dev;
     return true;
   }
   return false;
@@ -842,7 +848,7 @@ static bool validate4P1H(struct ncclTopoSystem* system, int *hive) {
 }
 
 static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
-  static const char *ringBase = "6 7 4 5 1 0 3 2";
+  static const char *ringBase = "6 7 4 5 1 0 3 2|7 6 2 3 0 1 5 4";
   static char ringRemap[64];
   int id[8], dist[8];
   int i;
@@ -850,8 +856,8 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) {
   *str = 0;
   int ngpus = system->nodes[GPU].count;
   int ncpus = system->nodes[CPU].count;
-  // 8 GPUs and 4 numa nodes on multi node only
-  if (ngpus != 8 || ncpus != 4 || !system->nodes[NET].count)
+  // 8 GPUs and 4 numa nodes only
+  if (ngpus != 8 || ncpus != 4)
     return ncclSuccess;
   // only valid on Rome
   int arch, vendor, model;
@@ -942,34 +948,38 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph
     }
     graph->speedIntra = graph->speedInter = system->maxWidth;
     if (system->nodes[NET].count) {
-      int *intra, *used;
-      graph->nChannels = system->nodes[NET].count;
-      NCCLCHECK(ncclCalloc(&intra, ngpus));
-      NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
-      for (int n = 0; n < system->nodes[NET].count; n++) {
-        graph->inter[n*2] = graph->inter[n*2+1] = n;
-        // do not change ring order for 4P2H on Rome
-        if (system->type == RCCL_TOPO_4P2H_ROME) continue;
-        struct ncclTopoNode* net = system->nodes[NET].nodes+n;
-        struct ncclTopoLinkList* paths = net->paths[GPU];
-        // find the first unsed GPU that is closest to NIC
-        int f, m;
-        for (f = 0; f < ngpus; f++) {
-          int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
-          if(j >= n) break;
-        }
-        for (int i = 0; i < ngpus; i++) {
-          int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
-          if (j < n) continue;
-          if (paths[i].count < paths[f].count) f = i;
+      // do not change ring order for multi node 4P2H on Rome
+      if (system->type == RCCL_TOPO_4P2H_ROME) {
+        for (int n = 0; n < graph->nChannels; n++)
+          graph->inter[n*2] = graph->inter[n*2+1] = n%system->nodes[NET].count;
+      } else {
+        int *intra, *used;
+        graph->nChannels = system->nodes[NET].count;
+        NCCLCHECK(ncclCalloc(&intra, ngpus));
+        NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count));
+        for (int n = 0; n < system->nodes[NET].count; n++) {
+          graph->inter[n*2] = graph->inter[n*2+1] = n;
+          struct ncclTopoNode* net = system->nodes[NET].nodes+n;
+          struct ncclTopoLinkList* paths = net->paths[GPU];
+          // find the first unsed GPU that is closest to NIC
+          int f, m;
+          for (f = 0; f < ngpus; f++) {
+            int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break;
+            if(j >= n) break;
+          }
+          for (int i = 0; i < ngpus; i++) {
+            int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break;
+            if (j < n) continue;
+            if (paths[i].count < paths[f].count) f = i;
+          }
+          for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
+          used[n] = graph->intra[n*ngpus+m];
+          for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
+          for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
         }
-        for (m = 0; m<ngpus; m++) if (graph->intra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break;
-        used[n] = graph->intra[n*ngpus+m];
-        for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)];
-        for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i];
+        free(used);
+        free(intra);
       }
-      free(used);
-      free(intra);
     }
     if (graph->nChannels) return ncclSuccess;
   }