From d1e20b4c5e8175aa3aafb9b784b985c1ea8e79bc Mon Sep 17 00:00:00 2001 From: Wenkai Du <43822138+wenkaidu@users.noreply.github.com> Date: Tue, 28 Jul 2020 14:21:44 -0700 Subject: [PATCH] Improve 4P2H topology on Rome (#243) 1. Use bi-directional rings 2. GPU search is sorted by PCI device ID to get consistent results --- src/graph/connect.cc | 3 +- src/graph/search.cc | 76 +++++++++++++++++++++++++------------------- 2 files changed, 44 insertions(+), 35 deletions(-) diff --git a/src/graph/connect.cc b/src/graph/connect.cc index 105d83cee..0dfde31a0 100644 --- a/src/graph/connect.cc +++ b/src/graph/connect.cc @@ -289,9 +289,8 @@ ncclResult_t ncclTopoPostset(struct ncclComm* comm, int* firstRanks, struct nccl memcpy(ringPrev+nChannels*nranks, ringPrev, nChannels*nranks*sizeof(int)); memcpy(ringNext+nChannels*nranks, ringNext, nChannels*nranks*sizeof(int)); - int nc = 0; + int nc = nChannels*2; if (comm->topo->nodes[NET].count == 0 && comm->topo->type == RCCL_TOPO_CR8G) nc = nChannels*3; - else if (comm->topo->nodes[NET].count != 0 && comm->topo->type == RCCL_TOPO_4P2H_ROME) nc = nChannels*4; int end = std::min((int)ncclMaxNchannels(), std::max(nc, ncclMinNchannels())); // Duplication should be complete now diff --git a/src/graph/search.cc b/src/graph/search.cc index 878a1d66b..4d4c23a0c 100644 --- a/src/graph/search.cc +++ b/src/graph/search.cc @@ -783,7 +783,7 @@ static bool getGpuNetCount(struct ncclTopoSystem* system, int id, int *ngpu, int } static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int *gpu1, int *gpu2, int ex1, int ex2) { - int n, m; + int n, m, idx, gid; int ngpus = system->nodes[GPU].count; *gpu1 = -1; *gpu2 = -1; int c1, c2; @@ -794,6 +794,7 @@ static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int if (system->nodes[GPU].nodes[n].paths[CPU][c1].count != 2) continue; struct ncclTopoNode* node = system->nodes[GPU].nodes+n; if (node->paths[GPU] == NULL) continue; + idx = -1; gid = 0; for (m = 0; m < ngpus; m++) { if (system->nodes[GPU].nodes[m].gpu.dev == ex2) continue; if (system->nodes[GPU].nodes[m].paths[CPU][c2].count != 2) continue; @@ -802,13 +803,18 @@ static bool findGpuByXGMI(struct ncclTopoSystem* system, int cpu1, int cpu2, int if (link->remNode->gpu.dev == system->nodes[GPU].nodes[m].gpu.dev) break; } if (!link->remNode) continue; - if (link->type == LINK_NVL) break; + if (link->type == LINK_NVL) { + if (idx == -1 || (idx != -1 && system->nodes[GPU].nodes[m].id < gid)) { + idx = m; + gid = system->nodes[GPU].nodes[m].id; + } + } } - if (m < ngpus) break; + if (idx != -1) break; } if (n < ngpus) { *gpu1 = system->nodes[GPU].nodes[n].gpu.dev; - *gpu2 = system->nodes[GPU].nodes[m].gpu.dev; + *gpu2 = system->nodes[GPU].nodes[idx].gpu.dev; return true; } return false; @@ -842,7 +848,7 @@ static bool validate4P1H(struct ncclTopoSystem* system, int *hive) { } static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) { - static const char *ringBase = "6 7 4 5 1 0 3 2"; + static const char *ringBase = "6 7 4 5 1 0 3 2|7 6 2 3 0 1 5 4"; static char ringRemap[64]; int id[8], dist[8]; int i; @@ -850,8 +856,8 @@ static ncclResult_t parseRome4P2H(struct ncclTopoSystem* system, char **str) { *str = 0; int ngpus = system->nodes[GPU].count; int ncpus = system->nodes[CPU].count; - // 8 GPUs and 4 numa nodes on multi node only - if (ngpus != 8 || ncpus != 4 || !system->nodes[NET].count) + // 8 GPUs and 4 numa nodes only + if (ngpus != 8 || ncpus != 4) return ncclSuccess; // only valid on Rome int arch, vendor, model; @@ -942,34 +948,38 @@ ncclResult_t ncclTopoCompute(ncclTopoSystem* system, struct ncclTopoGraph* graph } graph->speedIntra = graph->speedInter = system->maxWidth; if (system->nodes[NET].count) { - int *intra, *used; - graph->nChannels = system->nodes[NET].count; - NCCLCHECK(ncclCalloc(&intra, ngpus)); - NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count)); - for (int n = 0; n < system->nodes[NET].count; n++) { - graph->inter[n*2] = graph->inter[n*2+1] = n; - // do not change ring order for 4P2H on Rome - if (system->type == RCCL_TOPO_4P2H_ROME) continue; - struct ncclTopoNode* net = system->nodes[NET].nodes+n; - struct ncclTopoLinkList* paths = net->paths[GPU]; - // find the first unsed GPU that is closest to NIC - int f, m; - for (f = 0; f < ngpus; f++) { - int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break; - if(j >= n) break; - } - for (int i = 0; i < ngpus; i++) { - int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break; - if (j < n) continue; - if (paths[i].count < paths[f].count) f = i; + // do not change ring order for multi node 4P2H on Rome + if (system->type == RCCL_TOPO_4P2H_ROME) { + for (int n = 0; n < graph->nChannels; n++) + graph->inter[n*2] = graph->inter[n*2+1] = n%system->nodes[NET].count; + } else { + int *intra, *used; + graph->nChannels = system->nodes[NET].count; + NCCLCHECK(ncclCalloc(&intra, ngpus)); + NCCLCHECK(ncclCalloc(&used,system->nodes[NET].count)); + for (int n = 0; n < system->nodes[NET].count; n++) { + graph->inter[n*2] = graph->inter[n*2+1] = n; + struct ncclTopoNode* net = system->nodes[NET].nodes+n; + struct ncclTopoLinkList* paths = net->paths[GPU]; + // find the first unsed GPU that is closest to NIC + int f, m; + for (f = 0; f < ngpus; f++) { + int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[f].gpu.rank) break; + if(j >= n) break; + } + for (int i = 0; i < ngpus; i++) { + int j = 0; for (j = 0; j < n; j++) if(used[j] == system->nodes[GPU].nodes[i].gpu.rank) break; + if (j < n) continue; + if (paths[i].count < paths[f].count) f = i; + } + for (m = 0; mintra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break; + used[n] = graph->intra[n*ngpus+m]; + for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)]; + for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i]; } - for (m = 0; mintra[n*ngpus+m] == system->nodes[GPU].nodes[f].gpu.rank) break; - used[n] = graph->intra[n*ngpus+m]; - for (int i = 0; i < ngpus; i++) intra[i] = graph->intra[n*ngpus+((i+m)%ngpus)]; - for (int i = 0; i < ngpus; i++) graph->intra[n*ngpus+i] = intra[i]; + free(used); + free(intra); } - free(used); - free(intra); } if (graph->nChannels) return ncclSuccess; }