Skip to content

Commit

Permalink
rccl-prim-test: add all-to-all benchmark (#185)
Browse files Browse the repository at this point in the history
For gfx908, support simple detection of ring topology.
Call ReduceOrCopyMulti directly from kernel.
Also simplify code by removing kernel start synchronization option
which has no effect on throughput measurements.
  • Loading branch information
wenkaidu authored Mar 16, 2020
1 parent b9fb0cd commit ebc823e
Show file tree
Hide file tree
Showing 2 changed files with 170 additions and 150 deletions.
57 changes: 1 addition & 56 deletions tools/rccl-prim-test/copy_kernel.h
Original file line number Diff line number Diff line change
Expand Up @@ -186,7 +186,7 @@ __device__ int ptrAlign128(T* ptr) { return (uint64_t)ptr % alignof(Pack128); }

// Try to limit consecutive load/stores to 8.
// Use UNROLL 8 when we have a single source and a single destination, 4 otherwise
#define AUTOUNROLL (UNROLL*(4/(MINDSTS+MINSRCS)))
#define AUTOUNROLL UNROLL

template<int UNROLL, class FUNC, typename T, int MINSRCS, int MAXSRCS, int MINDSTS, int MAXDSTS>
__device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
Expand Down Expand Up @@ -252,59 +252,4 @@ __device__ void ReduceOrCopyMulti(const int tid, const int nthreads,
ReduceCopyMulti<FUNC, T, MINSRCS, MAXSRCS, MINDSTS, MAXDSTS>(tid, nthreads, nsrcs, srcs, ndsts, dsts, offset, Nrem);
}

// Assumptions:
// - there is exactly 1 block
// - THREADS is the number of producer threads
// - this function is called by all producer threads
template<int UNROLL, int THREADS, typename T>
__device__ void Copy(volatile T * __restrict__ const dest,
const volatile T * __restrict__ const src, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src;
dsts[0] = (T*)dest;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
1, srcs, 1, dsts, N);
}

template<int UNROLL, int THREADS, typename T>
__device__ void DoubleCopy(volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1,
const volatile T * __restrict__ const src, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src;
dsts[0] = (T*)dest0;
dsts[1] = (T*)dest1;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
1, srcs, 2, dsts, N);
}

template<int UNROLL, int THREADS, typename T>
__device__ void Reduce(volatile T * __restrict__ const dest,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src0;
srcs[1] = (const T*)src1;
dsts[0] = (T*)dest;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
2, srcs, 1, dsts, N);
}

template<int UNROLL, int THREADS, typename T>
__device__ void ReduceCopy(volatile T * __restrict__ const dest0,
volatile T * __restrict__ const dest1,
const volatile T * __restrict__ const src0,
const volatile T * __restrict__ const src1, const int N) {
const T* srcs[2];
T* dsts[2];
srcs[0] = (const T*)src0;
srcs[1] = (const T*)src1;
dsts[0] = (T*)dest0;
dsts[1] = (T*)dest1;
ReduceOrCopyMulti<UNROLL, FuncPassA<T>, T, 1, 2, 1, 2>(threadIdx.x, THREADS,
2, srcs, 2, dsts, N);
}
#endif // COPY_KERNEL_H_
Loading

0 comments on commit ebc823e

Please sign in to comment.