From 871ca22b6150a3aec5d06f7d20b6f738fcb49ba2 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Fri, 16 Apr 2021 09:19:52 -0700 Subject: [PATCH 1/7] Optimize string gather performance for large strings --- cpp/benchmarks/string/copy_benchmark.cpp | 3 + cpp/include/cudf/strings/detail/gather.cuh | 161 +++++++++++++++++++-- 2 files changed, 151 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/string/copy_benchmark.cpp b/cpp/benchmarks/string/copy_benchmark.cpp index b49bc878ca7..23a70215015 100644 --- a/cpp/benchmarks/string/copy_benchmark.cpp +++ b/cpp/benchmarks/string/copy_benchmark.cpp @@ -74,6 +74,9 @@ static void generate_bench_args(benchmark::internal::Benchmark* b) int const max_rowlen = 1 << 13; int const len_mult = 4; generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + + // Benchmark for very small strings + b->Args({67108864, 2}); } #define COPY_BENCHMARK_DEFINE(name) \ diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 86f79881408..ffff35f5d6f 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -34,6 +34,139 @@ namespace cudf { namespace strings { namespace detail { +// Strategy 1: String-parallel +// This strategy assigns strings to warps so that each warp can cooperatively copy from the input +// location of the string to the corresponding output location. Large datatype (uint4) is used for +// stores. This strategy is best suited for large strings. + +// Helper function for loading 16B from a potentially unaligned memory location to registers. +__forceinline__ __device__ uint4 load_uint4(const char* ptr) +{ + unsigned int* aligned_ptr = (unsigned int*)((size_t)ptr & ~(3)); + uint4 regs = {0, 0, 0, 0}; + + regs.x = aligned_ptr[0]; + regs.y = aligned_ptr[1]; + regs.z = aligned_ptr[2]; + regs.w = aligned_ptr[3]; + uint tail = aligned_ptr[4]; + + unsigned int shift = ((size_t)ptr & 3) * 8; + + regs.x = __funnelshift_r(regs.x, regs.y, shift); + regs.y = __funnelshift_r(regs.y, regs.z, shift); + regs.z = __funnelshift_r(regs.z, regs.w, shift); + regs.w = __funnelshift_r(regs.w, tail, shift); + + return regs; +} + +template +__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, + char* out_chars, + cudf::device_span const out_offsets, + MapIterator string_indices, + size_type total_out_strings) +{ + int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int global_warp_id = global_thread_id / 32; + int warp_lane = global_thread_id % 32; + int nwarps = gridDim.x * blockDim.x / 32; + + size_t alignment_offset = reinterpret_cast(out_chars) & 15; + uint4* out_chars_aligned = reinterpret_cast(out_chars - alignment_offset); + + for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { + auto out_start = out_offsets[istring]; + auto out_end = out_offsets[istring + 1]; + + // This check is necessary because string_indices[istring] may be out of bound. + if (out_start == out_end) continue; + + const char* in_start = strings_begin[string_indices[istring]].data(); + + int32_t out_start_aligned = (out_start + alignment_offset + 15) / 16 * 16 - alignment_offset; + int32_t out_end_aligned = (out_end + alignment_offset) / 16 * 16 - alignment_offset; + + for (size_type ichar = out_start_aligned + warp_lane * 16; ichar < out_end_aligned; + ichar += 32 * 16) { + *(out_chars_aligned + (ichar + alignment_offset) / 16) = + load_uint4(in_start + ichar - out_start); + } + + if (out_end_aligned <= out_start_aligned) { + int32_t ichar = out_start + warp_lane; + if (ichar < out_end) { out_chars[ichar] = in_start[warp_lane]; } + } else { + if (out_start + warp_lane < out_start_aligned) { + out_chars[out_start + warp_lane] = in_start[warp_lane]; + } + + int32_t ichar = out_end_aligned + warp_lane; + if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; } + } + } +} + +// Strategy 2: Char-parallel +// This strategy assigns characters to threads, and uses binary search for getting the string +// index. To improve the binary search performance, fixed number of strings per threadblock is +// used. This strategy is best suited for small strings. +constexpr static int strings_per_threadblock = 32; + +// Binary search `value` in `offsets` of length `nelements`. Require `nelements` to be less than or +// equal to `strings_per_threadblock`. Require `strings_per_threadblock` to be an exponential of 2. +__forceinline__ __device__ size_type binary_search(int32_t* offsets, + int32_t value, + size_type nelements) +{ + size_type idx = 0; +#pragma unroll + for (size_type i = strings_per_threadblock / 2; i > 0; i /= 2) { + if (idx + i < nelements && offsets[idx + i] <= value) idx += i; + } + return idx; +} + +template +__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, + char* out_chars, + cudf::device_span const out_offsets, + MapIterator string_indices, + size_type total_out_strings) +{ + __shared__ int32_t out_offsets_threadblock[strings_per_threadblock + 1]; + + // Current thread block will process output strings starting at `begin_out_string_idx`. + size_type begin_out_string_idx = blockIdx.x * strings_per_threadblock; + + // Number of strings to be processed by the current threadblock. + size_type strings_current_threadblock = + min(strings_per_threadblock, total_out_strings - begin_out_string_idx); + + if (strings_current_threadblock <= 0) return; + + // Collectively load offsets of strings processed by the current thread block. + for (size_type idx = threadIdx.x; idx <= strings_current_threadblock; idx += blockDim.x) { + out_offsets_threadblock[idx] = out_offsets[idx + begin_out_string_idx]; + } + __syncthreads(); + + for (int32_t out_ibyte = threadIdx.x + out_offsets_threadblock[0]; + out_ibyte < out_offsets_threadblock[strings_current_threadblock]; + out_ibyte += blockDim.x) { + // binary search for the string index corresponding to out_ibyte + size_type string_idx = + binary_search(out_offsets_threadblock, out_ibyte, strings_current_threadblock); + + // calculate which character to load within the string + int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx]; + + size_type in_string_idx = string_indices[begin_out_string_idx + string_idx]; + out_chars[out_ibyte] = strings_begin[in_string_idx].data()[icharacter]; + } +} + /** * @brief Returns a new chars column using the specified indices to select * strings from the input iterator. @@ -68,20 +201,22 @@ std::unique_ptr gather_chars(StringIterator strings_begin, auto chars_column = create_chars_child_column(output_count, chars_bytes, stream, mr); auto const d_chars = chars_column->mutable_view().template data(); - auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char { - auto const out_row = - thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx)); - auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)]; // get row index - auto const d_str = strings_begin[row_idx]; // get row's string - auto const offset = out_idx - *out_row; // get string's char - return d_str.data()[offset]; - }; + size_type average_string_length = chars_bytes / output_count; - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - d_chars, - gather_chars_fn); + if (average_string_length > 32) { + gather_chars_fn_string_parallel<<(output_count) + 3) / 4, 65536), + 128, + 0, + stream.value()>>>( + strings_begin, d_chars, offsets, map_begin, output_count); + } else { + gather_chars_fn_char_parallel<<<(output_count + strings_per_threadblock - 1) / + strings_per_threadblock, + 128, + 0, + stream.value()>>>( + strings_begin, d_chars, offsets, map_begin, output_count); + } return chars_column; } From 50391620ce787ba5c1bb6dc07999d91ec254bb27 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 19 Apr 2021 14:21:11 -0700 Subject: [PATCH 2/7] Add more documentation and address reviewer's feedback --- cpp/include/cudf/strings/detail/gather.cuh | 79 +++++++++++++--------- 1 file changed, 46 insertions(+), 33 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index ffff35f5d6f..5bf736004ec 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -42,16 +42,13 @@ namespace detail { // Helper function for loading 16B from a potentially unaligned memory location to registers. __forceinline__ __device__ uint4 load_uint4(const char* ptr) { - unsigned int* aligned_ptr = (unsigned int*)((size_t)ptr & ~(3)); - uint4 regs = {0, 0, 0, 0}; + auto const offset = reinterpret_cast(ptr) % 4; + auto const* aligned_ptr = reinterpret_cast(ptr - offset); + auto const shift = offset * 8; - regs.x = aligned_ptr[0]; - regs.y = aligned_ptr[1]; - regs.z = aligned_ptr[2]; - regs.w = aligned_ptr[3]; - uint tail = aligned_ptr[4]; - - unsigned int shift = ((size_t)ptr & 3) * 8; + uint4 regs = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]}; + uint tail = 0; + if (shift) tail = aligned_ptr[4]; regs.x = __funnelshift_r(regs.x, regs.y, shift); regs.y = __funnelshift_r(regs.y, regs.z, shift); @@ -68,40 +65,56 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, MapIterator string_indices, size_type total_out_strings) { + constexpr size_t datatype_size = sizeof(uint4); + constexpr size_t threads_per_warp = 32; + int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int global_warp_id = global_thread_id / 32; - int warp_lane = global_thread_id % 32; - int nwarps = gridDim.x * blockDim.x / 32; + int global_warp_id = global_thread_id / threads_per_warp; + int warp_lane = global_thread_id % threads_per_warp; + int nwarps = gridDim.x * blockDim.x / threads_per_warp; - size_t alignment_offset = reinterpret_cast(out_chars) & 15; - uint4* out_chars_aligned = reinterpret_cast(out_chars - alignment_offset); + auto const alignment_offset = reinterpret_cast(out_chars) % datatype_size; + uint4* out_chars_aligned = reinterpret_cast(out_chars - alignment_offset); for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { - auto out_start = out_offsets[istring]; - auto out_end = out_offsets[istring + 1]; + auto const out_start = out_offsets[istring]; + auto const out_end = out_offsets[istring + 1]; // This check is necessary because string_indices[istring] may be out of bound. if (out_start == out_end) continue; const char* in_start = strings_begin[string_indices[istring]].data(); - int32_t out_start_aligned = (out_start + alignment_offset + 15) / 16 * 16 - alignment_offset; - int32_t out_end_aligned = (out_end + alignment_offset) / 16 * 16 - alignment_offset; - - for (size_type ichar = out_start_aligned + warp_lane * 16; ichar < out_end_aligned; - ichar += 32 * 16) { - *(out_chars_aligned + (ichar + alignment_offset) / 16) = + // Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`. + // `out_start_aligned` is the first 16B aligned memory location after `out_start`. + // `out_end_aligned` is the last 16B aligned memory location before `out_end`. Characters + // between `[out_start_aligned, out_end_aligned)` will be copied using uint4. + int32_t out_start_aligned = + (out_start + alignment_offset + datatype_size - 1) / datatype_size * datatype_size - + alignment_offset; + int32_t out_end_aligned = + (out_end + alignment_offset) / datatype_size * datatype_size - alignment_offset; + + for (size_type ichar = out_start_aligned + warp_lane * datatype_size; ichar < out_end_aligned; + ichar += threads_per_warp * datatype_size) { + *(out_chars_aligned + (ichar + alignment_offset) / datatype_size) = load_uint4(in_start + ichar - out_start); } + // Tail logic: copy characters of the current string outside `[out_start_aligned, + // out_end_aligned)`. if (out_end_aligned <= out_start_aligned) { + // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the + // entire string. Note that for 16B alignment, the maximum number of characters in this string + // is less than 32, so for each thread in the warp, copying one byte is enough. int32_t ichar = out_start + warp_lane; if (ichar < out_end) { out_chars[ichar] = in_start[warp_lane]; } } else { + // Copy characters in range `[out_start, out_start_aligned)`. if (out_start + warp_lane < out_start_aligned) { out_chars[out_start + warp_lane] = in_start[warp_lane]; } - + // Copy characters in range `[out_end_aligned, out_end)`. int32_t ichar = out_end_aligned + warp_lane; if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; } } @@ -112,10 +125,10 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, // This strategy assigns characters to threads, and uses binary search for getting the string // index. To improve the binary search performance, fixed number of strings per threadblock is // used. This strategy is best suited for small strings. -constexpr static int strings_per_threadblock = 32; // Binary search `value` in `offsets` of length `nelements`. Require `nelements` to be less than or // equal to `strings_per_threadblock`. Require `strings_per_threadblock` to be an exponential of 2. +template __forceinline__ __device__ size_type binary_search(int32_t* offsets, int32_t value, size_type nelements) @@ -128,7 +141,7 @@ __forceinline__ __device__ size_type binary_search(int32_t* offsets, return idx; } -template +template __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, char* out_chars, cudf::device_span const out_offsets, @@ -156,8 +169,8 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, out_ibyte < out_offsets_threadblock[strings_current_threadblock]; out_ibyte += blockDim.x) { // binary search for the string index corresponding to out_ibyte - size_type string_idx = - binary_search(out_offsets_threadblock, out_ibyte, strings_current_threadblock); + size_type string_idx = binary_search( + out_offsets_threadblock, out_ibyte, strings_current_threadblock); // calculate which character to load within the string int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx]; @@ -210,12 +223,12 @@ std::unique_ptr gather_chars(StringIterator strings_begin, stream.value()>>>( strings_begin, d_chars, offsets, map_begin, output_count); } else { - gather_chars_fn_char_parallel<<<(output_count + strings_per_threadblock - 1) / - strings_per_threadblock, - 128, - 0, - stream.value()>>>( - strings_begin, d_chars, offsets, map_begin, output_count); + constexpr int strings_per_threadblock = 32; + gather_chars_fn_char_parallel + <<<(output_count + strings_per_threadblock - 1) / strings_per_threadblock, + 128, + 0, + stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count); } return chars_column; From 9a195da73da9d15feba34c2a768f0bc54ba5000b Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 19 Apr 2021 17:34:39 -0700 Subject: [PATCH 3/7] Use thrust::upper_bound instead of customized binary search kernel --- cpp/include/cudf/strings/detail/gather.cuh | 23 ++++++---------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 5bf736004ec..1be15e4db6e 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -126,21 +126,6 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, // index. To improve the binary search performance, fixed number of strings per threadblock is // used. This strategy is best suited for small strings. -// Binary search `value` in `offsets` of length `nelements`. Require `nelements` to be less than or -// equal to `strings_per_threadblock`. Require `strings_per_threadblock` to be an exponential of 2. -template -__forceinline__ __device__ size_type binary_search(int32_t* offsets, - int32_t value, - size_type nelements) -{ - size_type idx = 0; -#pragma unroll - for (size_type i = strings_per_threadblock / 2; i > 0; i /= 2) { - if (idx + i < nelements && offsets[idx + i] <= value) idx += i; - } - return idx; -} - template __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, char* out_chars, @@ -169,8 +154,12 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, out_ibyte < out_offsets_threadblock[strings_current_threadblock]; out_ibyte += blockDim.x) { // binary search for the string index corresponding to out_ibyte - size_type string_idx = binary_search( - out_offsets_threadblock, out_ibyte, strings_current_threadblock); + auto const string_idx_iter = + thrust::prev(thrust::upper_bound(thrust::seq, + out_offsets_threadblock, + out_offsets_threadblock + strings_current_threadblock, + out_ibyte)); + size_type string_idx = thrust::distance(out_offsets_threadblock, string_idx_iter); // calculate which character to load within the string int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx]; From da0b5254e3eccfc9aed0ad2803e1ba19d522c9ee Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Tue, 20 Apr 2021 09:04:26 -0700 Subject: [PATCH 4/7] Use cudf's definition of warp size --- cpp/include/cudf/strings/detail/gather.cuh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 1be15e4db6e..0ec24c4690d 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -65,13 +66,12 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, MapIterator string_indices, size_type total_out_strings) { - constexpr size_t datatype_size = sizeof(uint4); - constexpr size_t threads_per_warp = 32; + constexpr size_t datatype_size = sizeof(uint4); int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; - int global_warp_id = global_thread_id / threads_per_warp; - int warp_lane = global_thread_id % threads_per_warp; - int nwarps = gridDim.x * blockDim.x / threads_per_warp; + int global_warp_id = global_thread_id / cudf::detail::warp_size; + int warp_lane = global_thread_id % cudf::detail::warp_size; + int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; auto const alignment_offset = reinterpret_cast(out_chars) % datatype_size; uint4* out_chars_aligned = reinterpret_cast(out_chars - alignment_offset); @@ -96,7 +96,7 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, (out_end + alignment_offset) / datatype_size * datatype_size - alignment_offset; for (size_type ichar = out_start_aligned + warp_lane * datatype_size; ichar < out_end_aligned; - ichar += threads_per_warp * datatype_size) { + ichar += cudf::detail::warp_size * datatype_size) { *(out_chars_aligned + (ichar + alignment_offset) / datatype_size) = load_uint4(in_start + ichar - out_start); } From a30e76eee4f6a05e982fca913dc5c6ff345bedd9 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 10 May 2021 18:26:05 -0700 Subject: [PATCH 5/7] Address reviewer's comment --- cpp/include/cudf/strings/detail/gather.cuh | 25 ++++++++++++---------- 1 file changed, 14 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 0ec24c4690d..950a66bb0dd 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -35,11 +35,6 @@ namespace cudf { namespace strings { namespace detail { -// Strategy 1: String-parallel -// This strategy assigns strings to warps so that each warp can cooperatively copy from the input -// location of the string to the corresponding output location. Large datatype (uint4) is used for -// stores. This strategy is best suited for large strings. - // Helper function for loading 16B from a potentially unaligned memory location to registers. __forceinline__ __device__ uint4 load_uint4(const char* ptr) { @@ -59,6 +54,10 @@ __forceinline__ __device__ uint4 load_uint4(const char* ptr) return regs; } +// Strategy 1: String-parallel +// This strategy assigns strings to warps so that each warp can cooperatively copy from the input +// location of the string to the corresponding output location. Large datatype (uint4) is used for +// stores. This strategy is best suited for large strings. template __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, char* out_chars, @@ -203,19 +202,23 @@ std::unique_ptr gather_chars(StringIterator strings_begin, auto chars_column = create_chars_child_column(output_count, chars_bytes, stream, mr); auto const d_chars = chars_column->mutable_view().template data(); + constexpr int warps_per_threadblock = 4; + size_type average_string_length = chars_bytes / output_count; if (average_string_length > 32) { - gather_chars_fn_string_parallel<<(output_count) + 3) / 4, 65536), - 128, - 0, - stream.value()>>>( - strings_begin, d_chars, offsets, map_begin, output_count); + constexpr int max_threadblocks = 65536; + gather_chars_fn_string_parallel<<< + min((static_cast(output_count) + warps_per_threadblock - 1) / warps_per_threadblock, + max_threadblocks), + warps_per_threadblock * cudf::detail::warp_size, + 0, + stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count); } else { constexpr int strings_per_threadblock = 32; gather_chars_fn_char_parallel <<<(output_count + strings_per_threadblock - 1) / strings_per_threadblock, - 128, + warps_per_threadblock * cudf::detail::warp_size, 0, stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count); } From 93dd1a3142b1370f3cccce19cc15e52368a442de Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 10 May 2021 18:50:03 -0700 Subject: [PATCH 6/7] Avoid reading beyond string boundaries --- cpp/include/cudf/strings/detail/gather.cuh | 33 +++++++++++++--------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 950a66bb0dd..a0a40b34819 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -65,14 +65,15 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, MapIterator string_indices, size_type total_out_strings) { - constexpr size_t datatype_size = sizeof(uint4); + constexpr size_t out_datatype_size = sizeof(uint4); + constexpr size_t in_datatype_size = sizeof(uint); int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; int global_warp_id = global_thread_id / cudf::detail::warp_size; int warp_lane = global_thread_id % cudf::detail::warp_size; int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; - auto const alignment_offset = reinterpret_cast(out_chars) % datatype_size; + auto const alignment_offset = reinterpret_cast(out_chars) % out_datatype_size; uint4* out_chars_aligned = reinterpret_cast(out_chars - alignment_offset); for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { @@ -85,18 +86,23 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, const char* in_start = strings_begin[string_indices[istring]].data(); // Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`. - // `out_start_aligned` is the first 16B aligned memory location after `out_start`. - // `out_end_aligned` is the last 16B aligned memory location before `out_end`. Characters + // `out_start_aligned` is the first 16B aligned memory location after `out_start + 4`. + // `out_end_aligned` is the last 16B aligned memory location before `out_end - 4`. Characters // between `[out_start_aligned, out_end_aligned)` will be copied using uint4. + // `out_start + 4` and `out_end - 4` are used instead of `out_start` and `out_end` to avoid + // `load_uint4` reading beyond string boundaries. int32_t out_start_aligned = - (out_start + alignment_offset + datatype_size - 1) / datatype_size * datatype_size - + (out_start + in_datatype_size + alignment_offset + out_datatype_size - 1) / + out_datatype_size * out_datatype_size - alignment_offset; int32_t out_end_aligned = - (out_end + alignment_offset) / datatype_size * datatype_size - alignment_offset; + (out_end - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size - + alignment_offset; - for (size_type ichar = out_start_aligned + warp_lane * datatype_size; ichar < out_end_aligned; - ichar += cudf::detail::warp_size * datatype_size) { - *(out_chars_aligned + (ichar + alignment_offset) / datatype_size) = + for (size_type ichar = out_start_aligned + warp_lane * out_datatype_size; + ichar < out_end_aligned; + ichar += cudf::detail::warp_size * out_datatype_size) { + *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) = load_uint4(in_start + ichar - out_start); } @@ -104,10 +110,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, // out_end_aligned)`. if (out_end_aligned <= out_start_aligned) { // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the - // entire string. Note that for 16B alignment, the maximum number of characters in this string - // is less than 32, so for each thread in the warp, copying one byte is enough. - int32_t ichar = out_start + warp_lane; - if (ichar < out_end) { out_chars[ichar] = in_start[warp_lane]; } + // entire string. + for (int32_t ichar = out_start + warp_lane; ichar < out_end; + ichar += cudf::detail::warp_size) { + out_chars[ichar] = in_start[ichar - out_start]; + } } else { // Copy characters in range `[out_start, out_start_aligned)`. if (out_start + warp_lane < out_start_aligned) { From e87da49f5b24310dd6dd40fd041afaffec94ddf5 Mon Sep 17 00:00:00 2001 From: Hao Gao Date: Mon, 24 May 2021 10:29:32 -0700 Subject: [PATCH 7/7] Improve documentation --- cpp/include/cudf/strings/detail/gather.cuh | 48 +++++++++++++++++----- 1 file changed, 37 insertions(+), 11 deletions(-) diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index a0a40b34819..dcd17245ee6 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -54,10 +54,22 @@ __forceinline__ __device__ uint4 load_uint4(const char* ptr) return regs; } -// Strategy 1: String-parallel -// This strategy assigns strings to warps so that each warp can cooperatively copy from the input -// location of the string to the corresponding output location. Large datatype (uint4) is used for -// stores. This strategy is best suited for large strings. +/** + * @brief Gather characters from the input iterator, with string parallel strategy. + * + * This strategy assigns strings to warps so that each warp can cooperatively copy from the input + * location of the string to the corresponding output location. Large datatype (uint4) is used for + * stores. This strategy is best suited for large strings. + * + * @tparam StringIterator Iterator should produce `string_view` objects. + * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. + * + * @param strings_begin Start of the iterator to retrieve `string_view` instances. + * @param out_chars Output buffer for gathered characters. + * @param out_offsets The offset values associated with the output buffer. + * @param string_indices Start of index iterator. + * @param total_out_strings Number of output strings to be gathered. + */ template __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, char* out_chars, @@ -127,11 +139,22 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, } } -// Strategy 2: Char-parallel -// This strategy assigns characters to threads, and uses binary search for getting the string -// index. To improve the binary search performance, fixed number of strings per threadblock is -// used. This strategy is best suited for small strings. - +/** + * @brief Gather characters from the input iterator, with char parallel strategy. + * + * This strategy assigns characters to threads, and uses binary search for getting the string + * index. To improve the binary search performance, fixed number of strings per threadblock is + * used. This strategy is best suited for small strings. + * + * @tparam StringIterator Iterator should produce `string_view` objects. + * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. + * + * @param strings_begin Start of the iterator to retrieve `string_view` instances. + * @param out_chars Output buffer for gathered characters. + * @param out_offsets The offset values associated with the output buffer. + * @param string_indices Start of index iterator. + * @param total_out_strings Number of output strings to be gathered. + */ template __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, char* out_chars, @@ -185,7 +208,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, * @tparam StringIterator Iterator should produce `string_view` objects. * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. * - * @param strings_begin Start of the iterator to retrieve `string_view` instances + * @param strings_begin Start of the iterator to retrieve `string_view` instances. * @param map_begin Start of index iterator. * @param map_end End of index iterator. * @param offsets The offset values to be associated with the output chars column. @@ -210,10 +233,13 @@ std::unique_ptr gather_chars(StringIterator strings_begin, auto const d_chars = chars_column->mutable_view().template data(); constexpr int warps_per_threadblock = 4; + // String parallel strategy will be used if average string length is above this threshold. + // Otherwise, char parallel strategy will be used. + constexpr size_type string_parallel_threshold = 32; size_type average_string_length = chars_bytes / output_count; - if (average_string_length > 32) { + if (average_string_length > string_parallel_threshold) { constexpr int max_threadblocks = 65536; gather_chars_fn_string_parallel<<< min((static_cast(output_count) + warps_per_threadblock - 1) / warps_per_threadblock,