From 871ca22b6150a3aec5d06f7d20b6f738fcb49ba2 Mon Sep 17 00:00:00 2001
From: Hao Gao <haog@nvidia.com>
Date: Fri, 16 Apr 2021 09:19:52 -0700
Subject: [PATCH 1/7] Optimize string gather performance for large strings

---
 cpp/benchmarks/string/copy_benchmark.cpp   |   3 +
 cpp/include/cudf/strings/detail/gather.cuh | 161 +++++++++++++++++++--
 2 files changed, 151 insertions(+), 13 deletions(-)
diff --git a/cpp/benchmarks/string/copy_benchmark.cpp b/cpp/benchmarks/string/copy_benchmark.cpp
index b49bc878ca7..23a70215015 100644
--- a/cpp/benchmarks/string/copy_benchmark.cpp
+++ b/cpp/benchmarks/string/copy_benchmark.cpp
@@ -74,6 +74,9 @@ static void generate_bench_args(benchmark::internal::Benchmark* b)
   int const max_rowlen = 1 << 13;
   int const len_mult   = 4;
   generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult);
+
+  // Benchmark for very small strings
+  b->Args({67108864, 2});
 }
 
 #define COPY_BENCHMARK_DEFINE(name)                           \
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 86f79881408..ffff35f5d6f 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -34,6 +34,139 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
+// Strategy 1: String-parallel
+// This strategy assigns strings to warps so that each warp can cooperatively copy from the input
+// location of the string to the corresponding output location. Large datatype (uint4) is used for
+// stores. This strategy is best suited for large strings.
+
+// Helper function for loading 16B from a potentially unaligned memory location to registers.
+__forceinline__ __device__ uint4 load_uint4(const char* ptr)
+{
+  unsigned int* aligned_ptr = (unsigned int*)((size_t)ptr & ~(3));
+  uint4 regs                = {0, 0, 0, 0};
+
+  regs.x    = aligned_ptr[0];
+  regs.y    = aligned_ptr[1];
+  regs.z    = aligned_ptr[2];
+  regs.w    = aligned_ptr[3];
+  uint tail = aligned_ptr[4];
+
+  unsigned int shift = ((size_t)ptr & 3) * 8;
+
+  regs.x = __funnelshift_r(regs.x, regs.y, shift);
+  regs.y = __funnelshift_r(regs.y, regs.z, shift);
+  regs.z = __funnelshift_r(regs.z, regs.w, shift);
+  regs.w = __funnelshift_r(regs.w, tail, shift);
+
+  return regs;
+}
+
+template <typename StringIterator, typename MapIterator>
+__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
+                                                char* out_chars,
+                                                cudf::device_span<int32_t const> const out_offsets,
+                                                MapIterator string_indices,
+                                                size_type total_out_strings)
+{
+  int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
+  int global_warp_id   = global_thread_id / 32;
+  int warp_lane        = global_thread_id % 32;
+  int nwarps           = gridDim.x * blockDim.x / 32;
+
+  size_t alignment_offset  = reinterpret_cast<size_t>(out_chars) & 15;
+  uint4* out_chars_aligned = reinterpret_cast<uint4*>(out_chars - alignment_offset);
+
+  for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
+    auto out_start = out_offsets[istring];
+    auto out_end   = out_offsets[istring + 1];
+
+    // This check is necessary because string_indices[istring] may be out of bound.
+    if (out_start == out_end) continue;
+
+    const char* in_start = strings_begin[string_indices[istring]].data();
+
+    int32_t out_start_aligned = (out_start + alignment_offset + 15) / 16 * 16 - alignment_offset;
+    int32_t out_end_aligned   = (out_end + alignment_offset) / 16 * 16 - alignment_offset;
+
+    for (size_type ichar = out_start_aligned + warp_lane * 16; ichar < out_end_aligned;
+         ichar += 32 * 16) {
+      *(out_chars_aligned + (ichar + alignment_offset) / 16) =
+        load_uint4(in_start + ichar - out_start);
+    }
+
+    if (out_end_aligned <= out_start_aligned) {
+      int32_t ichar = out_start + warp_lane;
+      if (ichar < out_end) { out_chars[ichar] = in_start[warp_lane]; }
+    } else {
+      if (out_start + warp_lane < out_start_aligned) {
+        out_chars[out_start + warp_lane] = in_start[warp_lane];
+      }
+
+      int32_t ichar = out_end_aligned + warp_lane;
+      if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; }
+    }
+  }
+}
+
+// Strategy 2: Char-parallel
+// This strategy assigns characters to threads, and uses binary search for getting the string
+// index. To improve the binary search performance, fixed number of strings per threadblock is
+// used. This strategy is best suited for small strings.
+constexpr static int strings_per_threadblock = 32;
+
+// Binary search `value` in `offsets` of length `nelements`. Require `nelements` to be less than or
+// equal to `strings_per_threadblock`. Require `strings_per_threadblock` to be an exponential of 2.
+__forceinline__ __device__ size_type binary_search(int32_t* offsets,
+                                                   int32_t value,
+                                                   size_type nelements)
+{
+  size_type idx = 0;
+#pragma unroll
+  for (size_type i = strings_per_threadblock / 2; i > 0; i /= 2) {
+    if (idx + i < nelements && offsets[idx + i] <= value) idx += i;
+  }
+  return idx;
+}
+
+template <typename StringIterator, typename MapIterator>
+__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
+                                              char* out_chars,
+                                              cudf::device_span<int32_t const> const out_offsets,
+                                              MapIterator string_indices,
+                                              size_type total_out_strings)
+{
+  __shared__ int32_t out_offsets_threadblock[strings_per_threadblock + 1];
+
+  // Current thread block will process output strings starting at `begin_out_string_idx`.
+  size_type begin_out_string_idx = blockIdx.x * strings_per_threadblock;
+
+  // Number of strings to be processed by the current threadblock.
+  size_type strings_current_threadblock =
+    min(strings_per_threadblock, total_out_strings - begin_out_string_idx);
+
+  if (strings_current_threadblock <= 0) return;
+
+  // Collectively load offsets of strings processed by the current thread block.
+  for (size_type idx = threadIdx.x; idx <= strings_current_threadblock; idx += blockDim.x) {
+    out_offsets_threadblock[idx] = out_offsets[idx + begin_out_string_idx];
+  }
+  __syncthreads();
+
+  for (int32_t out_ibyte = threadIdx.x + out_offsets_threadblock[0];
+       out_ibyte < out_offsets_threadblock[strings_current_threadblock];
+       out_ibyte += blockDim.x) {
+    // binary search for the string index corresponding to out_ibyte
+    size_type string_idx =
+      binary_search(out_offsets_threadblock, out_ibyte, strings_current_threadblock);
+
+    // calculate which character to load within the string
+    int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx];
+
+    size_type in_string_idx = string_indices[begin_out_string_idx + string_idx];
+    out_chars[out_ibyte]    = strings_begin[in_string_idx].data()[icharacter];
+  }
+}
+
 /**
  * @brief Returns a new chars column using the specified indices to select
  * strings from the input iterator.
@@ -68,20 +201,22 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
   auto chars_column  = create_chars_child_column(output_count, chars_bytes, stream, mr);
   auto const d_chars = chars_column->mutable_view().template data<char>();
 
-  auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char {
-    auto const out_row =
-      thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx));
-    auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)];  // get row index
-    auto const d_str   = strings_begin[row_idx];                                 // get row's string
-    auto const offset  = out_idx - *out_row;  // get string's char
-    return d_str.data()[offset];
-  };
+  size_type average_string_length = chars_bytes / output_count;
 
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(chars_bytes),
-                    d_chars,
-                    gather_chars_fn);
+  if (average_string_length > 32) {
+    gather_chars_fn_string_parallel<<<min((static_cast<int>(output_count) + 3) / 4, 65536),
+                                      128,
+                                      0,
+                                      stream.value()>>>(
+      strings_begin, d_chars, offsets, map_begin, output_count);
+  } else {
+    gather_chars_fn_char_parallel<<<(output_count + strings_per_threadblock - 1) /
+                                      strings_per_threadblock,
+                                    128,
+                                    0,
+                                    stream.value()>>>(
+      strings_begin, d_chars, offsets, map_begin, output_count);
+  }
 
   return chars_column;
 }

From 50391620ce787ba5c1bb6dc07999d91ec254bb27 Mon Sep 17 00:00:00 2001
From: Hao Gao <haog@nvidia.com>
Date: Mon, 19 Apr 2021 14:21:11 -0700
Subject: [PATCH 2/7] Add more documentation and address reviewer's feedback

---
 cpp/include/cudf/strings/detail/gather.cuh | 79 +++++++++++++---------
 1 file changed, 46 insertions(+), 33 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index ffff35f5d6f..5bf736004ec 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -42,16 +42,13 @@ namespace detail {
 // Helper function for loading 16B from a potentially unaligned memory location to registers.
 __forceinline__ __device__ uint4 load_uint4(const char* ptr)
 {
-  unsigned int* aligned_ptr = (unsigned int*)((size_t)ptr & ~(3));
-  uint4 regs                = {0, 0, 0, 0};
+  auto const offset       = reinterpret_cast<std::uintptr_t>(ptr) % 4;
+  auto const* aligned_ptr = reinterpret_cast<unsigned int const*>(ptr - offset);
+  auto const shift        = offset * 8;
 
-  regs.x    = aligned_ptr[0];
-  regs.y    = aligned_ptr[1];
-  regs.z    = aligned_ptr[2];
-  regs.w    = aligned_ptr[3];
-  uint tail = aligned_ptr[4];
-
-  unsigned int shift = ((size_t)ptr & 3) * 8;
+  uint4 regs = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]};
+  uint tail  = 0;
+  if (shift) tail = aligned_ptr[4];
 
   regs.x = __funnelshift_r(regs.x, regs.y, shift);
   regs.y = __funnelshift_r(regs.y, regs.z, shift);
@@ -68,40 +65,56 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
                                                 MapIterator string_indices,
                                                 size_type total_out_strings)
 {
+  constexpr size_t datatype_size    = sizeof(uint4);
+  constexpr size_t threads_per_warp = 32;
+
   int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int global_warp_id   = global_thread_id / 32;
-  int warp_lane        = global_thread_id % 32;
-  int nwarps           = gridDim.x * blockDim.x / 32;
+  int global_warp_id   = global_thread_id / threads_per_warp;
+  int warp_lane        = global_thread_id % threads_per_warp;
+  int nwarps           = gridDim.x * blockDim.x / threads_per_warp;
 
-  size_t alignment_offset  = reinterpret_cast<size_t>(out_chars) & 15;
-  uint4* out_chars_aligned = reinterpret_cast<uint4*>(out_chars - alignment_offset);
+  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % datatype_size;
+  uint4* out_chars_aligned    = reinterpret_cast<uint4*>(out_chars - alignment_offset);
 
   for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
-    auto out_start = out_offsets[istring];
-    auto out_end   = out_offsets[istring + 1];
+    auto const out_start = out_offsets[istring];
+    auto const out_end   = out_offsets[istring + 1];
 
     // This check is necessary because string_indices[istring] may be out of bound.
     if (out_start == out_end) continue;
 
     const char* in_start = strings_begin[string_indices[istring]].data();
 
-    int32_t out_start_aligned = (out_start + alignment_offset + 15) / 16 * 16 - alignment_offset;
-    int32_t out_end_aligned   = (out_end + alignment_offset) / 16 * 16 - alignment_offset;
-
-    for (size_type ichar = out_start_aligned + warp_lane * 16; ichar < out_end_aligned;
-         ichar += 32 * 16) {
-      *(out_chars_aligned + (ichar + alignment_offset) / 16) =
+    // Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`.
+    // `out_start_aligned` is the first 16B aligned memory location after `out_start`.
+    // `out_end_aligned` is the last 16B aligned memory location before `out_end`. Characters
+    // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
+    int32_t out_start_aligned =
+      (out_start + alignment_offset + datatype_size - 1) / datatype_size * datatype_size -
+      alignment_offset;
+    int32_t out_end_aligned =
+      (out_end + alignment_offset) / datatype_size * datatype_size - alignment_offset;
+
+    for (size_type ichar = out_start_aligned + warp_lane * datatype_size; ichar < out_end_aligned;
+         ichar += threads_per_warp * datatype_size) {
+      *(out_chars_aligned + (ichar + alignment_offset) / datatype_size) =
         load_uint4(in_start + ichar - out_start);
     }
 
+    // Tail logic: copy characters of the current string outside `[out_start_aligned,
+    // out_end_aligned)`.
     if (out_end_aligned <= out_start_aligned) {
+      // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
+      // entire string. Note that for 16B alignment, the maximum number of characters in this string
+      // is less than 32, so for each thread in the warp, copying one byte is enough.
       int32_t ichar = out_start + warp_lane;
       if (ichar < out_end) { out_chars[ichar] = in_start[warp_lane]; }
     } else {
+      // Copy characters in range `[out_start, out_start_aligned)`.
       if (out_start + warp_lane < out_start_aligned) {
         out_chars[out_start + warp_lane] = in_start[warp_lane];
       }
-
+      // Copy characters in range `[out_end_aligned, out_end)`.
       int32_t ichar = out_end_aligned + warp_lane;
       if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; }
     }
@@ -112,10 +125,10 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
 // This strategy assigns characters to threads, and uses binary search for getting the string
 // index. To improve the binary search performance, fixed number of strings per threadblock is
 // used. This strategy is best suited for small strings.
-constexpr static int strings_per_threadblock = 32;
 
 // Binary search `value` in `offsets` of length `nelements`. Require `nelements` to be less than or
 // equal to `strings_per_threadblock`. Require `strings_per_threadblock` to be an exponential of 2.
+template <int strings_per_threadblock>
 __forceinline__ __device__ size_type binary_search(int32_t* offsets,
                                                    int32_t value,
                                                    size_type nelements)
@@ -128,7 +141,7 @@ __forceinline__ __device__ size_type binary_search(int32_t* offsets,
   return idx;
 }
 
-template <typename StringIterator, typename MapIterator>
+template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
 __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
                                               char* out_chars,
                                               cudf::device_span<int32_t const> const out_offsets,
@@ -156,8 +169,8 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
        out_ibyte < out_offsets_threadblock[strings_current_threadblock];
        out_ibyte += blockDim.x) {
     // binary search for the string index corresponding to out_ibyte
-    size_type string_idx =
-      binary_search(out_offsets_threadblock, out_ibyte, strings_current_threadblock);
+    size_type string_idx = binary_search<strings_per_threadblock>(
+      out_offsets_threadblock, out_ibyte, strings_current_threadblock);
 
     // calculate which character to load within the string
     int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx];
@@ -210,12 +223,12 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
                                       stream.value()>>>(
       strings_begin, d_chars, offsets, map_begin, output_count);
   } else {
-    gather_chars_fn_char_parallel<<<(output_count + strings_per_threadblock - 1) /
-                                      strings_per_threadblock,
-                                    128,
-                                    0,
-                                    stream.value()>>>(
-      strings_begin, d_chars, offsets, map_begin, output_count);
+    constexpr int strings_per_threadblock = 32;
+    gather_chars_fn_char_parallel<strings_per_threadblock>
+      <<<(output_count + strings_per_threadblock - 1) / strings_per_threadblock,
+         128,
+         0,
+         stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count);
   }
 
   return chars_column;

From 9a195da73da9d15feba34c2a768f0bc54ba5000b Mon Sep 17 00:00:00 2001
From: Hao Gao <haog@nvidia.com>
Date: Mon, 19 Apr 2021 17:34:39 -0700
Subject: [PATCH 3/7] Use thrust::upper_bound instead of customized binary
 search kernel

---
 cpp/include/cudf/strings/detail/gather.cuh | 23 ++++++----------------
 1 file changed, 6 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 5bf736004ec..1be15e4db6e 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -126,21 +126,6 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
 // index. To improve the binary search performance, fixed number of strings per threadblock is
 // used. This strategy is best suited for small strings.
 
-// Binary search `value` in `offsets` of length `nelements`. Require `nelements` to be less than or
-// equal to `strings_per_threadblock`. Require `strings_per_threadblock` to be an exponential of 2.
-template <int strings_per_threadblock>
-__forceinline__ __device__ size_type binary_search(int32_t* offsets,
-                                                   int32_t value,
-                                                   size_type nelements)
-{
-  size_type idx = 0;
-#pragma unroll
-  for (size_type i = strings_per_threadblock / 2; i > 0; i /= 2) {
-    if (idx + i < nelements && offsets[idx + i] <= value) idx += i;
-  }
-  return idx;
-}
-
 template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
 __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
                                               char* out_chars,
@@ -169,8 +154,12 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
        out_ibyte < out_offsets_threadblock[strings_current_threadblock];
        out_ibyte += blockDim.x) {
     // binary search for the string index corresponding to out_ibyte
-    size_type string_idx = binary_search<strings_per_threadblock>(
-      out_offsets_threadblock, out_ibyte, strings_current_threadblock);
+    auto const string_idx_iter =
+      thrust::prev(thrust::upper_bound(thrust::seq,
+                                       out_offsets_threadblock,
+                                       out_offsets_threadblock + strings_current_threadblock,
+                                       out_ibyte));
+    size_type string_idx = thrust::distance(out_offsets_threadblock, string_idx_iter);
 
     // calculate which character to load within the string
     int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx];

From da0b5254e3eccfc9aed0ad2803e1ba19d522c9ee Mon Sep 17 00:00:00 2001
From: Hao Gao <haog@nvidia.com>
Date: Tue, 20 Apr 2021 09:04:26 -0700
Subject: [PATCH 4/7] Use cudf's definition of warp size

---
 cpp/include/cudf/strings/detail/gather.cuh | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 1be15e4db6e..0ec24c4690d 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/span.hpp>
@@ -65,13 +66,12 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
                                                 MapIterator string_indices,
                                                 size_type total_out_strings)
 {
-  constexpr size_t datatype_size    = sizeof(uint4);
-  constexpr size_t threads_per_warp = 32;
+  constexpr size_t datatype_size = sizeof(uint4);
 
   int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
-  int global_warp_id   = global_thread_id / threads_per_warp;
-  int warp_lane        = global_thread_id % threads_per_warp;
-  int nwarps           = gridDim.x * blockDim.x / threads_per_warp;
+  int global_warp_id   = global_thread_id / cudf::detail::warp_size;
+  int warp_lane        = global_thread_id % cudf::detail::warp_size;
+  int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
 
   auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % datatype_size;
   uint4* out_chars_aligned    = reinterpret_cast<uint4*>(out_chars - alignment_offset);
@@ -96,7 +96,7 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
       (out_end + alignment_offset) / datatype_size * datatype_size - alignment_offset;
 
     for (size_type ichar = out_start_aligned + warp_lane * datatype_size; ichar < out_end_aligned;
-         ichar += threads_per_warp * datatype_size) {
+         ichar += cudf::detail::warp_size * datatype_size) {
       *(out_chars_aligned + (ichar + alignment_offset) / datatype_size) =
         load_uint4(in_start + ichar - out_start);
     }

From a30e76eee4f6a05e982fca913dc5c6ff345bedd9 Mon Sep 17 00:00:00 2001
From: Hao Gao <haog@nvidia.com>
Date: Mon, 10 May 2021 18:26:05 -0700
Subject: [PATCH 5/7] Address reviewer's comment

---
 cpp/include/cudf/strings/detail/gather.cuh | 25 ++++++++++++----------
 1 file changed, 14 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 0ec24c4690d..950a66bb0dd 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -35,11 +35,6 @@ namespace cudf {
 namespace strings {
 namespace detail {
 
-// Strategy 1: String-parallel
-// This strategy assigns strings to warps so that each warp can cooperatively copy from the input
-// location of the string to the corresponding output location. Large datatype (uint4) is used for
-// stores. This strategy is best suited for large strings.
-
 // Helper function for loading 16B from a potentially unaligned memory location to registers.
 __forceinline__ __device__ uint4 load_uint4(const char* ptr)
 {
@@ -59,6 +54,10 @@ __forceinline__ __device__ uint4 load_uint4(const char* ptr)
   return regs;
 }
 
+// Strategy 1: String-parallel
+// This strategy assigns strings to warps so that each warp can cooperatively copy from the input
+// location of the string to the corresponding output location. Large datatype (uint4) is used for
+// stores. This strategy is best suited for large strings.
 template <typename StringIterator, typename MapIterator>
 __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
                                                 char* out_chars,
@@ -203,19 +202,23 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
   auto chars_column  = create_chars_child_column(output_count, chars_bytes, stream, mr);
   auto const d_chars = chars_column->mutable_view().template data<char>();
 
+  constexpr int warps_per_threadblock = 4;
+
   size_type average_string_length = chars_bytes / output_count;
 
   if (average_string_length > 32) {
-    gather_chars_fn_string_parallel<<<min((static_cast<int>(output_count) + 3) / 4, 65536),
-                                      128,
-                                      0,
-                                      stream.value()>>>(
-      strings_begin, d_chars, offsets, map_begin, output_count);
+    constexpr int max_threadblocks = 65536;
+    gather_chars_fn_string_parallel<<<
+      min((static_cast<int>(output_count) + warps_per_threadblock - 1) / warps_per_threadblock,
+          max_threadblocks),
+      warps_per_threadblock * cudf::detail::warp_size,
+      0,
+      stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count);
   } else {
     constexpr int strings_per_threadblock = 32;
     gather_chars_fn_char_parallel<strings_per_threadblock>
       <<<(output_count + strings_per_threadblock - 1) / strings_per_threadblock,
-         128,
+         warps_per_threadblock * cudf::detail::warp_size,
          0,
          stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count);
   }

From 93dd1a3142b1370f3cccce19cc15e52368a442de Mon Sep 17 00:00:00 2001
From: Hao Gao <haog@nvidia.com>
Date: Mon, 10 May 2021 18:50:03 -0700
Subject: [PATCH 6/7] Avoid reading beyond string boundaries

---
 cpp/include/cudf/strings/detail/gather.cuh | 33 +++++++++++++---------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index 950a66bb0dd..a0a40b34819 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -65,14 +65,15 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
                                                 MapIterator string_indices,
                                                 size_type total_out_strings)
 {
-  constexpr size_t datatype_size = sizeof(uint4);
+  constexpr size_t out_datatype_size = sizeof(uint4);
+  constexpr size_t in_datatype_size  = sizeof(uint);
 
   int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x;
   int global_warp_id   = global_thread_id / cudf::detail::warp_size;
   int warp_lane        = global_thread_id % cudf::detail::warp_size;
   int nwarps           = gridDim.x * blockDim.x / cudf::detail::warp_size;
 
-  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % datatype_size;
+  auto const alignment_offset = reinterpret_cast<std::uintptr_t>(out_chars) % out_datatype_size;
   uint4* out_chars_aligned    = reinterpret_cast<uint4*>(out_chars - alignment_offset);
 
   for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) {
@@ -85,18 +86,23 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
     const char* in_start = strings_begin[string_indices[istring]].data();
 
     // Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`.
-    // `out_start_aligned` is the first 16B aligned memory location after `out_start`.
-    // `out_end_aligned` is the last 16B aligned memory location before `out_end`. Characters
+    // `out_start_aligned` is the first 16B aligned memory location after `out_start + 4`.
+    // `out_end_aligned` is the last 16B aligned memory location before `out_end - 4`. Characters
     // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
+    // `out_start + 4` and `out_end - 4` are used instead of `out_start` and `out_end` to avoid
+    // `load_uint4` reading beyond string boundaries.
     int32_t out_start_aligned =
-      (out_start + alignment_offset + datatype_size - 1) / datatype_size * datatype_size -
+      (out_start + in_datatype_size + alignment_offset + out_datatype_size - 1) /
+        out_datatype_size * out_datatype_size -
       alignment_offset;
     int32_t out_end_aligned =
-      (out_end + alignment_offset) / datatype_size * datatype_size - alignment_offset;
+      (out_end - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
+      alignment_offset;
 
-    for (size_type ichar = out_start_aligned + warp_lane * datatype_size; ichar < out_end_aligned;
-         ichar += cudf::detail::warp_size * datatype_size) {
-      *(out_chars_aligned + (ichar + alignment_offset) / datatype_size) =
+    for (size_type ichar = out_start_aligned + warp_lane * out_datatype_size;
+         ichar < out_end_aligned;
+         ichar += cudf::detail::warp_size * out_datatype_size) {
+      *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
         load_uint4(in_start + ichar - out_start);
     }
 
@@ -104,10 +110,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
     // out_end_aligned)`.
     if (out_end_aligned <= out_start_aligned) {
       // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
-      // entire string. Note that for 16B alignment, the maximum number of characters in this string
-      // is less than 32, so for each thread in the warp, copying one byte is enough.
-      int32_t ichar = out_start + warp_lane;
-      if (ichar < out_end) { out_chars[ichar] = in_start[warp_lane]; }
+      // entire string.
+      for (int32_t ichar = out_start + warp_lane; ichar < out_end;
+           ichar += cudf::detail::warp_size) {
+        out_chars[ichar] = in_start[ichar - out_start];
+      }
     } else {
       // Copy characters in range `[out_start, out_start_aligned)`.
       if (out_start + warp_lane < out_start_aligned) {

From e87da49f5b24310dd6dd40fd041afaffec94ddf5 Mon Sep 17 00:00:00 2001
From: Hao Gao <haog@nvidia.com>
Date: Mon, 24 May 2021 10:29:32 -0700
Subject: [PATCH 7/7] Improve documentation

---
 cpp/include/cudf/strings/detail/gather.cuh | 48 +++++++++++++++++-----
 1 file changed, 37 insertions(+), 11 deletions(-)

diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index a0a40b34819..dcd17245ee6 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -54,10 +54,22 @@ __forceinline__ __device__ uint4 load_uint4(const char* ptr)
   return regs;
 }
 
-// Strategy 1: String-parallel
-// This strategy assigns strings to warps so that each warp can cooperatively copy from the input
-// location of the string to the corresponding output location. Large datatype (uint4) is used for
-// stores. This strategy is best suited for large strings.
+/**
+ * @brief Gather characters from the input iterator, with string parallel strategy.
+ *
+ * This strategy assigns strings to warps so that each warp can cooperatively copy from the input
+ * location of the string to the corresponding output location. Large datatype (uint4) is used for
+ * stores. This strategy is best suited for large strings.
+ *
+ * @tparam StringIterator Iterator should produce `string_view` objects.
+ * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
+ *
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances.
+ * @param out_chars Output buffer for gathered characters.
+ * @param out_offsets The offset values associated with the output buffer.
+ * @param string_indices Start of index iterator.
+ * @param total_out_strings Number of output strings to be gathered.
+ */
 template <typename StringIterator, typename MapIterator>
 __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
                                                 char* out_chars,
@@ -127,11 +139,22 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
   }
 }
 
-// Strategy 2: Char-parallel
-// This strategy assigns characters to threads, and uses binary search for getting the string
-// index. To improve the binary search performance, fixed number of strings per threadblock is
-// used. This strategy is best suited for small strings.
-
+/**
+ * @brief Gather characters from the input iterator, with char parallel strategy.
+ *
+ * This strategy assigns characters to threads, and uses binary search for getting the string
+ * index. To improve the binary search performance, fixed number of strings per threadblock is
+ * used. This strategy is best suited for small strings.
+ *
+ * @tparam StringIterator Iterator should produce `string_view` objects.
+ * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
+ *
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances.
+ * @param out_chars Output buffer for gathered characters.
+ * @param out_offsets The offset values associated with the output buffer.
+ * @param string_indices Start of index iterator.
+ * @param total_out_strings Number of output strings to be gathered.
+ */
 template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
 __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
                                               char* out_chars,
@@ -185,7 +208,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
  * @tparam StringIterator Iterator should produce `string_view` objects.
  * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`.
  *
- * @param strings_begin Start of the iterator to retrieve `string_view` instances
+ * @param strings_begin Start of the iterator to retrieve `string_view` instances.
  * @param map_begin Start of index iterator.
  * @param map_end End of index iterator.
  * @param offsets The offset values to be associated with the output chars column.
@@ -210,10 +233,13 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
   auto const d_chars = chars_column->mutable_view().template data<char>();
 
   constexpr int warps_per_threadblock = 4;
+  // String parallel strategy will be used if average string length is above this threshold.
+  // Otherwise, char parallel strategy will be used.
+  constexpr size_type string_parallel_threshold = 32;
 
   size_type average_string_length = chars_bytes / output_count;
 
-  if (average_string_length > 32) {
+  if (average_string_length > string_parallel_threshold) {
     constexpr int max_threadblocks = 65536;
     gather_chars_fn_string_parallel<<<
       min((static_cast<int>(output_count) + warps_per_threadblock - 1) / warps_per_threadblock,