Performance improvement in cudf::strings::join_strings for long strin…

…gs (#13283) Improves performance for longer strings with `cudf::strings::join_strings()` API. The new implementation is inspired by recent changes in json/writer_impl.cu which uses the `make_strings_children` factor that accepts an iterator of string pointers to gather the results into a single column of data but does lose some performance for smallish-strings (less than 32 bytes). So the original code is refactored to work on smaller strings. Also, a benchmark implementation is added in this PR to test against various sized input columns. For longer strings, the performance improvement is up to 100x for very long strings (>512 bytes). Reference #13048 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Nghia Truong (https://github.com/ttnghia) URL: #13283
rapidsai · May 15, 2023 · 4c456cb · 4c456cb
1 parent 23a77a6
commit 4c456cb
Show file tree

Hide file tree

Showing 4 changed files with 191 additions and 73 deletions.
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -300,8 +300,8 @@ ConfigureBench(
 )
 
 ConfigureNVBench(
-  STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/lengths.cpp string/like.cpp
-  string/reverse.cpp
+  STRINGS_NVBENCH string/case.cpp string/char_types.cpp string/join_strings.cpp string/lengths.cpp
+  string/like.cpp string/reverse.cpp
 )
 
 # ##################################################################################################

diff --git a/cpp/benchmarks/string/join_strings.cpp b/cpp/benchmarks/string/join_strings.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+static void bench_join(nvbench::state& state)
+{
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile = data_profile_builder().distribution(
+    cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
+  auto const table =
+    create_random_table({cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+  cudf::strings_column_view input(table->view().column(0));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  // gather some throughput statistics as well
+  auto const chars_size = input.chars_size();
+  state.add_element_count(chars_size, "chars_size");            // number of bytes;
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read;
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  std::string separator(":");
+  std::string narep("null");
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::strings::join_strings(input, separator, narep);
+  });
+}
+
+NVBENCH_BENCH(bench_join)
+  .set_name("strings_join")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
@@ -18,11 +18,12 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/detail/combine.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -33,95 +34,141 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/for_each.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform_scan.h>
 
 namespace cudf {
 namespace strings {
 namespace detail {
 
-std::unique_ptr<column> join_strings(strings_column_view const& strings,
+namespace {
+
+/**
+ * @brief Threshold to decide on using string-per-thread vs the string-gather
+ * approaches.
+ *
+ * If the average byte length of a string in a column exceeds this value then
+ * the string-gather function is used.
+ * Otherwise, a regular string-parallel function is used.
+ *
+ * This value was found using the strings_join benchmark results.
+ */
+constexpr size_type AVG_CHAR_BYTES_THRESHOLD = 32;
+
+struct join_base_fn {
+  column_device_view const d_strings;
+  string_view d_separator;
+  string_scalar_device_view d_narep;
+
+  __device__ thrust::pair<string_view, string_view> process_string(size_type idx) const
+  {
+    string_view d_str{};
+    string_view d_sep = (idx + 1 < d_strings.size()) ? d_separator : d_str;
+    if (d_strings.is_null(idx)) {
+      if (d_narep.is_valid()) {
+        d_str = d_narep.value();
+      } else {
+        // if null and no narep, don't output a separator either
+        d_sep = d_str;
+      }
+    } else {
+      d_str = d_strings.element<string_view>(idx);
+    }
+    return {d_str, d_sep};
+  }
+};
+
+/**
+ * @brief Compute output sizes and write output bytes
+ *
+ * This functor is suitable for make_strings_children
+ */
+struct join_fn : public join_base_fn {
+  size_type* d_offsets{};
+  char* d_chars{};
+
+  join_fn(column_device_view const d_strings,
+          string_view d_separator,
+          string_scalar_device_view d_narep)
+    : join_base_fn{d_strings, d_separator, d_narep}
+  {
+  }
+
+  __device__ void operator()(size_type idx) const
+  {
+    auto const [d_str, d_sep] = process_string(idx);
+
+    char* d_buffer  = d_chars ? d_chars + d_offsets[idx] : nullptr;
+    size_type bytes = 0;
+    if (d_buffer) {
+      d_buffer = detail::copy_string(d_buffer, d_str);
+      d_buffer = detail::copy_string(d_buffer, d_sep);
+    } else {
+      bytes += d_str.size_bytes() + d_sep.size_bytes();
+    }
+    if (!d_chars) { d_offsets[idx] = bytes; }
+  }
+};
+
+struct join_gather_fn : public join_base_fn {
+  join_gather_fn(column_device_view const d_strings,
+                 string_view d_separator,
+                 string_scalar_device_view d_narep)
+    : join_base_fn{d_strings, d_separator, d_narep}
+  {
+  }
+
+  __device__ string_index_pair operator()(size_type idx) const
+  {
+    auto const [d_str, d_sep] = process_string(idx / 2);
+    // every other string is the separator
+    return idx % 2 ? string_index_pair{d_sep.data(), d_sep.size_bytes()}
+                   : string_index_pair{d_str.data(), d_str.size_bytes()};
+  }
+};
+}  // namespace
+
+std::unique_ptr<column> join_strings(strings_column_view const& input,
                                      string_scalar const& separator,
                                      string_scalar const& narep,
                                      rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
-  auto strings_count = strings.size();
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (input.is_empty()) { return make_empty_column(type_id::STRING); }
 
   CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be a valid string_scalar");
 
   string_view d_separator(separator.data(), separator.size());
   auto d_narep = get_scalar_device_view(const_cast<string_scalar&>(narep));
 
-  auto strings_column = column_device_view::create(strings.parent(), stream);
-  auto d_strings      = *strings_column;
-
-  // create an offsets array for building the output memory layout
-  rmm::device_uvector<size_type> output_offsets(strings_count + 1, stream);
-  auto d_output_offsets = output_offsets.data();
-  // using inclusive-scan to compute last entry which is the total size
-  thrust::transform_inclusive_scan(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings_count),
-    d_output_offsets + 1,
-    [d_strings, d_separator, d_narep] __device__(size_type idx) {
-      size_type bytes = 0;
-      if (d_strings.is_null(idx)) {
-        if (!d_narep.is_valid()) return 0;  // skip nulls
-        bytes += d_narep.size();
-      } else
-        bytes += d_strings.element<string_view>(idx).size_bytes();
-      if ((idx + 1) < d_strings.size()) bytes += d_separator.size_bytes();
-      return bytes;
-    },
-    thrust::plus<size_type>());
-
-  output_offsets.set_element_to_zero_async(0, stream);
-  // total size is the last entry
-  size_type const bytes = output_offsets.back_element(stream);
-
-  // build offsets column (only 1 string so 2 offset entries)
-  auto offsets_column =
-    make_numeric_column(data_type{type_id::INT32}, 2, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view = offsets_column->mutable_view();
-  // set the first entry to 0 and the last entry to bytes
-  int32_t new_offsets[] = {0, static_cast<int32_t>(bytes)};
-  CUDF_CUDA_TRY(cudaMemcpyAsync(offsets_view.data<int32_t>(),
-                                new_offsets,
-                                sizeof(new_offsets),
-                                cudaMemcpyDefault,
-                                stream.value()));
-
-  // build null mask
-  // only one entry so it is either all valid or all null
+  auto d_strings = column_device_view::create(input.parent(), stream);
+
+  auto chars_column = [&] {
+    // build the strings column and commandeer the chars column
+    if ((input.size() == input.null_count()) ||
+        ((input.chars_size() / (input.size() - input.null_count())) <= AVG_CHAR_BYTES_THRESHOLD)) {
+      return std::get<1>(
+        make_strings_children(join_fn{*d_strings, d_separator, d_narep}, input.size(), stream, mr));
+    }
+    // dynamically feeds index pairs to build the output
+    auto indices = cudf::detail::make_counting_transform_iterator(
+      0, join_gather_fn{*d_strings, d_separator, d_narep});
+    auto joined_col = make_strings_column(indices, indices + (input.size() * 2), stream, mr);
+    return std::move(joined_col->release().children.back());
+  }();
+
+  // build the offsets: single string output has offsets [0,chars-size]
+  auto offsets = cudf::detail::make_device_uvector_async(
+    std::vector<size_type>({0, chars_column->size()}), stream, mr);
+  auto offsets_column = std::make_unique<column>(std::move(offsets), rmm::device_buffer{}, 0);
+
+  // build the null mask: only one output row so it is either all-valid or all-null
   auto const null_count =
-    static_cast<size_type>(strings.null_count() == strings_count && !narep.is_valid(stream));
-  auto null_mask    = null_count
-                        ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
-                        : rmm::device_buffer{0, stream, mr};
-  auto chars_column = create_chars_child_column(bytes, stream, mr);
-  auto d_chars      = chars_column->mutable_view().data<char>();
-  thrust::for_each_n(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    strings_count,
-    [d_strings, d_separator, d_narep, d_output_offsets, d_chars] __device__(size_type idx) {
-      size_type offset = d_output_offsets[idx];
-      char* d_buffer   = d_chars + offset;
-      if (d_strings.is_null(idx)) {
-        if (!d_narep.is_valid())
-          return;  // do not write to buffer if element is null (including separator)
-        d_buffer = detail::copy_string(d_buffer, d_narep.value());
-      } else {
-        string_view d_str = d_strings.element<string_view>(idx);
-        d_buffer          = detail::copy_string(d_buffer, d_str);
-      }
-      if ((idx + 1) < d_strings.size()) d_buffer = detail::copy_string(d_buffer, d_separator);
-    });
+    static_cast<size_type>(input.null_count() == input.size() && !narep.is_valid(stream));
+  auto null_mask = null_count
+                     ? cudf::detail::create_null_mask(1, cudf::mask_state::ALL_NULL, stream, mr)
+                     : rmm::device_buffer{0, stream, mr};
 
+  // perhaps this return a string_scalar instead of a single-row column
   return make_strings_column(
     1, std::move(offsets_column), std::move(chars_column), null_count, std::move(null_mask));
 }

diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp
@@ -58,6 +58,19 @@ TEST_F(JoinStringsTest, Join)
   }
 }
 
+TEST_F(JoinStringsTest, JoinLongStrings)
+{
+  std::string data(200, '0');
+  cudf::test::strings_column_wrapper input({data, data, data, data});
+
+  auto results =
+    cudf::strings::join_strings(cudf::strings_column_view(input), cudf::string_scalar("+"));
+
+  auto expected_data = data + "+" + data + "+" + data + "+" + data;
+  cudf::test::strings_column_wrapper expected({expected_data});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+}
+
 TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
 {
   cudf::column_view zero_size_strings_column(