Large strings support in cudf::merge (#15374)

Enable large strings support in `cudf::merge`. Simplifies the strings specialization to use the gather-based strings factory function which is already optimized for long strings and is now appropriately enabled for large strings. Also moved source from `include/cudf/strings/detail/merge.cuh` to `src/strings/merge/merge.cu` file since the template implemenation was not actually required. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Bradley Dice (https://github.com/bdice) - https://github.com/nvdbaranec - Vukasin Milovanovic (https://github.com/vuule) URL: #15374
rapidsai · Apr 22, 2024 · a2c81e7 · a2c81e7
1 parent 9fa247f
commit a2c81e7
Show file tree

Hide file tree

Showing 10 changed files with 267 additions and 156 deletions.
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -586,6 +586,7 @@ add_library(
   src/strings/filling/fill.cu
   src/strings/filter_chars.cu
   src/strings/like.cu
+  src/strings/merge/merge.cu
   src/strings/padding.cu
   src/strings/regex/regcomp.cpp
   src/strings/regex/regexec.cpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
@@ -236,7 +236,9 @@ ConfigureNVBench(HASHING_NVBENCH hashing/hash.cpp)
 # ##################################################################################################
 # * merge benchmark -------------------------------------------------------------------------------
 ConfigureBench(MERGE_BENCH merge/merge.cpp)
-ConfigureNVBench(MERGE_NVBENCH merge/merge_structs.cpp merge/merge_lists.cpp)
+ConfigureNVBench(
+  MERGE_NVBENCH merge/merge_lists.cpp merge/merge_structs.cpp merge/merge_strings.cpp
+)
 
 # ##################################################################################################
 # * null_mask benchmark ---------------------------------------------------------------------------

diff --git a/cpp/benchmarks/merge/merge_strings.cpp b/cpp/benchmarks/merge/merge_strings.cpp
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+
+#include <cudf/merge.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_merge_strings(nvbench::state& state)
+{
+  auto stream = cudf::get_default_stream();
+
+  auto const num_rows  = static_cast<cudf::size_type>(state.get_int64("num_rows"));
+  auto const row_width = static_cast<cudf::size_type>(state.get_int64("row_width"));
+  if (static_cast<std::size_t>(2 * num_rows) * static_cast<std::size_t>(row_width) >=
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
+    state.skip("Skip benchmarks greater than size_type limit");
+  }
+
+  data_profile const table_profile =
+    data_profile_builder()
+      .distribution(cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width)
+      .no_validity();
+  auto const source_tables = create_random_table(
+    {cudf::type_id::STRING, cudf::type_id::STRING}, row_count{num_rows}, table_profile);
+
+  auto const sorted_lhs = cudf::sort(cudf::table_view({source_tables->view().column(0)}));
+  auto const sorted_rhs = cudf::sort(cudf::table_view({source_tables->view().column(1)}));
+  auto const lhs        = sorted_lhs->view().column(0);
+  auto const rhs        = sorted_rhs->view().column(0);
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
+  auto chars_size = cudf::strings_column_view(lhs).chars_size(stream) +
+                    cudf::strings_column_view(rhs).chars_size(stream);
+  state.add_global_memory_reads<nvbench::int8_t>(chars_size);   // all bytes are read
+  state.add_global_memory_writes<nvbench::int8_t>(chars_size);  // all bytes are written
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    [[maybe_unused]] auto result = cudf::merge(
+      {cudf::table_view({lhs}), cudf::table_view({rhs})}, {0}, {cudf::order::ASCENDING});
+  });
+}
+
+NVBENCH_BENCH(nvbench_merge_strings)
+  .set_name("merge_strings")
+  .add_int64_axis("row_width", {32, 64, 128, 256, 512, 1024, 2048, 4096})
+  .add_int64_axis("num_rows", {4096, 32768, 262144, 2097152, 16777216});
diff --git a/cpp/include/cudf/strings/detail/merge.cuh b/cpp/include/cudf/strings/detail/merge.cuh
diff --git a/cpp/include/cudf/strings/detail/merge.hpp b/cpp/include/cudf/strings/detail/merge.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/detail/merge.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf ::strings ::detail {
+/**
+ * @brief Merges two strings columns
+ *
+ * @param lhs First column
+ * @param rhs Second column
+ * @param row_order Indices for each column
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> merge(strings_column_view const& lhs,
+                              strings_column_view const& rhs,
+                              cudf::detail::index_vector const& row_order,
+                              rmm::cuda_stream_view stream,
+                              rmm::device_async_resource_ref mr);
+
+}  // namespace cudf::strings::detail
diff --git a/cpp/include/cudf/strings/detail/strings_children.cuh b/cpp/include/cudf/strings/detail/strings_children.cuh
@@ -164,22 +164,22 @@ std::pair<std::unique_ptr<column>, int64_t> make_offsets_child_column(
     });
   auto input_itr = cudf::detail::make_counting_transform_iterator(0, map_fn);
   // Use the sizes-to-offsets iterator to compute the total number of elements
-  auto const total_elements =
+  auto const total_bytes =
     cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets, stream);
 
-  // TODO: replace exception with if-statement when enabling creating INT64 offsets
-  CUDF_EXPECTS(total_elements <= size_type_max,
-               "Size of output exceeds the character size limit",
+  auto const threshold = get_offset64_threshold();
+  CUDF_EXPECTS(is_large_strings_enabled() || (total_bytes < threshold),
+               "Size of output exceeds the column size limit",
                std::overflow_error);
-  // if (total_elements >= get_offset64_threshold()) {
-  //   // recompute as int64 offsets when above the threshold
-  //   offsets_column = make_numeric_column(
-  //     data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  //   auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
-  //   sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
-  // }
-
-  return std::pair(std::move(offsets_column), total_elements);
+  if (total_bytes >= get_offset64_threshold()) {
+    // recompute as int64 offsets when above the threshold
+    offsets_column = make_numeric_column(
+      data_type{type_id::INT64}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
+    auto d_offsets64 = offsets_column->mutable_view().template data<int64_t>();
+    cudf::detail::sizes_to_offsets(input_itr, input_itr + strings_count + 1, d_offsets64, stream);
+  }
+
+  return std::pair(std::move(offsets_column), total_bytes);
 }
 
 }  // namespace detail

diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -86,9 +86,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       return (item.first != nullptr ? static_cast<size_type>(item.second) : size_type{0});
     });
   auto offsets_transformer_itr = thrust::make_transform_iterator(begin, offsets_transformer);
-  auto [offsets_column, bytes] = cudf::detail::make_offsets_child_column(
+  auto [offsets_column, bytes] = cudf::strings::detail::make_offsets_child_column(
     offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr);
-  auto offsets_view = offsets_column->view();
+  auto const d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets_column->view());
 
   // create null mask
   auto validator = [] __device__(string_index_pair const item) { return item.first != nullptr; };
@@ -98,11 +99,10 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
     (null_count > 0) ? std::move(new_nulls.first) : rmm::device_buffer{0, stream, mr};
 
   // build chars column
-  auto chars_data = [offsets_view, bytes = bytes, begin, strings_count, null_count, stream, mr] {
+  auto chars_data = [d_offsets, bytes = bytes, begin, strings_count, null_count, stream, mr] {
     auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
     // use a character-parallel kernel for long string lengths
     if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
-      auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets_view);
       auto const str_begin = thrust::make_transform_iterator(
         begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
           return string_view{ip.first, ip.second};
@@ -121,12 +121,11 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       auto d_chars    = chars_data.data();
       auto copy_chars = [d_chars] __device__(auto item) {
         string_index_pair const str = thrust::get<0>(item);
-        size_type const offset      = thrust::get<1>(item);
+        int64_t const offset        = thrust::get<1>(item);
         if (str.first != nullptr) memcpy(d_chars + offset, str.first, str.second);
       };
       thrust::for_each_n(rmm::exec_policy(stream),
-                         thrust::make_zip_iterator(
-                           thrust::make_tuple(begin, offsets_view.template begin<size_type>())),
+                         thrust::make_zip_iterator(thrust::make_tuple(begin, d_offsets)),
                          strings_count,
                          copy_chars);
       return chars_data;
@@ -168,21 +167,15 @@ std::unique_ptr<column> make_strings_column(CharIterator chars_begin,
 {
   CUDF_FUNC_RANGE();
   size_type strings_count = thrust::distance(offsets_begin, offsets_end) - 1;
-  size_type bytes         = std::distance(chars_begin, chars_end) * sizeof(char);
-  if (strings_count == 0) return make_empty_column(type_id::STRING);
+  if (strings_count == 0) { return make_empty_column(type_id::STRING); }
 
+  int64_t const bytes = std::distance(chars_begin, chars_end) * sizeof(char);
   CUDF_EXPECTS(bytes >= 0, "invalid offsets data");
 
   // build offsets column -- this is the number of strings + 1
-  auto offsets_column = make_numeric_column(
-    data_type{type_to_id<size_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
-  auto offsets_view = offsets_column->mutable_view();
-  thrust::transform(rmm::exec_policy(stream),
-                    offsets_begin,
-                    offsets_end,
-                    offsets_view.data<int32_t>(),
-                    cuda::proclaim_return_type<int32_t>(
-                      [] __device__(auto offset) { return static_cast<int32_t>(offset); }));
+  auto [offsets_column, computed_bytes] =
+    cudf::strings::detail::make_offsets_child_column(offsets_begin, offsets_end, stream, mr);
+  CUDF_EXPECTS(bytes == computed_bytes, "unexpected byte count");
 
   // build chars column
   rmm::device_uvector<char> chars_data(bytes, stream, mr);

diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
@@ -27,7 +27,7 @@
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/lists/detail/concatenate.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/strings/detail/merge.cuh>
+#include <cudf/strings/detail/merge.hpp>
 #include <cudf/structs/structs_column_view.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table.hpp>
@@ -434,18 +434,8 @@ std::unique_ptr<column> column_merger::operator()<cudf::string_view>(
   rmm::cuda_stream_view stream,
   rmm::device_async_resource_ref mr) const
 {
-  auto column = strings::detail::merge<index_type>(strings_column_view(lcol),
-                                                   strings_column_view(rcol),
-                                                   row_order_.begin(),
-                                                   row_order_.end(),
-                                                   stream,
-                                                   mr);
-  if (lcol.has_nulls() || rcol.has_nulls()) {
-    auto merged_view = column->mutable_view();
-    materialize_bitmask(
-      lcol, rcol, merged_view.null_mask(), merged_view.size(), row_order_.data(), stream);
-  }
-  return column;
+  return strings::detail::merge(
+    strings_column_view(lcol), strings_column_view(rcol), row_order_, stream, mr);
 }
 
 // specialization for dictionary