rapidsai · rapids-bot · Mar 1, 2024 · Jan 29, 2024 · Jan 29, 2024 · Jan 29, 2024
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include "delta_binary.cuh"
 #include "page_decode.cuh"
 
 #include <io/utilities/column_buffer.hpp>
@@ -40,26 +41,136 @@ constexpr int rolling_buf_size = LEVEL_DECODE_BUF_SIZE;
 using unused_state_buf = page_state_buffers_s<0, 0, 0>;
 
 /**
+ * @brief Calculate string bytes for DELTA_LENGTH_BYTE_ARRAY encoded pages
+ *
+ * Result is valid only on thread 0.
+ *
+ * @param s The local page info
+ * @param t Thread index
+ */
+__device__ size_type gpuDeltaLengthPageStringSize(page_state_s* s, int t)
+{
+  if (t == 0) {
+    // find the beginning of char data
+    delta_binary_decoder string_lengths;
+    auto const* string_start = string_lengths.find_end_of_block(s->data_start, s->data_end);
+    // distance is size of string data
+    return static_cast<size_type>(std::distance(string_start, s->data_end));
+  }
+  return 0;
+}
+
+/**
+ * @brief Calculate string bytes for DELTA_BYTE_ARRAY encoded pages
+ *
+ * This expects all threads in the thread block (preprocess_block_size).
+ *
+ * @param s The local page info
+ * @param t Thread index
+ */
+__device__ size_type gpuDeltaPageStringSize(page_state_s* s, int t)
+{
+  using cudf::detail::warp_size;
+  using WarpReduce = cub::WarpReduce<uleb128_t>;
+  __shared__ typename WarpReduce::TempStorage temp_storage[2];
+
+  __shared__ __align__(16) delta_binary_decoder prefixes;
+  __shared__ __align__(16) delta_binary_decoder suffixes;
+
+  int const lane_id = t % warp_size;
+  int const warp_id = t / warp_size;
+
+  if (t == 0) {
+    auto const* suffix_start = prefixes.find_end_of_block(s->data_start, s->data_end);
+    suffixes.init_binary_block(suffix_start, s->data_end);
+  }
+  __syncthreads();
+
+  // two warps will traverse the prefixes and suffixes and sum them up
+  auto const db = t < warp_size ? &prefixes : t < 2 * warp_size ? &suffixes : nullptr;
+
+  size_t total_bytes = 0;
+  if (db != nullptr) {
+    // initialize with first value (which is stored in last_value)
+    if (lane_id == 0) { total_bytes = db->last_value; }
+
+    uleb128_t lane_sum = 0;
+    while (db->current_value_idx < db->num_encoded_values(true)) {
+      // calculate values for current mini-block
+      db->calc_mini_block_values(lane_id);
+
+      // get per lane sum for mini-block
+      for (uint32_t i = 0; i < db->values_per_mb; i += 32) {
+        uint32_t const idx = db->current_value_idx + i + lane_id;
+        if (idx < db->value_count) {
+          lane_sum += db->value[rolling_index<delta_rolling_buf_size>(idx)];
+        }
+      }
+
+      if (lane_id == 0) { db->setup_next_mini_block(true); }
+      __syncwarp();
+    }
+
+    // get sum for warp.
+    // note: warp_sum will only be valid on lane 0.
+    auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum);
+
+    if (lane_id == 0) { total_bytes += warp_sum; }
+  }
+  __syncthreads();
+
+  // now sum up total_bytes from the two warps. result is only valid on thread 0.
+  auto const final_bytes =
+    cudf::detail::single_lane_block_sum_reduce<preprocess_block_size, 0>(total_bytes);
+
+  return static_cast<size_type>(final_bytes);
+}
+
+/**
+ * @brief Calculate the number of string bytes in the page.
  *
  * This function expects the dictionary position to be at 0 and will traverse
- * the entire thing.
+ * the entire thing (for plain and dictionary encoding).
  *
- * Operates on a single warp only. Expects t < 32
+ * This expects all threads in the thread block (preprocess_block_size). Result is only
+ * valid on thread 0.
  *
  * @param s The local page info
  * @param t Thread index
  */
 __device__ size_type gpuDecodeTotalPageStringSize(page_state_s* s, int t)
 {
+  using cudf::detail::warp_size;
   size_type target_pos = s->num_input_values;
   size_type str_len    = 0;
-  if (s->dict_base) {
-    auto const [new_target_pos, len] =
-      gpuDecodeDictionaryIndices<true, unused_state_buf>(s, nullptr, target_pos, t);
-    target_pos = new_target_pos;
-    str_len    = len;
-  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
-    str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
+  switch (s->page.encoding) {
+    case Encoding::PLAIN_DICTIONARY:
+    case Encoding::RLE_DICTIONARY:
+      // TODO: make this block-based instead of just 1 warp
+      if (t < warp_size && s->dict_base) {
+        auto const [new_target_pos, len] =
+          gpuDecodeDictionaryIndices<true, unused_state_buf>(s, nullptr, target_pos, t);
+        target_pos = new_target_pos;
+        str_len    = len;
+      }
+      break;
+
+    case Encoding::PLAIN:
+      // TODO: since this is really just an estimate, we could just return
+      // s->dict_size (overestimate) or
+      // s->dict_size - sizeof(int) * s->page.num_input_values (underestimate)
+      if (t < warp_size && (s->col.data_type & 7) == BYTE_ARRAY) {
+        str_len = gpuInitStringDescriptors<true, unused_state_buf>(s, nullptr, target_pos, t);
+      }
+      break;
+
+    case Encoding::DELTA_LENGTH_BYTE_ARRAY: str_len = gpuDeltaLengthPageStringSize(s, t); break;
+
+    case Encoding::DELTA_BYTE_ARRAY: str_len = gpuDeltaPageStringSize(s, t); break;
+
+    default:
+      // not a valid string encoding, so just return 0
+      break;
   }
   if (!t) { s->dict_pos = target_pos; }
   return str_len;
@@ -348,9 +459,9 @@ CUDF_KERNEL void __launch_bounds__(preprocess_block_size)
   }
 
   // retrieve total string size.
-  // TODO: make this block-based instead of just 1 warp
   if (compute_string_sizes) {
-    if (t < 32) { s->page.str_bytes = gpuDecodeTotalPageStringSize(s, t); }
+    auto const str_bytes = gpuDecodeTotalPageStringSize(s, t);
+    if (t == 0) { s->page.str_bytes = str_bytes; }
   }
 
   // update output results:

@@ -1292,6 +1292,7 @@ inline __device__ bool setupLocalPageInfo(page_state_s* const s,
       s->dict_bits = 0;
       s->dict_base = nullptr;
       s->dict_size = 0;
+      s->dict_val  = 0;
       // NOTE:  if additional encodings are supported in the future, modifications must
       // be made to is_supported_encoding() in reader_impl_preprocess.cu
       switch (s->page.encoding) {

@@ -546,6 +546,7 @@ __device__ thrust::pair<size_t, size_t> totalDeltaByteArraySize(uint8_t const* d
     // get sum for warp.
     // note: warp_sum will only be valid on lane 0.
     auto const warp_sum = WarpReduce(temp_storage[warp_id]).Sum(lane_sum);
+    __syncwarp();
     auto const warp_max = WarpReduce(temp_storage[warp_id]).Reduce(lane_max, cub::Max());
 
     if (lane_id == 0) {