rapidsai · rapids-bot · May 15, 2024 · Apr 30, 2024 · Apr 30, 2024 · Apr 30, 2024
diff --git a/cpp/include/cudf/detail/utilities/base64_utils.hpp b/cpp/include/cudf/detail/utilities/base64_utils.hpp
@@ -0,0 +1,159 @@
+/*
+   base64_utils.cpp and base64_utils.hpp
+
+   base64 encoding and decoding with C++.
+
+   Version: 1.01.00
+
+   Copyright (C) 2004-2017 René Nyffenegger
+
+   This source code is provided 'as-is', without any express or implied
+   warranty. In no event will the author be held liable for any damages
+   arising from the use of this software.
+
+   Permission is granted to anyone to use this software for any purpose,
+   including commercial applications, and to alter it and redistribute it
+   freely, subject to the following restrictions:
+
+   1. The origin of this source code must not be misrepresented; you must not
+      claim that you wrote the original source code. If you use this source code
+      in a product, an acknowledgment in the product documentation would be
+      appreciated but is not required.
+
+   2. Altered source versions must be plainly marked as such, and must not be
+      misrepresented as being the original source code.
+
+   3. This notice may not be removed or altered from any source distribution.
+
+   René Nyffenegger rene.nyffenegger@adp-gmbh.ch
+
+*/
+
+/**
+ * @file base64_utils.hpp
+ * @brief base64 string encoding/decoding utilities and implementation
+ */
+
+#pragma once
+
+// altered: including required std headers
+#include <array>
+#include <iostream>
+#include <string>
+#include <vector>
+
+// altered: merged base64.h and base64.cpp into one file.
+// altered: applying clang-format for libcudf on this file.
+
+// altered: use cudf namespaces
+namespace cudf::detail {
+
+static const std::string base64_chars =
+  "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+  "abcdefghijklmnopqrstuvwxyz"
+  "0123456789+/";
+
+static inline auto is_base64(unsigned char c) { return (isalnum(c) or (c == '+') or (c == '/')); }
+
+// merging the encoder wrapper into the single function
+std::string base64_encode(std::string_view string_to_encode)
+{
+  // get bytes to encode and length
+  auto bytes_to_encode = reinterpret_cast<const unsigned char*>(string_to_encode.data());
+  auto input_length    = string_to_encode.size();
+
+  std::string encoded;
+  std::array<unsigned char, 4> char_array_4;
+  std::array<unsigned char, 3> char_array_3;
+  int i = 0;
+  int j = 0;
+
+  // altered: added braces to one liner loops in the rest of this function
+  while (input_length--) {
+    char_array_3[i++] = *(bytes_to_encode++);
+    if (i == 3) {
+      char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
+      char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
+      char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
+      char_array_4[3] = char_array_3[2] & 0x3f;
+
+      for (i = 0; (i < 4); i++) {
+        encoded += base64_chars[char_array_4[i]];
+      }
+      i = 0;
+    }
+  }
+
+  if (i) {
+    for (j = i; j < 3; j++) {
+      char_array_3[j] = '\0';
+    }
+
+    char_array_4[0] = (char_array_3[0] & 0xfc) >> 2;
+    char_array_4[1] = ((char_array_3[0] & 0x03) << 4) + ((char_array_3[1] & 0xf0) >> 4);
+    char_array_4[2] = ((char_array_3[1] & 0x0f) << 2) + ((char_array_3[2] & 0xc0) >> 6);
+
+    for (j = 0; (j < i + 1); j++) {
+      encoded += base64_chars[char_array_4[j]];
+    }
+    while ((i++ < 3)) {
+      encoded += '=';
+    }
+  }
+
+  return encoded;
+}
+
+// base64 decode lambda function
+std::string base64_decode(std::string_view encoded_string)
+{
+  std::array<unsigned char, 4> char_array_4;
+  std::array<unsigned char, 3> char_array_3;
+  std::string decoded;
+  size_t input_len = encoded_string.size();
+
+  int i   = 0;
+  int j   = 0;
+  int in_ = 0;
+
+  // altered: added braces to one liner loops in the rest of this function
+  while (input_len-- and (encoded_string[in_] != '=') and is_base64(encoded_string[in_])) {
+    char_array_4[i++] = encoded_string[in_];
+    in_++;
+    if (i == 4) {
+      for (i = 0; i < 4; i++) {
+        char_array_4[i] = base64_chars.find(char_array_4[i]) & 0xff;
+      }
+
+      char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
+      char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+      char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
+
+      for (i = 0; (i < 3); i++) {
+        decoded += char_array_3[i];
+      }
+      i = 0;
+    }
+  }
+
+  // altered: modify to i!=0 for better readability
+  if (i != 0) {
+    for (j = 0; j < i; j++) {
+      char_array_4[j] = base64_chars.find(char_array_4[j]) & 0xff;
+    }
+    char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
+    char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
+    // altered: TODO: arrow source code doesn't have the below line.
+    // altered: This is inconsequential as it is never appended to
+    // altered: `decoded` as max(i) = 3 and 0 <= j < 2.
+    char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
+
+    for (j = 0; j < i - 1; j++) {
+      decoded += char_array_3[j];
+    }
+  }
+
+  return decoded;
+}
+
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/io/ipc/Message.fbs b/cpp/include/cudf/io/ipc/Message.fbs
@@ -0,0 +1,155 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+include "Schema.fbs";
+
+namespace cudf.io.parquet.flatbuf;
+
+/// ----------------------------------------------------------------------
+/// Data structures for describing a table row batch (a collection of
+/// equal-length Arrow arrays)
+
+/// Metadata about a field at some level of a nested type tree (but not
+/// its children).
+///
+/// For example, a List<Int16> with values `[[1, 2, 3], null, [4], [5, 6], null]`
+/// would have {length: 5, null_count: 2} for its List node, and {length: 6,
+/// null_count: 0} for its Int16 node, as separate FieldNode structs
+struct FieldNode {
+  /// The number of value slots in the Arrow array at this level of a nested
+  /// tree
+  length: long;
+
+  /// The number of observed nulls. Fields with null_count == 0 may choose not
+  /// to write their physical validity bitmap out as a materialized buffer,
+  /// instead setting the length of the bitmap buffer to 0.
+  null_count: long;
+}
+
+enum CompressionType:byte {
+  // LZ4 frame format, for portability, as provided by lz4frame.h or wrappers
+  // thereof. Not to be confused with "raw" (also called "block") format
+  // provided by lz4.h
+  LZ4_FRAME,
+
+  // Zstandard
+  ZSTD
+}
+
+/// Provided for forward compatibility in case we need to support different
+/// strategies for compressing the IPC message body (like whole-body
+/// compression rather than buffer-level) in the future
+enum BodyCompressionMethod:byte {
+  /// Each constituent buffer is first compressed with the indicated
+  /// compressor, and then written with the uncompressed length in the first 8
+  /// bytes as a 64-bit little-endian signed integer followed by the compressed
+  /// buffer bytes (and then padding as required by the protocol). The
+  /// uncompressed length may be set to -1 to indicate that the data that
+  /// follows is not compressed, which can be useful for cases where
+  /// compression does not yield appreciable savings.
+  BUFFER
+}
+
+/// Optional compression for the memory buffers constituting IPC message
+/// bodies. Intended for use with RecordBatch but could be used for other
+/// message types
+table BodyCompression {
+  /// Compressor library.
+  /// For LZ4_FRAME, each compressed buffer must consist of a single frame.
+  codec: CompressionType = LZ4_FRAME;
+
+  /// Indicates the way the record batch body was compressed
+  method: BodyCompressionMethod = BUFFER;
+}
+
+/// A data header describing the shared memory layout of a "record" or "row"
+/// batch. Some systems call this a "row batch" internally and others a "record
+/// batch".
+table RecordBatch {
+  /// number of records / rows. The arrays in the batch should all have this
+  /// length
+  length: long;
+
+  /// Nodes correspond to the pre-ordered flattened logical schema
+  nodes: [FieldNode];
+
+  /// Buffers correspond to the pre-ordered flattened buffer tree
+  ///
+  /// The number of buffers appended to this list depends on the schema. For
+  /// example, most primitive arrays will have 2 buffers, 1 for the validity
+  /// bitmap and 1 for the values. For struct arrays, there will only be a
+  /// single buffer for the validity (nulls) bitmap
+  buffers: [Buffer];
+
+  /// Optional compression of the message body
+  compression: BodyCompression;
+
+  /// Some types such as Utf8View are represented using a variable number of buffers.
+  /// For each such Field in the pre-ordered flattened logical schema, there will be
+  /// an entry in variadicBufferCounts to indicate the number of number of variadic
+  /// buffers which belong to that Field in the current RecordBatch.
+  ///
+  /// For example, the schema
+  ///     col1: Struct<alpha: Int32, beta: BinaryView, gamma: Float64>
+  ///     col2: Utf8View
+  /// contains two Fields with variadic buffers so variadicBufferCounts will have
+  /// two entries, the first counting the variadic buffers of `col1.beta` and the
+  /// second counting `col2`'s.
+  ///
+  /// This field may be omitted if and only if the schema contains no Fields with
+  /// a variable number of buffers, such as BinaryView and Utf8View.
+  variadicBufferCounts: [long];
+}
+
+/// For sending dictionary encoding information. Any Field can be
+/// dictionary-encoded, but in this case none of its children may be
+/// dictionary-encoded.
+/// There is one vector / column per dictionary, but that vector / column
+/// may be spread across multiple dictionary batches by using the isDelta
+/// flag
+
+table DictionaryBatch {
+  id: long;
+  data: RecordBatch;
+
+  /// If isDelta is true the values in the dictionary are to be appended to a
+  /// dictionary with the indicated id. If isDelta is false this dictionary
+  /// should replace the existing dictionary.
+  isDelta: bool = false;
+}
+
+/// ----------------------------------------------------------------------
+/// The root Message type
+
+/// This union enables us to easily send different message types without
+/// redundant storage, and in the future we can easily add new message types.
+///
+/// Arrow implementations do not need to implement all of the message types,
+/// which may include experimental metadata types. For maximum compatibility,
+/// it is best to send data using RecordBatch
+union MessageHeader {
+  Schema
+}
+
+table Message {
+  version: cudf.io.parquet.flatbuf.MetadataVersion;
+  header: MessageHeader;
+  bodyLength: long;
+  custom_metadata: [ KeyValue ];
+}
+
+root_type Message;