From bb991ef6e37a65a6b5a05203e71cc1ae8d4444d3 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 11 Jun 2024 08:49:09 +0000
Subject: [PATCH 01/33] validation of tokens code

---
 cpp/CMakeLists.txt                       |   1 +
 cpp/include/cudf/io/json.hpp             |  59 +++++++++
 cpp/src/io/json/nested_json.hpp          |  16 +++
 cpp/src/io/json/nested_json_gpu.cu       |   5 +-
 cpp/src/io/json/output_writer_iterator.h | 140 ++++++++++++++++++++
 cpp/src/io/json/process_tokens.cu        | 161 +++++++++++++++++++++++
 cpp/tests/io/json_test.cpp               |  78 +++++++++++
 7 files changed, 459 insertions(+), 1 deletion(-)
 create mode 100644 cpp/src/io/json/output_writer_iterator.h
 create mode 100644 cpp/src/io/json/process_tokens.cu

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index ca85996b990..cf41b851970 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -392,6 +392,7 @@ add_library(
   src/io/json/nested_json_gpu.cu
   src/io/json/read_json.cu
   src/io/json/parser_features.cpp
+  src/io/json/process_tokens.cu
   src/io/json/write_json.cu
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 65ba8f25577..7525c67fa4d 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -129,6 +129,14 @@ class json_reader_options {
   // Whether to recover after an invalid JSON line
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
+  // Validation checks for spark
+  // Allow leading zeros for numeric values.
+  bool _allow_numeric_leading_zeros = true;
+  // Allow unquoted control characters
+  bool allowUnquotedControlChars = true;
+  // Additional values to recognize as null values
+  std::vector<std::string> _na_values;
+
   /**
    * @brief Constructor from source info.
    *
@@ -298,6 +306,20 @@ class json_reader_options {
    */
   json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
 
+  /**
+   * @brief Whether leading zeros are allowed in numeric values.
+   *
+   * @return true if leading zeros are allowed in numeric values
+   */
+  [[nodiscard]] bool is_allowed_numeric_leading_zeros() const { return _allow_numeric_leading_zeros; }
+
+  /**
+   * @brief Returns additional values to recognize as null values.
+   *
+   * @return Additional values to recognize as null values
+   */
+  [[nodiscard]] std::vector<std::string> const& get_na_values() const { return _na_values; }
+
   /**
    * @brief Set data types for columns to be read.
    *
@@ -427,6 +449,20 @@ class json_reader_options {
    * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines.
    */
   void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }
+
+  /**
+   * @brief Set Whether leading zeros are allowed in numeric values.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; }
+
+  /**
+   * @brief Sets additional values to recognize as null values.
+   *
+   * @param vals Vector of values to be considered to be null
+   */
+  void set_na_values(std::vector<std::string> vals) { _na_values = std::move(vals); }
 };
 
 /**
@@ -638,6 +674,29 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set Whether leading zeros are allowed in numeric values.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  json_reader_options_builder& numeric_leading_zeros(bool val)
+  {
+    options.allow_numeric_leading_zeros(val);
+    return *this;
+  }
+
+  /**
+   * @brief Sets additional values to recognize as null values.
+   *
+   * @param vals Vector of values to be considered to be null
+   * @return this for chaining
+   */
+  json_reader_options_builder& na_values(std::vector<std::string> vals)
+  {
+    options.set_na_values(std::move(vals));
+    return *this;
+  }
+
   /**
    * @brief move json_reader_options member once it's built.
    */
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index e12892a2d50..e858594e4b0 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -221,6 +221,22 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
   device_span<SymbolOffsetT const> token_indices,
   rmm::cuda_stream_view stream);
 
+/**
+ * @brief Validate the tokens conforming to behavior given in options.
+ * 
+ * @param d_input The string of input characters
+ * @param tokens The tokens to be post-processed
+ * @param token_indices The tokens' corresponding indices that are post-processed
+ * @param options Parsing options specifying the parsing behaviour
+ * @param stream The cuda stream to dispatch GPU kernels to
+ */
+void validate_token_stream(
+  device_span<char const> d_input,
+  device_span<PdaTokenT> tokens,
+  device_span<SymbolOffsetT> token_indices,
+  cudf::io::json_reader_options const& options,
+  rmm::cuda_stream_view stream);
+
 /**
  * @brief Parses the given JSON string and generates a tree representation of the given input.
  *
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index b243e4ba006..eeea2a0edf3 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1655,6 +1655,7 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> ge
 
   if (delimiter_offset == 1) {
     tokens.set_element(0, token_t::LineEnd, stream);
+    validate_token_stream(json_in, tokens, tokens_indices, options, stream);
     auto [filtered_tokens, filtered_tokens_indices] =
       process_token_stream(tokens, tokens_indices, stream);
     tokens         = std::move(filtered_tokens);
@@ -2079,7 +2080,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
   parse_opts.keepquotes = options.is_enabled_keep_quotes();
   parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
   parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
-  parse_opts.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
+  std::vector<std::string> na_values{"", "null"};
+  na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
+  parse_opts.trie_na    = cudf::detail::create_serialized_trie(na_values, stream);
   return parse_opts;
 }
 
diff --git a/cpp/src/io/json/output_writer_iterator.h b/cpp/src/io/json/output_writer_iterator.h
new file mode 100644
index 00000000000..20228b4efac
--- /dev/null
+++ b/cpp/src/io/json/output_writer_iterator.h
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+// Output writer iterator
+#pragma once
+
+#include <thrust/iterator/iterator_adaptor.h>
+
+THRUST_NAMESPACE_BEGIN
+namespace detail {
+
+// Proxy reference that calls BinaryFunction with Iterator value and the rhs of assignment operator
+template <typename BinaryFunction, typename Iterator>
+class output_writer_iterator_proxy {
+ public:
+  __host__ __device__ output_writer_iterator_proxy(const Iterator& index_iter, BinaryFunction fun)
+    : index_iter(index_iter), fun(fun)
+  {
+  }
+  template <typename T>
+  __host__ __device__ output_writer_iterator_proxy operator=(const T& x)
+  {
+    fun(*index_iter, x);
+    return *this;
+  }
+
+ private:
+  Iterator index_iter;
+  BinaryFunction fun;
+};
+
+// Register output_writer_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class BinaryFunction, class Iterator>
+struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>>
+  : public thrust::detail::true_type {
+};
+
+}  // namespace detail
+
+/**
+ * @brief Transform output iterator with custom writer binary function which takes index and value.
+ *
+ * @code {.cpp}
+ * #include <thrust/iterator/output_writer_iterator.cuh>
+ * #include <thrust/device_vector.h>
+ * #include <thrust/iterator/counting_iterator.h>
+ * #include <thrust/iterator/transform_iterator.h>
+ *
+ * struct set_bits_field {
+ *   int* bitfield;
+ *   __device__ inline void set_bit(size_t bit_index)
+ *   {
+ *     atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32)));
+ *   }
+ *   __device__ inline void clear_bit(size_t bit_index)
+ *   {
+ *     atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32)));
+ *   }
+ *   // Index, value
+ *   __device__ void operator()(size_t i, bool x)
+ *   {
+ *     if (x)
+ *       set_bit(i);
+ *     else
+ *       clear_bit(i);
+ *   }
+ * };
+ *
+ * thrust::device_vector<int> v(1, 0x00000000);
+ * auto result_begin = thrust::make_output_writer_iterator(thrust::make_counting_iterator(0),
+ *                                                 set_bits_field{v.data().get()});
+ * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
+ *   [] __device__ (int x) {   return x%2; });
+ * thrust::copy(thrust::device, value, value+32, result_begin);
+ * assert(v[0] == 0xaaaaaaaa);
+ * @endcode
+ *
+ *
+ * @tparam BinaryFunction Binary function to be called with the Iterator value and the rhs of
+ * assignment operator.
+ * @tparam Iterator iterator type that acts as index of the output.
+ */
+template <typename BinaryFunction, typename Iterator>
+class output_writer_iterator
+  : public thrust::iterator_adaptor<
+      output_writer_iterator<BinaryFunction, Iterator>,
+      Iterator,
+      thrust::use_default,
+      thrust::use_default,
+      thrust::use_default,
+      thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>> {
+ public:
+  // parent class.
+  typedef thrust::iterator_adaptor<
+    output_writer_iterator<BinaryFunction, Iterator>,
+    Iterator,
+    thrust::use_default,
+    thrust::use_default,
+    thrust::use_default,
+    thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>>
+    super_t;
+  // friend thrust::iterator_core_access to allow it access to the private interface dereference()
+  friend class thrust::iterator_core_access;
+  __host__ __device__ output_writer_iterator(Iterator const& x, BinaryFunction fun)
+    : super_t(x), fun(fun)
+  {
+  }
+
+ private:
+  BinaryFunction fun;
+
+  // thrust::iterator_core_access accesses this function
+  __host__ __device__ typename super_t::reference dereference() const
+  {
+    return thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>(
+      this->base_reference(), fun);
+  }
+};
+
+template <typename BinaryFunction, typename Iterator>
+output_writer_iterator<BinaryFunction, Iterator> __host__ __device__
+make_output_writer_iterator(Iterator out, BinaryFunction fun)
+{
+  return output_writer_iterator<BinaryFunction, Iterator>(out, fun);
+}  // end make_output_writer_iterator
+THRUST_NAMESPACE_END
\ No newline at end of file
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
new file mode 100644
index 00000000000..d1ed02052dc
--- /dev/null
+++ b/cpp/src/io/json/process_tokens.cu
@@ -0,0 +1,161 @@
+
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nested_json.hpp"
+// #include "tabulate_output_iterator.cuh"
+#include "output_writer_iterator.h"
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/io/detail/tokenize_json.hpp>
+
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <cuda/functional>
+#include <thrust/scan.h>
+#include <thrust/transform_scan.h>
+#include <thrust/tuple.h>
+
+#include <sys/types.h>
+
+namespace cudf::io::json {
+namespace detail {
+
+struct write_if {
+  using token_t   = cudf::io::json::token_t;
+  using scan_type = thrust::pair<token_t, bool>;
+  PdaTokenT* tokens;
+  size_t n;
+  // Index, value
+  __device__ void operator()(size_type i, scan_type x)
+  {
+    if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) {
+      if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) {
+        tokens[i] = token_t::ErrorBegin;
+        // printf("writing\n");
+      }
+    }
+  }
+};
+
+void validate_token_stream(device_span<char const> d_input,
+                           device_span<PdaTokenT> tokens,
+                           device_span<SymbolOffsetT> token_indices,
+                           cudf::io::json_reader_options const& options,
+                           rmm::cuda_stream_view stream)
+{
+  if (getenv("SPARK_JSON")) {
+    using token_t = cudf::io::json::token_t;
+    auto validate_tokens =
+      [data = d_input.data(),
+       allow_numeric_leading_zeros =
+         options.is_allowed_numeric_leading_zeros()] __device__(SymbolOffsetT start,
+                                                                SymbolOffsetT end) -> bool {
+      // Leading zeros.
+      if (!allow_numeric_leading_zeros and data[start] == '0') return false;
+      return true;
+    };
+    auto num_tokens = tokens.size();
+    auto count_it   = thrust::make_counting_iterator(0);
+    auto predicate  = [tokens        = tokens.begin(),
+                      token_indices = token_indices.begin(),
+                      validate_tokens] __device__(auto i) -> bool {
+      if (tokens[i] == token_t::ValueEnd) {
+        return !validate_tokens(token_indices[i - 1], token_indices[i]);
+      }
+      return false;
+    };
+
+    using scan_type        = write_if::scan_type;
+    auto conditional_write = write_if{tokens.begin(), num_tokens};
+    // auto conditional_output_it = tokens.begin();
+    // auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write);
+    auto conditional_output_it =
+      thrust::make_output_writer_iterator(thrust::make_counting_iterator(0), conditional_write);
+    auto transform_op = cuda::proclaim_return_type<scan_type>(
+      [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
+        if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
+        return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
+      });
+    auto binary_op = cuda::proclaim_return_type<scan_type>(
+      [] __device__(scan_type prev, scan_type curr) -> scan_type {
+        auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
+        return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
+      });
+    rmm::device_uvector<int8_t> error(num_tokens, stream);
+    thrust::transform(rmm::exec_policy(stream),
+                      count_it,
+                      count_it + num_tokens,
+                      error.begin(),
+                      predicate);  // in-place scan
+    printf("error:");
+    for (auto tk : cudf::detail::make_std_vector_sync(error, stream))
+      printf("%d ", tk);
+    printf("\n");
+
+    thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                     count_it,
+                                     count_it + num_tokens,
+                                     conditional_output_it,
+                                     transform_op,
+                                     binary_op);  // in-place scan
+  }
+  printf("pre_process_token:");
+  for (auto tk : cudf::detail::make_std_vector_sync(device_span<PdaTokenT const>(tokens), stream))
+    printf("%d ", tk);
+  printf("\n");
+
+  // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE
+  // 1   0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  1  1
+  // 1   1  1  1  1  1  1  2  2  2  2  2  2  2  2  2  3  4
+  // auto unary_op = [] __device__ (auto) -> token_t { return token_t::ErrorBegin; };
+  // thrust::transform_if(rmm::exec_policy(stream), count_it, count_it + num_tokens, tokens.begin(),
+  // unary_op, predicate); auto num_rows = thrust::count(rmm::exec_policy(stream), tokens.begin(),
+  // tokens.end(), token_t::LineEnd); rmm::device_uvector<bool> row_is_error(num_rows, stream);
+  // rmm::device_uvector<SymbolOffsetT> row_index(num_tokens, stream);
+  // auto is_LineEnd = [] __device__ (auto token) -> SymbolOffsetT { return token ==
+  // token_t::LineEnd; }; thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+  //   tokens.begin(), tokens.end(), row_index.begin(), is_LineEnd, thrust::plus<SymbolOffsetT>{});
+  // auto is_error_it = thrust::make_transform_iterator(tokens.begin(), [] __device__ (auto token)
+  // -> bool { return token == token_t::ErrorBegin; });
+  // thrust::reduce_by_key(rmm::exec_policy(stream), row_index.begin(), row_index.end(),
+  // is_error_it, thrust::make_discard_iterator(), row_is_error.begin());
+
+  // if current == ErrorBegin and tokens[i+1]==LE or i==n-1) then write ErrorBegin to tokens[i],
+  // else nothing. if VB, or SB, then if validate(token[i], token[i+1])==false,
+  //
+  // Transform_if to errors tokens
+  // Count LE (num_rows)
+  // create int vector [num_rows], bool[num_rows]
+  // TODO: fuse them together with single scan algorithm.
+  // LE==1, to scan to row_index.
+  // reduce_by_key, to check if it has any error, and number of non-errors tokens.
+  // reduce them to get output tokens count.
+  // copy_if -> use cub cub::DeviceSelect::If(d_data, d_num_selected_out, num_items, select_op)
+}
+
+// corner cases: empty LE,
+// alternate Decoupled look back idea:
+// count LineEnd, allocate bool[num_rows] as is_error.
+// decoupled look back for LineEnd tokens for row indices,
+//  transform_if to error tokens and write to bool[row_index] atomically (reduce and write within
+//  warp/block?)
+// decoupled look back for LineEnd tokens for row indices,
+//  (if not error row & not lineEnd token) -> decoupled look back for output indices,
+//  CopyIf (if not error row & not lineEnd token) write to output.
+}  // namespace detail
+}  // namespace cudf::io::json
\ No newline at end of file
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 57aa2721756..5e51a1c759a 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2161,6 +2161,84 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync)
   cudf::io::set_host_memory_resource(last_mr);
 }
 
+// Validation
+TEST_F(JsonReaderTest, ValueValidation)
+{
+  // parsing error as null rows
+  std::string data =
+    // 0 -> a: -2 (valid)
+    R"({"a":-2 }{})"
+    "\n"
+    // 1 -> (invalid)
+    R"({"b":{}should_be_invalid})"
+    "\n"
+    // 2 -> b (valid)
+    R"({"b":{"a":3} })"
+    "\n"
+    // 3 -> c: (valid/null based on option)
+    R"({"a": 1, "c":nan, "d": "null" } )"
+    "\n"
+    "\n"
+    // 4 -> (valid/null based on option)
+    R"({"a":04, "c": 1.23, "d": "abc"} 123)"
+    "\n"
+    // 5 -> (valid)
+    R"({"a":5}//Comment after record)"
+    "\n"
+    // 6 -> ((valid/null based on option)
+    R"({"a":06} //Comment after whitespace)"
+    "\n"
+    // 7 -> (invalid)
+    R"({"a":5 //Invalid Comment within record})";
+
+  // leadingZeros allowed
+  // na_values,
+  {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 4);
+    EXPECT_EQ(result.tbl->num_rows(), 8);
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0},
+                                  {true, false, false, true, true, true, true, false}};
+    auto b_column = cudf::test::structs_column_wrapper(
+      {b_a_col}, {false, false, true, false, false, false, false, false});
+    auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0},
+                                    {false, false, false, false, true, false, false, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);
+  }
+  // leadingZeros not allowed, NaN allowed
+  {
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .lines(true)
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .numeric_leading_zeros(false)
+        .na_values({"nan"});
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 4);
+    EXPECT_EQ(result.tbl->num_rows(), 8);
+    EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8); // empty column
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0},
+                                  {true, false, false, true, false, true, false, false}};
+    auto b_column = cudf::test::structs_column_wrapper(
+      {b_a_col}, {false, false, true, false, false, false, false, false});
+    auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0},
+                                    {false, false, false, false, false, false, false, false});
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);
+  }
+}
+
 TEST_F(JsonReaderTest, MixedTypes)
 {
   using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;

From 4e707cb706fe6044451dfce683cbe38bc410c0d6 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 18 Jun 2024 02:10:57 +0000
Subject: [PATCH 02/33] fix pre-commit check failures

---
 cpp/include/cudf/io/json.hpp             |  6 +++++-
 cpp/src/io/json/nested_json.hpp          | 13 ++++++-------
 cpp/src/io/json/nested_json_gpu.cu       |  2 +-
 cpp/src/io/json/output_writer_iterator.h |  7 +++----
 cpp/src/io/json/process_tokens.cu        |  2 +-
 cpp/tests/io/json_test.cpp               | 10 +++++-----
 6 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7525c67fa4d..6fc90865666 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -311,7 +311,10 @@ class json_reader_options {
    *
    * @return true if leading zeros are allowed in numeric values
    */
-  [[nodiscard]] bool is_allowed_numeric_leading_zeros() const { return _allow_numeric_leading_zeros; }
+  [[nodiscard]] bool is_allowed_numeric_leading_zeros() const
+  {
+    return _allow_numeric_leading_zeros;
+  }
 
   /**
    * @brief Returns additional values to recognize as null values.
@@ -678,6 +681,7 @@ class json_reader_options_builder {
    * @brief Set Whether leading zeros are allowed in numeric values.
    *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   * @return this for chaining
    */
   json_reader_options_builder& numeric_leading_zeros(bool val)
   {
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index e858594e4b0..6d80b41c907 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -223,19 +223,18 @@ std::pair<rmm::device_uvector<PdaTokenT>, rmm::device_uvector<SymbolOffsetT>> pr
 
 /**
  * @brief Validate the tokens conforming to behavior given in options.
- * 
+ *
  * @param d_input The string of input characters
  * @param tokens The tokens to be post-processed
  * @param token_indices The tokens' corresponding indices that are post-processed
  * @param options Parsing options specifying the parsing behaviour
  * @param stream The cuda stream to dispatch GPU kernels to
  */
-void validate_token_stream(
-  device_span<char const> d_input,
-  device_span<PdaTokenT> tokens,
-  device_span<SymbolOffsetT> token_indices,
-  cudf::io::json_reader_options const& options,
-  rmm::cuda_stream_view stream);
+void validate_token_stream(device_span<char const> d_input,
+                           device_span<PdaTokenT> tokens,
+                           device_span<SymbolOffsetT> token_indices,
+                           cudf::io::json_reader_options const& options,
+                           rmm::cuda_stream_view stream);
 
 /**
  * @brief Parses the given JSON string and generates a tree representation of the given input.
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index eeea2a0edf3..a7e700f32ec 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -2082,7 +2082,7 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt
   parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
   std::vector<std::string> na_values{"", "null"};
   na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end());
-  parse_opts.trie_na    = cudf::detail::create_serialized_trie(na_values, stream);
+  parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream);
   return parse_opts;
 }
 
diff --git a/cpp/src/io/json/output_writer_iterator.h b/cpp/src/io/json/output_writer_iterator.h
index 20228b4efac..f6276771af4 100644
--- a/cpp/src/io/json/output_writer_iterator.h
+++ b/cpp/src/io/json/output_writer_iterator.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,8 +46,7 @@ class output_writer_iterator_proxy {
 // type_traits to enable its use with algorithms.
 template <class BinaryFunction, class Iterator>
 struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>>
-  : public thrust::detail::true_type {
-};
+  : public thrust::detail::true_type {};
 
 }  // namespace detail
 
@@ -137,4 +136,4 @@ make_output_writer_iterator(Iterator out, BinaryFunction fun)
 {
   return output_writer_iterator<BinaryFunction, Iterator>(out, fun);
 }  // end make_output_writer_iterator
-THRUST_NAMESPACE_END
\ No newline at end of file
+THRUST_NAMESPACE_END
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index d1ed02052dc..96ae4f7817b 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -158,4 +158,4 @@ void validate_token_stream(device_span<char const> d_input,
 //  (if not error row & not lineEnd token) -> decoupled look back for output indices,
 //  CopyIf (if not error row & not lineEnd token) write to output.
 }  // namespace detail
-}  // namespace cudf::io::json
\ No newline at end of file
+}  // namespace cudf::io::json
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 5e51a1c759a..1d51c571c9a 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -2202,9 +2202,9 @@ TEST_F(JsonReaderTest, ValueValidation)
 
     EXPECT_EQ(result.tbl->num_columns(), 4);
     EXPECT_EQ(result.tbl->num_rows(), 8);
-    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
-    auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0},
-                                  {true, false, false, true, true, true, true, false}};
+    auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column =
+      int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, {true, false, false, true, true, true, true, false}};
     auto b_column = cudf::test::structs_column_wrapper(
       {b_a_col}, {false, false, true, false, false, false, false, false});
     auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0},
@@ -2225,14 +2225,14 @@ TEST_F(JsonReaderTest, ValueValidation)
 
     EXPECT_EQ(result.tbl->num_columns(), 4);
     EXPECT_EQ(result.tbl->num_rows(), 8);
-    EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8); // empty column
+    EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8);  // empty column
     auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
     auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0},
                                   {true, false, false, true, false, true, false, false}};
     auto b_column = cudf::test::structs_column_wrapper(
       {b_a_col}, {false, false, true, false, false, false, false, false});
     auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0},
-                                    {false, false, false, false, false, false, false, false});
+                                 {false, false, false, false, false, false, false, false});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column);

From 0c2e4da8ac1b594af3cf68a75d8bf675bdad5e0c Mon Sep 17 00:00:00 2001
From: "Robert (Bobby) Evans" <revans2@gmail.com>
Date: Fri, 2 Aug 2024 16:13:48 -0500
Subject: [PATCH 03/33] Add Spark Compatible JSON validation (#10)

---
 cpp/include/cudf/io/json.hpp                  |  99 ++++++-
 cpp/src/io/json/process_tokens.cu             | 250 ++++++++++++++++--
 .../main/java/ai/rapids/cudf/JSONOptions.java |  67 ++++-
 java/src/main/java/ai/rapids/cudf/Table.java  |  73 ++++-
 java/src/main/native/src/TableJni.cpp         |  36 ++-
 .../test/java/ai/rapids/cudf/TableTest.java   | 201 ++++++++++++++
 6 files changed, 689 insertions(+), 37 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 3d0d263e2bb..ad9f9a4aa5c 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -131,10 +131,14 @@ class json_reader_options {
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
   // Validation checks for spark
+  // Should the json validation be strict of not
+  bool _strict_validation = false;
   // Allow leading zeros for numeric values.
   bool _allow_numeric_leading_zeros = true;
+  // Allow nonnumeric numbers. NaN/Inf
+  bool _allow_nonnumeric_numbers = true;
   // Allow unquoted control characters
-  bool allowUnquotedControlChars = true;
+  bool _allow_unquoted_control_chars = true;
   // Additional values to recognize as null values
   std::vector<std::string> _na_values;
 
@@ -309,7 +313,15 @@ class json_reader_options {
   [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; }
 
   /**
-   * @brief Whether leading zeros are allowed in numeric values.
+   * @brief Whether json validation should be enforced strictly or not.
+   *
+   * @return true if it should be.
+   */
+  [[nodiscard]] bool is_strict_validation() const { return _strict_validation; }
+
+  /**
+   * @brief Whether leading zeros are allowed in numeric values. strict validation
+   * must be enabled for this to work.
    *
    * @return true if leading zeros are allowed in numeric values
    */
@@ -318,6 +330,22 @@ class json_reader_options {
     return _allow_numeric_leading_zeros;
   }
 
+  /**
+   * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, and 
+   * -Infinity. strict validation must be enabled for this to work.
+   *
+   * @return true if leading zeros are allowed in numeric values
+   */
+  [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }
+
+  /**
+   * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 be allowed
+   * without some form of escaping. Strict validation must be enabled for this to work.
+   *
+   * @return true if unquoted control chars are allowed.
+   */
+  [[nodiscard]] bool is_allowed_unquoted_control_chars() const { return _allow_unquoted_control_chars; }
+
   /**
    * @brief Returns additional values to recognize as null values.
    *
@@ -456,12 +484,37 @@ class json_reader_options {
   void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; }
 
   /**
-   * @brief Set Whether leading zeros are allowed in numeric values.
+   * @brief Set whether strict validation is enabled or not.
+   *
+   * @param val Boolean value to indicate whether strict validation is enabled.
+   */
+  void set_strict_validation(bool val) { _strict_validation = val; }
+
+  /**
+   * @brief Set whether leading zeros are allowed in numeric values. strict validation
+   * must be enabled for this to work.
    *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
    */
   void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; }
 
+  /**
+   * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, 
+   * Infinity, and -Infinity. strict validation must be enabled for this to work.
+   *
+   * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
+   */
+  void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; }
+
+  /**
+   * @brief Set whether in a quoted string should characters greater than or equal to 0 
+   * and less than 32 be allowed without some form of escaping. Strict validation must 
+   * be enabled for this to work.
+   *
+   * @param val true to indicate wether unquoted control chars are allowed.
+   */
+  void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; }
+
   /**
    * @brief Sets additional values to recognize as null values.
    *
@@ -680,7 +733,19 @@ class json_reader_options_builder {
   }
 
   /**
-   * @brief Set Whether leading zeros are allowed in numeric values.
+   * @brief Set whether json validation should be strict or not.
+   *
+   * @param val Boolean value to indicate whether json validation should be strict or not.
+   */
+  json_reader_options_builder& strict_validation(bool val)
+  {
+    options.set_strict_validation(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set Whether leading zeros are allowed in numeric values. strict validation must
+   * be enabled for this to have any effect.
    *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
    * @return this for chaining
@@ -691,6 +756,32 @@ class json_reader_options_builder {
     return *this;
   }
 
+  /**
+   * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, 
+   * +INF, -INF, +Infinity, Infinity, and -Infinity.
+   * strict validation must be enabled for this to have any effect.
+   *
+   * @param val Boolean value to indicate if unquoted nonnumeric values are
+   * valid json or not.
+   */
+  json_reader_options_builder& nonnumeric_numbers(bool val)
+  {
+    options.allow_nonnumeric_numbers(val);
+    return *this;
+  }
+
+  /**
+   * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without 
+   * some form of escaping. strict validation must be enabled for this to have any effect.
+   *
+   * @param val Boolean value to indicate if unquoted control chars are allowed or not.
+   */
+  json_reader_options_builder& unquoted_control_chars(bool val)
+  {
+    options.allow_unquoted_control_chars(val);
+    return *this;
+  }
+
   /**
    * @brief Sets additional values to recognize as null values.
    *
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 96ae4f7817b..aee22e3fdf0 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -52,30 +52,246 @@ struct write_if {
   }
 };
 
+enum class number_state {
+  start = 0,
+  saw_neg, // not a complete state
+  leading_zero,
+  whole,
+  saw_radix, // not a complete state
+  fraction,
+  start_exponent, // not a complete state
+  after_sign_exponent, // not a complete state
+  exponent
+};
+
+enum class string_state {
+  normal = 0,
+  escaped, // not a complete state
+  escaped_u // not a complete state
+};
+
+__device__ inline bool substr_eq(const char * data,
+    SymbolOffsetT const start,
+    SymbolOffsetT const end,
+    SymbolOffsetT const expected_len,
+    const char * expected) {
+  if (end - start != expected_len) {
+    return false;
+  } else {
+    for (auto idx = 0; idx < expected_len; idx++) {
+      if (data[start + idx] != expected[idx]) {
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 void validate_token_stream(device_span<char const> d_input,
                            device_span<PdaTokenT> tokens,
                            device_span<SymbolOffsetT> token_indices,
                            cudf::io::json_reader_options const& options,
                            rmm::cuda_stream_view stream)
 {
-  if (getenv("SPARK_JSON")) {
+  if (options.is_strict_validation()) {
     using token_t = cudf::io::json::token_t;
-    auto validate_tokens =
+    auto validate_values =
       [data = d_input.data(),
        allow_numeric_leading_zeros =
-         options.is_allowed_numeric_leading_zeros()] __device__(SymbolOffsetT start,
-                                                                SymbolOffsetT end) -> bool {
-      // Leading zeros.
-      if (!allow_numeric_leading_zeros and data[start] == '0') return false;
-      return true;
+         options.is_allowed_numeric_leading_zeros(),
+       allow_nonnumeric =
+         options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
+                                                             SymbolOffsetT end) -> bool {
+      // This validates an unquoted value. A value must match https://www.json.org/json-en.html
+      // but the leading and training whitespace should already have been removed, and is not
+      // a string
+      auto c = data[start];
+      if ('n' == c) {
+        return substr_eq(data, start, end, 4, "null");
+      } else if ('t' == c) {
+        return substr_eq(data, start, end, 4, "true");
+      } else if ('f' == c) {
+        return substr_eq(data, start, end, 5, "false");
+      } else if (allow_nonnumeric && c == 'N') {
+        return substr_eq(data, start, end, 3, "NaN");
+      } else if (allow_nonnumeric && c == 'I') {
+        return substr_eq(data, start, end, 8, "Infinity");
+      } else if (allow_nonnumeric && c == '+') {
+        return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity");
+      } else if ('-' == c || c <= '9' && 'c' >= '0') {
+        // number
+        auto num_state = number_state::start;
+        for (auto at = start; at < end; at++) {
+          c = data[at];
+          switch (num_state) {
+            case number_state::start:
+              if ('-' == c) {
+                num_state = number_state::saw_neg;
+              } else if ('0' == c) {
+                num_state = number_state::leading_zero;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::whole;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::saw_neg:
+              if ('0' == c) {
+                num_state = number_state::leading_zero;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::whole;
+              } else if (allow_nonnumeric && 'I' == c) {
+                return substr_eq(data, start, end, 4, "-INF") || substr_eq(data, start, end, 9, "-Infinity");
+              } else {
+                return false;
+              }
+              break;
+            case number_state::leading_zero:
+              if (allow_numeric_leading_zeros && c >= '0' && c <= '9') {
+                num_state = number_state::whole;
+              } else if ('.' == c) {
+                num_state = number_state::saw_radix;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::start_exponent;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::whole:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::whole;
+              } else if ('.' == c) {
+                num_state = number_state::saw_radix;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::start_exponent;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::saw_radix:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::fraction;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::start_exponent;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::fraction:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::fraction;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::start_exponent;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::start_exponent:
+              if ('+' == c || '-' == c) {
+                num_state = number_state::after_sign_exponent;
+              } else if (c >= '0' && c <= '9') {
+                num_state = number_state::exponent;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::after_sign_exponent:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::exponent;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::exponent:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::exponent;
+              } else {
+                return false;
+              }
+              break;
+          }
+        }
+        return num_state != number_state::after_sign_exponent &&
+          num_state != number_state::start_exponent &&
+          num_state != number_state::saw_neg &&
+          num_state != number_state::saw_radix;
+      } else {
+        return false;
+      }
+    };
+
+    auto validate_strings =
+      [data = d_input.data(),
+       allow_unquoted_control_chars = 
+         options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
+                                                                 SymbolOffsetT end) -> bool {
+      // This validates a quoted string. A string must match https://www.json.org/json-en.html
+      // but we already know that it has a starting and ending " and all white space has been
+      // stripped out. Also the base CUDF validation makes sure escaped chars are correct
+      // so we only need to worry about unquoted control chars     
+
+      auto state = string_state::normal;
+      auto u_count = 0;
+      for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
+        auto c = data[idx];
+        if (!allow_unquoted_control_chars && c >= 0 && c < 32) {
+          return false;
+        }
+
+        switch (state) {
+          case string_state::normal:
+            if (c == '\\') {
+              state = string_state::escaped;
+            }
+            break;
+          case string_state::escaped:
+            // in Spark you can allow any char to be escaped, but CUDF
+            // validates it in some cases so we need to also validate it.
+            if (c == 'u') {
+              state = string_state::escaped_u;
+              u_count = 0;
+            } else if (c == '"' ||
+                c == '\\' ||
+                c == '/' ||
+                c == 'b' ||
+                c == 'f' ||
+                c == 'n' ||
+                c == 'r' ||
+                c == 't') {
+              state = string_state::normal;
+            } else {
+              return false;
+            }
+            break;
+          case string_state::escaped_u:
+            if ((c >= '0' && c <= '9') ||
+                (c >= 'a' && c <= 'f') ||
+                (c >= 'A' && c <= 'F')) {
+              u_count++;
+              if (u_count == 4) {
+                state = string_state::normal;
+                u_count = 0;
+              }
+            } else {
+              return false;
+            }
+            break;
+        }
+      }
+      return string_state::normal == state;
     };
+
     auto num_tokens = tokens.size();
     auto count_it   = thrust::make_counting_iterator(0);
     auto predicate  = [tokens        = tokens.begin(),
                       token_indices = token_indices.begin(),
-                      validate_tokens] __device__(auto i) -> bool {
+                      validate_values,
+                      validate_strings] __device__(auto i) -> bool {
       if (tokens[i] == token_t::ValueEnd) {
-        return !validate_tokens(token_indices[i - 1], token_indices[i]);
+        return !validate_values(token_indices[i - 1], token_indices[i]);
+      } else if (tokens[i] == token_t::FieldNameEnd ||
+          tokens[i] == token_t::StringEnd) {
+        return !validate_strings(token_indices[i - 1], token_indices[i]);
       }
       return false;
     };
@@ -102,10 +318,10 @@ void validate_token_stream(device_span<char const> d_input,
                       count_it + num_tokens,
                       error.begin(),
                       predicate);  // in-place scan
-    printf("error:");
-    for (auto tk : cudf::detail::make_std_vector_sync(error, stream))
-      printf("%d ", tk);
-    printf("\n");
+    //printf("error:");
+    //for (auto tk : cudf::detail::make_std_vector_sync(error, stream))
+    //  printf("%d ", tk);
+    //printf("\n");
 
     thrust::transform_inclusive_scan(rmm::exec_policy(stream),
                                      count_it,
@@ -114,10 +330,10 @@ void validate_token_stream(device_span<char const> d_input,
                                      transform_op,
                                      binary_op);  // in-place scan
   }
-  printf("pre_process_token:");
-  for (auto tk : cudf::detail::make_std_vector_sync(device_span<PdaTokenT const>(tokens), stream))
-    printf("%d ", tk);
-  printf("\n");
+  //printf("pre_process_token:");
+  //for (auto tk : cudf::detail::make_std_vector_sync(device_span<PdaTokenT const>(tokens), stream))
+  //  printf("%d ", tk);
+  //printf("\n");
 
   // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE
   // 1   0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  1  1
diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index b37d0d88ec9..50cce4590c1 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -34,6 +34,10 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean normalizeWhitespace;
   private final boolean mixedTypesAsStrings;
   private final boolean keepStringQuotes;
+  private final boolean allowLeadingZeros;
+  private final boolean strictValidation;
+  private final boolean allowNonNumericNumbers;
+  private final boolean allowUnquotedControlChars;
 
   private JSONOptions(Builder builder) {
     super(builder);
@@ -44,6 +48,10 @@ private JSONOptions(Builder builder) {
     normalizeWhitespace = builder.normalizeWhitespace;
     mixedTypesAsStrings = builder.mixedTypesAsStrings;
     keepStringQuotes = builder.keepQuotes;
+    strictValidation = builder.strictValidation;
+    allowLeadingZeros = builder.allowLeadingZeros;
+    allowNonNumericNumbers = builder.allowNonNumericNumbers;
+    allowUnquotedControlChars = builder.allowUnquotedControlChars;
   }
 
   public boolean isDayFirst() {
@@ -75,6 +83,22 @@ public boolean keepStringQuotes() {
     return keepStringQuotes;
   }
 
+  public boolean strictValidation() {
+    return strictValidation;
+  }
+
+  public boolean leadingZerosAllowed() {
+    return allowLeadingZeros;
+  }
+
+  public boolean nonNumericNumbersAllowed() {
+    return allowNonNumericNumbers;
+  }
+
+  public boolean unquotedControlChars() {
+    return allowUnquotedControlChars;
+  }
+
   @Override
   String[] getIncludeColumnNames() {
     throw new UnsupportedOperationException("JSON reader didn't support column prune");
@@ -85,6 +109,10 @@ public static Builder builder() {
   }
 
   public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptions.Builder> {
+    private boolean strictValidation = false;
+    private boolean allowUnquotedControlChars = true;
+    private boolean allowNonNumericNumbers = false;
+    private boolean allowLeadingZeros = false;
     private boolean dayFirst = false;
     private boolean lines = true;
 
@@ -95,10 +123,47 @@ public static final class Builder  extends ColumnFilterOptions.Builder<JSONOptio
     private boolean mixedTypesAsStrings = false;
     private boolean keepQuotes = false;
 
+    /**
+     * Should json validation be strict or not
+     */
+    public Builder withStrictValidation(boolean isAllowed) {
+      strictValidation = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should leading zeros on numbers be allowed or not. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withLeadingZeros(boolean isAllowed) {
+      allowLeadingZeros = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should non-numeric numbers be allowed or not. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withNonNumericNumbers(boolean isAllowed) {
+      allowNonNumericNumbers = isAllowed;
+      return this;
+    }
+
+    /**
+     * Should unquoted control chars be allowed in strings. Strict validation
+     * must be enabled for this to have any effect.
+     */
+    public Builder withUnquotedControlChars(boolean isAllowed) {
+      allowUnquotedControlChars = isAllowed;
+      return this;
+    }
+
+    // TODO need to finish this for other configs...
+
     /**
      * Whether to parse dates as DD/MM versus MM/DD
      * @param dayFirst true: DD/MM, false, MM/DD
-     * @return
+     * @return builder for chaining
      */
     public Builder withDayFirst(boolean dayFirst) {
       this.dayFirst = dayFirst;
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index 4e737451ed6..2ac3ed01d3a 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -254,7 +254,11 @@ private static native long readJSON(int[] numChildren, String[] columnNames,
                                         boolean normalizeSingleQuotes,
                                         boolean normalizeWhitespace,
                                         boolean mixedTypesAsStrings,
-                                        boolean keepStringQuotes) throws CudfException;
+                                        boolean keepStringQuotes,
+                                        boolean strictValidation,
+                                        boolean allowLeadingZeros,
+                                        boolean allowNonNumericNumbers,
+                                        boolean allowUnquotedControl) throws CudfException;
 
   private static native long readJSONFromDataSource(int[] numChildren, String[] columnNames,
                                       int[] dTypeIds, int[] dTypeScales,
@@ -264,6 +268,10 @@ private static native long readJSONFromDataSource(int[] numChildren, String[] co
                                       boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
+                                      boolean strictValidation,
+                                      boolean allowLeadingZeros,
+                                      boolean allowNonNumericNumbers,
+                                      boolean allowUnquotedControl,
                                       long dsHandle) throws CudfException;
 
   private static native long readAndInferJSONFromDataSource(boolean dayFirst, boolean lines,
@@ -272,7 +280,12 @@ private static native long readAndInferJSONFromDataSource(boolean dayFirst, bool
                                       boolean normalizeWhitespace,
                                       boolean mixedTypesAsStrings,
                                       boolean keepStringQuotes,
+                                      boolean strictValidation,
+                                      boolean allowLeadingZeros,
+                                      boolean allowNonNumericNumbers,
+                                      boolean allowUnquotedControl,
                                       long dsHandle) throws CudfException;
+
   private static native long readAndInferJSON(long address, long length,
                                               boolean dayFirst,
                                               boolean lines,
@@ -280,7 +293,11 @@ private static native long readAndInferJSON(long address, long length,
                                               boolean normalizeSingleQuotes,
                                               boolean normalizeWhitespace,
                                               boolean mixedTypesAsStrings,
-                                              boolean keepStringQuotes) throws CudfException;
+                                              boolean keepStringQuotes,
+                                              boolean strictValidation,
+                                              boolean allowLeadingZeros,
+                                              boolean allowNonNumericNumbers,
+                                              boolean allowUnquotedControl) throws CudfException;
 
   /**
    * Read in Parquet formatted data.
@@ -1271,7 +1288,11 @@ public static Table readJSON(Schema schema, JSONOptions opts, File path) {
                     opts.isNormalizeSingleQuotes(),
                     opts.isNormalizeWhitespace(),
                     opts.isMixedTypesAsStrings(),
-                opts.keepStringQuotes()))) {
+                    opts.keepStringQuotes(),
+                    opts.strictValidation(),
+                    opts.leadingZerosAllowed(),
+                    opts.nonNumericNumbersAllowed(),
+                    opts.unquotedControlChars()))) {
 
       return gatherJSONColumns(schema, twm, -1);
     }
@@ -1349,7 +1370,12 @@ public static TableWithMeta readJSON(JSONOptions opts, HostMemoryBuffer buffer,
         opts.isDayFirst(), opts.isLines(), opts.isRecoverWithNull(),
         opts.isNormalizeSingleQuotes(),
         opts.isNormalizeWhitespace(),
-        opts.isMixedTypesAsStrings(), opts.keepStringQuotes()));
+        opts.isMixedTypesAsStrings(),
+        opts.keepStringQuotes(),
+        opts.strictValidation(),
+        opts.leadingZerosAllowed(),
+        opts.nonNumericNumbersAllowed(),
+        opts.unquotedControlChars()));
   }
 
   /**
@@ -1367,6 +1393,10 @@ public static TableWithMeta readAndInferJSON(JSONOptions opts, DataSource ds) {
           opts.isNormalizeWhitespace(),
           opts.isMixedTypesAsStrings(),
           opts.keepStringQuotes(),
+          opts.strictValidation(),
+          opts.leadingZerosAllowed(),
+          opts.nonNumericNumbersAllowed(),
+          opts.unquotedControlChars(),
           dsHandle));
         return twm;
       } finally {
@@ -1409,10 +1439,18 @@ public static Table readJSON(Schema schema, JSONOptions opts, HostMemoryBuffer b
     try (TableWithMeta twm = new TableWithMeta(readJSON(
             schema.getFlattenedNumChildren(), schema.getFlattenedColumnNames(),
             schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), null,
-            buffer.getAddress() + offset, len, opts.isDayFirst(), opts.isLines(),
-            opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+            buffer.getAddress() + offset, len,
+            opts.isDayFirst(),
+            opts.isLines(),
+            opts.isRecoverWithNull(),
+            opts.isNormalizeSingleQuotes(),
             opts.isNormalizeWhitespace(),
-            opts.isMixedTypesAsStrings(), opts.keepStringQuotes()))) {
+            opts.isMixedTypesAsStrings(),
+            opts.keepStringQuotes(),
+            opts.strictValidation(),
+            opts.leadingZerosAllowed(),
+            opts.nonNumericNumbersAllowed(),
+            opts.unquotedControlChars()))) {
       return gatherJSONColumns(schema, twm, emptyRowCount);
     }
   }
@@ -1433,17 +1471,26 @@ public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds) {
    * @param schema the schema of the data. You may use Schema.INFERRED to infer the schema.
    * @param opts various JSON parsing options.
    * @param ds the DataSource to read from.
-   * @param emtpyRowCount the number of rows to return if no columns were read.
+   * @param emptyRowCount the number of rows to return if no columns were read.
    * @return the data parsed as a table on the GPU.
    */
-  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emtpyRowCount) {
+  public static Table readJSON(Schema schema, JSONOptions opts, DataSource ds, int emptyRowCount) {
     long dsHandle = DataSourceHelper.createWrapperDataSource(ds);
     try (TableWithMeta twm = new TableWithMeta(readJSONFromDataSource(schema.getFlattenedNumChildren(),
-        schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(), opts.isDayFirst(),
-        opts.isLines(), opts.isRecoverWithNull(), opts.isNormalizeSingleQuotes(),
+        schema.getFlattenedColumnNames(), schema.getFlattenedTypeIds(), schema.getFlattenedTypeScales(),
+        opts.isDayFirst(),
+        opts.isLines(),
+        opts.isRecoverWithNull(),
+        opts.isNormalizeSingleQuotes(),
         opts.isNormalizeWhitespace(),
-        opts.isMixedTypesAsStrings(), opts.keepStringQuotes(), dsHandle))) {
-      return gatherJSONColumns(schema, twm, emtpyRowCount);
+        opts.isMixedTypesAsStrings(),
+        opts.keepStringQuotes(),
+        opts.strictValidation(),
+        opts.leadingZerosAllowed(),
+        opts.nonNumericNumbersAllowed(),
+        opts.unquotedControlChars(),
+        dsHandle))) {
+      return gatherJSONColumns(schema, twm, emptyRowCount);
     } finally {
       DataSourceHelper.destroyWrapperDataSource(dsHandle);
     }
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index c58cd732b39..424ace416e8 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1610,6 +1610,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
                                                          jboolean normalize_whitespace,
                                                          jboolean mixed_types_as_string,
                                                          jboolean keep_quotes,
+                                                         jboolean strict_validation,
+                                                         jboolean allow_leading_zeros,
+                                                         jboolean allow_nonnumeric_numbers,
+                                                         jboolean allow_unquoted_control,
                                                          jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1629,6 +1633,10 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
+        .numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control)
         .keep_quotes(keep_quotes);
 
     auto result =
@@ -1649,7 +1657,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
                                                                    jboolean normalize_single_quotes,
                                                                    jboolean normalize_whitespace,
                                                                    jboolean mixed_types_as_string,
-                                                                   jboolean keep_quotes)
+                                                                   jboolean keep_quotes,
+                                                                   jboolean strict_validation,
+                                                                   jboolean allow_leading_zeros,
+                                                                   jboolean allow_nonnumeric_numbers,
+                                                                   jboolean allow_unquoted_control)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {
@@ -1671,6 +1683,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .recovery_mode(recovery_mode)
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
+        .strict_validation(strict_validation)
+        .numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control)
         .mixed_types_as_string(mixed_types_as_string)
         .keep_quotes(keep_quotes);
 
@@ -1777,6 +1793,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
                                                  jboolean normalize_whitespace,
                                                  jboolean mixed_types_as_string,
                                                  jboolean keep_quotes,
+                                                 jboolean strict_validation,
+                                                 jboolean allow_leading_zeros,
+                                                 jboolean allow_nonnumeric_numbers,
+                                                 jboolean allow_unquoted_control,
                                                  jlong ds_handle)
 {
   JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0);
@@ -1811,6 +1831,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
+        .numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control)
         .keep_quotes(keep_quotes);
 
     if (!n_types.is_null()) {
@@ -1861,7 +1885,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
                                                            jboolean normalize_single_quotes,
                                                            jboolean normalize_whitespace,
                                                            jboolean mixed_types_as_string,
-                                                           jboolean keep_quotes)
+                                                           jboolean keep_quotes,
+                                                           jboolean strict_validation,
+                                                           jboolean allow_leading_zeros,
+                                                           jboolean allow_nonnumeric_numbers,
+                                                           jboolean allow_unquoted_control)
 {
   bool read_buffer = true;
   if (buffer == 0) {
@@ -1910,6 +1938,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
+        .strict_validation(strict_validation)
+        .numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control)
         .keep_quotes(keep_quotes);
 
     if (!n_types.is_null()) {
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 050bcbb268f..56fe63598d9 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException {
     }
   }
 
+  @Test
   void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     Schema schema = Schema.builder()
         .column(DType.STRING, "A")
@@ -455,6 +456,206 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException {
     }
   }
 
+  private static final byte[] JSON_VALIDATION_BUFFER = (
+      "{\"a\":true}\n" +
+      "{\"a\":false}\n" +
+      "{\"a\":null}\n" +
+      "{\"a\":true, \"b\":truee}\n" +
+      "{\"a\":true, \"b\":\"nulll\"}\n" +
+      "{\"a\": 1}\n" +
+      "{\"a\": 0}\n" +
+      "{\"a\": -}\n" +
+      "{\"a\": -0}\n" +
+      "{\"a\": -01}\n" +
+
+      "{\"a\": 01}\n" +
+      "{\"a\": -0.1}\n" +
+      "{\"a\": -00.1}\n" +
+      "{\"a\": NaN}\n" +
+      "{\"a\": INF}\n" +
+      "{\"a\": +INF}\n" +
+      "{\"a\": -INF}\n" +
+      "{\"a\": +Infinity}\n" +
+      "{\"a\": Infinity}\n" +
+      "{\"a\": -Infinity}\n" +
+
+      "{\"a\": INFinity}\n" +
+      "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" +
+      "{\"a\":12.}\n" +
+      "{\"a\": -3.4e+38}\n" +
+      "{\"a\": -3.4e-38}\n" +
+      "{\"a\": 1.4e38}\n" +
+      "{\"a\": -3.4E+38}\n" +
+      "{\"a\": -3.4E-38}\n" +
+      "{\"a\": 1.4E38}\n" +
+      "{\"a\": -3.4E+}\n" +
+
+      "{\"a\": -3.4E-}\n" +
+      "{\"a\": \"A\u0000B\"}\n" +
+      "{\"a\": \"A\\u0000B\"}\n" +
+      "{\"a\": \"A\u0001B\"}\n" +
+      "{\"a\": \"A\\u0001B\"}\n" +
+      "{\"a\": \"A\u001FB\"}\n" +
+      "{\"a\": \"A\\u001FB\"}\n" +
+      "{\"a\": \"A\u0020B\"}\n" +
+      "{\"a\": \"A\\u0020B\"}\n" +
+      "{\"a\": \"\\u12\"}\n" +
+
+      "{\"a\": \"\\z\"}\n" +
+      "{\"a\": \"\\r\"}\n" +
+      "{\"a\": \"something\", \"b\": \"\\z\"}\n"
+  ).getBytes(StandardCharsets.UTF_8);
+
+  @Test
+  void testJSONValidationNoStrict() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(false)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01",
+            "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
+            "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+",
+            "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", "\"something\"")
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidation() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationLeadingZeros() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(true)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", "-01",
+            "01", "-0.1", "-00.1", null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationNonNumeric() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(true)
+        .withUnquotedControlChars(true)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity",
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
+  @Test
+  void testJSONValidationUnquotedControl() {
+    Schema schema = Schema.builder()
+        .column(DType.STRING, "a")
+        .build();
+    JSONOptions opts = JSONOptions.builder()
+        .withRecoverWithNull(true)
+        .withMixedTypesAsStrings(true)
+        .withNormalizeWhitespace(true)
+        .withKeepQuotes(true)
+        .withNormalizeSingleQuotes(true)
+        .withStrictValidation(true)
+        .withLeadingZeros(false)
+        .withNonNumericNumbers(false)
+        .withUnquotedControlChars(false)
+        .build();
+    try (Table expected = new Table.TestBuilder()
+        .column(
+            "true", "false", null, null, "true", "1", "0", null, "-0", null,
+            null, "-0.1", null, null, null, null, null, null, null, null,
+            null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null,
+            null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null,
+            null, "\"\r\"", null)
+        .build();
+         MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER);
+         Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) {
+      assertTablesAreEqual(expected, table);
+    }
+  }
+
   private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" +
       "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" +
       "{\"d\":[1,2,3]}\n" +

From e94493732be3cd0d6d9ab0c5e9b85da970473950 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 13:39:03 +0000
Subject: [PATCH 04/33] style fixes

---
 cpp/include/cudf/io/json.hpp          | 31 +++++----
 cpp/src/io/json/process_tokens.cu     | 95 ++++++++++++---------------
 java/src/main/native/src/TableJni.cpp | 31 ++++-----
 3 files changed, 75 insertions(+), 82 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7ad91f93a6e..dfff9beae09 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -331,20 +331,23 @@ class json_reader_options {
   }
 
   /**
-   * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, and 
-   * -Infinity. strict validation must be enabled for this to work.
+   * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
+   * and -Infinity. strict validation must be enabled for this to work.
    *
    * @return true if leading zeros are allowed in numeric values
    */
   [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }
 
   /**
-   * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 be allowed
-   * without some form of escaping. Strict validation must be enabled for this to work.
+   * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
+   * be allowed without some form of escaping. Strict validation must be enabled for this to work.
    *
    * @return true if unquoted control chars are allowed.
    */
-  [[nodiscard]] bool is_allowed_unquoted_control_chars() const { return _allow_unquoted_control_chars; }
+  [[nodiscard]] bool is_allowed_unquoted_control_chars() const
+  {
+    return _allow_unquoted_control_chars;
+  }
 
   /**
    * @brief Returns additional values to recognize as null values.
@@ -499,7 +502,7 @@ class json_reader_options {
   void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; }
 
   /**
-   * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, 
+   * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
    * Infinity, and -Infinity. strict validation must be enabled for this to work.
    *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
@@ -507,11 +510,11 @@ class json_reader_options {
   void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; }
 
   /**
-   * @brief Set whether in a quoted string should characters greater than or equal to 0 
-   * and less than 32 be allowed without some form of escaping. Strict validation must 
+   * @brief Set whether in a quoted string should characters greater than or equal to 0
+   * and less than 32 be allowed without some form of escaping. Strict validation must
    * be enabled for this to work.
    *
-   * @param val true to indicate wether unquoted control chars are allowed.
+   * @param val true to indicate whether unquoted control chars are allowed.
    */
   void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; }
 
@@ -736,6 +739,7 @@ class json_reader_options_builder {
    * @brief Set whether json validation should be strict or not.
    *
    * @param val Boolean value to indicate whether json validation should be strict or not.
+   * @return this for chaining
    */
   json_reader_options_builder& strict_validation(bool val)
   {
@@ -757,12 +761,12 @@ class json_reader_options_builder {
   }
 
   /**
-   * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, 
+   * @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
    * +INF, -INF, +Infinity, Infinity, and -Infinity.
    * strict validation must be enabled for this to have any effect.
    *
-   * @param val Boolean value to indicate if unquoted nonnumeric values are
-   * valid json or not.
+   * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
+   * @return this for chaining
    */
   json_reader_options_builder& nonnumeric_numbers(bool val)
   {
@@ -771,10 +775,11 @@ class json_reader_options_builder {
   }
 
   /**
-   * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without 
+   * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
    * some form of escaping. strict validation must be enabled for this to have any effect.
    *
    * @param val Boolean value to indicate if unquoted control chars are allowed or not.
+   * @return this for chaining
    */
   json_reader_options_builder& unquoted_control_chars(bool val)
   {
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index aee22e3fdf0..906b5169064 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -54,34 +54,33 @@ struct write_if {
 
 enum class number_state {
   start = 0,
-  saw_neg, // not a complete state
+  saw_neg,  // not a complete state
   leading_zero,
   whole,
-  saw_radix, // not a complete state
+  saw_radix,  // not a complete state
   fraction,
-  start_exponent, // not a complete state
-  after_sign_exponent, // not a complete state
+  start_exponent,       // not a complete state
+  after_sign_exponent,  // not a complete state
   exponent
 };
 
 enum class string_state {
   normal = 0,
-  escaped, // not a complete state
-  escaped_u // not a complete state
+  escaped,   // not a complete state
+  escaped_u  // not a complete state
 };
 
-__device__ inline bool substr_eq(const char * data,
-    SymbolOffsetT const start,
-    SymbolOffsetT const end,
-    SymbolOffsetT const expected_len,
-    const char * expected) {
+__device__ inline bool substr_eq(const char* data,
+                                 SymbolOffsetT const start,
+                                 SymbolOffsetT const end,
+                                 SymbolOffsetT const expected_len,
+                                 const char* expected)
+{
   if (end - start != expected_len) {
     return false;
   } else {
     for (auto idx = 0; idx < expected_len; idx++) {
-      if (data[start + idx] != expected[idx]) {
-        return false;
-      }
+      if (data[start + idx] != expected[idx]) { return false; }
     }
   }
   return true;
@@ -96,9 +95,8 @@ void validate_token_stream(device_span<char const> d_input,
   if (options.is_strict_validation()) {
     using token_t = cudf::io::json::token_t;
     auto validate_values =
-      [data = d_input.data(),
-       allow_numeric_leading_zeros =
-         options.is_allowed_numeric_leading_zeros(),
+      [data                        = d_input.data(),
+       allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
        allow_nonnumeric =
          options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
                                                              SymbolOffsetT end) -> bool {
@@ -117,7 +115,8 @@ void validate_token_stream(device_span<char const> d_input,
       } else if (allow_nonnumeric && c == 'I') {
         return substr_eq(data, start, end, 8, "Infinity");
       } else if (allow_nonnumeric && c == '+') {
-        return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity");
+        return substr_eq(data, start, end, 4, "+INF") ||
+               substr_eq(data, start, end, 9, "+Infinity");
       } else if ('-' == c || c <= '9' && 'c' >= '0') {
         // number
         auto num_state = number_state::start;
@@ -141,7 +140,8 @@ void validate_token_stream(device_span<char const> d_input,
               } else if (c >= '1' && c <= '9') {
                 num_state = number_state::whole;
               } else if (allow_nonnumeric && 'I' == c) {
-                return substr_eq(data, start, end, 4, "-INF") || substr_eq(data, start, end, 9, "-Infinity");
+                return substr_eq(data, start, end, 4, "-INF") ||
+                       substr_eq(data, start, end, 9, "-Infinity");
               } else {
                 return false;
               }
@@ -212,9 +212,8 @@ void validate_token_stream(device_span<char const> d_input,
           }
         }
         return num_state != number_state::after_sign_exponent &&
-          num_state != number_state::start_exponent &&
-          num_state != number_state::saw_neg &&
-          num_state != number_state::saw_radix;
+               num_state != number_state::start_exponent && num_state != number_state::saw_neg &&
+               num_state != number_state::saw_radix;
       } else {
         return false;
       }
@@ -222,54 +221,42 @@ void validate_token_stream(device_span<char const> d_input,
 
     auto validate_strings =
       [data = d_input.data(),
-       allow_unquoted_control_chars = 
+       allow_unquoted_control_chars =
          options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
                                                                  SymbolOffsetT end) -> bool {
       // This validates a quoted string. A string must match https://www.json.org/json-en.html
       // but we already know that it has a starting and ending " and all white space has been
       // stripped out. Also the base CUDF validation makes sure escaped chars are correct
-      // so we only need to worry about unquoted control chars     
+      // so we only need to worry about unquoted control chars
 
-      auto state = string_state::normal;
+      auto state   = string_state::normal;
       auto u_count = 0;
       for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
         auto c = data[idx];
-        if (!allow_unquoted_control_chars && c >= 0 && c < 32) {
-          return false;
-        }
+        if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; }
 
         switch (state) {
           case string_state::normal:
-            if (c == '\\') {
-              state = string_state::escaped;
-            }
+            if (c == '\\') { state = string_state::escaped; }
             break;
           case string_state::escaped:
             // in Spark you can allow any char to be escaped, but CUDF
             // validates it in some cases so we need to also validate it.
             if (c == 'u') {
-              state = string_state::escaped_u;
+              state   = string_state::escaped_u;
               u_count = 0;
-            } else if (c == '"' ||
-                c == '\\' ||
-                c == '/' ||
-                c == 'b' ||
-                c == 'f' ||
-                c == 'n' ||
-                c == 'r' ||
-                c == 't') {
+            } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' ||
+                       c == 'r' || c == 't') {
               state = string_state::normal;
             } else {
               return false;
             }
             break;
           case string_state::escaped_u:
-            if ((c >= '0' && c <= '9') ||
-                (c >= 'a' && c <= 'f') ||
-                (c >= 'A' && c <= 'F')) {
+            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
               u_count++;
               if (u_count == 4) {
-                state = string_state::normal;
+                state   = string_state::normal;
                 u_count = 0;
               }
             } else {
@@ -289,8 +276,7 @@ void validate_token_stream(device_span<char const> d_input,
                       validate_strings] __device__(auto i) -> bool {
       if (tokens[i] == token_t::ValueEnd) {
         return !validate_values(token_indices[i - 1], token_indices[i]);
-      } else if (tokens[i] == token_t::FieldNameEnd ||
-          tokens[i] == token_t::StringEnd) {
+      } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
         return !validate_strings(token_indices[i - 1], token_indices[i]);
       }
       return false;
@@ -318,10 +304,10 @@ void validate_token_stream(device_span<char const> d_input,
                       count_it + num_tokens,
                       error.begin(),
                       predicate);  // in-place scan
-    //printf("error:");
-    //for (auto tk : cudf::detail::make_std_vector_sync(error, stream))
-    //  printf("%d ", tk);
-    //printf("\n");
+    // printf("error:");
+    // for (auto tk : cudf::detail::make_std_vector_sync(error, stream))
+    //   printf("%d ", tk);
+    // printf("\n");
 
     thrust::transform_inclusive_scan(rmm::exec_policy(stream),
                                      count_it,
@@ -330,10 +316,11 @@ void validate_token_stream(device_span<char const> d_input,
                                      transform_op,
                                      binary_op);  // in-place scan
   }
-  //printf("pre_process_token:");
-  //for (auto tk : cudf::detail::make_std_vector_sync(device_span<PdaTokenT const>(tokens), stream))
-  //  printf("%d ", tk);
-  //printf("\n");
+  // printf("pre_process_token:");
+  // for (auto tk : cudf::detail::make_std_vector_sync(device_span<PdaTokenT const>(tokens),
+  // stream))
+  //   printf("%d ", tk);
+  // printf("\n");
 
   // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE
   // 1   0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  1  1
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index cf8a11783e2..b67db835a23 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1648,21 +1648,22 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
   CATCH_STD(env, 0);
 }
 
-JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
-                                                                   jclass,
-                                                                   jlong buffer,
-                                                                   jlong buffer_length,
-                                                                   jboolean day_first,
-                                                                   jboolean lines,
-                                                                   jboolean recover_with_null,
-                                                                   jboolean normalize_single_quotes,
-                                                                   jboolean normalize_whitespace,
-                                                                   jboolean mixed_types_as_string,
-                                                                   jboolean keep_quotes,
-                                                                   jboolean strict_validation,
-                                                                   jboolean allow_leading_zeros,
-                                                                   jboolean allow_nonnumeric_numbers,
-                                                                   jboolean allow_unquoted_control)
+JNIEXPORT jlong JNICALL
+Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
+                                           jclass,
+                                           jlong buffer,
+                                           jlong buffer_length,
+                                           jboolean day_first,
+                                           jboolean lines,
+                                           jboolean recover_with_null,
+                                           jboolean normalize_single_quotes,
+                                           jboolean normalize_whitespace,
+                                           jboolean mixed_types_as_string,
+                                           jboolean keep_quotes,
+                                           jboolean strict_validation,
+                                           jboolean allow_leading_zeros,
+                                           jboolean allow_nonnumeric_numbers,
+                                           jboolean allow_unquoted_control)
 {
   JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0);
   if (buffer_length <= 0) {

From 23072c05b776203df415f08e68c88786c9768c98 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 09:03:05 -0500
Subject: [PATCH 05/33] Update json normalization to take device_buffer

* change device_uvector to device_buffer
* update tests

---------

Co-authored-by: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
---
 cpp/include/cudf/io/detail/json.hpp           |  4 ++--
 cpp/src/io/json/json_normalization.cu         | 23 +++++++++++--------
 cpp/src/io/json/read_json.cu                  | 16 ++++++-------
 .../io/json/json_quote_normalization_test.cpp |  9 ++++----
 .../json_whitespace_normalization_test.cu     |  7 +++---
 5 files changed, 30 insertions(+), 29 deletions(-)

diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 42b10a78ce8..38ba4f675c3 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -61,7 +61,7 @@ void write_json(data_sink* sink,
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr);
 
@@ -72,7 +72,7 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<char>
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
-void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<char>>& indata,
+void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr);
 }  // namespace io::json::detail
diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu
index 760b2214365..2cdb967018a 100644
--- a/cpp/src/io/json/json_normalization.cu
+++ b/cpp/src/io/json/json_normalization.cu
@@ -16,6 +16,7 @@
 
 #include "io/fst/lookup_tables.cuh"
 
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/io/detail/json.hpp>
 #include <cudf/types.hpp>
 
@@ -298,10 +299,11 @@ struct TransduceToNormalizedWS {
 
 namespace detail {
 
-void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+void normalize_single_quotes(datasource::owning_buffer<rmm::device_buffer>& indata,
                              rmm::cuda_stream_view stream,
                              rmm::device_async_resource_ref mr)
 {
+  CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
   auto parser =
@@ -311,25 +313,26 @@ void normalize_single_quotes(datasource::owning_buffer<rmm::device_uvector<Symbo
                             normalize_quotes::TransduceToNormalizedQuotes{}),
                           stream);
 
-  rmm::device_uvector<SymbolT> outbuf(indata.size() * 2, stream, mr);
+  rmm::device_buffer outbuf(indata.size() * 2, stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(indata.data(),
+  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
-                   outbuf.data(),
+                   static_cast<SymbolT*>(outbuf.data()),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
                    normalize_quotes::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
   std::swap(indata, outdata);
 }
 
-void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>>& indata,
+void normalize_whitespace(datasource::owning_buffer<rmm::device_buffer>& indata,
                           rmm::cuda_stream_view stream,
                           rmm::device_async_resource_ref mr)
 {
+  CUDF_FUNC_RANGE();
   static constexpr std::int32_t min_out = 0;
   static constexpr std::int32_t max_out = 2;
   auto parser =
@@ -339,18 +342,18 @@ void normalize_whitespace(datasource::owning_buffer<rmm::device_uvector<SymbolT>
                             normalize_whitespace::TransduceToNormalizedWS{}),
                           stream);
 
-  rmm::device_uvector<SymbolT> outbuf(indata.size(), stream, mr);
+  rmm::device_buffer outbuf(indata.size(), stream, mr);
   rmm::device_scalar<SymbolOffsetT> outbuf_size(stream, mr);
-  parser.Transduce(indata.data(),
+  parser.Transduce(reinterpret_cast<SymbolT const*>(indata.data()),
                    static_cast<SymbolOffsetT>(indata.size()),
-                   outbuf.data(),
+                   static_cast<SymbolT*>(outbuf.data()),
                    thrust::make_discard_iterator(),
                    outbuf_size.data(),
                    normalize_whitespace::start_state,
                    stream);
 
   outbuf.resize(outbuf_size.value(stream), stream);
-  datasource::owning_buffer<rmm::device_uvector<SymbolT>> outdata(std::move(outbuf));
+  datasource::owning_buffer<rmm::device_buffer> outdata(std::move(outbuf));
   std::swap(indata, outdata);
 }
 
diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu
index 590f70864b1..e0d0497e0a2 100644
--- a/cpp/src/io/json/read_json.cu
+++ b/cpp/src/io/json/read_json.cu
@@ -168,7 +168,7 @@ size_t estimate_size_per_subchunk(size_t chunk_size)
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @returns Data source owning buffer enclosing the bytes read
  */
-datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
+datasource::owning_buffer<rmm::device_buffer> get_record_range_raw_input(
   host_span<std::unique_ptr<datasource>> sources,
   json_reader_options const& reader_opts,
   rmm::cuda_stream_view stream)
@@ -200,8 +200,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
       ? total_source_size * estimated_compression_ratio + header_size
       : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) +
           num_extra_delimiters;
-  rmm::device_uvector<char> buffer(buffer_size, stream);
-  device_span<char> bufspan(buffer);
+  rmm::device_buffer buffer(buffer_size, stream);
+  device_span<char> bufspan(reinterpret_cast<char*>(buffer.data()), buffer.size());
 
   // Offset within buffer indicating first read position
   std::int64_t buffer_offset = 0;
@@ -213,8 +213,8 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream);
   if (first_delim_pos == -1) {
     // return empty owning datasource buffer
-    auto empty_buf = rmm::device_uvector<char>(0, stream);
-    return datasource::owning_buffer<rmm::device_uvector<char>>(std::move(empty_buf));
+    auto empty_buf = rmm::device_buffer(0, stream);
+    return datasource::owning_buffer<rmm::device_buffer>(std::move(empty_buf));
   } else if (!should_load_all_sources) {
     // Find next delimiter
     std::int64_t next_delim_pos = -1;
@@ -232,12 +232,12 @@ datasource::owning_buffer<rmm::device_uvector<char>> get_record_range_raw_input(
     }
     if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size();
 
-    return datasource::owning_buffer<rmm::device_uvector<char>>(
+    return datasource::owning_buffer<rmm::device_buffer>(
       std::move(buffer),
       reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
       next_delim_pos - first_delim_pos - shift_for_nonzero_offset);
   }
-  return datasource::owning_buffer<rmm::device_uvector<char>>(
+  return datasource::owning_buffer<rmm::device_buffer>(
     std::move(buffer),
     reinterpret_cast<uint8_t*>(buffer.data()) + first_delim_pos + shift_for_nonzero_offset,
     readbufspan.size() - first_delim_pos - shift_for_nonzero_offset);
@@ -249,7 +249,7 @@ table_with_metadata read_batch(host_span<std::unique_ptr<datasource>> sources,
                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  datasource::owning_buffer<rmm::device_uvector<char>> bufview =
+  datasource::owning_buffer<rmm::device_buffer> bufview =
     get_record_range_raw_input(sources, reader_opts, stream);
 
   // If input JSON buffer has single quotes and option to normalize single quotes is enabled,
diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp
index 55ad0afe499..3a9ba8d9f3b 100644
--- a/cpp/tests/io/json/json_quote_normalization_test.cpp
+++ b/cpp/tests/io/json/json_quote_normalization_test.cpp
@@ -26,7 +26,7 @@
 #include <cudf/io/json.hpp>
 #include <cudf/io/types.hpp>
 
-#include <rmm/device_uvector.hpp>
+#include <rmm/device_buffer.hpp>
 #include <rmm/mr/device/cuda_memory_resource.hpp>
 #include <rmm/mr/device/device_memory_resource.hpp>
 
@@ -42,12 +42,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
     std::make_shared<rmm::mr::cuda_memory_resource>();
 
   auto stream_view  = cudf::test::get_default_stream();
-  auto device_input = cudf::detail::make_device_uvector_async(
-    host_input, stream_view, rmm::mr::get_current_device_resource());
+  auto device_input = rmm::device_buffer(
+    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
-    std::move(device_input));
+  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get());
 
   std::string preprocessed_host_output(device_data.size(), 0);
diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu
index 8ed5fa81b12..01dd17fab98 100644
--- a/cpp/tests/io/json/json_whitespace_normalization_test.cu
+++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu
@@ -38,12 +38,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou
   // Prepare cuda stream for data transfers & kernels
   auto stream_view = cudf::test::get_default_stream();
 
-  auto device_input = cudf::detail::make_device_uvector_async(
-    host_input, stream_view, rmm::mr::get_current_device_resource());
+  auto device_input = rmm::device_buffer(
+    host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource());
 
   // Preprocessing FST
-  cudf::io::datasource::owning_buffer<rmm::device_uvector<char>> device_data(
-    std::move(device_input));
+  cudf::io::datasource::owning_buffer<rmm::device_buffer> device_data(std::move(device_input));
   cudf::io::json::detail::normalize_whitespace(
     device_data, stream_view, rmm::mr::get_current_device_resource());
 

From a8853401bde56017f319eadd2382034fdbfa65e3 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Fri, 9 Aug 2024 20:32:18 +0000
Subject: [PATCH 06/33] fix char comparison error

---
 cpp/src/io/json/process_tokens.cu | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 906b5169064..3b225130ba2 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -19,6 +19,7 @@
 // #include "tabulate_output_iterator.cuh"
 #include "output_writer_iterator.h"
 
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/detail/tokenize_json.hpp>
 
@@ -26,11 +27,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/functional>
-#include <thrust/scan.h>
 #include <thrust/transform_scan.h>
-#include <thrust/tuple.h>
-
-#include <sys/types.h>
 
 namespace cudf::io::json {
 namespace detail {
@@ -92,6 +89,7 @@ void validate_token_stream(device_span<char const> d_input,
                            cudf::io::json_reader_options const& options,
                            rmm::cuda_stream_view stream)
 {
+  CUDF_FUNC_RANGE();
   if (options.is_strict_validation()) {
     using token_t = cudf::io::json::token_t;
     auto validate_values =
@@ -233,7 +231,7 @@ void validate_token_stream(device_span<char const> d_input,
       auto u_count = 0;
       for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
         auto c = data[idx];
-        if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; }
+        if (!allow_unquoted_control_chars && c >= '\0' && c < '\32') { return false; }
 
         switch (state) {
           case string_state::normal:

From ab1385df894ec8642279e52e5abb7722264880cd Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 15 Aug 2024 13:08:37 -0500
Subject: [PATCH 07/33] update char comparison

---
 cpp/src/io/json/process_tokens.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 3b225130ba2..85efaa191d2 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -231,7 +231,7 @@ void validate_token_stream(device_span<char const> d_input,
       auto u_count = 0;
       for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
         auto c = data[idx];
-        if (!allow_unquoted_control_chars && c >= '\0' && c < '\32') { return false; }
+        if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; }
 
         switch (state) {
           case string_state::normal:

From f2e2b448e6fc02ba178aa8eef7f243d31e628e4a Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 26 Aug 2024 15:35:26 +0000
Subject: [PATCH 08/33] rename to tabulate_output_iterator.cuh

---
 cpp/src/io/json/process_tokens.cu             |  7 ++--
 ...terator.h => tabulate_output_iterator.cuh} | 40 +++++++++----------
 2 files changed, 23 insertions(+), 24 deletions(-)
 rename cpp/src/io/json/{output_writer_iterator.h => tabulate_output_iterator.cuh} (70%)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 85efaa191d2..a7c2d264880 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -16,8 +16,7 @@
  */
 
 #include "nested_json.hpp"
-// #include "tabulate_output_iterator.cuh"
-#include "output_writer_iterator.h"
+#include "tabulate_output_iterator.cuh"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -231,7 +230,7 @@ void validate_token_stream(device_span<char const> d_input,
       auto u_count = 0;
       for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
         auto c = data[idx];
-        if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; }
+        if (!allow_unquoted_control_chars && c < 32) { return false; }
 
         switch (state) {
           case string_state::normal:
@@ -285,7 +284,7 @@ void validate_token_stream(device_span<char const> d_input,
     // auto conditional_output_it = tokens.begin();
     // auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write);
     auto conditional_output_it =
-      thrust::make_output_writer_iterator(thrust::make_counting_iterator(0), conditional_write);
+      thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0), conditional_write);
     auto transform_op = cuda::proclaim_return_type<scan_type>(
       [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
         if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
diff --git a/cpp/src/io/json/output_writer_iterator.h b/cpp/src/io/json/tabulate_output_iterator.cuh
similarity index 70%
rename from cpp/src/io/json/output_writer_iterator.h
rename to cpp/src/io/json/tabulate_output_iterator.cuh
index f6276771af4..fe457bfb5a7 100644
--- a/cpp/src/io/json/output_writer_iterator.h
+++ b/cpp/src/io/json/tabulate_output_iterator.cuh
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-// Output writer iterator
+// Tabulate Output iterator
 #pragma once
 
 #include <thrust/iterator/iterator_adaptor.h>
@@ -24,14 +24,14 @@ namespace detail {
 
 // Proxy reference that calls BinaryFunction with Iterator value and the rhs of assignment operator
 template <typename BinaryFunction, typename Iterator>
-class output_writer_iterator_proxy {
+class tabulate_output_iterator_proxy {
  public:
-  __host__ __device__ output_writer_iterator_proxy(const Iterator& index_iter, BinaryFunction fun)
+  __host__ __device__ tabulate_output_iterator_proxy(const Iterator& index_iter, BinaryFunction fun)
     : index_iter(index_iter), fun(fun)
   {
   }
   template <typename T>
-  __host__ __device__ output_writer_iterator_proxy operator=(const T& x)
+  __host__ __device__ tabulate_output_iterator_proxy operator=(const T& x)
   {
     fun(*index_iter, x);
     return *this;
@@ -42,19 +42,19 @@ class output_writer_iterator_proxy {
   BinaryFunction fun;
 };
 
-// Register output_writer_iterator_proxy with 'is_proxy_reference' from
+// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
 // type_traits to enable its use with algorithms.
 template <class BinaryFunction, class Iterator>
-struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>>
+struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, Iterator>>
   : public thrust::detail::true_type {};
 
 }  // namespace detail
 
 /**
- * @brief Transform output iterator with custom writer binary function which takes index and value.
+ * @brief Transform output iterator with custom binary function which takes index and value.
  *
  * @code {.cpp}
- * #include <thrust/iterator/output_writer_iterator.cuh>
+ * #include <thrust/iterator/tabulate_output_iterator.cuh>
  * #include <thrust/device_vector.h>
  * #include <thrust/iterator/counting_iterator.h>
  * #include <thrust/iterator/transform_iterator.h>
@@ -80,7 +80,7 @@ struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>
  * };
  *
  * thrust::device_vector<int> v(1, 0x00000000);
- * auto result_begin = thrust::make_output_writer_iterator(thrust::make_counting_iterator(0),
+ * auto result_begin = thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0),
  *                                                 set_bits_field{v.data().get()});
  * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
  *   [] __device__ (int x) {   return x%2; });
@@ -94,27 +94,27 @@ struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>
  * @tparam Iterator iterator type that acts as index of the output.
  */
 template <typename BinaryFunction, typename Iterator>
-class output_writer_iterator
+class tabulate_output_iterator
   : public thrust::iterator_adaptor<
-      output_writer_iterator<BinaryFunction, Iterator>,
+      tabulate_output_iterator<BinaryFunction, Iterator>,
       Iterator,
       thrust::use_default,
       thrust::use_default,
       thrust::use_default,
-      thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>> {
+      thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>> {
  public:
   // parent class.
   typedef thrust::iterator_adaptor<
-    output_writer_iterator<BinaryFunction, Iterator>,
+    tabulate_output_iterator<BinaryFunction, Iterator>,
     Iterator,
     thrust::use_default,
     thrust::use_default,
     thrust::use_default,
-    thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>>
+    thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>>
     super_t;
   // friend thrust::iterator_core_access to allow it access to the private interface dereference()
   friend class thrust::iterator_core_access;
-  __host__ __device__ output_writer_iterator(Iterator const& x, BinaryFunction fun)
+  __host__ __device__ tabulate_output_iterator(Iterator const& x, BinaryFunction fun)
     : super_t(x), fun(fun)
   {
   }
@@ -125,15 +125,15 @@ class output_writer_iterator
   // thrust::iterator_core_access accesses this function
   __host__ __device__ typename super_t::reference dereference() const
   {
-    return thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>(
+    return thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>(
       this->base_reference(), fun);
   }
 };
 
 template <typename BinaryFunction, typename Iterator>
-output_writer_iterator<BinaryFunction, Iterator> __host__ __device__
-make_output_writer_iterator(Iterator out, BinaryFunction fun)
+tabulate_output_iterator<BinaryFunction, Iterator> __host__ __device__
+make_tabulate_output_iterator(Iterator out, BinaryFunction fun)
 {
-  return output_writer_iterator<BinaryFunction, Iterator>(out, fun);
-}  // end make_output_writer_iterator
+  return tabulate_output_iterator<BinaryFunction, Iterator>(out, fun);
+}  // end make_tabulate_output_iterator
 THRUST_NAMESPACE_END

From 09632189c976ae247b9aa36bc3fd4a451739f693 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 26 Aug 2024 15:58:49 +0000
Subject: [PATCH 09/33] absorb counting_iterator to tabulate_output_iterator

Signed-off-by: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
---
 cpp/src/io/json/process_tokens.cu            | 11 ++--
 cpp/src/io/json/tabulate_output_iterator.cuh | 56 +++++++++-----------
 2 files changed, 30 insertions(+), 37 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index a7c2d264880..224d90e0b24 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -279,13 +279,10 @@ void validate_token_stream(device_span<char const> d_input,
       return false;
     };
 
-    using scan_type        = write_if::scan_type;
-    auto conditional_write = write_if{tokens.begin(), num_tokens};
-    // auto conditional_output_it = tokens.begin();
-    // auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write);
-    auto conditional_output_it =
-      thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0), conditional_write);
-    auto transform_op = cuda::proclaim_return_type<scan_type>(
+    using scan_type            = write_if::scan_type;
+    auto conditional_write     = write_if{tokens.begin(), num_tokens};
+    auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write);
+    auto transform_op          = cuda::proclaim_return_type<scan_type>(
       [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
         if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
         return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh
index fe457bfb5a7..8c677696a26 100644
--- a/cpp/src/io/json/tabulate_output_iterator.cuh
+++ b/cpp/src/io/json/tabulate_output_iterator.cuh
@@ -17,35 +17,36 @@
 // Tabulate Output iterator
 #pragma once
 
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_adaptor.h>
 
 THRUST_NAMESPACE_BEGIN
 namespace detail {
 
-// Proxy reference that calls BinaryFunction with Iterator value and the rhs of assignment operator
-template <typename BinaryFunction, typename Iterator>
+// Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator
+template <typename BinaryFunction, typename IndexT>
 class tabulate_output_iterator_proxy {
  public:
-  __host__ __device__ tabulate_output_iterator_proxy(const Iterator& index_iter, BinaryFunction fun)
-    : index_iter(index_iter), fun(fun)
+  __host__ __device__ tabulate_output_iterator_proxy(const IndexT index, BinaryFunction fun)
+    : index(index), fun(fun)
   {
   }
   template <typename T>
-  __host__ __device__ tabulate_output_iterator_proxy operator=(const T& x)
+  __host__ __device__ tabulate_output_iterator_proxy operator=(const T& rhs_value)
   {
-    fun(*index_iter, x);
+    fun(index, rhs_value);
     return *this;
   }
 
  private:
-  Iterator index_iter;
+  IndexT index;
   BinaryFunction fun;
 };
 
 // Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
 // type_traits to enable its use with algorithms.
-template <class BinaryFunction, class Iterator>
-struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, Iterator>>
+template <class BinaryFunction, class IndexT>
+struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, IndexT>>
   : public thrust::detail::true_type {};
 
 }  // namespace detail
@@ -80,8 +81,7 @@ struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, Iterato
  * };
  *
  * thrust::device_vector<int> v(1, 0x00000000);
- * auto result_begin = thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0),
- *                                                 set_bits_field{v.data().get()});
+ * auto result_begin = thrust::make_tabulate_output_iterator(set_bits_field{v.data().get()});
  * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
  *   [] __device__ (int x) {   return x%2; });
  * thrust::copy(thrust::device, value, value+32, result_begin);
@@ -93,31 +93,27 @@ struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, Iterato
  * assignment operator.
  * @tparam Iterator iterator type that acts as index of the output.
  */
-template <typename BinaryFunction, typename Iterator>
+template <typename BinaryFunction, typename IndexT = ptrdiff_t>
 class tabulate_output_iterator
   : public thrust::iterator_adaptor<
-      tabulate_output_iterator<BinaryFunction, Iterator>,
-      Iterator,
+      tabulate_output_iterator<BinaryFunction, IndexT>,
+      counting_iterator<IndexT>,
       thrust::use_default,
       thrust::use_default,
       thrust::use_default,
-      thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>> {
+      thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>> {
  public:
   // parent class.
-  typedef thrust::iterator_adaptor<
-    tabulate_output_iterator<BinaryFunction, Iterator>,
-    Iterator,
+  using super_t = thrust::iterator_adaptor<
+    tabulate_output_iterator<BinaryFunction, IndexT>,
+    counting_iterator<IndexT>,
     thrust::use_default,
     thrust::use_default,
     thrust::use_default,
-    thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>>
-    super_t;
+    thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>>;
   // friend thrust::iterator_core_access to allow it access to the private interface dereference()
   friend class thrust::iterator_core_access;
-  __host__ __device__ tabulate_output_iterator(Iterator const& x, BinaryFunction fun)
-    : super_t(x), fun(fun)
-  {
-  }
+  __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {}
 
  private:
   BinaryFunction fun;
@@ -125,15 +121,15 @@ class tabulate_output_iterator
   // thrust::iterator_core_access accesses this function
   __host__ __device__ typename super_t::reference dereference() const
   {
-    return thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>(
-      this->base_reference(), fun);
+    return thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>(*this->base(),
+                                                                                  fun);
   }
 };
 
-template <typename BinaryFunction, typename Iterator>
-tabulate_output_iterator<BinaryFunction, Iterator> __host__ __device__
-make_tabulate_output_iterator(Iterator out, BinaryFunction fun)
+template <typename BinaryFunction>
+tabulate_output_iterator<BinaryFunction> __host__ __device__
+make_tabulate_output_iterator(BinaryFunction fun)
 {
-  return tabulate_output_iterator<BinaryFunction, Iterator>(out, fun);
+  return tabulate_output_iterator<BinaryFunction>(fun);
 }  // end make_tabulate_output_iterator
 THRUST_NAMESPACE_END

From be7402c4d685fbf3f5ca747b219adc85add3ae38 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 26 Aug 2024 18:12:59 +0000
Subject: [PATCH 10/33] update documentation

---
 cpp/include/cudf/io/json.hpp | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index faf8cccf552..433bb57941f 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -323,6 +323,8 @@ class json_reader_options {
    * @brief Whether leading zeros are allowed in numeric values. strict validation
    * must be enabled for this to work.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @return true if leading zeros are allowed in numeric values
    */
   [[nodiscard]] bool is_allowed_numeric_leading_zeros() const
@@ -334,6 +336,8 @@ class json_reader_options {
    * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
    * and -Infinity. strict validation must be enabled for this to work.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @return true if leading zeros are allowed in numeric values
    */
   [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; }
@@ -342,6 +346,8 @@ class json_reader_options {
    * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
    * be allowed without some form of escaping. Strict validation must be enabled for this to work.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @return true if unquoted control chars are allowed.
    */
   [[nodiscard]] bool is_allowed_unquoted_control_chars() const
@@ -497,6 +503,8 @@ class json_reader_options {
    * @brief Set whether leading zeros are allowed in numeric values. strict validation
    * must be enabled for this to work.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
    */
   void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; }
@@ -505,6 +513,8 @@ class json_reader_options {
    * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
    * Infinity, and -Infinity. strict validation must be enabled for this to work.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
    */
   void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; }
@@ -514,6 +524,8 @@ class json_reader_options {
    * and less than 32 be allowed without some form of escaping. Strict validation must
    * be enabled for this to work.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @param val true to indicate whether unquoted control chars are allowed.
    */
   void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; }
@@ -751,6 +763,8 @@ class json_reader_options_builder {
    * @brief Set Whether leading zeros are allowed in numeric values. strict validation must
    * be enabled for this to have any effect.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
    * @return this for chaining
    */
@@ -765,6 +779,8 @@ class json_reader_options_builder {
    * +INF, -INF, +Infinity, Infinity, and -Infinity.
    * strict validation must be enabled for this to have any effect.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
    * @return this for chaining
    */
@@ -778,6 +794,8 @@ class json_reader_options_builder {
    * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
    * some form of escaping. strict validation must be enabled for this to have any effect.
    *
+   * @note: strict_validation must be enabled for this to work.
+   *
    * @param val Boolean value to indicate if unquoted control chars are allowed or not.
    * @return this for chaining
    */

From b114401843d5af5e631a7419fe225e8bb1d86267 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 26 Aug 2024 18:19:03 +0000
Subject: [PATCH 11/33] add na_values to validation

---
 cpp/src/io/json/process_tokens.cu | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 224d90e0b24..c78a8fd818d 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -15,6 +15,7 @@
  * limitations under the License.
  */
 
+#include "io/utilities/trie.cuh"
 #include "nested_json.hpp"
 #include "tabulate_output_iterator.cuh"
 
@@ -91,8 +92,12 @@ void validate_token_stream(device_span<char const> d_input,
   CUDF_FUNC_RANGE();
   if (options.is_strict_validation()) {
     using token_t = cudf::io::json::token_t;
+    cudf::detail::optional_trie trie_na =
+      cudf::detail::create_serialized_trie(options.get_na_values(), stream);
+    auto trie_na_view = cudf::detail::make_trie_view(trie_na);
     auto validate_values =
       [data                        = d_input.data(),
+       trie_na                     = trie_na_view,
        allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
        allow_nonnumeric =
          options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
@@ -100,8 +105,11 @@ void validate_token_stream(device_span<char const> d_input,
       // This validates an unquoted value. A value must match https://www.json.org/json-en.html
       // but the leading and training whitespace should already have been removed, and is not
       // a string
-      auto c = data[start];
-      if ('n' == c) {
+      auto c               = data[start];
+      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
+      if (is_null_literal) {
+        return true;
+      } else if ('n' == c) {
         return substr_eq(data, start, end, 4, "null");
       } else if ('t' == c) {
         return substr_eq(data, start, end, 4, "true");

From a1e9afcc9055fbc516cdda4dd216a8b9c02695bf Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 26 Aug 2024 18:19:16 +0000
Subject: [PATCH 12/33] add strict validation to test

---
 cpp/tests/io/json/json_test.cpp | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index f06053b44a7..7316026d4d7 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2169,14 +2169,15 @@ TEST_F(JsonReaderTest, ValueValidation)
     cudf::io::json_reader_options in_options =
       cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
         .lines(true)
-        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL);
+        .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .strict_validation(true);
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
     EXPECT_EQ(result.tbl->num_columns(), 4);
     EXPECT_EQ(result.tbl->num_rows(), 8);
-    auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
-    auto a_column =
-      int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, {true, false, false, true, true, true, true, false}};
+    auto b_a_col  = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0});
+    auto a_column = int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0},
+                                  {true, false, false, false, true, true, true, false}};
     auto b_column = cudf::test::structs_column_wrapper(
       {b_a_col}, {false, false, true, false, false, false, false, false});
     auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0},
@@ -2192,7 +2193,8 @@ TEST_F(JsonReaderTest, ValueValidation)
         .lines(true)
         .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
         .numeric_leading_zeros(false)
-        .na_values({"nan"});
+        .na_values({"nan"})
+        .strict_validation(true);
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
     EXPECT_EQ(result.tbl->num_columns(), 4);

From ec78ef959dde124a56784b0591c1d49a5763f2d8 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 26 Aug 2024 18:42:02 +0000
Subject: [PATCH 13/33] rename tabulate_output_iterator namespace

---
 cpp/src/io/json/process_tokens.cu            |  2 +-
 cpp/src/io/json/tabulate_output_iterator.cuh | 52 ++++++++++----------
 2 files changed, 26 insertions(+), 28 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index c78a8fd818d..e1a50dd6070 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -289,7 +289,7 @@ void validate_token_stream(device_span<char const> d_input,
 
     using scan_type            = write_if::scan_type;
     auto conditional_write     = write_if{tokens.begin(), num_tokens};
-    auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write);
+    auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
     auto transform_op          = cuda::proclaim_return_type<scan_type>(
       [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
         if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh
index 8c677696a26..46887beefbd 100644
--- a/cpp/src/io/json/tabulate_output_iterator.cuh
+++ b/cpp/src/io/json/tabulate_output_iterator.cuh
@@ -20,7 +20,7 @@
 #include <thrust/iterator/counting_iterator.h>
 #include <thrust/iterator/iterator_adaptor.h>
 
-THRUST_NAMESPACE_BEGIN
+namespace cudf {
 namespace detail {
 
 // Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator
@@ -43,19 +43,11 @@ class tabulate_output_iterator_proxy {
   BinaryFunction fun;
 };
 
-// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
-// type_traits to enable its use with algorithms.
-template <class BinaryFunction, class IndexT>
-struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, IndexT>>
-  : public thrust::detail::true_type {};
-
-}  // namespace detail
-
 /**
  * @brief Transform output iterator with custom binary function which takes index and value.
  *
  * @code {.cpp}
- * #include <thrust/iterator/tabulate_output_iterator.cuh>
+ * #include "tabulate_output_iterator.cuh"
  * #include <thrust/device_vector.h>
  * #include <thrust/iterator/counting_iterator.h>
  * #include <thrust/iterator/transform_iterator.h>
@@ -95,22 +87,20 @@ struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, IndexT>
  */
 template <typename BinaryFunction, typename IndexT = ptrdiff_t>
 class tabulate_output_iterator
-  : public thrust::iterator_adaptor<
-      tabulate_output_iterator<BinaryFunction, IndexT>,
-      counting_iterator<IndexT>,
-      thrust::use_default,
-      thrust::use_default,
-      thrust::use_default,
-      thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>> {
+  : public thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, IndexT>,
+                                    thrust::counting_iterator<IndexT>,
+                                    thrust::use_default,
+                                    thrust::use_default,
+                                    thrust::use_default,
+                                    tabulate_output_iterator_proxy<BinaryFunction, IndexT>> {
  public:
   // parent class.
-  using super_t = thrust::iterator_adaptor<
-    tabulate_output_iterator<BinaryFunction, IndexT>,
-    counting_iterator<IndexT>,
-    thrust::use_default,
-    thrust::use_default,
-    thrust::use_default,
-    thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>>;
+  using super_t = thrust::iterator_adaptor<tabulate_output_iterator<BinaryFunction, IndexT>,
+                                           thrust::counting_iterator<IndexT>,
+                                           thrust::use_default,
+                                           thrust::use_default,
+                                           thrust::use_default,
+                                           tabulate_output_iterator_proxy<BinaryFunction, IndexT>>;
   // friend thrust::iterator_core_access to allow it access to the private interface dereference()
   friend class thrust::iterator_core_access;
   __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {}
@@ -121,8 +111,7 @@ class tabulate_output_iterator
   // thrust::iterator_core_access accesses this function
   __host__ __device__ typename super_t::reference dereference() const
   {
-    return thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>(*this->base(),
-                                                                                  fun);
+    return tabulate_output_iterator_proxy<BinaryFunction, IndexT>(*this->base(), fun);
   }
 };
 
@@ -132,4 +121,13 @@ make_tabulate_output_iterator(BinaryFunction fun)
 {
   return tabulate_output_iterator<BinaryFunction>(fun);
 }  // end make_tabulate_output_iterator
-THRUST_NAMESPACE_END
+
+}  // namespace detail
+}  // namespace cudf
+
+// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
+// type_traits to enable its use with algorithms.
+template <class BinaryFunction, class IndexT>
+struct thrust::detail::is_proxy_reference<
+  cudf::detail::tabulate_output_iterator_proxy<BinaryFunction, IndexT>>
+  : public thrust::detail::true_type {};

From a225ce077e9c8ba70aaab8ba4f590e44b5b27840 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 26 Aug 2024 18:56:06 +0000
Subject: [PATCH 14/33] remove comments and notes

---
 cpp/src/io/json/process_tokens.cu | 53 -------------------------------
 1 file changed, 53 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index e1a50dd6070..6336ca64928 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -300,16 +300,6 @@ void validate_token_stream(device_span<char const> d_input,
         auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
         return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
       });
-    rmm::device_uvector<int8_t> error(num_tokens, stream);
-    thrust::transform(rmm::exec_policy(stream),
-                      count_it,
-                      count_it + num_tokens,
-                      error.begin(),
-                      predicate);  // in-place scan
-    // printf("error:");
-    // for (auto tk : cudf::detail::make_std_vector_sync(error, stream))
-    //   printf("%d ", tk);
-    // printf("\n");
 
     thrust::transform_inclusive_scan(rmm::exec_policy(stream),
                                      count_it,
@@ -318,49 +308,6 @@ void validate_token_stream(device_span<char const> d_input,
                                      transform_op,
                                      binary_op);  // in-place scan
   }
-  // printf("pre_process_token:");
-  // for (auto tk : cudf::detail::make_std_vector_sync(device_span<PdaTokenT const>(tokens),
-  // stream))
-  //   printf("%d ", tk);
-  // printf("\n");
-
-  // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE
-  // 1   0  0  0  0  0  0  1  0  0  0  0  0  0  0  0  1  1
-  // 1   1  1  1  1  1  1  2  2  2  2  2  2  2  2  2  3  4
-  // auto unary_op = [] __device__ (auto) -> token_t { return token_t::ErrorBegin; };
-  // thrust::transform_if(rmm::exec_policy(stream), count_it, count_it + num_tokens, tokens.begin(),
-  // unary_op, predicate); auto num_rows = thrust::count(rmm::exec_policy(stream), tokens.begin(),
-  // tokens.end(), token_t::LineEnd); rmm::device_uvector<bool> row_is_error(num_rows, stream);
-  // rmm::device_uvector<SymbolOffsetT> row_index(num_tokens, stream);
-  // auto is_LineEnd = [] __device__ (auto token) -> SymbolOffsetT { return token ==
-  // token_t::LineEnd; }; thrust::transform_inclusive_scan(rmm::exec_policy(stream),
-  //   tokens.begin(), tokens.end(), row_index.begin(), is_LineEnd, thrust::plus<SymbolOffsetT>{});
-  // auto is_error_it = thrust::make_transform_iterator(tokens.begin(), [] __device__ (auto token)
-  // -> bool { return token == token_t::ErrorBegin; });
-  // thrust::reduce_by_key(rmm::exec_policy(stream), row_index.begin(), row_index.end(),
-  // is_error_it, thrust::make_discard_iterator(), row_is_error.begin());
-
-  // if current == ErrorBegin and tokens[i+1]==LE or i==n-1) then write ErrorBegin to tokens[i],
-  // else nothing. if VB, or SB, then if validate(token[i], token[i+1])==false,
-  //
-  // Transform_if to errors tokens
-  // Count LE (num_rows)
-  // create int vector [num_rows], bool[num_rows]
-  // TODO: fuse them together with single scan algorithm.
-  // LE==1, to scan to row_index.
-  // reduce_by_key, to check if it has any error, and number of non-errors tokens.
-  // reduce them to get output tokens count.
-  // copy_if -> use cub cub::DeviceSelect::If(d_data, d_num_selected_out, num_items, select_op)
 }
-
-// corner cases: empty LE,
-// alternate Decoupled look back idea:
-// count LineEnd, allocate bool[num_rows] as is_error.
-// decoupled look back for LineEnd tokens for row indices,
-//  transform_if to error tokens and write to bool[row_index] atomically (reduce and write within
-//  warp/block?)
-// decoupled look back for LineEnd tokens for row indices,
-//  (if not error row & not lineEnd token) -> decoupled look back for output indices,
-//  CopyIf (if not error row & not lineEnd token) write to output.
 }  // namespace detail
 }  // namespace cudf::io::json

From 875a72b0cef208157aeb53cb4c1e8bf254c74399 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 3 Sep 2024 23:08:40 +0000
Subject: [PATCH 15/33] fix unsigned/signed issue with ARM systems

---
 cpp/src/io/json/process_tokens.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 6336ca64928..ec1f82e1792 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -238,7 +238,7 @@ void validate_token_stream(device_span<char const> d_input,
       auto u_count = 0;
       for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
         auto c = data[idx];
-        if (!allow_unquoted_control_chars && c < 32) { return false; }
+        if (!allow_unquoted_control_chars && c <= char{0} && c < char{32}) { return false; }
 
         switch (state) {
           case string_state::normal:

From be7f17e722522c4723ae1405a08fdd569953c8be Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 3 Sep 2024 23:11:43 +0000
Subject: [PATCH 16/33] remove comments

---
 java/src/main/java/ai/rapids/cudf/JSONOptions.java | 2 --
 1 file changed, 2 deletions(-)

diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 50cce4590c1..4fcaf1e4a6b 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -158,8 +158,6 @@ public Builder withUnquotedControlChars(boolean isAllowed) {
       return this;
     }
 
-    // TODO need to finish this for other configs...
-
     /**
      * Whether to parse dates as DD/MM versus MM/DD
      * @param dayFirst true: DD/MM, false, MM/DD

From fb628774d6cd8a19a2411053ac658468a12ee519 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Wed, 4 Sep 2024 23:33:32 +0000
Subject: [PATCH 17/33] fix condition

---
 cpp/src/io/json/process_tokens.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index ec1f82e1792..61f4e559cbf 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -238,7 +238,7 @@ void validate_token_stream(device_span<char const> d_input,
       auto u_count = 0;
       for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
         auto c = data[idx];
-        if (!allow_unquoted_control_chars && c <= char{0} && c < char{32}) { return false; }
+        if (!allow_unquoted_control_chars && c >= char{0} && c < char{32}) { return false; }
 
         switch (state) {
           case string_state::normal:

From e4f7d047fab12063eceb386e5e8f5143e7243ee7 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 5 Sep 2024 21:23:51 +0000
Subject: [PATCH 18/33] fix char issue with typecast

---
 cpp/src/io/json/process_tokens.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 61f4e559cbf..59ea0dd1583 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -238,7 +238,9 @@ void validate_token_stream(device_span<char const> d_input,
       auto u_count = 0;
       for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
         auto c = data[idx];
-        if (!allow_unquoted_control_chars && c >= char{0} && c < char{32}) { return false; }
+        if (!allow_unquoted_control_chars && static_cast<int>(c) >= 0 && static_cast<int>(c) < 32) {
+          return false;
+        }
 
         switch (state) {
           case string_state::normal:

From 851fe3e58fa9d0a4fa2d2190506c4784acf4fbf9 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 5 Sep 2024 16:26:31 -0500
Subject: [PATCH 19/33] Update cpp/include/cudf/io/json.hpp

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 cpp/include/cudf/io/json.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 433bb57941f..e28cb978606 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -500,7 +500,7 @@ class json_reader_options {
   void set_strict_validation(bool val) { _strict_validation = val; }
 
   /**
-   * @brief Set whether leading zeros are allowed in numeric values. strict validation
+   * @brief Set whether leading zeros are allowed in numeric values. Strict validation
    * must be enabled for this to work.
    *
    * @note: strict_validation must be enabled for this to work.

From 35e4b892841ed47e76637636312d775bd256e3b1 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Thu, 5 Sep 2024 16:26:37 -0500
Subject: [PATCH 20/33] Update cpp/include/cudf/io/json.hpp

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 cpp/include/cudf/io/json.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index e28cb978606..fe2cd8603f0 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -511,7 +511,7 @@ class json_reader_options {
 
   /**
    * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
-   * Infinity, and -Infinity. strict validation must be enabled for this to work.
+   * Infinity, and -Infinity. Strict validation must be enabled for this to work.
    *
    * @note: strict_validation must be enabled for this to work.
    *

From 368182327d6b274584502fc4c0b479a53b5b33cb Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 5 Sep 2024 21:36:37 +0000
Subject: [PATCH 21/33] address review comments

---
 cpp/src/io/json/process_tokens.cu | 433 +++++++++++++++---------------
 1 file changed, 215 insertions(+), 218 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 59ea0dd1583..839d9e88c9a 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -43,28 +43,27 @@ struct write_if {
     if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) {
       if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) {
         tokens[i] = token_t::ErrorBegin;
-        // printf("writing\n");
       }
     }
   }
 };
 
 enum class number_state {
-  start = 0,
-  saw_neg,  // not a complete state
-  leading_zero,
-  whole,
-  saw_radix,  // not a complete state
-  fraction,
-  start_exponent,       // not a complete state
-  after_sign_exponent,  // not a complete state
-  exponent
+  START = 0,
+  SAW_NEG,  // not a complete state
+  LEADING_ZERO,
+  WHOLE,
+  SAW_RADIX,  // not a complete state
+  FRACTION,
+  START_EXPONENT,       // not a complete state
+  AFTER_SIGN_EXPONENT,  // not a complete state
+  EXPONENT
 };
 
 enum class string_state {
-  normal = 0,
-  escaped,   // not a complete state
-  escaped_u  // not a complete state
+  NORMAL = 0,
+  ESCAPED,   // not a complete state
+  ESCAPED_U  // not a complete state
 };
 
 __device__ inline bool substr_eq(const char* data,
@@ -90,226 +89,224 @@ void validate_token_stream(device_span<char const> d_input,
                            rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
-  if (options.is_strict_validation()) {
-    using token_t = cudf::io::json::token_t;
-    cudf::detail::optional_trie trie_na =
-      cudf::detail::create_serialized_trie(options.get_na_values(), stream);
-    auto trie_na_view = cudf::detail::make_trie_view(trie_na);
-    auto validate_values =
-      [data                        = d_input.data(),
-       trie_na                     = trie_na_view,
-       allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
-       allow_nonnumeric =
-         options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
-                                                             SymbolOffsetT end) -> bool {
-      // This validates an unquoted value. A value must match https://www.json.org/json-en.html
-      // but the leading and training whitespace should already have been removed, and is not
-      // a string
-      auto c               = data[start];
-      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
-      if (is_null_literal) {
-        return true;
-      } else if ('n' == c) {
-        return substr_eq(data, start, end, 4, "null");
-      } else if ('t' == c) {
-        return substr_eq(data, start, end, 4, "true");
-      } else if ('f' == c) {
-        return substr_eq(data, start, end, 5, "false");
-      } else if (allow_nonnumeric && c == 'N') {
-        return substr_eq(data, start, end, 3, "NaN");
-      } else if (allow_nonnumeric && c == 'I') {
-        return substr_eq(data, start, end, 8, "Infinity");
-      } else if (allow_nonnumeric && c == '+') {
-        return substr_eq(data, start, end, 4, "+INF") ||
-               substr_eq(data, start, end, 9, "+Infinity");
-      } else if ('-' == c || c <= '9' && 'c' >= '0') {
-        // number
-        auto num_state = number_state::start;
-        for (auto at = start; at < end; at++) {
-          c = data[at];
-          switch (num_state) {
-            case number_state::start:
-              if ('-' == c) {
-                num_state = number_state::saw_neg;
-              } else if ('0' == c) {
-                num_state = number_state::leading_zero;
-              } else if (c >= '1' && c <= '9') {
-                num_state = number_state::whole;
-              } else {
-                return false;
-              }
-              break;
-            case number_state::saw_neg:
-              if ('0' == c) {
-                num_state = number_state::leading_zero;
-              } else if (c >= '1' && c <= '9') {
-                num_state = number_state::whole;
-              } else if (allow_nonnumeric && 'I' == c) {
-                return substr_eq(data, start, end, 4, "-INF") ||
-                       substr_eq(data, start, end, 9, "-Infinity");
-              } else {
-                return false;
-              }
-              break;
-            case number_state::leading_zero:
-              if (allow_numeric_leading_zeros && c >= '0' && c <= '9') {
-                num_state = number_state::whole;
-              } else if ('.' == c) {
-                num_state = number_state::saw_radix;
-              } else if ('e' == c || 'E' == c) {
-                num_state = number_state::start_exponent;
-              } else {
-                return false;
-              }
-              break;
-            case number_state::whole:
-              if (c >= '0' && c <= '9') {
-                num_state = number_state::whole;
-              } else if ('.' == c) {
-                num_state = number_state::saw_radix;
-              } else if ('e' == c || 'E' == c) {
-                num_state = number_state::start_exponent;
-              } else {
-                return false;
-              }
-              break;
-            case number_state::saw_radix:
-              if (c >= '0' && c <= '9') {
-                num_state = number_state::fraction;
-              } else if ('e' == c || 'E' == c) {
-                num_state = number_state::start_exponent;
-              } else {
-                return false;
-              }
-              break;
-            case number_state::fraction:
-              if (c >= '0' && c <= '9') {
-                num_state = number_state::fraction;
-              } else if ('e' == c || 'E' == c) {
-                num_state = number_state::start_exponent;
-              } else {
-                return false;
-              }
-              break;
-            case number_state::start_exponent:
-              if ('+' == c || '-' == c) {
-                num_state = number_state::after_sign_exponent;
-              } else if (c >= '0' && c <= '9') {
-                num_state = number_state::exponent;
-              } else {
-                return false;
-              }
-              break;
-            case number_state::after_sign_exponent:
-              if (c >= '0' && c <= '9') {
-                num_state = number_state::exponent;
-              } else {
-                return false;
-              }
-              break;
-            case number_state::exponent:
-              if (c >= '0' && c <= '9') {
-                num_state = number_state::exponent;
-              } else {
-                return false;
-              }
-              break;
-          }
-        }
-        return num_state != number_state::after_sign_exponent &&
-               num_state != number_state::start_exponent && num_state != number_state::saw_neg &&
-               num_state != number_state::saw_radix;
-      } else {
-        return false;
-      }
-    };
-
-    auto validate_strings =
-      [data = d_input.data(),
-       allow_unquoted_control_chars =
-         options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
-                                                                 SymbolOffsetT end) -> bool {
-      // This validates a quoted string. A string must match https://www.json.org/json-en.html
-      // but we already know that it has a starting and ending " and all white space has been
-      // stripped out. Also the base CUDF validation makes sure escaped chars are correct
-      // so we only need to worry about unquoted control chars
-
-      auto state   = string_state::normal;
-      auto u_count = 0;
-      for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
-        auto c = data[idx];
-        if (!allow_unquoted_control_chars && static_cast<int>(c) >= 0 && static_cast<int>(c) < 32) {
-          return false;
-        }
-
-        switch (state) {
-          case string_state::normal:
-            if (c == '\\') { state = string_state::escaped; }
+  if (!options.is_strict_validation()) { return; }
+  using token_t = cudf::io::json::token_t;
+  cudf::detail::optional_trie trie_na =
+    cudf::detail::create_serialized_trie(options.get_na_values(), stream);
+  auto trie_na_view = cudf::detail::make_trie_view(trie_na);
+  auto validate_values =
+    [data                        = d_input.data(),
+     trie_na                     = trie_na_view,
+     allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
+     allow_nonnumeric =
+       options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
+                                                           SymbolOffsetT end) -> bool {
+    // This validates an unquoted value. A value must match https://www.json.org/json-en.html
+    // but the leading and training whitespace should already have been removed, and is not
+    // a string
+    auto c               = data[start];
+    auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
+    if (is_null_literal) {
+      return true;
+    } else if ('n' == c) {
+      return substr_eq(data, start, end, 4, "null");
+    } else if ('t' == c) {
+      return substr_eq(data, start, end, 4, "true");
+    } else if ('f' == c) {
+      return substr_eq(data, start, end, 5, "false");
+    } else if (allow_nonnumeric && c == 'N') {
+      return substr_eq(data, start, end, 3, "NaN");
+    } else if (allow_nonnumeric && c == 'I') {
+      return substr_eq(data, start, end, 8, "Infinity");
+    } else if (allow_nonnumeric && c == '+') {
+      return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity");
+    } else if ('-' == c || c <= '9' && 'c' >= '0') {
+      // number
+      auto num_state = number_state::START;
+      for (auto at = start; at < end; at++) {
+        c = data[at];
+        switch (num_state) {
+          case number_state::START:
+            if ('-' == c) {
+              num_state = number_state::SAW_NEG;
+            } else if ('0' == c) {
+              num_state = number_state::LEADING_ZERO;
+            } else if (c >= '1' && c <= '9') {
+              num_state = number_state::WHOLE;
+            } else {
+              return false;
+            }
             break;
-          case string_state::escaped:
-            // in Spark you can allow any char to be escaped, but CUDF
-            // validates it in some cases so we need to also validate it.
-            if (c == 'u') {
-              state   = string_state::escaped_u;
-              u_count = 0;
-            } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' ||
-                       c == 'r' || c == 't') {
-              state = string_state::normal;
+          case number_state::SAW_NEG:
+            if ('0' == c) {
+              num_state = number_state::LEADING_ZERO;
+            } else if (c >= '1' && c <= '9') {
+              num_state = number_state::WHOLE;
+            } else if (allow_nonnumeric && 'I' == c) {
+              return substr_eq(data, start, end, 4, "-INF") ||
+                     substr_eq(data, start, end, 9, "-Infinity");
+            } else {
+              return false;
+            }
+            break;
+          case number_state::LEADING_ZERO:
+            if (allow_numeric_leading_zeros && c >= '0' && c <= '9') {
+              num_state = number_state::WHOLE;
+            } else if ('.' == c) {
+              num_state = number_state::SAW_RADIX;
+            } else if ('e' == c || 'E' == c) {
+              num_state = number_state::START_EXPONENT;
+            } else {
+              return false;
+            }
+            break;
+          case number_state::WHOLE:
+            if (c >= '0' && c <= '9') {
+              num_state = number_state::WHOLE;
+            } else if ('.' == c) {
+              num_state = number_state::SAW_RADIX;
+            } else if ('e' == c || 'E' == c) {
+              num_state = number_state::START_EXPONENT;
+            } else {
+              return false;
+            }
+            break;
+          case number_state::SAW_RADIX:
+            if (c >= '0' && c <= '9') {
+              num_state = number_state::FRACTION;
+            } else if ('e' == c || 'E' == c) {
+              num_state = number_state::START_EXPONENT;
+            } else {
+              return false;
+            }
+            break;
+          case number_state::FRACTION:
+            if (c >= '0' && c <= '9') {
+              num_state = number_state::FRACTION;
+            } else if ('e' == c || 'E' == c) {
+              num_state = number_state::START_EXPONENT;
             } else {
               return false;
             }
             break;
-          case string_state::escaped_u:
-            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
-              u_count++;
-              if (u_count == 4) {
-                state   = string_state::normal;
-                u_count = 0;
-              }
+          case number_state::START_EXPONENT:
+            if ('+' == c || '-' == c) {
+              num_state = number_state::AFTER_SIGN_EXPONENT;
+            } else if (c >= '0' && c <= '9') {
+              num_state = number_state::EXPONENT;
+            } else {
+              return false;
+            }
+            break;
+          case number_state::AFTER_SIGN_EXPONENT:
+            if (c >= '0' && c <= '9') {
+              num_state = number_state::EXPONENT;
+            } else {
+              return false;
+            }
+            break;
+          case number_state::EXPONENT:
+            if (c >= '0' && c <= '9') {
+              num_state = number_state::EXPONENT;
             } else {
               return false;
             }
             break;
         }
       }
-      return string_state::normal == state;
-    };
+      return num_state != number_state::AFTER_SIGN_EXPONENT &&
+             num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG &&
+             num_state != number_state::SAW_RADIX;
+    } else {
+      return false;
+    }
+  };
+
+  auto validate_strings =
+    [data = d_input.data(),
+     allow_unquoted_control_chars =
+       options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
+                                                               SymbolOffsetT end) -> bool {
+    // This validates a quoted string. A string must match https://www.json.org/json-en.html
+    // but we already know that it has a starting and ending " and all white space has been
+    // stripped out. Also the base CUDF validation makes sure escaped chars are correct
+    // so we only need to worry about unquoted control chars
+
+    auto state   = string_state::NORMAL;
+    auto u_count = 0;
+    for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
+      auto c = data[idx];
+      if (!allow_unquoted_control_chars && static_cast<int>(c) >= 0 && static_cast<int>(c) < 32) {
+        return false;
+      }
 
-    auto num_tokens = tokens.size();
-    auto count_it   = thrust::make_counting_iterator(0);
-    auto predicate  = [tokens        = tokens.begin(),
-                      token_indices = token_indices.begin(),
-                      validate_values,
-                      validate_strings] __device__(auto i) -> bool {
-      if (tokens[i] == token_t::ValueEnd) {
-        return !validate_values(token_indices[i - 1], token_indices[i]);
-      } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
-        return !validate_strings(token_indices[i - 1], token_indices[i]);
+      switch (state) {
+        case string_state::NORMAL:
+          if (c == '\\') { state = string_state::ESCAPED; }
+          break;
+        case string_state::ESCAPED:
+          // in Spark you can allow any char to be escaped, but CUDF
+          // validates it in some cases so we need to also validate it.
+          if (c == 'u') {
+            state   = string_state::ESCAPED_U;
+            u_count = 0;
+          } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' ||
+                     c == 'r' || c == 't') {
+            state = string_state::NORMAL;
+          } else {
+            return false;
+          }
+          break;
+        case string_state::ESCAPED_U:
+          if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
+            u_count++;
+            if (u_count == 4) {
+              state   = string_state::NORMAL;
+              u_count = 0;
+            }
+          } else {
+            return false;
+          }
+          break;
       }
-      return false;
-    };
+    }
+    return string_state::NORMAL == state;
+  };
+
+  auto num_tokens = tokens.size();
+  auto count_it   = thrust::make_counting_iterator(0);
+  auto predicate  = [tokens        = tokens.begin(),
+                    token_indices = token_indices.begin(),
+                    validate_values,
+                    validate_strings] __device__(auto i) -> bool {
+    if (tokens[i] == token_t::ValueEnd) {
+      return !validate_values(token_indices[i - 1], token_indices[i]);
+    } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) {
+      return !validate_strings(token_indices[i - 1], token_indices[i]);
+    }
+    return false;
+  };
 
-    using scan_type            = write_if::scan_type;
-    auto conditional_write     = write_if{tokens.begin(), num_tokens};
-    auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
-    auto transform_op          = cuda::proclaim_return_type<scan_type>(
-      [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
-        if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
-        return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
-      });
-    auto binary_op = cuda::proclaim_return_type<scan_type>(
-      [] __device__(scan_type prev, scan_type curr) -> scan_type {
-        auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
-        return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
-      });
+  using scan_type            = write_if::scan_type;
+  auto conditional_write     = write_if{tokens.begin(), num_tokens};
+  auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write);
+  auto transform_op          = cuda::proclaim_return_type<scan_type>(
+    [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
+      if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
+      return {static_cast<token_t>(tokens[i]), tokens[i] == token_t::LineEnd};
+    });
+  auto binary_op = cuda::proclaim_return_type<scan_type>(
+    [] __device__(scan_type prev, scan_type curr) -> scan_type {
+      auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first);
+      return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second);
+    });
 
-    thrust::transform_inclusive_scan(rmm::exec_policy(stream),
-                                     count_it,
-                                     count_it + num_tokens,
-                                     conditional_output_it,
-                                     transform_op,
-                                     binary_op);  // in-place scan
-  }
+  thrust::transform_inclusive_scan(rmm::exec_policy(stream),
+                                   count_it,
+                                   count_it + num_tokens,
+                                   conditional_output_it,
+                                   transform_op,
+                                   binary_op);  // in-place scan
 }
 }  // namespace detail
 }  // namespace cudf::io::json

From 1d897f710e2dad2431bab98f68ee0b8ce09a2696 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 5 Sep 2024 21:37:09 +0000
Subject: [PATCH 22/33] fix doc

---
 cpp/src/io/json/tabulate_output_iterator.cuh | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh
index 46887beefbd..7cf3655e259 100644
--- a/cpp/src/io/json/tabulate_output_iterator.cuh
+++ b/cpp/src/io/json/tabulate_output_iterator.cuh
@@ -14,7 +14,6 @@
  * limitations under the License.
  */
 
-// Tabulate Output iterator
 #pragma once
 
 #include <thrust/iterator/counting_iterator.h>
@@ -44,7 +43,7 @@ class tabulate_output_iterator_proxy {
 };
 
 /**
- * @brief Transform output iterator with custom binary function which takes index and value.
+ * @brief Tabulate output iterator with custom binary function which takes index and value.
  *
  * @code {.cpp}
  * #include "tabulate_output_iterator.cuh"

From 6bf4d3f3f6e52510f1defd31e70cb3fd7781333f Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Thu, 5 Sep 2024 21:38:24 +0000
Subject: [PATCH 23/33] address review comments

---
 cpp/src/io/json/process_tokens.cu | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 839d9e88c9a..856fdbdb865 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -72,12 +72,9 @@ __device__ inline bool substr_eq(const char* data,
                                  SymbolOffsetT const expected_len,
                                  const char* expected)
 {
-  if (end - start != expected_len) {
-    return false;
-  } else {
-    for (auto idx = 0; idx < expected_len; idx++) {
-      if (data[start + idx] != expected[idx]) { return false; }
-    }
+  if (end - start != expected_len) { return false; }
+  for (auto idx = 0; idx < expected_len; idx++) {
+    if (data[start + idx] != expected[idx]) { return false; }
   }
   return true;
 }

From e093d641a8b081a8ceafd4394354af3c0ca9dd86 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 9 Sep 2024 21:31:53 +0000
Subject: [PATCH 24/33] address review comments

---
 cpp/include/cudf/io/json.hpp      | 27 ++++++++++++++++++++-------
 cpp/src/io/json/process_tokens.cu |  8 ++++----
 cpp/tests/io/json/json_test.cpp   |  4 ++--
 3 files changed, 26 insertions(+), 13 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index fe2cd8603f0..21466216a86 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -20,6 +20,7 @@
 
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 #include <rmm/resource_ref.hpp>
@@ -131,7 +132,7 @@ class json_reader_options {
   json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL;
 
   // Validation checks for spark
-  // Should the json validation be strict of not
+  // Should the json validation be strict or not
   bool _strict_validation = false;
   // Allow leading zeros for numeric values.
   bool _allow_numeric_leading_zeros = true;
@@ -503,32 +504,44 @@ class json_reader_options {
    * @brief Set whether leading zeros are allowed in numeric values. Strict validation
    * must be enabled for this to work.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
    *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
    */
-  void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; }
+  void allow_numeric_leading_zeros(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_numeric_leading_zeros = val;
+  }
 
   /**
    * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity,
    * Infinity, and -Infinity. Strict validation must be enabled for this to work.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
    *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
    */
-  void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; }
+  void allow_nonnumeric_numbers(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_nonnumeric_numbers = val;
+  }
 
   /**
    * @brief Set whether in a quoted string should characters greater than or equal to 0
    * and less than 32 be allowed without some form of escaping. Strict validation must
    * be enabled for this to work.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
    *
    * @param val true to indicate whether unquoted control chars are allowed.
    */
-  void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; }
+  void allow_unquoted_control_chars(bool val)
+  {
+    CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work.");
+    _allow_unquoted_control_chars = val;
+  }
 
   /**
    * @brief Sets additional values to recognize as null values.
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 856fdbdb865..4558dd6014b 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -91,7 +91,7 @@ void validate_token_stream(device_span<char const> d_input,
   cudf::detail::optional_trie trie_na =
     cudf::detail::create_serialized_trie(options.get_na_values(), stream);
   auto trie_na_view = cudf::detail::make_trie_view(trie_na);
-  auto validate_values =
+  auto validate_values = cuda::proclaim_return_type<bool>(
     [data                        = d_input.data(),
      trie_na                     = trie_na_view,
      allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
@@ -217,9 +217,9 @@ void validate_token_stream(device_span<char const> d_input,
     } else {
       return false;
     }
-  };
+  });
 
-  auto validate_strings =
+  auto validate_strings = cuda::proclaim_return_type<bool>(
     [data = d_input.data(),
      allow_unquoted_control_chars =
        options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
@@ -268,7 +268,7 @@ void validate_token_stream(device_span<char const> d_input,
       }
     }
     return string_state::NORMAL == state;
-  };
+  });
 
   auto num_tokens = tokens.size();
   auto count_it   = thrust::make_counting_iterator(0);
diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp
index 212538fd891..960c19fce2e 100644
--- a/cpp/tests/io/json/json_test.cpp
+++ b/cpp/tests/io/json/json_test.cpp
@@ -2239,9 +2239,9 @@ TEST_F(JsonReaderTest, ValueValidation)
       cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
         .lines(true)
         .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL)
+        .strict_validation(true)
         .numeric_leading_zeros(false)
-        .na_values({"nan"})
-        .strict_validation(true);
+        .na_values({"nan"});
     cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
     EXPECT_EQ(result.tbl->num_columns(), 4);

From 00ef690834b62fc165b5800c31f393034e6d0401 Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Mon, 9 Sep 2024 21:35:16 +0000
Subject: [PATCH 25/33] rename lambda name

---
 cpp/src/io/json/process_tokens.cu | 321 +++++++++++++++---------------
 1 file changed, 161 insertions(+), 160 deletions(-)

diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index 4558dd6014b..db30bd9d7e2 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -90,185 +90,186 @@ void validate_token_stream(device_span<char const> d_input,
   using token_t = cudf::io::json::token_t;
   cudf::detail::optional_trie trie_na =
     cudf::detail::create_serialized_trie(options.get_na_values(), stream);
-  auto trie_na_view = cudf::detail::make_trie_view(trie_na);
-  auto validate_values = cuda::proclaim_return_type<bool>(
+  auto trie_na_view            = cudf::detail::make_trie_view(trie_na);
+  auto validate_numeric_values = cuda::proclaim_return_type<bool>(
     [data                        = d_input.data(),
      trie_na                     = trie_na_view,
      allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),
      allow_nonnumeric =
        options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start,
                                                            SymbolOffsetT end) -> bool {
-    // This validates an unquoted value. A value must match https://www.json.org/json-en.html
-    // but the leading and training whitespace should already have been removed, and is not
-    // a string
-    auto c               = data[start];
-    auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
-    if (is_null_literal) {
-      return true;
-    } else if ('n' == c) {
-      return substr_eq(data, start, end, 4, "null");
-    } else if ('t' == c) {
-      return substr_eq(data, start, end, 4, "true");
-    } else if ('f' == c) {
-      return substr_eq(data, start, end, 5, "false");
-    } else if (allow_nonnumeric && c == 'N') {
-      return substr_eq(data, start, end, 3, "NaN");
-    } else if (allow_nonnumeric && c == 'I') {
-      return substr_eq(data, start, end, 8, "Infinity");
-    } else if (allow_nonnumeric && c == '+') {
-      return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity");
-    } else if ('-' == c || c <= '9' && 'c' >= '0') {
-      // number
-      auto num_state = number_state::START;
-      for (auto at = start; at < end; at++) {
-        c = data[at];
-        switch (num_state) {
-          case number_state::START:
-            if ('-' == c) {
-              num_state = number_state::SAW_NEG;
-            } else if ('0' == c) {
-              num_state = number_state::LEADING_ZERO;
-            } else if (c >= '1' && c <= '9') {
-              num_state = number_state::WHOLE;
-            } else {
-              return false;
-            }
-            break;
-          case number_state::SAW_NEG:
-            if ('0' == c) {
-              num_state = number_state::LEADING_ZERO;
-            } else if (c >= '1' && c <= '9') {
-              num_state = number_state::WHOLE;
-            } else if (allow_nonnumeric && 'I' == c) {
-              return substr_eq(data, start, end, 4, "-INF") ||
-                     substr_eq(data, start, end, 9, "-Infinity");
-            } else {
-              return false;
-            }
-            break;
-          case number_state::LEADING_ZERO:
-            if (allow_numeric_leading_zeros && c >= '0' && c <= '9') {
-              num_state = number_state::WHOLE;
-            } else if ('.' == c) {
-              num_state = number_state::SAW_RADIX;
-            } else if ('e' == c || 'E' == c) {
-              num_state = number_state::START_EXPONENT;
-            } else {
-              return false;
-            }
-            break;
-          case number_state::WHOLE:
-            if (c >= '0' && c <= '9') {
-              num_state = number_state::WHOLE;
-            } else if ('.' == c) {
-              num_state = number_state::SAW_RADIX;
-            } else if ('e' == c || 'E' == c) {
-              num_state = number_state::START_EXPONENT;
-            } else {
-              return false;
-            }
-            break;
-          case number_state::SAW_RADIX:
-            if (c >= '0' && c <= '9') {
-              num_state = number_state::FRACTION;
-            } else if ('e' == c || 'E' == c) {
-              num_state = number_state::START_EXPONENT;
-            } else {
-              return false;
-            }
-            break;
-          case number_state::FRACTION:
-            if (c >= '0' && c <= '9') {
-              num_state = number_state::FRACTION;
-            } else if ('e' == c || 'E' == c) {
-              num_state = number_state::START_EXPONENT;
-            } else {
-              return false;
-            }
-            break;
-          case number_state::START_EXPONENT:
-            if ('+' == c || '-' == c) {
-              num_state = number_state::AFTER_SIGN_EXPONENT;
-            } else if (c >= '0' && c <= '9') {
-              num_state = number_state::EXPONENT;
-            } else {
-              return false;
-            }
-            break;
-          case number_state::AFTER_SIGN_EXPONENT:
-            if (c >= '0' && c <= '9') {
-              num_state = number_state::EXPONENT;
-            } else {
-              return false;
-            }
-            break;
-          case number_state::EXPONENT:
-            if (c >= '0' && c <= '9') {
-              num_state = number_state::EXPONENT;
-            } else {
-              return false;
-            }
-            break;
+      // This validates an unquoted value. A value must match https://www.json.org/json-en.html
+      // but the leading and training whitespace should already have been removed, and is not
+      // a string
+      auto c               = data[start];
+      auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start});
+      if (is_null_literal) {
+        return true;
+      } else if ('n' == c) {
+        return substr_eq(data, start, end, 4, "null");
+      } else if ('t' == c) {
+        return substr_eq(data, start, end, 4, "true");
+      } else if ('f' == c) {
+        return substr_eq(data, start, end, 5, "false");
+      } else if (allow_nonnumeric && c == 'N') {
+        return substr_eq(data, start, end, 3, "NaN");
+      } else if (allow_nonnumeric && c == 'I') {
+        return substr_eq(data, start, end, 8, "Infinity");
+      } else if (allow_nonnumeric && c == '+') {
+        return substr_eq(data, start, end, 4, "+INF") ||
+               substr_eq(data, start, end, 9, "+Infinity");
+      } else if ('-' == c || c <= '9' && 'c' >= '0') {
+        // number
+        auto num_state = number_state::START;
+        for (auto at = start; at < end; at++) {
+          c = data[at];
+          switch (num_state) {
+            case number_state::START:
+              if ('-' == c) {
+                num_state = number_state::SAW_NEG;
+              } else if ('0' == c) {
+                num_state = number_state::LEADING_ZERO;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::SAW_NEG:
+              if ('0' == c) {
+                num_state = number_state::LEADING_ZERO;
+              } else if (c >= '1' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if (allow_nonnumeric && 'I' == c) {
+                return substr_eq(data, start, end, 4, "-INF") ||
+                       substr_eq(data, start, end, 9, "-Infinity");
+              } else {
+                return false;
+              }
+              break;
+            case number_state::LEADING_ZERO:
+              if (allow_numeric_leading_zeros && c >= '0' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if ('.' == c) {
+                num_state = number_state::SAW_RADIX;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::WHOLE:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::WHOLE;
+              } else if ('.' == c) {
+                num_state = number_state::SAW_RADIX;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::SAW_RADIX:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::FRACTION;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::FRACTION:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::FRACTION;
+              } else if ('e' == c || 'E' == c) {
+                num_state = number_state::START_EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::START_EXPONENT:
+              if ('+' == c || '-' == c) {
+                num_state = number_state::AFTER_SIGN_EXPONENT;
+              } else if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::AFTER_SIGN_EXPONENT:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+            case number_state::EXPONENT:
+              if (c >= '0' && c <= '9') {
+                num_state = number_state::EXPONENT;
+              } else {
+                return false;
+              }
+              break;
+          }
         }
+        return num_state != number_state::AFTER_SIGN_EXPONENT &&
+               num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG &&
+               num_state != number_state::SAW_RADIX;
+      } else {
+        return false;
       }
-      return num_state != number_state::AFTER_SIGN_EXPONENT &&
-             num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG &&
-             num_state != number_state::SAW_RADIX;
-    } else {
-      return false;
-    }
-  });
+    });
 
   auto validate_strings = cuda::proclaim_return_type<bool>(
     [data = d_input.data(),
      allow_unquoted_control_chars =
        options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start,
                                                                SymbolOffsetT end) -> bool {
-    // This validates a quoted string. A string must match https://www.json.org/json-en.html
-    // but we already know that it has a starting and ending " and all white space has been
-    // stripped out. Also the base CUDF validation makes sure escaped chars are correct
-    // so we only need to worry about unquoted control chars
+      // This validates a quoted string. A string must match https://www.json.org/json-en.html
+      // but we already know that it has a starting and ending " and all white space has been
+      // stripped out. Also the base CUDF validation makes sure escaped chars are correct
+      // so we only need to worry about unquoted control chars
 
-    auto state   = string_state::NORMAL;
-    auto u_count = 0;
-    for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
-      auto c = data[idx];
-      if (!allow_unquoted_control_chars && static_cast<int>(c) >= 0 && static_cast<int>(c) < 32) {
-        return false;
-      }
+      auto state   = string_state::NORMAL;
+      auto u_count = 0;
+      for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
+        auto c = data[idx];
+        if (!allow_unquoted_control_chars && static_cast<int>(c) >= 0 && static_cast<int>(c) < 32) {
+          return false;
+        }
 
-      switch (state) {
-        case string_state::NORMAL:
-          if (c == '\\') { state = string_state::ESCAPED; }
-          break;
-        case string_state::ESCAPED:
-          // in Spark you can allow any char to be escaped, but CUDF
-          // validates it in some cases so we need to also validate it.
-          if (c == 'u') {
-            state   = string_state::ESCAPED_U;
-            u_count = 0;
-          } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' ||
-                     c == 'r' || c == 't') {
-            state = string_state::NORMAL;
-          } else {
-            return false;
-          }
-          break;
-        case string_state::ESCAPED_U:
-          if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
-            u_count++;
-            if (u_count == 4) {
-              state   = string_state::NORMAL;
+        switch (state) {
+          case string_state::NORMAL:
+            if (c == '\\') { state = string_state::ESCAPED; }
+            break;
+          case string_state::ESCAPED:
+            // in Spark you can allow any char to be escaped, but CUDF
+            // validates it in some cases so we need to also validate it.
+            if (c == 'u') {
+              state   = string_state::ESCAPED_U;
               u_count = 0;
+            } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' ||
+                       c == 'r' || c == 't') {
+              state = string_state::NORMAL;
+            } else {
+              return false;
             }
-          } else {
-            return false;
-          }
-          break;
+            break;
+          case string_state::ESCAPED_U:
+            if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) {
+              u_count++;
+              if (u_count == 4) {
+                state   = string_state::NORMAL;
+                u_count = 0;
+              }
+            } else {
+              return false;
+            }
+            break;
+        }
       }
-    }
-    return string_state::NORMAL == state;
-  });
+      return string_state::NORMAL == state;
+    });
 
   auto num_tokens = tokens.size();
   auto count_it   = thrust::make_counting_iterator(0);

From cecb42fe276783582f1f958c4fa88afce612e5ad Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 9 Sep 2024 22:05:03 -0500
Subject: [PATCH 26/33] Apply suggestions from code review

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 cpp/include/cudf/io/json.hpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 21466216a86..faa871638ba 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -133,10 +133,11 @@ class json_reader_options {
 
   // Validation checks for spark
   // Should the json validation be strict or not
+  // Note: strict validation enforces the JSON specification https://www.json.org/json-en.html
   bool _strict_validation = false;
   // Allow leading zeros for numeric values.
   bool _allow_numeric_leading_zeros = true;
-  // Allow nonnumeric numbers. NaN/Inf
+  // Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity
   bool _allow_nonnumeric_numbers = true;
   // Allow unquoted control characters
   bool _allow_unquoted_control_chars = true;
@@ -321,10 +322,9 @@ class json_reader_options {
   [[nodiscard]] bool is_strict_validation() const { return _strict_validation; }
 
   /**
-   * @brief Whether leading zeros are allowed in numeric values. strict validation
-   * must be enabled for this to work.
+   * @brief Whether leading zeros are allowed in numeric values.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @note: This validation is enforced only if strict validation is enabled.
    *
    * @return true if leading zeros are allowed in numeric values
    */
@@ -335,9 +335,9 @@ class json_reader_options {
 
   /**
    * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity,
-   * and -Infinity. strict validation must be enabled for this to work.
+   * and -Infinity.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @note: This validation is enforced only if strict validation is enabled.
    *
    * @return true if leading zeros are allowed in numeric values
    */
@@ -345,9 +345,9 @@ class json_reader_options {
 
   /**
    * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32
-   * be allowed without some form of escaping. Strict validation must be enabled for this to work.
+   * be allowed without some form of escaping.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @note: This validation is enforced only if strict validation is enabled.
    *
    * @return true if unquoted control chars are allowed.
    */

From 9cd30984ccdc93d2bb1e8192b6166bfc53bd96ac Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Mon, 9 Sep 2024 22:05:21 -0500
Subject: [PATCH 27/33] Apply suggestions from code review

Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 java/src/main/java/ai/rapids/cudf/JSONOptions.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
index 4fcaf1e4a6b..c8308ca17ec 100644
--- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java
@@ -34,8 +34,8 @@ public final class JSONOptions extends ColumnFilterOptions {
   private final boolean normalizeWhitespace;
   private final boolean mixedTypesAsStrings;
   private final boolean keepStringQuotes;
-  private final boolean allowLeadingZeros;
   private final boolean strictValidation;
+  private final boolean allowLeadingZeros;
   private final boolean allowNonNumericNumbers;
   private final boolean allowUnquotedControlChars;
 

From c816c73acb71cd8f6dfe64bccfd4290fbe6946aa Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 10 Sep 2024 03:06:56 +0000
Subject: [PATCH 28/33] update docs

---
 cpp/include/cudf/io/json.hpp      | 6 +++---
 cpp/src/io/json/process_tokens.cu | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index faa871638ba..d88f1c0f74c 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -776,7 +776,7 @@ class json_reader_options_builder {
    * @brief Set Whether leading zeros are allowed in numeric values. strict validation must
    * be enabled for this to have any effect.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
    *
    * @param val Boolean value to indicate whether leading zeros are allowed in numeric values
    * @return this for chaining
@@ -792,7 +792,7 @@ class json_reader_options_builder {
    * +INF, -INF, +Infinity, Infinity, and -Infinity.
    * strict validation must be enabled for this to have any effect.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
    *
    * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not.
    * @return this for chaining
@@ -807,7 +807,7 @@ class json_reader_options_builder {
    * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
    * some form of escaping. strict validation must be enabled for this to have any effect.
    *
-   * @note: strict_validation must be enabled for this to work.
+   * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
    *
    * @param val Boolean value to indicate if unquoted control chars are allowed or not.
    * @return this for chaining
diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu
index db30bd9d7e2..83c7b663980 100644
--- a/cpp/src/io/json/process_tokens.cu
+++ b/cpp/src/io/json/process_tokens.cu
@@ -90,8 +90,8 @@ void validate_token_stream(device_span<char const> d_input,
   using token_t = cudf::io::json::token_t;
   cudf::detail::optional_trie trie_na =
     cudf::detail::create_serialized_trie(options.get_na_values(), stream);
-  auto trie_na_view            = cudf::detail::make_trie_view(trie_na);
-  auto validate_numeric_values = cuda::proclaim_return_type<bool>(
+  auto trie_na_view    = cudf::detail::make_trie_view(trie_na);
+  auto validate_values = cuda::proclaim_return_type<bool>(
     [data                        = d_input.data(),
      trie_na                     = trie_na_view,
      allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(),

From 53db70330afeb9b880f98fc708daac61ec74f1aa Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 9 Sep 2024 22:06:14 -0600
Subject: [PATCH 29/33] Update cpp/include/cudf/io/json.hpp

---
 cpp/include/cudf/io/json.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index d88f1c0f74c..6482bb7e7fd 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -773,7 +773,7 @@ class json_reader_options_builder {
   }
 
   /**
-   * @brief Set Whether leading zeros are allowed in numeric values. strict validation must
+   * @brief Set Whether leading zeros are allowed in numeric values. Strict validation must
    * be enabled for this to have any effect.
    *
    * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.

From c3832b6cfa8aa96f8a8fc024fc5540d354c87e55 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 9 Sep 2024 22:06:21 -0600
Subject: [PATCH 30/33] Update cpp/include/cudf/io/json.hpp

---
 cpp/include/cudf/io/json.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 6482bb7e7fd..7606ed1db83 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -790,7 +790,7 @@ class json_reader_options_builder {
   /**
    * @brief Set whether specific unquoted number values are valid JSON. The values are NaN,
    * +INF, -INF, +Infinity, Infinity, and -Infinity.
-   * strict validation must be enabled for this to have any effect.
+   * Strict validation must be enabled for this to have any effect.
    *
    * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
    *

From 070263e86d86e21a3bc0a584087187a583192758 Mon Sep 17 00:00:00 2001
From: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
Date: Mon, 9 Sep 2024 22:06:26 -0600
Subject: [PATCH 31/33] Update cpp/include/cudf/io/json.hpp

---
 cpp/include/cudf/io/json.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp
index 7606ed1db83..f52ec926f49 100644
--- a/cpp/include/cudf/io/json.hpp
+++ b/cpp/include/cudf/io/json.hpp
@@ -805,7 +805,7 @@ class json_reader_options_builder {
 
   /**
    * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without
-   * some form of escaping. strict validation must be enabled for this to have any effect.
+   * some form of escaping. Strict validation must be enabled for this to have any effect.
    *
    * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option.
    *

From fb0e85f53dd569dde12e6321ea5f5655cdaa100d Mon Sep 17 00:00:00 2001
From: Karthikeyan Natarajan <karthikeyann@users.noreply.github.com>
Date: Tue, 10 Sep 2024 19:50:47 +0000
Subject: [PATCH 32/33] fix strict_validation dependent options with if

---
 java/src/main/native/src/TableJni.cpp | 33 ++++++++++++++++-----------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index 2238195fa9d..f265547d6a7 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1646,11 +1646,12 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
         .strict_validation(strict_validation)
-        .numeric_leading_zeros(allow_leading_zeros)
-        .nonnumeric_numbers(allow_nonnumeric_numbers)
-        .unquoted_control_chars(allow_unquoted_control)
         .keep_quotes(keep_quotes);
-
+    if (strict_validation) {
+      opt.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
     auto result =
       std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
 
@@ -1697,11 +1698,13 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .normalize_single_quotes(static_cast<bool>(normalize_single_quotes))
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .strict_validation(strict_validation)
-        .numeric_leading_zeros(allow_leading_zeros)
-        .nonnumeric_numbers(allow_nonnumeric_numbers)
-        .unquoted_control_chars(allow_unquoted_control)
         .mixed_types_as_string(mixed_types_as_string)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opt.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     auto result =
       std::make_unique<cudf::io::table_with_metadata>(cudf::io::read_json(opts.build()));
@@ -1845,10 +1848,12 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
         .strict_validation(strict_validation)
-        .numeric_leading_zeros(allow_leading_zeros)
-        .nonnumeric_numbers(allow_nonnumeric_numbers)
-        .unquoted_control_chars(allow_unquoted_control)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opt.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {
@@ -1952,10 +1957,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .normalize_whitespace(static_cast<bool>(normalize_whitespace))
         .mixed_types_as_string(mixed_types_as_string)
         .strict_validation(strict_validation)
-        .numeric_leading_zeros(allow_leading_zeros)
-        .nonnumeric_numbers(allow_nonnumeric_numbers)
-        .unquoted_control_chars(allow_unquoted_control)
         .keep_quotes(keep_quotes);
+    if (strict_validation) {
+      opt.numeric_leading_zeros(allow_leading_zeros)
+        .nonnumeric_numbers(allow_nonnumeric_numbers)
+        .unquoted_control_chars(allow_unquoted_control);
+    }
 
     if (!n_types.is_null()) {
       if (n_types.size() != n_scales.size()) {

From 252c38b660be40bb37cc93cabba005cf4a344db1 Mon Sep 17 00:00:00 2001
From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com>
Date: Tue, 10 Sep 2024 17:03:04 -0500
Subject: [PATCH 33/33] fix typo

---
 java/src/main/native/src/TableJni.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index a687f0bdb79..40a111209b0 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -1649,7 +1649,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env,
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
     if (strict_validation) {
-      opt.numeric_leading_zeros(allow_leading_zeros)
+      opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
         .unquoted_control_chars(allow_unquoted_control);
     }
@@ -1702,7 +1702,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env,
         .mixed_types_as_string(mixed_types_as_string)
         .keep_quotes(keep_quotes);
     if (strict_validation) {
-      opt.numeric_leading_zeros(allow_leading_zeros)
+      opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
         .unquoted_control_chars(allow_unquoted_control);
     }
@@ -1851,7 +1851,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env,
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
     if (strict_validation) {
-      opt.numeric_leading_zeros(allow_leading_zeros)
+      opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
         .unquoted_control_chars(allow_unquoted_control);
     }
@@ -1960,7 +1960,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env,
         .strict_validation(strict_validation)
         .keep_quotes(keep_quotes);
     if (strict_validation) {
-      opt.numeric_leading_zeros(allow_leading_zeros)
+      opts.numeric_leading_zeros(allow_leading_zeros)
         .nonnumeric_numbers(allow_nonnumeric_numbers)
         .unquoted_control_chars(allow_unquoted_control);
     }