From bb991ef6e37a65a6b5a05203e71cc1ae8d4444d3 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 11 Jun 2024 08:49:09 +0000 Subject: [PATCH 01/33] validation of tokens code --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/io/json.hpp | 59 +++++++++ cpp/src/io/json/nested_json.hpp | 16 +++ cpp/src/io/json/nested_json_gpu.cu | 5 +- cpp/src/io/json/output_writer_iterator.h | 140 ++++++++++++++++++++ cpp/src/io/json/process_tokens.cu | 161 +++++++++++++++++++++++ cpp/tests/io/json_test.cpp | 78 +++++++++++ 7 files changed, 459 insertions(+), 1 deletion(-) create mode 100644 cpp/src/io/json/output_writer_iterator.h create mode 100644 cpp/src/io/json/process_tokens.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index ca85996b990..cf41b851970 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -392,6 +392,7 @@ add_library( src/io/json/nested_json_gpu.cu src/io/json/read_json.cu src/io/json/parser_features.cpp + src/io/json/process_tokens.cu src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 65ba8f25577..7525c67fa4d 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -129,6 +129,14 @@ class json_reader_options { // Whether to recover after an invalid JSON line json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; + // Validation checks for spark + // Allow leading zeros for numeric values. + bool _allow_numeric_leading_zeros = true; + // Allow unquoted control characters + bool allowUnquotedControlChars = true; + // Additional values to recognize as null values + std::vector _na_values; + /** * @brief Constructor from source info. * @@ -298,6 +306,20 @@ class json_reader_options { */ json_recovery_mode_t recovery_mode() const { return _recovery_mode; } + /** + * @brief Whether leading zeros are allowed in numeric values. + * + * @return true if leading zeros are allowed in numeric values + */ + [[nodiscard]] bool is_allowed_numeric_leading_zeros() const { return _allow_numeric_leading_zeros; } + + /** + * @brief Returns additional values to recognize as null values. + * + * @return Additional values to recognize as null values + */ + [[nodiscard]] std::vector const& get_na_values() const { return _na_values; } + /** * @brief Set data types for columns to be read. * @@ -427,6 +449,20 @@ class json_reader_options { * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. */ void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } + + /** + * @brief Set Whether leading zeros are allowed in numeric values. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; } + + /** + * @brief Sets additional values to recognize as null values. + * + * @param vals Vector of values to be considered to be null + */ + void set_na_values(std::vector vals) { _na_values = std::move(vals); } }; /** @@ -638,6 +674,29 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set Whether leading zeros are allowed in numeric values. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + json_reader_options_builder& numeric_leading_zeros(bool val) + { + options.allow_numeric_leading_zeros(val); + return *this; + } + + /** + * @brief Sets additional values to recognize as null values. + * + * @param vals Vector of values to be considered to be null + * @return this for chaining + */ + json_reader_options_builder& na_values(std::vector vals) + { + options.set_na_values(std::move(vals)); + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index e12892a2d50..e858594e4b0 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -221,6 +221,22 @@ std::pair, rmm::device_uvector> pr device_span token_indices, rmm::cuda_stream_view stream); +/** + * @brief Validate the tokens conforming to behavior given in options. + * + * @param d_input The string of input characters + * @param tokens The tokens to be post-processed + * @param token_indices The tokens' corresponding indices that are post-processed + * @param options Parsing options specifying the parsing behaviour + * @param stream The cuda stream to dispatch GPU kernels to + */ +void validate_token_stream( + device_span d_input, + device_span tokens, + device_span token_indices, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream); + /** * @brief Parses the given JSON string and generates a tree representation of the given input. * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index b243e4ba006..eeea2a0edf3 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -1655,6 +1655,7 @@ std::pair, rmm::device_uvector> ge if (delimiter_offset == 1) { tokens.set_element(0, token_t::LineEnd, stream); + validate_token_stream(json_in, tokens, tokens_indices, options, stream); auto [filtered_tokens, filtered_tokens_indices] = process_token_stream(tokens, tokens_indices, stream); tokens = std::move(filtered_tokens); @@ -2079,7 +2080,9 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt parse_opts.keepquotes = options.is_enabled_keep_quotes(); parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + std::vector na_values{"", "null"}; + na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); + parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); return parse_opts; } diff --git a/cpp/src/io/json/output_writer_iterator.h b/cpp/src/io/json/output_writer_iterator.h new file mode 100644 index 00000000000..20228b4efac --- /dev/null +++ b/cpp/src/io/json/output_writer_iterator.h @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +// Output writer iterator +#pragma once + +#include + +THRUST_NAMESPACE_BEGIN +namespace detail { + +// Proxy reference that calls BinaryFunction with Iterator value and the rhs of assignment operator +template +class output_writer_iterator_proxy { + public: + __host__ __device__ output_writer_iterator_proxy(const Iterator& index_iter, BinaryFunction fun) + : index_iter(index_iter), fun(fun) + { + } + template + __host__ __device__ output_writer_iterator_proxy operator=(const T& x) + { + fun(*index_iter, x); + return *this; + } + + private: + Iterator index_iter; + BinaryFunction fun; +}; + +// Register output_writer_iterator_proxy with 'is_proxy_reference' from +// type_traits to enable its use with algorithms. +template +struct is_proxy_reference> + : public thrust::detail::true_type { +}; + +} // namespace detail + +/** + * @brief Transform output iterator with custom writer binary function which takes index and value. + * + * @code {.cpp} + * #include + * #include + * #include + * #include + * + * struct set_bits_field { + * int* bitfield; + * __device__ inline void set_bit(size_t bit_index) + * { + * atomicOr(&bitfield[bit_index/32], (int{1} << (bit_index % 32))); + * } + * __device__ inline void clear_bit(size_t bit_index) + * { + * atomicAnd(&bitfield[bit_index / 32], ~(int{1} << (bit_index % 32))); + * } + * // Index, value + * __device__ void operator()(size_t i, bool x) + * { + * if (x) + * set_bit(i); + * else + * clear_bit(i); + * } + * }; + * + * thrust::device_vector v(1, 0x00000000); + * auto result_begin = thrust::make_output_writer_iterator(thrust::make_counting_iterator(0), + * set_bits_field{v.data().get()}); + * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0), + * [] __device__ (int x) { return x%2; }); + * thrust::copy(thrust::device, value, value+32, result_begin); + * assert(v[0] == 0xaaaaaaaa); + * @endcode + * + * + * @tparam BinaryFunction Binary function to be called with the Iterator value and the rhs of + * assignment operator. + * @tparam Iterator iterator type that acts as index of the output. + */ +template +class output_writer_iterator + : public thrust::iterator_adaptor< + output_writer_iterator, + Iterator, + thrust::use_default, + thrust::use_default, + thrust::use_default, + thrust::detail::output_writer_iterator_proxy> { + public: + // parent class. + typedef thrust::iterator_adaptor< + output_writer_iterator, + Iterator, + thrust::use_default, + thrust::use_default, + thrust::use_default, + thrust::detail::output_writer_iterator_proxy> + super_t; + // friend thrust::iterator_core_access to allow it access to the private interface dereference() + friend class thrust::iterator_core_access; + __host__ __device__ output_writer_iterator(Iterator const& x, BinaryFunction fun) + : super_t(x), fun(fun) + { + } + + private: + BinaryFunction fun; + + // thrust::iterator_core_access accesses this function + __host__ __device__ typename super_t::reference dereference() const + { + return thrust::detail::output_writer_iterator_proxy( + this->base_reference(), fun); + } +}; + +template +output_writer_iterator __host__ __device__ +make_output_writer_iterator(Iterator out, BinaryFunction fun) +{ + return output_writer_iterator(out, fun); +} // end make_output_writer_iterator +THRUST_NAMESPACE_END \ No newline at end of file diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu new file mode 100644 index 00000000000..d1ed02052dc --- /dev/null +++ b/cpp/src/io/json/process_tokens.cu @@ -0,0 +1,161 @@ + +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.hpp" +// #include "tabulate_output_iterator.cuh" +#include "output_writer_iterator.h" + +#include +#include + +#include +#include + +#include +#include +#include +#include + +#include + +namespace cudf::io::json { +namespace detail { + +struct write_if { + using token_t = cudf::io::json::token_t; + using scan_type = thrust::pair; + PdaTokenT* tokens; + size_t n; + // Index, value + __device__ void operator()(size_type i, scan_type x) + { + if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) { + if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) { + tokens[i] = token_t::ErrorBegin; + // printf("writing\n"); + } + } + } +}; + +void validate_token_stream(device_span d_input, + device_span tokens, + device_span token_indices, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream) +{ + if (getenv("SPARK_JSON")) { + using token_t = cudf::io::json::token_t; + auto validate_tokens = + [data = d_input.data(), + allow_numeric_leading_zeros = + options.is_allowed_numeric_leading_zeros()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // Leading zeros. + if (!allow_numeric_leading_zeros and data[start] == '0') return false; + return true; + }; + auto num_tokens = tokens.size(); + auto count_it = thrust::make_counting_iterator(0); + auto predicate = [tokens = tokens.begin(), + token_indices = token_indices.begin(), + validate_tokens] __device__(auto i) -> bool { + if (tokens[i] == token_t::ValueEnd) { + return !validate_tokens(token_indices[i - 1], token_indices[i]); + } + return false; + }; + + using scan_type = write_if::scan_type; + auto conditional_write = write_if{tokens.begin(), num_tokens}; + // auto conditional_output_it = tokens.begin(); + // auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write); + auto conditional_output_it = + thrust::make_output_writer_iterator(thrust::make_counting_iterator(0), conditional_write); + auto transform_op = cuda::proclaim_return_type( + [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { + if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; + return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; + }); + auto binary_op = cuda::proclaim_return_type( + [] __device__(scan_type prev, scan_type curr) -> scan_type { + auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); + return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); + }); + rmm::device_uvector error(num_tokens, stream); + thrust::transform(rmm::exec_policy(stream), + count_it, + count_it + num_tokens, + error.begin(), + predicate); // in-place scan + printf("error:"); + for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) + printf("%d ", tk); + printf("\n"); + + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + count_it, + count_it + num_tokens, + conditional_output_it, + transform_op, + binary_op); // in-place scan + } + printf("pre_process_token:"); + for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), stream)) + printf("%d ", tk); + printf("\n"); + + // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE + // 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 + // 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 4 + // auto unary_op = [] __device__ (auto) -> token_t { return token_t::ErrorBegin; }; + // thrust::transform_if(rmm::exec_policy(stream), count_it, count_it + num_tokens, tokens.begin(), + // unary_op, predicate); auto num_rows = thrust::count(rmm::exec_policy(stream), tokens.begin(), + // tokens.end(), token_t::LineEnd); rmm::device_uvector row_is_error(num_rows, stream); + // rmm::device_uvector row_index(num_tokens, stream); + // auto is_LineEnd = [] __device__ (auto token) -> SymbolOffsetT { return token == + // token_t::LineEnd; }; thrust::transform_inclusive_scan(rmm::exec_policy(stream), + // tokens.begin(), tokens.end(), row_index.begin(), is_LineEnd, thrust::plus{}); + // auto is_error_it = thrust::make_transform_iterator(tokens.begin(), [] __device__ (auto token) + // -> bool { return token == token_t::ErrorBegin; }); + // thrust::reduce_by_key(rmm::exec_policy(stream), row_index.begin(), row_index.end(), + // is_error_it, thrust::make_discard_iterator(), row_is_error.begin()); + + // if current == ErrorBegin and tokens[i+1]==LE or i==n-1) then write ErrorBegin to tokens[i], + // else nothing. if VB, or SB, then if validate(token[i], token[i+1])==false, + // + // Transform_if to errors tokens + // Count LE (num_rows) + // create int vector [num_rows], bool[num_rows] + // TODO: fuse them together with single scan algorithm. + // LE==1, to scan to row_index. + // reduce_by_key, to check if it has any error, and number of non-errors tokens. + // reduce them to get output tokens count. + // copy_if -> use cub cub::DeviceSelect::If(d_data, d_num_selected_out, num_items, select_op) +} + +// corner cases: empty LE, +// alternate Decoupled look back idea: +// count LineEnd, allocate bool[num_rows] as is_error. +// decoupled look back for LineEnd tokens for row indices, +// transform_if to error tokens and write to bool[row_index] atomically (reduce and write within +// warp/block?) +// decoupled look back for LineEnd tokens for row indices, +// (if not error row & not lineEnd token) -> decoupled look back for output indices, +// CopyIf (if not error row & not lineEnd token) write to output. +} // namespace detail +} // namespace cudf::io::json \ No newline at end of file diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 57aa2721756..5e51a1c759a 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2161,6 +2161,84 @@ TEST_F(JsonReaderTest, JSONLinesRecoveringSync) cudf::io::set_host_memory_resource(last_mr); } +// Validation +TEST_F(JsonReaderTest, ValueValidation) +{ + // parsing error as null rows + std::string data = + // 0 -> a: -2 (valid) + R"({"a":-2 }{})" + "\n" + // 1 -> (invalid) + R"({"b":{}should_be_invalid})" + "\n" + // 2 -> b (valid) + R"({"b":{"a":3} })" + "\n" + // 3 -> c: (valid/null based on option) + R"({"a": 1, "c":nan, "d": "null" } )" + "\n" + "\n" + // 4 -> (valid/null based on option) + R"({"a":04, "c": 1.23, "d": "abc"} 123)" + "\n" + // 5 -> (valid) + R"({"a":5}//Comment after record)" + "\n" + // 6 -> ((valid/null based on option) + R"({"a":06} //Comment after whitespace)" + "\n" + // 7 -> (invalid) + R"({"a":5 //Invalid Comment within record})"; + + // leadingZeros allowed + // na_values, + { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 4); + EXPECT_EQ(result.tbl->num_rows(), 8); + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, + {true, false, false, true, true, true, true, false}}; + auto b_column = cudf::test::structs_column_wrapper( + {b_a_col}, {false, false, true, false, false, false, false, false}); + auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0}, + {false, false, false, false, true, false, false, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column); + } + // leadingZeros not allowed, NaN allowed + { + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .numeric_leading_zeros(false) + .na_values({"nan"}); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 4); + EXPECT_EQ(result.tbl->num_rows(), 8); + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8); // empty column + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, + {true, false, false, true, false, true, false, false}}; + auto b_column = cudf::test::structs_column_wrapper( + {b_a_col}, {false, false, true, false, false, false, false, false}); + auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0}, + {false, false, false, false, false, false, false, false}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column); + } +} + TEST_F(JsonReaderTest, MixedTypes) { using LCWS = cudf::test::lists_column_wrapper; From 4e707cb706fe6044451dfce683cbe38bc410c0d6 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 18 Jun 2024 02:10:57 +0000 Subject: [PATCH 02/33] fix pre-commit check failures --- cpp/include/cudf/io/json.hpp | 6 +++++- cpp/src/io/json/nested_json.hpp | 13 ++++++------- cpp/src/io/json/nested_json_gpu.cu | 2 +- cpp/src/io/json/output_writer_iterator.h | 7 +++---- cpp/src/io/json/process_tokens.cu | 2 +- cpp/tests/io/json_test.cpp | 10 +++++----- 6 files changed, 21 insertions(+), 19 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7525c67fa4d..6fc90865666 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -311,7 +311,10 @@ class json_reader_options { * * @return true if leading zeros are allowed in numeric values */ - [[nodiscard]] bool is_allowed_numeric_leading_zeros() const { return _allow_numeric_leading_zeros; } + [[nodiscard]] bool is_allowed_numeric_leading_zeros() const + { + return _allow_numeric_leading_zeros; + } /** * @brief Returns additional values to recognize as null values. @@ -678,6 +681,7 @@ class json_reader_options_builder { * @brief Set Whether leading zeros are allowed in numeric values. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + * @return this for chaining */ json_reader_options_builder& numeric_leading_zeros(bool val) { diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index e858594e4b0..6d80b41c907 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -223,19 +223,18 @@ std::pair, rmm::device_uvector> pr /** * @brief Validate the tokens conforming to behavior given in options. - * + * * @param d_input The string of input characters * @param tokens The tokens to be post-processed * @param token_indices The tokens' corresponding indices that are post-processed * @param options Parsing options specifying the parsing behaviour * @param stream The cuda stream to dispatch GPU kernels to */ -void validate_token_stream( - device_span d_input, - device_span tokens, - device_span token_indices, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream); +void validate_token_stream(device_span d_input, + device_span tokens, + device_span token_indices, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream); /** * @brief Parses the given JSON string and generates a tree representation of the given input. diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index eeea2a0edf3..a7e700f32ec 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -2082,7 +2082,7 @@ cudf::io::parse_options parsing_options(cudf::io::json_reader_options const& opt parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); std::vector na_values{"", "null"}; na_values.insert(na_values.end(), options.get_na_values().begin(), options.get_na_values().end()); - parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie(na_values, stream); return parse_opts; } diff --git a/cpp/src/io/json/output_writer_iterator.h b/cpp/src/io/json/output_writer_iterator.h index 20228b4efac..f6276771af4 100644 --- a/cpp/src/io/json/output_writer_iterator.h +++ b/cpp/src/io/json/output_writer_iterator.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,8 +46,7 @@ class output_writer_iterator_proxy { // type_traits to enable its use with algorithms. template struct is_proxy_reference> - : public thrust::detail::true_type { -}; + : public thrust::detail::true_type {}; } // namespace detail @@ -137,4 +136,4 @@ make_output_writer_iterator(Iterator out, BinaryFunction fun) { return output_writer_iterator(out, fun); } // end make_output_writer_iterator -THRUST_NAMESPACE_END \ No newline at end of file +THRUST_NAMESPACE_END diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index d1ed02052dc..96ae4f7817b 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -158,4 +158,4 @@ void validate_token_stream(device_span d_input, // (if not error row & not lineEnd token) -> decoupled look back for output indices, // CopyIf (if not error row & not lineEnd token) write to output. } // namespace detail -} // namespace cudf::io::json \ No newline at end of file +} // namespace cudf::io::json diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 5e51a1c759a..1d51c571c9a 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2202,9 +2202,9 @@ TEST_F(JsonReaderTest, ValueValidation) EXPECT_EQ(result.tbl->num_columns(), 4); EXPECT_EQ(result.tbl->num_rows(), 8); - auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); - auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, - {true, false, false, true, true, true, true, false}}; + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + auto a_column = + int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, {true, false, false, true, true, true, true, false}}; auto b_column = cudf::test::structs_column_wrapper( {b_a_col}, {false, false, true, false, false, false, false, false}); auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0}, @@ -2225,14 +2225,14 @@ TEST_F(JsonReaderTest, ValueValidation) EXPECT_EQ(result.tbl->num_columns(), 4); EXPECT_EQ(result.tbl->num_rows(), 8); - EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8); // empty column + EXPECT_EQ(result.tbl->get_column(2).type().id(), cudf::type_id::INT8); // empty column auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); auto a_column = int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, {true, false, false, true, false, true, false, false}}; auto b_column = cudf::test::structs_column_wrapper( {b_a_col}, {false, false, true, false, false, false, false, false}); auto c_column = int8_wrapper({0, 0, 0, 0, 0, 0, 0, 0}, - {false, false, false, false, false, false, false, false}); + {false, false, false, false, false, false, false, false}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), a_column); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), b_column); CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(2), c_column); From 0c2e4da8ac1b594af3cf68a75d8bf675bdad5e0c Mon Sep 17 00:00:00 2001 From: "Robert (Bobby) Evans" Date: Fri, 2 Aug 2024 16:13:48 -0500 Subject: [PATCH 03/33] Add Spark Compatible JSON validation (#10) --- cpp/include/cudf/io/json.hpp | 99 ++++++- cpp/src/io/json/process_tokens.cu | 250 ++++++++++++++++-- .../main/java/ai/rapids/cudf/JSONOptions.java | 67 ++++- java/src/main/java/ai/rapids/cudf/Table.java | 73 ++++- java/src/main/native/src/TableJni.cpp | 36 ++- .../test/java/ai/rapids/cudf/TableTest.java | 201 ++++++++++++++ 6 files changed, 689 insertions(+), 37 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 3d0d263e2bb..ad9f9a4aa5c 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -131,10 +131,14 @@ class json_reader_options { json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; // Validation checks for spark + // Should the json validation be strict of not + bool _strict_validation = false; // Allow leading zeros for numeric values. bool _allow_numeric_leading_zeros = true; + // Allow nonnumeric numbers. NaN/Inf + bool _allow_nonnumeric_numbers = true; // Allow unquoted control characters - bool allowUnquotedControlChars = true; + bool _allow_unquoted_control_chars = true; // Additional values to recognize as null values std::vector _na_values; @@ -309,7 +313,15 @@ class json_reader_options { [[nodiscard]] json_recovery_mode_t recovery_mode() const { return _recovery_mode; } /** - * @brief Whether leading zeros are allowed in numeric values. + * @brief Whether json validation should be enforced strictly or not. + * + * @return true if it should be. + */ + [[nodiscard]] bool is_strict_validation() const { return _strict_validation; } + + /** + * @brief Whether leading zeros are allowed in numeric values. strict validation + * must be enabled for this to work. * * @return true if leading zeros are allowed in numeric values */ @@ -318,6 +330,22 @@ class json_reader_options { return _allow_numeric_leading_zeros; } + /** + * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, and + * -Infinity. strict validation must be enabled for this to work. + * + * @return true if leading zeros are allowed in numeric values + */ + [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; } + + /** + * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 be allowed + * without some form of escaping. Strict validation must be enabled for this to work. + * + * @return true if unquoted control chars are allowed. + */ + [[nodiscard]] bool is_allowed_unquoted_control_chars() const { return _allow_unquoted_control_chars; } + /** * @brief Returns additional values to recognize as null values. * @@ -456,12 +484,37 @@ class json_reader_options { void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } /** - * @brief Set Whether leading zeros are allowed in numeric values. + * @brief Set whether strict validation is enabled or not. + * + * @param val Boolean value to indicate whether strict validation is enabled. + */ + void set_strict_validation(bool val) { _strict_validation = val; } + + /** + * @brief Set whether leading zeros are allowed in numeric values. strict validation + * must be enabled for this to work. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values */ void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; } + /** + * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, + * Infinity, and -Infinity. strict validation must be enabled for this to work. + * + * @param val Boolean value to indicate whether leading zeros are allowed in numeric values + */ + void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; } + + /** + * @brief Set whether in a quoted string should characters greater than or equal to 0 + * and less than 32 be allowed without some form of escaping. Strict validation must + * be enabled for this to work. + * + * @param val true to indicate wether unquoted control chars are allowed. + */ + void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; } + /** * @brief Sets additional values to recognize as null values. * @@ -680,7 +733,19 @@ class json_reader_options_builder { } /** - * @brief Set Whether leading zeros are allowed in numeric values. + * @brief Set whether json validation should be strict or not. + * + * @param val Boolean value to indicate whether json validation should be strict or not. + */ + json_reader_options_builder& strict_validation(bool val) + { + options.set_strict_validation(val); + return *this; + } + + /** + * @brief Set Whether leading zeros are allowed in numeric values. strict validation must + * be enabled for this to have any effect. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values * @return this for chaining @@ -691,6 +756,32 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, + * +INF, -INF, +Infinity, Infinity, and -Infinity. + * strict validation must be enabled for this to have any effect. + * + * @param val Boolean value to indicate if unquoted nonnumeric values are + * valid json or not. + */ + json_reader_options_builder& nonnumeric_numbers(bool val) + { + options.allow_nonnumeric_numbers(val); + return *this; + } + + /** + * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without + * some form of escaping. strict validation must be enabled for this to have any effect. + * + * @param val Boolean value to indicate if unquoted control chars are allowed or not. + */ + json_reader_options_builder& unquoted_control_chars(bool val) + { + options.allow_unquoted_control_chars(val); + return *this; + } + /** * @brief Sets additional values to recognize as null values. * diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 96ae4f7817b..aee22e3fdf0 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -52,30 +52,246 @@ struct write_if { } }; +enum class number_state { + start = 0, + saw_neg, // not a complete state + leading_zero, + whole, + saw_radix, // not a complete state + fraction, + start_exponent, // not a complete state + after_sign_exponent, // not a complete state + exponent +}; + +enum class string_state { + normal = 0, + escaped, // not a complete state + escaped_u // not a complete state +}; + +__device__ inline bool substr_eq(const char * data, + SymbolOffsetT const start, + SymbolOffsetT const end, + SymbolOffsetT const expected_len, + const char * expected) { + if (end - start != expected_len) { + return false; + } else { + for (auto idx = 0; idx < expected_len; idx++) { + if (data[start + idx] != expected[idx]) { + return false; + } + } + } + return true; +} + void validate_token_stream(device_span d_input, device_span tokens, device_span token_indices, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { - if (getenv("SPARK_JSON")) { + if (options.is_strict_validation()) { using token_t = cudf::io::json::token_t; - auto validate_tokens = + auto validate_values = [data = d_input.data(), allow_numeric_leading_zeros = - options.is_allowed_numeric_leading_zeros()] __device__(SymbolOffsetT start, - SymbolOffsetT end) -> bool { - // Leading zeros. - if (!allow_numeric_leading_zeros and data[start] == '0') return false; - return true; + options.is_allowed_numeric_leading_zeros(), + allow_nonnumeric = + options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates an unquoted value. A value must match https://www.json.org/json-en.html + // but the leading and training whitespace should already have been removed, and is not + // a string + auto c = data[start]; + if ('n' == c) { + return substr_eq(data, start, end, 4, "null"); + } else if ('t' == c) { + return substr_eq(data, start, end, 4, "true"); + } else if ('f' == c) { + return substr_eq(data, start, end, 5, "false"); + } else if (allow_nonnumeric && c == 'N') { + return substr_eq(data, start, end, 3, "NaN"); + } else if (allow_nonnumeric && c == 'I') { + return substr_eq(data, start, end, 8, "Infinity"); + } else if (allow_nonnumeric && c == '+') { + return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity"); + } else if ('-' == c || c <= '9' && 'c' >= '0') { + // number + auto num_state = number_state::start; + for (auto at = start; at < end; at++) { + c = data[at]; + switch (num_state) { + case number_state::start: + if ('-' == c) { + num_state = number_state::saw_neg; + } else if ('0' == c) { + num_state = number_state::leading_zero; + } else if (c >= '1' && c <= '9') { + num_state = number_state::whole; + } else { + return false; + } + break; + case number_state::saw_neg: + if ('0' == c) { + num_state = number_state::leading_zero; + } else if (c >= '1' && c <= '9') { + num_state = number_state::whole; + } else if (allow_nonnumeric && 'I' == c) { + return substr_eq(data, start, end, 4, "-INF") || substr_eq(data, start, end, 9, "-Infinity"); + } else { + return false; + } + break; + case number_state::leading_zero: + if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { + num_state = number_state::whole; + } else if ('.' == c) { + num_state = number_state::saw_radix; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::whole: + if (c >= '0' && c <= '9') { + num_state = number_state::whole; + } else if ('.' == c) { + num_state = number_state::saw_radix; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::saw_radix: + if (c >= '0' && c <= '9') { + num_state = number_state::fraction; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::fraction: + if (c >= '0' && c <= '9') { + num_state = number_state::fraction; + } else if ('e' == c || 'E' == c) { + num_state = number_state::start_exponent; + } else { + return false; + } + break; + case number_state::start_exponent: + if ('+' == c || '-' == c) { + num_state = number_state::after_sign_exponent; + } else if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + case number_state::after_sign_exponent: + if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + case number_state::exponent: + if (c >= '0' && c <= '9') { + num_state = number_state::exponent; + } else { + return false; + } + break; + } + } + return num_state != number_state::after_sign_exponent && + num_state != number_state::start_exponent && + num_state != number_state::saw_neg && + num_state != number_state::saw_radix; + } else { + return false; + } + }; + + auto validate_strings = + [data = d_input.data(), + allow_unquoted_control_chars = + options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates a quoted string. A string must match https://www.json.org/json-en.html + // but we already know that it has a starting and ending " and all white space has been + // stripped out. Also the base CUDF validation makes sure escaped chars are correct + // so we only need to worry about unquoted control chars + + auto state = string_state::normal; + auto u_count = 0; + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (!allow_unquoted_control_chars && c >= 0 && c < 32) { + return false; + } + + switch (state) { + case string_state::normal: + if (c == '\\') { + state = string_state::escaped; + } + break; + case string_state::escaped: + // in Spark you can allow any char to be escaped, but CUDF + // validates it in some cases so we need to also validate it. + if (c == 'u') { + state = string_state::escaped_u; + u_count = 0; + } else if (c == '"' || + c == '\\' || + c == '/' || + c == 'b' || + c == 'f' || + c == 'n' || + c == 'r' || + c == 't') { + state = string_state::normal; + } else { + return false; + } + break; + case string_state::escaped_u: + if ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f') || + (c >= 'A' && c <= 'F')) { + u_count++; + if (u_count == 4) { + state = string_state::normal; + u_count = 0; + } + } else { + return false; + } + break; + } + } + return string_state::normal == state; }; + auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); auto predicate = [tokens = tokens.begin(), token_indices = token_indices.begin(), - validate_tokens] __device__(auto i) -> bool { + validate_values, + validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { - return !validate_tokens(token_indices[i - 1], token_indices[i]); + return !validate_values(token_indices[i - 1], token_indices[i]); + } else if (tokens[i] == token_t::FieldNameEnd || + tokens[i] == token_t::StringEnd) { + return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; }; @@ -102,10 +318,10 @@ void validate_token_stream(device_span d_input, count_it + num_tokens, error.begin(), predicate); // in-place scan - printf("error:"); - for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) - printf("%d ", tk); - printf("\n"); + //printf("error:"); + //for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) + // printf("%d ", tk); + //printf("\n"); thrust::transform_inclusive_scan(rmm::exec_policy(stream), count_it, @@ -114,10 +330,10 @@ void validate_token_stream(device_span d_input, transform_op, binary_op); // in-place scan } - printf("pre_process_token:"); - for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), stream)) - printf("%d ", tk); - printf("\n"); + //printf("pre_process_token:"); + //for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), stream)) + // printf("%d ", tk); + //printf("\n"); // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE // 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index b37d0d88ec9..50cce4590c1 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -34,6 +34,10 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; + private final boolean allowLeadingZeros; + private final boolean strictValidation; + private final boolean allowNonNumericNumbers; + private final boolean allowUnquotedControlChars; private JSONOptions(Builder builder) { super(builder); @@ -44,6 +48,10 @@ private JSONOptions(Builder builder) { normalizeWhitespace = builder.normalizeWhitespace; mixedTypesAsStrings = builder.mixedTypesAsStrings; keepStringQuotes = builder.keepQuotes; + strictValidation = builder.strictValidation; + allowLeadingZeros = builder.allowLeadingZeros; + allowNonNumericNumbers = builder.allowNonNumericNumbers; + allowUnquotedControlChars = builder.allowUnquotedControlChars; } public boolean isDayFirst() { @@ -75,6 +83,22 @@ public boolean keepStringQuotes() { return keepStringQuotes; } + public boolean strictValidation() { + return strictValidation; + } + + public boolean leadingZerosAllowed() { + return allowLeadingZeros; + } + + public boolean nonNumericNumbersAllowed() { + return allowNonNumericNumbers; + } + + public boolean unquotedControlChars() { + return allowUnquotedControlChars; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -85,6 +109,10 @@ public static Builder builder() { } public static final class Builder extends ColumnFilterOptions.Builder { + private boolean strictValidation = false; + private boolean allowUnquotedControlChars = true; + private boolean allowNonNumericNumbers = false; + private boolean allowLeadingZeros = false; private boolean dayFirst = false; private boolean lines = true; @@ -95,10 +123,47 @@ public static final class Builder extends ColumnFilterOptions.Builder(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); auto result = @@ -1649,7 +1657,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, - jboolean keep_quotes) + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { @@ -1671,6 +1683,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .recovery_mode(recovery_mode) .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) + .strict_validation(strict_validation) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); @@ -1777,6 +1793,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean normalize_whitespace, jboolean mixed_types_as_string, jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control, jlong ds_handle) { JNI_NULL_CHECK(env, ds_handle, "no data source handle given", 0); @@ -1811,6 +1831,10 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); if (!n_types.is_null()) { @@ -1861,7 +1885,11 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean normalize_single_quotes, jboolean normalize_whitespace, jboolean mixed_types_as_string, - jboolean keep_quotes) + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { bool read_buffer = true; if (buffer == 0) { @@ -1910,6 +1938,10 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) + .strict_validation(strict_validation) + .numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); if (!n_types.is_null()) { diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index 050bcbb268f..56fe63598d9 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -437,6 +437,7 @@ void testReadWhitespacesJSONFile() throws IOException { } } + @Test void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { Schema schema = Schema.builder() .column(DType.STRING, "A") @@ -455,6 +456,206 @@ void testReadSingleQuotesJSONFileKeepQuotes() throws IOException { } } + private static final byte[] JSON_VALIDATION_BUFFER = ( + "{\"a\":true}\n" + + "{\"a\":false}\n" + + "{\"a\":null}\n" + + "{\"a\":true, \"b\":truee}\n" + + "{\"a\":true, \"b\":\"nulll\"}\n" + + "{\"a\": 1}\n" + + "{\"a\": 0}\n" + + "{\"a\": -}\n" + + "{\"a\": -0}\n" + + "{\"a\": -01}\n" + + + "{\"a\": 01}\n" + + "{\"a\": -0.1}\n" + + "{\"a\": -00.1}\n" + + "{\"a\": NaN}\n" + + "{\"a\": INF}\n" + + "{\"a\": +INF}\n" + + "{\"a\": -INF}\n" + + "{\"a\": +Infinity}\n" + + "{\"a\": Infinity}\n" + + "{\"a\": -Infinity}\n" + + + "{\"a\": INFinity}\n" + + "{\"a\":\"3710-11-10T02:46:58.732Z\"}\n" + + "{\"a\":12.}\n" + + "{\"a\": -3.4e+38}\n" + + "{\"a\": -3.4e-38}\n" + + "{\"a\": 1.4e38}\n" + + "{\"a\": -3.4E+38}\n" + + "{\"a\": -3.4E-38}\n" + + "{\"a\": 1.4E38}\n" + + "{\"a\": -3.4E+}\n" + + + "{\"a\": -3.4E-}\n" + + "{\"a\": \"A\u0000B\"}\n" + + "{\"a\": \"A\\u0000B\"}\n" + + "{\"a\": \"A\u0001B\"}\n" + + "{\"a\": \"A\\u0001B\"}\n" + + "{\"a\": \"A\u001FB\"}\n" + + "{\"a\": \"A\\u001FB\"}\n" + + "{\"a\": \"A\u0020B\"}\n" + + "{\"a\": \"A\\u0020B\"}\n" + + "{\"a\": \"\\u12\"}\n" + + + "{\"a\": \"\\z\"}\n" + + "{\"a\": \"\\r\"}\n" + + "{\"a\": \"something\", \"b\": \"\\z\"}\n" + ).getBytes(StandardCharsets.UTF_8); + + @Test + void testJSONValidationNoStrict() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(false) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, "true", "true", "1", "0", "-", "-0", "-01", + "01", "-0.1", "-00.1", "NaN", "INF", "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + "INFinity", "\"3710-11-10T02:46:58.732Z\"", "12.", "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", "-3.4E+", + "-3.4E-", "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", "\"something\"") + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidation() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationLeadingZeros() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(true) + .withNonNumericNumbers(false) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", "-01", + "01", "-0.1", "-00.1", null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationNonNumeric() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(true) + .withUnquotedControlChars(true) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, "NaN", null, "+INF", "-INF", "+Infinity", "Infinity", "-Infinity", + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, "\"A\u0000B\"", "\"A\u0000B\"", "\"A\u0001B\"", "\"A\u0001B\"", "\"A\u001FB\"", "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + + @Test + void testJSONValidationUnquotedControl() { + Schema schema = Schema.builder() + .column(DType.STRING, "a") + .build(); + JSONOptions opts = JSONOptions.builder() + .withRecoverWithNull(true) + .withMixedTypesAsStrings(true) + .withNormalizeWhitespace(true) + .withKeepQuotes(true) + .withNormalizeSingleQuotes(true) + .withStrictValidation(true) + .withLeadingZeros(false) + .withNonNumericNumbers(false) + .withUnquotedControlChars(false) + .build(); + try (Table expected = new Table.TestBuilder() + .column( + "true", "false", null, null, "true", "1", "0", null, "-0", null, + null, "-0.1", null, null, null, null, null, null, null, null, + null, "\"3710-11-10T02:46:58.732Z\"", null, "-3.4e+38", "-3.4e-38", "1.4e38", "-3.4E+38", "-3.4E-38", "1.4E38", null, + null, null, "\"A\u0000B\"", null, "\"A\u0001B\"", null, "\"A\u001FB\"", "\"A B\"", "\"A B\"", null, + null, "\"\r\"", null) + .build(); + MultiBufferDataSource source = sourceFrom(JSON_VALIDATION_BUFFER); + Table table = Table.readJSON(schema, opts, source, (int)expected.getRowCount())) { + assertTablesAreEqual(expected, table); + } + } + private static final byte[] NESTED_JSON_DATA_BUFFER = ("{\"a\":{\"c\":\"C1\"}}\n" + "{\"a\":{\"c\":\"C2\", \"b\":\"B2\"}}\n" + "{\"d\":[1,2,3]}\n" + From e94493732be3cd0d6d9ab0c5e9b85da970473950 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 9 Aug 2024 13:39:03 +0000 Subject: [PATCH 04/33] style fixes --- cpp/include/cudf/io/json.hpp | 31 +++++---- cpp/src/io/json/process_tokens.cu | 95 ++++++++++++--------------- java/src/main/native/src/TableJni.cpp | 31 ++++----- 3 files changed, 75 insertions(+), 82 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7ad91f93a6e..dfff9beae09 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -331,20 +331,23 @@ class json_reader_options { } /** - * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, and - * -Infinity. strict validation must be enabled for this to work. + * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, + * and -Infinity. strict validation must be enabled for this to work. * * @return true if leading zeros are allowed in numeric values */ [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; } /** - * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 be allowed - * without some form of escaping. Strict validation must be enabled for this to work. + * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 + * be allowed without some form of escaping. Strict validation must be enabled for this to work. * * @return true if unquoted control chars are allowed. */ - [[nodiscard]] bool is_allowed_unquoted_control_chars() const { return _allow_unquoted_control_chars; } + [[nodiscard]] bool is_allowed_unquoted_control_chars() const + { + return _allow_unquoted_control_chars; + } /** * @brief Returns additional values to recognize as null values. @@ -499,7 +502,7 @@ class json_reader_options { void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; } /** - * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, + * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, * Infinity, and -Infinity. strict validation must be enabled for this to work. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values @@ -507,11 +510,11 @@ class json_reader_options { void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; } /** - * @brief Set whether in a quoted string should characters greater than or equal to 0 - * and less than 32 be allowed without some form of escaping. Strict validation must + * @brief Set whether in a quoted string should characters greater than or equal to 0 + * and less than 32 be allowed without some form of escaping. Strict validation must * be enabled for this to work. * - * @param val true to indicate wether unquoted control chars are allowed. + * @param val true to indicate whether unquoted control chars are allowed. */ void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; } @@ -736,6 +739,7 @@ class json_reader_options_builder { * @brief Set whether json validation should be strict or not. * * @param val Boolean value to indicate whether json validation should be strict or not. + * @return this for chaining */ json_reader_options_builder& strict_validation(bool val) { @@ -757,12 +761,12 @@ class json_reader_options_builder { } /** - * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, + * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, * +INF, -INF, +Infinity, Infinity, and -Infinity. * strict validation must be enabled for this to have any effect. * - * @param val Boolean value to indicate if unquoted nonnumeric values are - * valid json or not. + * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not. + * @return this for chaining */ json_reader_options_builder& nonnumeric_numbers(bool val) { @@ -771,10 +775,11 @@ class json_reader_options_builder { } /** - * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without + * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without * some form of escaping. strict validation must be enabled for this to have any effect. * * @param val Boolean value to indicate if unquoted control chars are allowed or not. + * @return this for chaining */ json_reader_options_builder& unquoted_control_chars(bool val) { diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index aee22e3fdf0..906b5169064 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -54,34 +54,33 @@ struct write_if { enum class number_state { start = 0, - saw_neg, // not a complete state + saw_neg, // not a complete state leading_zero, whole, - saw_radix, // not a complete state + saw_radix, // not a complete state fraction, - start_exponent, // not a complete state - after_sign_exponent, // not a complete state + start_exponent, // not a complete state + after_sign_exponent, // not a complete state exponent }; enum class string_state { normal = 0, - escaped, // not a complete state - escaped_u // not a complete state + escaped, // not a complete state + escaped_u // not a complete state }; -__device__ inline bool substr_eq(const char * data, - SymbolOffsetT const start, - SymbolOffsetT const end, - SymbolOffsetT const expected_len, - const char * expected) { +__device__ inline bool substr_eq(const char* data, + SymbolOffsetT const start, + SymbolOffsetT const end, + SymbolOffsetT const expected_len, + const char* expected) +{ if (end - start != expected_len) { return false; } else { for (auto idx = 0; idx < expected_len; idx++) { - if (data[start + idx] != expected[idx]) { - return false; - } + if (data[start + idx] != expected[idx]) { return false; } } } return true; @@ -96,9 +95,8 @@ void validate_token_stream(device_span d_input, if (options.is_strict_validation()) { using token_t = cudf::io::json::token_t; auto validate_values = - [data = d_input.data(), - allow_numeric_leading_zeros = - options.is_allowed_numeric_leading_zeros(), + [data = d_input.data(), + allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), allow_nonnumeric = options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, SymbolOffsetT end) -> bool { @@ -117,7 +115,8 @@ void validate_token_stream(device_span d_input, } else if (allow_nonnumeric && c == 'I') { return substr_eq(data, start, end, 8, "Infinity"); } else if (allow_nonnumeric && c == '+') { - return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity"); + return substr_eq(data, start, end, 4, "+INF") || + substr_eq(data, start, end, 9, "+Infinity"); } else if ('-' == c || c <= '9' && 'c' >= '0') { // number auto num_state = number_state::start; @@ -141,7 +140,8 @@ void validate_token_stream(device_span d_input, } else if (c >= '1' && c <= '9') { num_state = number_state::whole; } else if (allow_nonnumeric && 'I' == c) { - return substr_eq(data, start, end, 4, "-INF") || substr_eq(data, start, end, 9, "-Infinity"); + return substr_eq(data, start, end, 4, "-INF") || + substr_eq(data, start, end, 9, "-Infinity"); } else { return false; } @@ -212,9 +212,8 @@ void validate_token_stream(device_span d_input, } } return num_state != number_state::after_sign_exponent && - num_state != number_state::start_exponent && - num_state != number_state::saw_neg && - num_state != number_state::saw_radix; + num_state != number_state::start_exponent && num_state != number_state::saw_neg && + num_state != number_state::saw_radix; } else { return false; } @@ -222,54 +221,42 @@ void validate_token_stream(device_span d_input, auto validate_strings = [data = d_input.data(), - allow_unquoted_control_chars = + allow_unquoted_control_chars = options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, SymbolOffsetT end) -> bool { // This validates a quoted string. A string must match https://www.json.org/json-en.html // but we already know that it has a starting and ending " and all white space has been // stripped out. Also the base CUDF validation makes sure escaped chars are correct - // so we only need to worry about unquoted control chars + // so we only need to worry about unquoted control chars - auto state = string_state::normal; + auto state = string_state::normal; auto u_count = 0; for (SymbolOffsetT idx = start + 1; idx < end; idx++) { auto c = data[idx]; - if (!allow_unquoted_control_chars && c >= 0 && c < 32) { - return false; - } + if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; } switch (state) { case string_state::normal: - if (c == '\\') { - state = string_state::escaped; - } + if (c == '\\') { state = string_state::escaped; } break; case string_state::escaped: // in Spark you can allow any char to be escaped, but CUDF // validates it in some cases so we need to also validate it. if (c == 'u') { - state = string_state::escaped_u; + state = string_state::escaped_u; u_count = 0; - } else if (c == '"' || - c == '\\' || - c == '/' || - c == 'b' || - c == 'f' || - c == 'n' || - c == 'r' || - c == 't') { + } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || + c == 'r' || c == 't') { state = string_state::normal; } else { return false; } break; case string_state::escaped_u: - if ((c >= '0' && c <= '9') || - (c >= 'a' && c <= 'f') || - (c >= 'A' && c <= 'F')) { + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { u_count++; if (u_count == 4) { - state = string_state::normal; + state = string_state::normal; u_count = 0; } } else { @@ -289,8 +276,7 @@ void validate_token_stream(device_span d_input, validate_strings] __device__(auto i) -> bool { if (tokens[i] == token_t::ValueEnd) { return !validate_values(token_indices[i - 1], token_indices[i]); - } else if (tokens[i] == token_t::FieldNameEnd || - tokens[i] == token_t::StringEnd) { + } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { return !validate_strings(token_indices[i - 1], token_indices[i]); } return false; @@ -318,10 +304,10 @@ void validate_token_stream(device_span d_input, count_it + num_tokens, error.begin(), predicate); // in-place scan - //printf("error:"); - //for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) - // printf("%d ", tk); - //printf("\n"); + // printf("error:"); + // for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) + // printf("%d ", tk); + // printf("\n"); thrust::transform_inclusive_scan(rmm::exec_policy(stream), count_it, @@ -330,10 +316,11 @@ void validate_token_stream(device_span d_input, transform_op, binary_op); // in-place scan } - //printf("pre_process_token:"); - //for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), stream)) - // printf("%d ", tk); - //printf("\n"); + // printf("pre_process_token:"); + // for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), + // stream)) + // printf("%d ", tk); + // printf("\n"); // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE // 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index cf8a11783e2..b67db835a23 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1648,21 +1648,22 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, CATCH_STD(env, 0); } -JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, - jclass, - jlong buffer, - jlong buffer_length, - jboolean day_first, - jboolean lines, - jboolean recover_with_null, - jboolean normalize_single_quotes, - jboolean normalize_whitespace, - jboolean mixed_types_as_string, - jboolean keep_quotes, - jboolean strict_validation, - jboolean allow_leading_zeros, - jboolean allow_nonnumeric_numbers, - jboolean allow_unquoted_control) +JNIEXPORT jlong JNICALL +Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, + jclass, + jlong buffer, + jlong buffer_length, + jboolean day_first, + jboolean lines, + jboolean recover_with_null, + jboolean normalize_single_quotes, + jboolean normalize_whitespace, + jboolean mixed_types_as_string, + jboolean keep_quotes, + jboolean strict_validation, + jboolean allow_leading_zeros, + jboolean allow_nonnumeric_numbers, + jboolean allow_unquoted_control) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); if (buffer_length <= 0) { From 23072c05b776203df415f08e68c88786c9768c98 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Fri, 9 Aug 2024 09:03:05 -0500 Subject: [PATCH 05/33] Update json normalization to take device_buffer * change device_uvector to device_buffer * update tests --------- Co-authored-by: Karthikeyan Natarajan --- cpp/include/cudf/io/detail/json.hpp | 4 ++-- cpp/src/io/json/json_normalization.cu | 23 +++++++++++-------- cpp/src/io/json/read_json.cu | 16 ++++++------- .../io/json/json_quote_normalization_test.cpp | 9 ++++---- .../json_whitespace_normalization_test.cu | 7 +++--- 5 files changed, 30 insertions(+), 29 deletions(-) diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp index 42b10a78ce8..38ba4f675c3 100644 --- a/cpp/include/cudf/io/detail/json.hpp +++ b/cpp/include/cudf/io/detail/json.hpp @@ -61,7 +61,7 @@ void write_json(data_sink* sink, * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ -void normalize_single_quotes(datasource::owning_buffer>& indata, +void normalize_single_quotes(datasource::owning_buffer& indata, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); @@ -72,7 +72,7 @@ void normalize_single_quotes(datasource::owning_buffer * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ -void normalize_whitespace(datasource::owning_buffer>& indata, +void normalize_whitespace(datasource::owning_buffer& indata, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); } // namespace io::json::detail diff --git a/cpp/src/io/json/json_normalization.cu b/cpp/src/io/json/json_normalization.cu index 760b2214365..2cdb967018a 100644 --- a/cpp/src/io/json/json_normalization.cu +++ b/cpp/src/io/json/json_normalization.cu @@ -16,6 +16,7 @@ #include "io/fst/lookup_tables.cuh" +#include #include #include @@ -298,10 +299,11 @@ struct TransduceToNormalizedWS { namespace detail { -void normalize_single_quotes(datasource::owning_buffer>& indata, +void normalize_single_quotes(datasource::owning_buffer& indata, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; auto parser = @@ -311,25 +313,26 @@ void normalize_single_quotes(datasource::owning_buffer outbuf(indata.size() * 2, stream, mr); + rmm::device_buffer outbuf(indata.size() * 2, stream, mr); rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(indata.data(), + parser.Transduce(reinterpret_cast(indata.data()), static_cast(indata.size()), - outbuf.data(), + static_cast(outbuf.data()), thrust::make_discard_iterator(), outbuf_size.data(), normalize_quotes::start_state, stream); outbuf.resize(outbuf_size.value(stream), stream); - datasource::owning_buffer> outdata(std::move(outbuf)); + datasource::owning_buffer outdata(std::move(outbuf)); std::swap(indata, outdata); } -void normalize_whitespace(datasource::owning_buffer>& indata, +void normalize_whitespace(datasource::owning_buffer& indata, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { + CUDF_FUNC_RANGE(); static constexpr std::int32_t min_out = 0; static constexpr std::int32_t max_out = 2; auto parser = @@ -339,18 +342,18 @@ void normalize_whitespace(datasource::owning_buffer normalize_whitespace::TransduceToNormalizedWS{}), stream); - rmm::device_uvector outbuf(indata.size(), stream, mr); + rmm::device_buffer outbuf(indata.size(), stream, mr); rmm::device_scalar outbuf_size(stream, mr); - parser.Transduce(indata.data(), + parser.Transduce(reinterpret_cast(indata.data()), static_cast(indata.size()), - outbuf.data(), + static_cast(outbuf.data()), thrust::make_discard_iterator(), outbuf_size.data(), normalize_whitespace::start_state, stream); outbuf.resize(outbuf_size.value(stream), stream); - datasource::owning_buffer> outdata(std::move(outbuf)); + datasource::owning_buffer outdata(std::move(outbuf)); std::swap(indata, outdata); } diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 590f70864b1..e0d0497e0a2 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -168,7 +168,7 @@ size_t estimate_size_per_subchunk(size_t chunk_size) * @param stream CUDA stream used for device memory operations and kernel launches * @returns Data source owning buffer enclosing the bytes read */ -datasource::owning_buffer> get_record_range_raw_input( +datasource::owning_buffer get_record_range_raw_input( host_span> sources, json_reader_options const& reader_opts, rmm::cuda_stream_view stream) @@ -200,8 +200,8 @@ datasource::owning_buffer> get_record_range_raw_input( ? total_source_size * estimated_compression_ratio + header_size : std::min(total_source_size, chunk_size + num_subchunks_prealloced * size_per_subchunk) + num_extra_delimiters; - rmm::device_uvector buffer(buffer_size, stream); - device_span bufspan(buffer); + rmm::device_buffer buffer(buffer_size, stream); + device_span bufspan(reinterpret_cast(buffer.data()), buffer.size()); // Offset within buffer indicating first read position std::int64_t buffer_offset = 0; @@ -213,8 +213,8 @@ datasource::owning_buffer> get_record_range_raw_input( chunk_offset == 0 ? 0 : find_first_delimiter(readbufspan, '\n', stream); if (first_delim_pos == -1) { // return empty owning datasource buffer - auto empty_buf = rmm::device_uvector(0, stream); - return datasource::owning_buffer>(std::move(empty_buf)); + auto empty_buf = rmm::device_buffer(0, stream); + return datasource::owning_buffer(std::move(empty_buf)); } else if (!should_load_all_sources) { // Find next delimiter std::int64_t next_delim_pos = -1; @@ -232,12 +232,12 @@ datasource::owning_buffer> get_record_range_raw_input( } if (next_delim_pos < buffer_offset) next_delim_pos = buffer_offset + readbufspan.size(); - return datasource::owning_buffer>( + return datasource::owning_buffer( std::move(buffer), reinterpret_cast(buffer.data()) + first_delim_pos + shift_for_nonzero_offset, next_delim_pos - first_delim_pos - shift_for_nonzero_offset); } - return datasource::owning_buffer>( + return datasource::owning_buffer( std::move(buffer), reinterpret_cast(buffer.data()) + first_delim_pos + shift_for_nonzero_offset, readbufspan.size() - first_delim_pos - shift_for_nonzero_offset); @@ -249,7 +249,7 @@ table_with_metadata read_batch(host_span> sources, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - datasource::owning_buffer> bufview = + datasource::owning_buffer bufview = get_record_range_raw_input(sources, reader_opts, stream); // If input JSON buffer has single quotes and option to normalize single quotes is enabled, diff --git a/cpp/tests/io/json/json_quote_normalization_test.cpp b/cpp/tests/io/json/json_quote_normalization_test.cpp index 55ad0afe499..3a9ba8d9f3b 100644 --- a/cpp/tests/io/json/json_quote_normalization_test.cpp +++ b/cpp/tests/io/json/json_quote_normalization_test.cpp @@ -26,7 +26,7 @@ #include #include -#include +#include #include #include @@ -42,12 +42,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou std::make_shared(); auto stream_view = cudf::test::get_default_stream(); - auto device_input = cudf::detail::make_device_uvector_async( - host_input, stream_view, rmm::mr::get_current_device_resource()); + auto device_input = rmm::device_buffer( + host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource()); // Preprocessing FST - cudf::io::datasource::owning_buffer> device_data( - std::move(device_input)); + cudf::io::datasource::owning_buffer device_data(std::move(device_input)); cudf::io::json::detail::normalize_single_quotes(device_data, stream_view, rsc.get()); std::string preprocessed_host_output(device_data.size(), 0); diff --git a/cpp/tests/io/json/json_whitespace_normalization_test.cu b/cpp/tests/io/json/json_whitespace_normalization_test.cu index 8ed5fa81b12..01dd17fab98 100644 --- a/cpp/tests/io/json/json_whitespace_normalization_test.cu +++ b/cpp/tests/io/json/json_whitespace_normalization_test.cu @@ -38,12 +38,11 @@ void run_test(std::string const& host_input, std::string const& expected_host_ou // Prepare cuda stream for data transfers & kernels auto stream_view = cudf::test::get_default_stream(); - auto device_input = cudf::detail::make_device_uvector_async( - host_input, stream_view, rmm::mr::get_current_device_resource()); + auto device_input = rmm::device_buffer( + host_input.c_str(), host_input.size(), stream_view, rmm::mr::get_current_device_resource()); // Preprocessing FST - cudf::io::datasource::owning_buffer> device_data( - std::move(device_input)); + cudf::io::datasource::owning_buffer device_data(std::move(device_input)); cudf::io::json::detail::normalize_whitespace( device_data, stream_view, rmm::mr::get_current_device_resource()); From a8853401bde56017f319eadd2382034fdbfa65e3 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Fri, 9 Aug 2024 20:32:18 +0000 Subject: [PATCH 06/33] fix char comparison error --- cpp/src/io/json/process_tokens.cu | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 906b5169064..3b225130ba2 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -19,6 +19,7 @@ // #include "tabulate_output_iterator.cuh" #include "output_writer_iterator.h" +#include #include #include @@ -26,11 +27,7 @@ #include #include -#include #include -#include - -#include namespace cudf::io::json { namespace detail { @@ -92,6 +89,7 @@ void validate_token_stream(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { + CUDF_FUNC_RANGE(); if (options.is_strict_validation()) { using token_t = cudf::io::json::token_t; auto validate_values = @@ -233,7 +231,7 @@ void validate_token_stream(device_span d_input, auto u_count = 0; for (SymbolOffsetT idx = start + 1; idx < end; idx++) { auto c = data[idx]; - if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; } + if (!allow_unquoted_control_chars && c >= '\0' && c < '\32') { return false; } switch (state) { case string_state::normal: From ab1385df894ec8642279e52e5abb7722264880cd Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 15 Aug 2024 13:08:37 -0500 Subject: [PATCH 07/33] update char comparison --- cpp/src/io/json/process_tokens.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 3b225130ba2..85efaa191d2 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -231,7 +231,7 @@ void validate_token_stream(device_span d_input, auto u_count = 0; for (SymbolOffsetT idx = start + 1; idx < end; idx++) { auto c = data[idx]; - if (!allow_unquoted_control_chars && c >= '\0' && c < '\32') { return false; } + if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; } switch (state) { case string_state::normal: From f2e2b448e6fc02ba178aa8eef7f243d31e628e4a Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 26 Aug 2024 15:35:26 +0000 Subject: [PATCH 08/33] rename to tabulate_output_iterator.cuh --- cpp/src/io/json/process_tokens.cu | 7 ++-- ...terator.h => tabulate_output_iterator.cuh} | 40 +++++++++---------- 2 files changed, 23 insertions(+), 24 deletions(-) rename cpp/src/io/json/{output_writer_iterator.h => tabulate_output_iterator.cuh} (70%) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 85efaa191d2..a7c2d264880 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -16,8 +16,7 @@ */ #include "nested_json.hpp" -// #include "tabulate_output_iterator.cuh" -#include "output_writer_iterator.h" +#include "tabulate_output_iterator.cuh" #include #include @@ -231,7 +230,7 @@ void validate_token_stream(device_span d_input, auto u_count = 0; for (SymbolOffsetT idx = start + 1; idx < end; idx++) { auto c = data[idx]; - if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; } + if (!allow_unquoted_control_chars && c < 32) { return false; } switch (state) { case string_state::normal: @@ -285,7 +284,7 @@ void validate_token_stream(device_span d_input, // auto conditional_output_it = tokens.begin(); // auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write); auto conditional_output_it = - thrust::make_output_writer_iterator(thrust::make_counting_iterator(0), conditional_write); + thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0), conditional_write); auto transform_op = cuda::proclaim_return_type( [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; diff --git a/cpp/src/io/json/output_writer_iterator.h b/cpp/src/io/json/tabulate_output_iterator.cuh similarity index 70% rename from cpp/src/io/json/output_writer_iterator.h rename to cpp/src/io/json/tabulate_output_iterator.cuh index f6276771af4..fe457bfb5a7 100644 --- a/cpp/src/io/json/output_writer_iterator.h +++ b/cpp/src/io/json/tabulate_output_iterator.cuh @@ -14,7 +14,7 @@ * limitations under the License. */ -// Output writer iterator +// Tabulate Output iterator #pragma once #include @@ -24,14 +24,14 @@ namespace detail { // Proxy reference that calls BinaryFunction with Iterator value and the rhs of assignment operator template -class output_writer_iterator_proxy { +class tabulate_output_iterator_proxy { public: - __host__ __device__ output_writer_iterator_proxy(const Iterator& index_iter, BinaryFunction fun) + __host__ __device__ tabulate_output_iterator_proxy(const Iterator& index_iter, BinaryFunction fun) : index_iter(index_iter), fun(fun) { } template - __host__ __device__ output_writer_iterator_proxy operator=(const T& x) + __host__ __device__ tabulate_output_iterator_proxy operator=(const T& x) { fun(*index_iter, x); return *this; @@ -42,19 +42,19 @@ class output_writer_iterator_proxy { BinaryFunction fun; }; -// Register output_writer_iterator_proxy with 'is_proxy_reference' from +// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from // type_traits to enable its use with algorithms. template -struct is_proxy_reference> +struct is_proxy_reference> : public thrust::detail::true_type {}; } // namespace detail /** - * @brief Transform output iterator with custom writer binary function which takes index and value. + * @brief Transform output iterator with custom binary function which takes index and value. * * @code {.cpp} - * #include + * #include * #include * #include * #include @@ -80,7 +80,7 @@ struct is_proxy_reference * }; * * thrust::device_vector v(1, 0x00000000); - * auto result_begin = thrust::make_output_writer_iterator(thrust::make_counting_iterator(0), + * auto result_begin = thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0), * set_bits_field{v.data().get()}); * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0), * [] __device__ (int x) { return x%2; }); @@ -94,27 +94,27 @@ struct is_proxy_reference * @tparam Iterator iterator type that acts as index of the output. */ template -class output_writer_iterator +class tabulate_output_iterator : public thrust::iterator_adaptor< - output_writer_iterator, + tabulate_output_iterator, Iterator, thrust::use_default, thrust::use_default, thrust::use_default, - thrust::detail::output_writer_iterator_proxy> { + thrust::detail::tabulate_output_iterator_proxy> { public: // parent class. typedef thrust::iterator_adaptor< - output_writer_iterator, + tabulate_output_iterator, Iterator, thrust::use_default, thrust::use_default, thrust::use_default, - thrust::detail::output_writer_iterator_proxy> + thrust::detail::tabulate_output_iterator_proxy> super_t; // friend thrust::iterator_core_access to allow it access to the private interface dereference() friend class thrust::iterator_core_access; - __host__ __device__ output_writer_iterator(Iterator const& x, BinaryFunction fun) + __host__ __device__ tabulate_output_iterator(Iterator const& x, BinaryFunction fun) : super_t(x), fun(fun) { } @@ -125,15 +125,15 @@ class output_writer_iterator // thrust::iterator_core_access accesses this function __host__ __device__ typename super_t::reference dereference() const { - return thrust::detail::output_writer_iterator_proxy( + return thrust::detail::tabulate_output_iterator_proxy( this->base_reference(), fun); } }; template -output_writer_iterator __host__ __device__ -make_output_writer_iterator(Iterator out, BinaryFunction fun) +tabulate_output_iterator __host__ __device__ +make_tabulate_output_iterator(Iterator out, BinaryFunction fun) { - return output_writer_iterator(out, fun); -} // end make_output_writer_iterator + return tabulate_output_iterator(out, fun); +} // end make_tabulate_output_iterator THRUST_NAMESPACE_END From 09632189c976ae247b9aa36bc3fd4a451739f693 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 26 Aug 2024 15:58:49 +0000 Subject: [PATCH 09/33] absorb counting_iterator to tabulate_output_iterator Signed-off-by: Karthikeyan Natarajan --- cpp/src/io/json/process_tokens.cu | 11 ++-- cpp/src/io/json/tabulate_output_iterator.cuh | 56 +++++++++----------- 2 files changed, 30 insertions(+), 37 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index a7c2d264880..224d90e0b24 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -279,13 +279,10 @@ void validate_token_stream(device_span d_input, return false; }; - using scan_type = write_if::scan_type; - auto conditional_write = write_if{tokens.begin(), num_tokens}; - // auto conditional_output_it = tokens.begin(); - // auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write); - auto conditional_output_it = - thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0), conditional_write); - auto transform_op = cuda::proclaim_return_type( + using scan_type = write_if::scan_type; + auto conditional_write = write_if{tokens.begin(), num_tokens}; + auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write); + auto transform_op = cuda::proclaim_return_type( [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh index fe457bfb5a7..8c677696a26 100644 --- a/cpp/src/io/json/tabulate_output_iterator.cuh +++ b/cpp/src/io/json/tabulate_output_iterator.cuh @@ -17,35 +17,36 @@ // Tabulate Output iterator #pragma once +#include #include THRUST_NAMESPACE_BEGIN namespace detail { -// Proxy reference that calls BinaryFunction with Iterator value and the rhs of assignment operator -template +// Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator +template class tabulate_output_iterator_proxy { public: - __host__ __device__ tabulate_output_iterator_proxy(const Iterator& index_iter, BinaryFunction fun) - : index_iter(index_iter), fun(fun) + __host__ __device__ tabulate_output_iterator_proxy(const IndexT index, BinaryFunction fun) + : index(index), fun(fun) { } template - __host__ __device__ tabulate_output_iterator_proxy operator=(const T& x) + __host__ __device__ tabulate_output_iterator_proxy operator=(const T& rhs_value) { - fun(*index_iter, x); + fun(index, rhs_value); return *this; } private: - Iterator index_iter; + IndexT index; BinaryFunction fun; }; // Register tabulate_output_iterator_proxy with 'is_proxy_reference' from // type_traits to enable its use with algorithms. -template -struct is_proxy_reference> +template +struct is_proxy_reference> : public thrust::detail::true_type {}; } // namespace detail @@ -80,8 +81,7 @@ struct is_proxy_reference v(1, 0x00000000); - * auto result_begin = thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0), - * set_bits_field{v.data().get()}); + * auto result_begin = thrust::make_tabulate_output_iterator(set_bits_field{v.data().get()}); * auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0), * [] __device__ (int x) { return x%2; }); * thrust::copy(thrust::device, value, value+32, result_begin); @@ -93,31 +93,27 @@ struct is_proxy_reference +template class tabulate_output_iterator : public thrust::iterator_adaptor< - tabulate_output_iterator, - Iterator, + tabulate_output_iterator, + counting_iterator, thrust::use_default, thrust::use_default, thrust::use_default, - thrust::detail::tabulate_output_iterator_proxy> { + thrust::detail::tabulate_output_iterator_proxy> { public: // parent class. - typedef thrust::iterator_adaptor< - tabulate_output_iterator, - Iterator, + using super_t = thrust::iterator_adaptor< + tabulate_output_iterator, + counting_iterator, thrust::use_default, thrust::use_default, thrust::use_default, - thrust::detail::tabulate_output_iterator_proxy> - super_t; + thrust::detail::tabulate_output_iterator_proxy>; // friend thrust::iterator_core_access to allow it access to the private interface dereference() friend class thrust::iterator_core_access; - __host__ __device__ tabulate_output_iterator(Iterator const& x, BinaryFunction fun) - : super_t(x), fun(fun) - { - } + __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {} private: BinaryFunction fun; @@ -125,15 +121,15 @@ class tabulate_output_iterator // thrust::iterator_core_access accesses this function __host__ __device__ typename super_t::reference dereference() const { - return thrust::detail::tabulate_output_iterator_proxy( - this->base_reference(), fun); + return thrust::detail::tabulate_output_iterator_proxy(*this->base(), + fun); } }; -template -tabulate_output_iterator __host__ __device__ -make_tabulate_output_iterator(Iterator out, BinaryFunction fun) +template +tabulate_output_iterator __host__ __device__ +make_tabulate_output_iterator(BinaryFunction fun) { - return tabulate_output_iterator(out, fun); + return tabulate_output_iterator(fun); } // end make_tabulate_output_iterator THRUST_NAMESPACE_END From be7402c4d685fbf3f5ca747b219adc85add3ae38 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 26 Aug 2024 18:12:59 +0000 Subject: [PATCH 10/33] update documentation --- cpp/include/cudf/io/json.hpp | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index faf8cccf552..433bb57941f 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -323,6 +323,8 @@ class json_reader_options { * @brief Whether leading zeros are allowed in numeric values. strict validation * must be enabled for this to work. * + * @note: strict_validation must be enabled for this to work. + * * @return true if leading zeros are allowed in numeric values */ [[nodiscard]] bool is_allowed_numeric_leading_zeros() const @@ -334,6 +336,8 @@ class json_reader_options { * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, * and -Infinity. strict validation must be enabled for this to work. * + * @note: strict_validation must be enabled for this to work. + * * @return true if leading zeros are allowed in numeric values */ [[nodiscard]] bool is_allowed_nonnumeric_numbers() const { return _allow_nonnumeric_numbers; } @@ -342,6 +346,8 @@ class json_reader_options { * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 * be allowed without some form of escaping. Strict validation must be enabled for this to work. * + * @note: strict_validation must be enabled for this to work. + * * @return true if unquoted control chars are allowed. */ [[nodiscard]] bool is_allowed_unquoted_control_chars() const @@ -497,6 +503,8 @@ class json_reader_options { * @brief Set whether leading zeros are allowed in numeric values. strict validation * must be enabled for this to work. * + * @note: strict_validation must be enabled for this to work. + * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values */ void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; } @@ -505,6 +513,8 @@ class json_reader_options { * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, * Infinity, and -Infinity. strict validation must be enabled for this to work. * + * @note: strict_validation must be enabled for this to work. + * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values */ void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; } @@ -514,6 +524,8 @@ class json_reader_options { * and less than 32 be allowed without some form of escaping. Strict validation must * be enabled for this to work. * + * @note: strict_validation must be enabled for this to work. + * * @param val true to indicate whether unquoted control chars are allowed. */ void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; } @@ -751,6 +763,8 @@ class json_reader_options_builder { * @brief Set Whether leading zeros are allowed in numeric values. strict validation must * be enabled for this to have any effect. * + * @note: strict_validation must be enabled for this to work. + * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values * @return this for chaining */ @@ -765,6 +779,8 @@ class json_reader_options_builder { * +INF, -INF, +Infinity, Infinity, and -Infinity. * strict validation must be enabled for this to have any effect. * + * @note: strict_validation must be enabled for this to work. + * * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not. * @return this for chaining */ @@ -778,6 +794,8 @@ class json_reader_options_builder { * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without * some form of escaping. strict validation must be enabled for this to have any effect. * + * @note: strict_validation must be enabled for this to work. + * * @param val Boolean value to indicate if unquoted control chars are allowed or not. * @return this for chaining */ From b114401843d5af5e631a7419fe225e8bb1d86267 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 26 Aug 2024 18:19:03 +0000 Subject: [PATCH 11/33] add na_values to validation --- cpp/src/io/json/process_tokens.cu | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 224d90e0b24..c78a8fd818d 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -15,6 +15,7 @@ * limitations under the License. */ +#include "io/utilities/trie.cuh" #include "nested_json.hpp" #include "tabulate_output_iterator.cuh" @@ -91,8 +92,12 @@ void validate_token_stream(device_span d_input, CUDF_FUNC_RANGE(); if (options.is_strict_validation()) { using token_t = cudf::io::json::token_t; + cudf::detail::optional_trie trie_na = + cudf::detail::create_serialized_trie(options.get_na_values(), stream); + auto trie_na_view = cudf::detail::make_trie_view(trie_na); auto validate_values = [data = d_input.data(), + trie_na = trie_na_view, allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), allow_nonnumeric = options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, @@ -100,8 +105,11 @@ void validate_token_stream(device_span d_input, // This validates an unquoted value. A value must match https://www.json.org/json-en.html // but the leading and training whitespace should already have been removed, and is not // a string - auto c = data[start]; - if ('n' == c) { + auto c = data[start]; + auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); + if (is_null_literal) { + return true; + } else if ('n' == c) { return substr_eq(data, start, end, 4, "null"); } else if ('t' == c) { return substr_eq(data, start, end, 4, "true"); From a1e9afcc9055fbc516cdda4dd216a8b9c02695bf Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 26 Aug 2024 18:19:16 +0000 Subject: [PATCH 12/33] add strict validation to test --- cpp/tests/io/json/json_test.cpp | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index f06053b44a7..7316026d4d7 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2169,14 +2169,15 @@ TEST_F(JsonReaderTest, ValueValidation) cudf::io::json_reader_options in_options = cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) .lines(true) - .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .strict_validation(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 4); EXPECT_EQ(result.tbl->num_rows(), 8); - auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); - auto a_column = - int64_wrapper{{-2, 0, 0, 1, 4, 5, 6, 0}, {true, false, false, true, true, true, true, false}}; + auto b_a_col = int64_wrapper({0, 0, 3, 0, 0, 0, 0, 0}); + auto a_column = int64_wrapper{{-2, 0, 0, 0, 4, 5, 6, 0}, + {true, false, false, false, true, true, true, false}}; auto b_column = cudf::test::structs_column_wrapper( {b_a_col}, {false, false, true, false, false, false, false, false}); auto c_column = float64_wrapper({0.0, 0.0, 0.0, 0.0, 1.23, 0.0, 0.0, 0.0}, @@ -2192,7 +2193,8 @@ TEST_F(JsonReaderTest, ValueValidation) .lines(true) .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) .numeric_leading_zeros(false) - .na_values({"nan"}); + .na_values({"nan"}) + .strict_validation(true); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 4); From ec78ef959dde124a56784b0591c1d49a5763f2d8 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 26 Aug 2024 18:42:02 +0000 Subject: [PATCH 13/33] rename tabulate_output_iterator namespace --- cpp/src/io/json/process_tokens.cu | 2 +- cpp/src/io/json/tabulate_output_iterator.cuh | 52 ++++++++++---------- 2 files changed, 26 insertions(+), 28 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index c78a8fd818d..e1a50dd6070 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -289,7 +289,7 @@ void validate_token_stream(device_span d_input, using scan_type = write_if::scan_type; auto conditional_write = write_if{tokens.begin(), num_tokens}; - auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write); + auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); auto transform_op = cuda::proclaim_return_type( [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh index 8c677696a26..46887beefbd 100644 --- a/cpp/src/io/json/tabulate_output_iterator.cuh +++ b/cpp/src/io/json/tabulate_output_iterator.cuh @@ -20,7 +20,7 @@ #include #include -THRUST_NAMESPACE_BEGIN +namespace cudf { namespace detail { // Proxy reference that calls BinaryFunction with index value and the rhs of assignment operator @@ -43,19 +43,11 @@ class tabulate_output_iterator_proxy { BinaryFunction fun; }; -// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from -// type_traits to enable its use with algorithms. -template -struct is_proxy_reference> - : public thrust::detail::true_type {}; - -} // namespace detail - /** * @brief Transform output iterator with custom binary function which takes index and value. * * @code {.cpp} - * #include + * #include "tabulate_output_iterator.cuh" * #include * #include * #include @@ -95,22 +87,20 @@ struct is_proxy_reference */ template class tabulate_output_iterator - : public thrust::iterator_adaptor< - tabulate_output_iterator, - counting_iterator, - thrust::use_default, - thrust::use_default, - thrust::use_default, - thrust::detail::tabulate_output_iterator_proxy> { + : public thrust::iterator_adaptor, + thrust::counting_iterator, + thrust::use_default, + thrust::use_default, + thrust::use_default, + tabulate_output_iterator_proxy> { public: // parent class. - using super_t = thrust::iterator_adaptor< - tabulate_output_iterator, - counting_iterator, - thrust::use_default, - thrust::use_default, - thrust::use_default, - thrust::detail::tabulate_output_iterator_proxy>; + using super_t = thrust::iterator_adaptor, + thrust::counting_iterator, + thrust::use_default, + thrust::use_default, + thrust::use_default, + tabulate_output_iterator_proxy>; // friend thrust::iterator_core_access to allow it access to the private interface dereference() friend class thrust::iterator_core_access; __host__ __device__ tabulate_output_iterator(BinaryFunction fun) : fun(fun) {} @@ -121,8 +111,7 @@ class tabulate_output_iterator // thrust::iterator_core_access accesses this function __host__ __device__ typename super_t::reference dereference() const { - return thrust::detail::tabulate_output_iterator_proxy(*this->base(), - fun); + return tabulate_output_iterator_proxy(*this->base(), fun); } }; @@ -132,4 +121,13 @@ make_tabulate_output_iterator(BinaryFunction fun) { return tabulate_output_iterator(fun); } // end make_tabulate_output_iterator -THRUST_NAMESPACE_END + +} // namespace detail +} // namespace cudf + +// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from +// type_traits to enable its use with algorithms. +template +struct thrust::detail::is_proxy_reference< + cudf::detail::tabulate_output_iterator_proxy> + : public thrust::detail::true_type {}; From a225ce077e9c8ba70aaab8ba4f590e44b5b27840 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 26 Aug 2024 18:56:06 +0000 Subject: [PATCH 14/33] remove comments and notes --- cpp/src/io/json/process_tokens.cu | 53 ------------------------------- 1 file changed, 53 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index e1a50dd6070..6336ca64928 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -300,16 +300,6 @@ void validate_token_stream(device_span d_input, auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); }); - rmm::device_uvector error(num_tokens, stream); - thrust::transform(rmm::exec_policy(stream), - count_it, - count_it + num_tokens, - error.begin(), - predicate); // in-place scan - // printf("error:"); - // for (auto tk : cudf::detail::make_std_vector_sync(error, stream)) - // printf("%d ", tk); - // printf("\n"); thrust::transform_inclusive_scan(rmm::exec_policy(stream), count_it, @@ -318,49 +308,6 @@ void validate_token_stream(device_span d_input, transform_op, binary_op); // in-place scan } - // printf("pre_process_token:"); - // for (auto tk : cudf::detail::make_std_vector_sync(device_span(tokens), - // stream)) - // printf("%d ", tk); - // printf("\n"); - - // LE SB FB FE VB VE SE LE SB ER LE SB LB VB VE SE LE LE - // 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 - // 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 3 4 - // auto unary_op = [] __device__ (auto) -> token_t { return token_t::ErrorBegin; }; - // thrust::transform_if(rmm::exec_policy(stream), count_it, count_it + num_tokens, tokens.begin(), - // unary_op, predicate); auto num_rows = thrust::count(rmm::exec_policy(stream), tokens.begin(), - // tokens.end(), token_t::LineEnd); rmm::device_uvector row_is_error(num_rows, stream); - // rmm::device_uvector row_index(num_tokens, stream); - // auto is_LineEnd = [] __device__ (auto token) -> SymbolOffsetT { return token == - // token_t::LineEnd; }; thrust::transform_inclusive_scan(rmm::exec_policy(stream), - // tokens.begin(), tokens.end(), row_index.begin(), is_LineEnd, thrust::plus{}); - // auto is_error_it = thrust::make_transform_iterator(tokens.begin(), [] __device__ (auto token) - // -> bool { return token == token_t::ErrorBegin; }); - // thrust::reduce_by_key(rmm::exec_policy(stream), row_index.begin(), row_index.end(), - // is_error_it, thrust::make_discard_iterator(), row_is_error.begin()); - - // if current == ErrorBegin and tokens[i+1]==LE or i==n-1) then write ErrorBegin to tokens[i], - // else nothing. if VB, or SB, then if validate(token[i], token[i+1])==false, - // - // Transform_if to errors tokens - // Count LE (num_rows) - // create int vector [num_rows], bool[num_rows] - // TODO: fuse them together with single scan algorithm. - // LE==1, to scan to row_index. - // reduce_by_key, to check if it has any error, and number of non-errors tokens. - // reduce them to get output tokens count. - // copy_if -> use cub cub::DeviceSelect::If(d_data, d_num_selected_out, num_items, select_op) } - -// corner cases: empty LE, -// alternate Decoupled look back idea: -// count LineEnd, allocate bool[num_rows] as is_error. -// decoupled look back for LineEnd tokens for row indices, -// transform_if to error tokens and write to bool[row_index] atomically (reduce and write within -// warp/block?) -// decoupled look back for LineEnd tokens for row indices, -// (if not error row & not lineEnd token) -> decoupled look back for output indices, -// CopyIf (if not error row & not lineEnd token) write to output. } // namespace detail } // namespace cudf::io::json From 875a72b0cef208157aeb53cb4c1e8bf254c74399 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 3 Sep 2024 23:08:40 +0000 Subject: [PATCH 15/33] fix unsigned/signed issue with ARM systems --- cpp/src/io/json/process_tokens.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 6336ca64928..ec1f82e1792 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -238,7 +238,7 @@ void validate_token_stream(device_span d_input, auto u_count = 0; for (SymbolOffsetT idx = start + 1; idx < end; idx++) { auto c = data[idx]; - if (!allow_unquoted_control_chars && c < 32) { return false; } + if (!allow_unquoted_control_chars && c <= char{0} && c < char{32}) { return false; } switch (state) { case string_state::normal: From be7f17e722522c4723ae1405a08fdd569953c8be Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 3 Sep 2024 23:11:43 +0000 Subject: [PATCH 16/33] remove comments --- java/src/main/java/ai/rapids/cudf/JSONOptions.java | 2 -- 1 file changed, 2 deletions(-) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 50cce4590c1..4fcaf1e4a6b 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -158,8 +158,6 @@ public Builder withUnquotedControlChars(boolean isAllowed) { return this; } - // TODO need to finish this for other configs... - /** * Whether to parse dates as DD/MM versus MM/DD * @param dayFirst true: DD/MM, false, MM/DD From fb628774d6cd8a19a2411053ac658468a12ee519 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Wed, 4 Sep 2024 23:33:32 +0000 Subject: [PATCH 17/33] fix condition --- cpp/src/io/json/process_tokens.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index ec1f82e1792..61f4e559cbf 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -238,7 +238,7 @@ void validate_token_stream(device_span d_input, auto u_count = 0; for (SymbolOffsetT idx = start + 1; idx < end; idx++) { auto c = data[idx]; - if (!allow_unquoted_control_chars && c <= char{0} && c < char{32}) { return false; } + if (!allow_unquoted_control_chars && c >= char{0} && c < char{32}) { return false; } switch (state) { case string_state::normal: From e4f7d047fab12063eceb386e5e8f5143e7243ee7 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 5 Sep 2024 21:23:51 +0000 Subject: [PATCH 18/33] fix char issue with typecast --- cpp/src/io/json/process_tokens.cu | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 61f4e559cbf..59ea0dd1583 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -238,7 +238,9 @@ void validate_token_stream(device_span d_input, auto u_count = 0; for (SymbolOffsetT idx = start + 1; idx < end; idx++) { auto c = data[idx]; - if (!allow_unquoted_control_chars && c >= char{0} && c < char{32}) { return false; } + if (!allow_unquoted_control_chars && static_cast(c) >= 0 && static_cast(c) < 32) { + return false; + } switch (state) { case string_state::normal: From 851fe3e58fa9d0a4fa2d2190506c4784acf4fbf9 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:26:31 -0500 Subject: [PATCH 19/33] Update cpp/include/cudf/io/json.hpp Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/include/cudf/io/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 433bb57941f..e28cb978606 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -500,7 +500,7 @@ class json_reader_options { void set_strict_validation(bool val) { _strict_validation = val; } /** - * @brief Set whether leading zeros are allowed in numeric values. strict validation + * @brief Set whether leading zeros are allowed in numeric values. Strict validation * must be enabled for this to work. * * @note: strict_validation must be enabled for this to work. From 35e4b892841ed47e76637636312d775bd256e3b1 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Thu, 5 Sep 2024 16:26:37 -0500 Subject: [PATCH 20/33] Update cpp/include/cudf/io/json.hpp Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/include/cudf/io/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index e28cb978606..fe2cd8603f0 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -511,7 +511,7 @@ class json_reader_options { /** * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, - * Infinity, and -Infinity. strict validation must be enabled for this to work. + * Infinity, and -Infinity. Strict validation must be enabled for this to work. * * @note: strict_validation must be enabled for this to work. * From 368182327d6b274584502fc4c0b479a53b5b33cb Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 5 Sep 2024 21:36:37 +0000 Subject: [PATCH 21/33] address review comments --- cpp/src/io/json/process_tokens.cu | 433 +++++++++++++++--------------- 1 file changed, 215 insertions(+), 218 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 59ea0dd1583..839d9e88c9a 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -43,28 +43,27 @@ struct write_if { if (i == n - 1 or tokens[i + 1] == token_t::LineEnd) { if (x.first == token_t::ErrorBegin and tokens[i] != token_t::ErrorBegin) { tokens[i] = token_t::ErrorBegin; - // printf("writing\n"); } } } }; enum class number_state { - start = 0, - saw_neg, // not a complete state - leading_zero, - whole, - saw_radix, // not a complete state - fraction, - start_exponent, // not a complete state - after_sign_exponent, // not a complete state - exponent + START = 0, + SAW_NEG, // not a complete state + LEADING_ZERO, + WHOLE, + SAW_RADIX, // not a complete state + FRACTION, + START_EXPONENT, // not a complete state + AFTER_SIGN_EXPONENT, // not a complete state + EXPONENT }; enum class string_state { - normal = 0, - escaped, // not a complete state - escaped_u // not a complete state + NORMAL = 0, + ESCAPED, // not a complete state + ESCAPED_U // not a complete state }; __device__ inline bool substr_eq(const char* data, @@ -90,226 +89,224 @@ void validate_token_stream(device_span d_input, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); - if (options.is_strict_validation()) { - using token_t = cudf::io::json::token_t; - cudf::detail::optional_trie trie_na = - cudf::detail::create_serialized_trie(options.get_na_values(), stream); - auto trie_na_view = cudf::detail::make_trie_view(trie_na); - auto validate_values = - [data = d_input.data(), - trie_na = trie_na_view, - allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), - allow_nonnumeric = - options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, - SymbolOffsetT end) -> bool { - // This validates an unquoted value. A value must match https://www.json.org/json-en.html - // but the leading and training whitespace should already have been removed, and is not - // a string - auto c = data[start]; - auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); - if (is_null_literal) { - return true; - } else if ('n' == c) { - return substr_eq(data, start, end, 4, "null"); - } else if ('t' == c) { - return substr_eq(data, start, end, 4, "true"); - } else if ('f' == c) { - return substr_eq(data, start, end, 5, "false"); - } else if (allow_nonnumeric && c == 'N') { - return substr_eq(data, start, end, 3, "NaN"); - } else if (allow_nonnumeric && c == 'I') { - return substr_eq(data, start, end, 8, "Infinity"); - } else if (allow_nonnumeric && c == '+') { - return substr_eq(data, start, end, 4, "+INF") || - substr_eq(data, start, end, 9, "+Infinity"); - } else if ('-' == c || c <= '9' && 'c' >= '0') { - // number - auto num_state = number_state::start; - for (auto at = start; at < end; at++) { - c = data[at]; - switch (num_state) { - case number_state::start: - if ('-' == c) { - num_state = number_state::saw_neg; - } else if ('0' == c) { - num_state = number_state::leading_zero; - } else if (c >= '1' && c <= '9') { - num_state = number_state::whole; - } else { - return false; - } - break; - case number_state::saw_neg: - if ('0' == c) { - num_state = number_state::leading_zero; - } else if (c >= '1' && c <= '9') { - num_state = number_state::whole; - } else if (allow_nonnumeric && 'I' == c) { - return substr_eq(data, start, end, 4, "-INF") || - substr_eq(data, start, end, 9, "-Infinity"); - } else { - return false; - } - break; - case number_state::leading_zero: - if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { - num_state = number_state::whole; - } else if ('.' == c) { - num_state = number_state::saw_radix; - } else if ('e' == c || 'E' == c) { - num_state = number_state::start_exponent; - } else { - return false; - } - break; - case number_state::whole: - if (c >= '0' && c <= '9') { - num_state = number_state::whole; - } else if ('.' == c) { - num_state = number_state::saw_radix; - } else if ('e' == c || 'E' == c) { - num_state = number_state::start_exponent; - } else { - return false; - } - break; - case number_state::saw_radix: - if (c >= '0' && c <= '9') { - num_state = number_state::fraction; - } else if ('e' == c || 'E' == c) { - num_state = number_state::start_exponent; - } else { - return false; - } - break; - case number_state::fraction: - if (c >= '0' && c <= '9') { - num_state = number_state::fraction; - } else if ('e' == c || 'E' == c) { - num_state = number_state::start_exponent; - } else { - return false; - } - break; - case number_state::start_exponent: - if ('+' == c || '-' == c) { - num_state = number_state::after_sign_exponent; - } else if (c >= '0' && c <= '9') { - num_state = number_state::exponent; - } else { - return false; - } - break; - case number_state::after_sign_exponent: - if (c >= '0' && c <= '9') { - num_state = number_state::exponent; - } else { - return false; - } - break; - case number_state::exponent: - if (c >= '0' && c <= '9') { - num_state = number_state::exponent; - } else { - return false; - } - break; - } - } - return num_state != number_state::after_sign_exponent && - num_state != number_state::start_exponent && num_state != number_state::saw_neg && - num_state != number_state::saw_radix; - } else { - return false; - } - }; - - auto validate_strings = - [data = d_input.data(), - allow_unquoted_control_chars = - options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, - SymbolOffsetT end) -> bool { - // This validates a quoted string. A string must match https://www.json.org/json-en.html - // but we already know that it has a starting and ending " and all white space has been - // stripped out. Also the base CUDF validation makes sure escaped chars are correct - // so we only need to worry about unquoted control chars - - auto state = string_state::normal; - auto u_count = 0; - for (SymbolOffsetT idx = start + 1; idx < end; idx++) { - auto c = data[idx]; - if (!allow_unquoted_control_chars && static_cast(c) >= 0 && static_cast(c) < 32) { - return false; - } - - switch (state) { - case string_state::normal: - if (c == '\\') { state = string_state::escaped; } + if (!options.is_strict_validation()) { return; } + using token_t = cudf::io::json::token_t; + cudf::detail::optional_trie trie_na = + cudf::detail::create_serialized_trie(options.get_na_values(), stream); + auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto validate_values = + [data = d_input.data(), + trie_na = trie_na_view, + allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), + allow_nonnumeric = + options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates an unquoted value. A value must match https://www.json.org/json-en.html + // but the leading and training whitespace should already have been removed, and is not + // a string + auto c = data[start]; + auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); + if (is_null_literal) { + return true; + } else if ('n' == c) { + return substr_eq(data, start, end, 4, "null"); + } else if ('t' == c) { + return substr_eq(data, start, end, 4, "true"); + } else if ('f' == c) { + return substr_eq(data, start, end, 5, "false"); + } else if (allow_nonnumeric && c == 'N') { + return substr_eq(data, start, end, 3, "NaN"); + } else if (allow_nonnumeric && c == 'I') { + return substr_eq(data, start, end, 8, "Infinity"); + } else if (allow_nonnumeric && c == '+') { + return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity"); + } else if ('-' == c || c <= '9' && 'c' >= '0') { + // number + auto num_state = number_state::START; + for (auto at = start; at < end; at++) { + c = data[at]; + switch (num_state) { + case number_state::START: + if ('-' == c) { + num_state = number_state::SAW_NEG; + } else if ('0' == c) { + num_state = number_state::LEADING_ZERO; + } else if (c >= '1' && c <= '9') { + num_state = number_state::WHOLE; + } else { + return false; + } break; - case string_state::escaped: - // in Spark you can allow any char to be escaped, but CUDF - // validates it in some cases so we need to also validate it. - if (c == 'u') { - state = string_state::escaped_u; - u_count = 0; - } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || - c == 'r' || c == 't') { - state = string_state::normal; + case number_state::SAW_NEG: + if ('0' == c) { + num_state = number_state::LEADING_ZERO; + } else if (c >= '1' && c <= '9') { + num_state = number_state::WHOLE; + } else if (allow_nonnumeric && 'I' == c) { + return substr_eq(data, start, end, 4, "-INF") || + substr_eq(data, start, end, 9, "-Infinity"); + } else { + return false; + } + break; + case number_state::LEADING_ZERO: + if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { + num_state = number_state::WHOLE; + } else if ('.' == c) { + num_state = number_state::SAW_RADIX; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::WHOLE: + if (c >= '0' && c <= '9') { + num_state = number_state::WHOLE; + } else if ('.' == c) { + num_state = number_state::SAW_RADIX; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::SAW_RADIX: + if (c >= '0' && c <= '9') { + num_state = number_state::FRACTION; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::FRACTION: + if (c >= '0' && c <= '9') { + num_state = number_state::FRACTION; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; } else { return false; } break; - case string_state::escaped_u: - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { - u_count++; - if (u_count == 4) { - state = string_state::normal; - u_count = 0; - } + case number_state::START_EXPONENT: + if ('+' == c || '-' == c) { + num_state = number_state::AFTER_SIGN_EXPONENT; + } else if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + case number_state::AFTER_SIGN_EXPONENT: + if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + case number_state::EXPONENT: + if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; } else { return false; } break; } } - return string_state::normal == state; - }; + return num_state != number_state::AFTER_SIGN_EXPONENT && + num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG && + num_state != number_state::SAW_RADIX; + } else { + return false; + } + }; + + auto validate_strings = + [data = d_input.data(), + allow_unquoted_control_chars = + options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, + SymbolOffsetT end) -> bool { + // This validates a quoted string. A string must match https://www.json.org/json-en.html + // but we already know that it has a starting and ending " and all white space has been + // stripped out. Also the base CUDF validation makes sure escaped chars are correct + // so we only need to worry about unquoted control chars + + auto state = string_state::NORMAL; + auto u_count = 0; + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (!allow_unquoted_control_chars && static_cast(c) >= 0 && static_cast(c) < 32) { + return false; + } - auto num_tokens = tokens.size(); - auto count_it = thrust::make_counting_iterator(0); - auto predicate = [tokens = tokens.begin(), - token_indices = token_indices.begin(), - validate_values, - validate_strings] __device__(auto i) -> bool { - if (tokens[i] == token_t::ValueEnd) { - return !validate_values(token_indices[i - 1], token_indices[i]); - } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { - return !validate_strings(token_indices[i - 1], token_indices[i]); + switch (state) { + case string_state::NORMAL: + if (c == '\\') { state = string_state::ESCAPED; } + break; + case string_state::ESCAPED: + // in Spark you can allow any char to be escaped, but CUDF + // validates it in some cases so we need to also validate it. + if (c == 'u') { + state = string_state::ESCAPED_U; + u_count = 0; + } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || + c == 'r' || c == 't') { + state = string_state::NORMAL; + } else { + return false; + } + break; + case string_state::ESCAPED_U: + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + u_count++; + if (u_count == 4) { + state = string_state::NORMAL; + u_count = 0; + } + } else { + return false; + } + break; } - return false; - }; + } + return string_state::NORMAL == state; + }; + + auto num_tokens = tokens.size(); + auto count_it = thrust::make_counting_iterator(0); + auto predicate = [tokens = tokens.begin(), + token_indices = token_indices.begin(), + validate_values, + validate_strings] __device__(auto i) -> bool { + if (tokens[i] == token_t::ValueEnd) { + return !validate_values(token_indices[i - 1], token_indices[i]); + } else if (tokens[i] == token_t::FieldNameEnd || tokens[i] == token_t::StringEnd) { + return !validate_strings(token_indices[i - 1], token_indices[i]); + } + return false; + }; - using scan_type = write_if::scan_type; - auto conditional_write = write_if{tokens.begin(), num_tokens}; - auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); - auto transform_op = cuda::proclaim_return_type( - [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { - if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; - return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; - }); - auto binary_op = cuda::proclaim_return_type( - [] __device__(scan_type prev, scan_type curr) -> scan_type { - auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); - return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); - }); + using scan_type = write_if::scan_type; + auto conditional_write = write_if{tokens.begin(), num_tokens}; + auto conditional_output_it = cudf::detail::make_tabulate_output_iterator(conditional_write); + auto transform_op = cuda::proclaim_return_type( + [predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type { + if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd}; + return {static_cast(tokens[i]), tokens[i] == token_t::LineEnd}; + }); + auto binary_op = cuda::proclaim_return_type( + [] __device__(scan_type prev, scan_type curr) -> scan_type { + auto op_result = (prev.first == token_t::ErrorBegin ? prev.first : curr.first); + return scan_type((curr.second ? curr.first : op_result), prev.second | curr.second); + }); - thrust::transform_inclusive_scan(rmm::exec_policy(stream), - count_it, - count_it + num_tokens, - conditional_output_it, - transform_op, - binary_op); // in-place scan - } + thrust::transform_inclusive_scan(rmm::exec_policy(stream), + count_it, + count_it + num_tokens, + conditional_output_it, + transform_op, + binary_op); // in-place scan } } // namespace detail } // namespace cudf::io::json From 1d897f710e2dad2431bab98f68ee0b8ce09a2696 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 5 Sep 2024 21:37:09 +0000 Subject: [PATCH 22/33] fix doc --- cpp/src/io/json/tabulate_output_iterator.cuh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/src/io/json/tabulate_output_iterator.cuh b/cpp/src/io/json/tabulate_output_iterator.cuh index 46887beefbd..7cf3655e259 100644 --- a/cpp/src/io/json/tabulate_output_iterator.cuh +++ b/cpp/src/io/json/tabulate_output_iterator.cuh @@ -14,7 +14,6 @@ * limitations under the License. */ -// Tabulate Output iterator #pragma once #include @@ -44,7 +43,7 @@ class tabulate_output_iterator_proxy { }; /** - * @brief Transform output iterator with custom binary function which takes index and value. + * @brief Tabulate output iterator with custom binary function which takes index and value. * * @code {.cpp} * #include "tabulate_output_iterator.cuh" From 6bf4d3f3f6e52510f1defd31e70cb3fd7781333f Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Thu, 5 Sep 2024 21:38:24 +0000 Subject: [PATCH 23/33] address review comments --- cpp/src/io/json/process_tokens.cu | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 839d9e88c9a..856fdbdb865 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -72,12 +72,9 @@ __device__ inline bool substr_eq(const char* data, SymbolOffsetT const expected_len, const char* expected) { - if (end - start != expected_len) { - return false; - } else { - for (auto idx = 0; idx < expected_len; idx++) { - if (data[start + idx] != expected[idx]) { return false; } - } + if (end - start != expected_len) { return false; } + for (auto idx = 0; idx < expected_len; idx++) { + if (data[start + idx] != expected[idx]) { return false; } } return true; } From e093d641a8b081a8ceafd4394354af3c0ca9dd86 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 9 Sep 2024 21:31:53 +0000 Subject: [PATCH 24/33] address review comments --- cpp/include/cudf/io/json.hpp | 27 ++++++++++++++++++++------- cpp/src/io/json/process_tokens.cu | 8 ++++---- cpp/tests/io/json/json_test.cpp | 4 ++-- 3 files changed, 26 insertions(+), 13 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index fe2cd8603f0..21466216a86 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -20,6 +20,7 @@ #include #include +#include #include #include @@ -131,7 +132,7 @@ class json_reader_options { json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; // Validation checks for spark - // Should the json validation be strict of not + // Should the json validation be strict or not bool _strict_validation = false; // Allow leading zeros for numeric values. bool _allow_numeric_leading_zeros = true; @@ -503,32 +504,44 @@ class json_reader_options { * @brief Set whether leading zeros are allowed in numeric values. Strict validation * must be enabled for this to work. * - * @note: strict_validation must be enabled for this to work. + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values */ - void allow_numeric_leading_zeros(bool val) { _allow_numeric_leading_zeros = val; } + void allow_numeric_leading_zeros(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_numeric_leading_zeros = val; + } /** * @brief Set whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, * Infinity, and -Infinity. Strict validation must be enabled for this to work. * - * @note: strict_validation must be enabled for this to work. + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values */ - void allow_nonnumeric_numbers(bool val) { _allow_nonnumeric_numbers = val; } + void allow_nonnumeric_numbers(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_nonnumeric_numbers = val; + } /** * @brief Set whether in a quoted string should characters greater than or equal to 0 * and less than 32 be allowed without some form of escaping. Strict validation must * be enabled for this to work. * - * @note: strict_validation must be enabled for this to work. + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. * * @param val true to indicate whether unquoted control chars are allowed. */ - void allow_unquoted_control_chars(bool val) { _allow_unquoted_control_chars = val; } + void allow_unquoted_control_chars(bool val) + { + CUDF_EXPECTS(_strict_validation, "Strict validation must be enabled for this to work."); + _allow_unquoted_control_chars = val; + } /** * @brief Sets additional values to recognize as null values. diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 856fdbdb865..4558dd6014b 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -91,7 +91,7 @@ void validate_token_stream(device_span d_input, cudf::detail::optional_trie trie_na = cudf::detail::create_serialized_trie(options.get_na_values(), stream); auto trie_na_view = cudf::detail::make_trie_view(trie_na); - auto validate_values = + auto validate_values = cuda::proclaim_return_type( [data = d_input.data(), trie_na = trie_na_view, allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), @@ -217,9 +217,9 @@ void validate_token_stream(device_span d_input, } else { return false; } - }; + }); - auto validate_strings = + auto validate_strings = cuda::proclaim_return_type( [data = d_input.data(), allow_unquoted_control_chars = options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, @@ -268,7 +268,7 @@ void validate_token_stream(device_span d_input, } } return string_state::NORMAL == state; - }; + }); auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 212538fd891..960c19fce2e 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2239,9 +2239,9 @@ TEST_F(JsonReaderTest, ValueValidation) cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) .lines(true) .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .strict_validation(true) .numeric_leading_zeros(false) - .na_values({"nan"}) - .strict_validation(true); + .na_values({"nan"}); cudf::io::table_with_metadata result = cudf::io::read_json(in_options); EXPECT_EQ(result.tbl->num_columns(), 4); From 00ef690834b62fc165b5800c31f393034e6d0401 Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Mon, 9 Sep 2024 21:35:16 +0000 Subject: [PATCH 25/33] rename lambda name --- cpp/src/io/json/process_tokens.cu | 321 +++++++++++++++--------------- 1 file changed, 161 insertions(+), 160 deletions(-) diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index 4558dd6014b..db30bd9d7e2 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -90,185 +90,186 @@ void validate_token_stream(device_span d_input, using token_t = cudf::io::json::token_t; cudf::detail::optional_trie trie_na = cudf::detail::create_serialized_trie(options.get_na_values(), stream); - auto trie_na_view = cudf::detail::make_trie_view(trie_na); - auto validate_values = cuda::proclaim_return_type( + auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto validate_numeric_values = cuda::proclaim_return_type( [data = d_input.data(), trie_na = trie_na_view, allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), allow_nonnumeric = options.is_allowed_nonnumeric_numbers()] __device__(SymbolOffsetT start, SymbolOffsetT end) -> bool { - // This validates an unquoted value. A value must match https://www.json.org/json-en.html - // but the leading and training whitespace should already have been removed, and is not - // a string - auto c = data[start]; - auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); - if (is_null_literal) { - return true; - } else if ('n' == c) { - return substr_eq(data, start, end, 4, "null"); - } else if ('t' == c) { - return substr_eq(data, start, end, 4, "true"); - } else if ('f' == c) { - return substr_eq(data, start, end, 5, "false"); - } else if (allow_nonnumeric && c == 'N') { - return substr_eq(data, start, end, 3, "NaN"); - } else if (allow_nonnumeric && c == 'I') { - return substr_eq(data, start, end, 8, "Infinity"); - } else if (allow_nonnumeric && c == '+') { - return substr_eq(data, start, end, 4, "+INF") || substr_eq(data, start, end, 9, "+Infinity"); - } else if ('-' == c || c <= '9' && 'c' >= '0') { - // number - auto num_state = number_state::START; - for (auto at = start; at < end; at++) { - c = data[at]; - switch (num_state) { - case number_state::START: - if ('-' == c) { - num_state = number_state::SAW_NEG; - } else if ('0' == c) { - num_state = number_state::LEADING_ZERO; - } else if (c >= '1' && c <= '9') { - num_state = number_state::WHOLE; - } else { - return false; - } - break; - case number_state::SAW_NEG: - if ('0' == c) { - num_state = number_state::LEADING_ZERO; - } else if (c >= '1' && c <= '9') { - num_state = number_state::WHOLE; - } else if (allow_nonnumeric && 'I' == c) { - return substr_eq(data, start, end, 4, "-INF") || - substr_eq(data, start, end, 9, "-Infinity"); - } else { - return false; - } - break; - case number_state::LEADING_ZERO: - if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { - num_state = number_state::WHOLE; - } else if ('.' == c) { - num_state = number_state::SAW_RADIX; - } else if ('e' == c || 'E' == c) { - num_state = number_state::START_EXPONENT; - } else { - return false; - } - break; - case number_state::WHOLE: - if (c >= '0' && c <= '9') { - num_state = number_state::WHOLE; - } else if ('.' == c) { - num_state = number_state::SAW_RADIX; - } else if ('e' == c || 'E' == c) { - num_state = number_state::START_EXPONENT; - } else { - return false; - } - break; - case number_state::SAW_RADIX: - if (c >= '0' && c <= '9') { - num_state = number_state::FRACTION; - } else if ('e' == c || 'E' == c) { - num_state = number_state::START_EXPONENT; - } else { - return false; - } - break; - case number_state::FRACTION: - if (c >= '0' && c <= '9') { - num_state = number_state::FRACTION; - } else if ('e' == c || 'E' == c) { - num_state = number_state::START_EXPONENT; - } else { - return false; - } - break; - case number_state::START_EXPONENT: - if ('+' == c || '-' == c) { - num_state = number_state::AFTER_SIGN_EXPONENT; - } else if (c >= '0' && c <= '9') { - num_state = number_state::EXPONENT; - } else { - return false; - } - break; - case number_state::AFTER_SIGN_EXPONENT: - if (c >= '0' && c <= '9') { - num_state = number_state::EXPONENT; - } else { - return false; - } - break; - case number_state::EXPONENT: - if (c >= '0' && c <= '9') { - num_state = number_state::EXPONENT; - } else { - return false; - } - break; + // This validates an unquoted value. A value must match https://www.json.org/json-en.html + // but the leading and training whitespace should already have been removed, and is not + // a string + auto c = data[start]; + auto is_null_literal = serialized_trie_contains(trie_na, {data + start, end - start}); + if (is_null_literal) { + return true; + } else if ('n' == c) { + return substr_eq(data, start, end, 4, "null"); + } else if ('t' == c) { + return substr_eq(data, start, end, 4, "true"); + } else if ('f' == c) { + return substr_eq(data, start, end, 5, "false"); + } else if (allow_nonnumeric && c == 'N') { + return substr_eq(data, start, end, 3, "NaN"); + } else if (allow_nonnumeric && c == 'I') { + return substr_eq(data, start, end, 8, "Infinity"); + } else if (allow_nonnumeric && c == '+') { + return substr_eq(data, start, end, 4, "+INF") || + substr_eq(data, start, end, 9, "+Infinity"); + } else if ('-' == c || c <= '9' && 'c' >= '0') { + // number + auto num_state = number_state::START; + for (auto at = start; at < end; at++) { + c = data[at]; + switch (num_state) { + case number_state::START: + if ('-' == c) { + num_state = number_state::SAW_NEG; + } else if ('0' == c) { + num_state = number_state::LEADING_ZERO; + } else if (c >= '1' && c <= '9') { + num_state = number_state::WHOLE; + } else { + return false; + } + break; + case number_state::SAW_NEG: + if ('0' == c) { + num_state = number_state::LEADING_ZERO; + } else if (c >= '1' && c <= '9') { + num_state = number_state::WHOLE; + } else if (allow_nonnumeric && 'I' == c) { + return substr_eq(data, start, end, 4, "-INF") || + substr_eq(data, start, end, 9, "-Infinity"); + } else { + return false; + } + break; + case number_state::LEADING_ZERO: + if (allow_numeric_leading_zeros && c >= '0' && c <= '9') { + num_state = number_state::WHOLE; + } else if ('.' == c) { + num_state = number_state::SAW_RADIX; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::WHOLE: + if (c >= '0' && c <= '9') { + num_state = number_state::WHOLE; + } else if ('.' == c) { + num_state = number_state::SAW_RADIX; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::SAW_RADIX: + if (c >= '0' && c <= '9') { + num_state = number_state::FRACTION; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::FRACTION: + if (c >= '0' && c <= '9') { + num_state = number_state::FRACTION; + } else if ('e' == c || 'E' == c) { + num_state = number_state::START_EXPONENT; + } else { + return false; + } + break; + case number_state::START_EXPONENT: + if ('+' == c || '-' == c) { + num_state = number_state::AFTER_SIGN_EXPONENT; + } else if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + case number_state::AFTER_SIGN_EXPONENT: + if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + case number_state::EXPONENT: + if (c >= '0' && c <= '9') { + num_state = number_state::EXPONENT; + } else { + return false; + } + break; + } } + return num_state != number_state::AFTER_SIGN_EXPONENT && + num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG && + num_state != number_state::SAW_RADIX; + } else { + return false; } - return num_state != number_state::AFTER_SIGN_EXPONENT && - num_state != number_state::START_EXPONENT && num_state != number_state::SAW_NEG && - num_state != number_state::SAW_RADIX; - } else { - return false; - } - }); + }); auto validate_strings = cuda::proclaim_return_type( [data = d_input.data(), allow_unquoted_control_chars = options.is_allowed_unquoted_control_chars()] __device__(SymbolOffsetT start, SymbolOffsetT end) -> bool { - // This validates a quoted string. A string must match https://www.json.org/json-en.html - // but we already know that it has a starting and ending " and all white space has been - // stripped out. Also the base CUDF validation makes sure escaped chars are correct - // so we only need to worry about unquoted control chars + // This validates a quoted string. A string must match https://www.json.org/json-en.html + // but we already know that it has a starting and ending " and all white space has been + // stripped out. Also the base CUDF validation makes sure escaped chars are correct + // so we only need to worry about unquoted control chars - auto state = string_state::NORMAL; - auto u_count = 0; - for (SymbolOffsetT idx = start + 1; idx < end; idx++) { - auto c = data[idx]; - if (!allow_unquoted_control_chars && static_cast(c) >= 0 && static_cast(c) < 32) { - return false; - } + auto state = string_state::NORMAL; + auto u_count = 0; + for (SymbolOffsetT idx = start + 1; idx < end; idx++) { + auto c = data[idx]; + if (!allow_unquoted_control_chars && static_cast(c) >= 0 && static_cast(c) < 32) { + return false; + } - switch (state) { - case string_state::NORMAL: - if (c == '\\') { state = string_state::ESCAPED; } - break; - case string_state::ESCAPED: - // in Spark you can allow any char to be escaped, but CUDF - // validates it in some cases so we need to also validate it. - if (c == 'u') { - state = string_state::ESCAPED_U; - u_count = 0; - } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || - c == 'r' || c == 't') { - state = string_state::NORMAL; - } else { - return false; - } - break; - case string_state::ESCAPED_U: - if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { - u_count++; - if (u_count == 4) { - state = string_state::NORMAL; + switch (state) { + case string_state::NORMAL: + if (c == '\\') { state = string_state::ESCAPED; } + break; + case string_state::ESCAPED: + // in Spark you can allow any char to be escaped, but CUDF + // validates it in some cases so we need to also validate it. + if (c == 'u') { + state = string_state::ESCAPED_U; u_count = 0; + } else if (c == '"' || c == '\\' || c == '/' || c == 'b' || c == 'f' || c == 'n' || + c == 'r' || c == 't') { + state = string_state::NORMAL; + } else { + return false; } - } else { - return false; - } - break; + break; + case string_state::ESCAPED_U: + if ((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')) { + u_count++; + if (u_count == 4) { + state = string_state::NORMAL; + u_count = 0; + } + } else { + return false; + } + break; + } } - } - return string_state::NORMAL == state; - }); + return string_state::NORMAL == state; + }); auto num_tokens = tokens.size(); auto count_it = thrust::make_counting_iterator(0); From cecb42fe276783582f1f958c4fa88afce612e5ad Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:05:03 -0500 Subject: [PATCH 26/33] Apply suggestions from code review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/include/cudf/io/json.hpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 21466216a86..faa871638ba 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -133,10 +133,11 @@ class json_reader_options { // Validation checks for spark // Should the json validation be strict or not + // Note: strict validation enforces the JSON specification https://www.json.org/json-en.html bool _strict_validation = false; // Allow leading zeros for numeric values. bool _allow_numeric_leading_zeros = true; - // Allow nonnumeric numbers. NaN/Inf + // Allow non-numeric numbers: NaN, +INF, -INF, +Infinity, Infinity, -Infinity bool _allow_nonnumeric_numbers = true; // Allow unquoted control characters bool _allow_unquoted_control_chars = true; @@ -321,10 +322,9 @@ class json_reader_options { [[nodiscard]] bool is_strict_validation() const { return _strict_validation; } /** - * @brief Whether leading zeros are allowed in numeric values. strict validation - * must be enabled for this to work. + * @brief Whether leading zeros are allowed in numeric values. * - * @note: strict_validation must be enabled for this to work. + * @note: This validation is enforced only if strict validation is enabled. * * @return true if leading zeros are allowed in numeric values */ @@ -335,9 +335,9 @@ class json_reader_options { /** * @brief Whether unquoted number values should be allowed NaN, +INF, -INF, +Infinity, Infinity, - * and -Infinity. strict validation must be enabled for this to work. + * and -Infinity. * - * @note: strict_validation must be enabled for this to work. + * @note: This validation is enforced only if strict validation is enabled. * * @return true if leading zeros are allowed in numeric values */ @@ -345,9 +345,9 @@ class json_reader_options { /** * @brief Whether in a quoted string should characters greater than or equal to 0 and less than 32 - * be allowed without some form of escaping. Strict validation must be enabled for this to work. + * be allowed without some form of escaping. * - * @note: strict_validation must be enabled for this to work. + * @note: This validation is enforced only if strict validation is enabled. * * @return true if unquoted control chars are allowed. */ From 9cd30984ccdc93d2bb1e8192b6166bfc53bd96ac Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:05:21 -0500 Subject: [PATCH 27/33] Apply suggestions from code review Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- java/src/main/java/ai/rapids/cudf/JSONOptions.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 4fcaf1e4a6b..c8308ca17ec 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -34,8 +34,8 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean normalizeWhitespace; private final boolean mixedTypesAsStrings; private final boolean keepStringQuotes; - private final boolean allowLeadingZeros; private final boolean strictValidation; + private final boolean allowLeadingZeros; private final boolean allowNonNumericNumbers; private final boolean allowUnquotedControlChars; From c816c73acb71cd8f6dfe64bccfd4290fbe6946aa Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 10 Sep 2024 03:06:56 +0000 Subject: [PATCH 28/33] update docs --- cpp/include/cudf/io/json.hpp | 6 +++--- cpp/src/io/json/process_tokens.cu | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index faa871638ba..d88f1c0f74c 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -776,7 +776,7 @@ class json_reader_options_builder { * @brief Set Whether leading zeros are allowed in numeric values. strict validation must * be enabled for this to have any effect. * - * @note: strict_validation must be enabled for this to work. + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. * * @param val Boolean value to indicate whether leading zeros are allowed in numeric values * @return this for chaining @@ -792,7 +792,7 @@ class json_reader_options_builder { * +INF, -INF, +Infinity, Infinity, and -Infinity. * strict validation must be enabled for this to have any effect. * - * @note: strict_validation must be enabled for this to work. + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. * * @param val Boolean value to indicate if unquoted nonnumeric values are valid json or not. * @return this for chaining @@ -807,7 +807,7 @@ class json_reader_options_builder { * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without * some form of escaping. strict validation must be enabled for this to have any effect. * - * @note: strict_validation must be enabled for this to work. + * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. * * @param val Boolean value to indicate if unquoted control chars are allowed or not. * @return this for chaining diff --git a/cpp/src/io/json/process_tokens.cu b/cpp/src/io/json/process_tokens.cu index db30bd9d7e2..83c7b663980 100644 --- a/cpp/src/io/json/process_tokens.cu +++ b/cpp/src/io/json/process_tokens.cu @@ -90,8 +90,8 @@ void validate_token_stream(device_span d_input, using token_t = cudf::io::json::token_t; cudf::detail::optional_trie trie_na = cudf::detail::create_serialized_trie(options.get_na_values(), stream); - auto trie_na_view = cudf::detail::make_trie_view(trie_na); - auto validate_numeric_values = cuda::proclaim_return_type( + auto trie_na_view = cudf::detail::make_trie_view(trie_na); + auto validate_values = cuda::proclaim_return_type( [data = d_input.data(), trie_na = trie_na_view, allow_numeric_leading_zeros = options.is_allowed_numeric_leading_zeros(), From 53db70330afeb9b880f98fc708daac61ec74f1aa Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:06:14 -0600 Subject: [PATCH 29/33] Update cpp/include/cudf/io/json.hpp --- cpp/include/cudf/io/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index d88f1c0f74c..6482bb7e7fd 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -773,7 +773,7 @@ class json_reader_options_builder { } /** - * @brief Set Whether leading zeros are allowed in numeric values. strict validation must + * @brief Set Whether leading zeros are allowed in numeric values. Strict validation must * be enabled for this to have any effect. * * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. From c3832b6cfa8aa96f8a8fc024fc5540d354c87e55 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:06:21 -0600 Subject: [PATCH 30/33] Update cpp/include/cudf/io/json.hpp --- cpp/include/cudf/io/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 6482bb7e7fd..7606ed1db83 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -790,7 +790,7 @@ class json_reader_options_builder { /** * @brief Set whether specific unquoted number values are valid JSON. The values are NaN, * +INF, -INF, +Infinity, Infinity, and -Infinity. - * strict validation must be enabled for this to have any effect. + * Strict validation must be enabled for this to have any effect. * * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. * From 070263e86d86e21a3bc0a584087187a583192758 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Mon, 9 Sep 2024 22:06:26 -0600 Subject: [PATCH 31/33] Update cpp/include/cudf/io/json.hpp --- cpp/include/cudf/io/json.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 7606ed1db83..f52ec926f49 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -805,7 +805,7 @@ class json_reader_options_builder { /** * @brief Set whether chars >= 0 and < 32 are allowed in a quoted string without - * some form of escaping. strict validation must be enabled for this to have any effect. + * some form of escaping. Strict validation must be enabled for this to have any effect. * * @throw cudf::logic_error if `strict_validation` is not enabled before setting this option. * From fb0e85f53dd569dde12e6321ea5f5655cdaa100d Mon Sep 17 00:00:00 2001 From: Karthikeyan Natarajan Date: Tue, 10 Sep 2024 19:50:47 +0000 Subject: [PATCH 32/33] fix strict_validation dependent options with if --- java/src/main/native/src/TableJni.cpp | 33 ++++++++++++++++----------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 2238195fa9d..f265547d6a7 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1646,11 +1646,12 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .strict_validation(strict_validation) - .numeric_leading_zeros(allow_leading_zeros) - .nonnumeric_numbers(allow_nonnumeric_numbers) - .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); - + if (strict_validation) { + opt.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1697,11 +1698,13 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .normalize_single_quotes(static_cast(normalize_single_quotes)) .normalize_whitespace(static_cast(normalize_whitespace)) .strict_validation(strict_validation) - .numeric_leading_zeros(allow_leading_zeros) - .nonnumeric_numbers(allow_nonnumeric_numbers) - .unquoted_control_chars(allow_unquoted_control) .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); + if (strict_validation) { + opt.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } auto result = std::make_unique(cudf::io::read_json(opts.build())); @@ -1845,10 +1848,12 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .strict_validation(strict_validation) - .numeric_leading_zeros(allow_leading_zeros) - .nonnumeric_numbers(allow_nonnumeric_numbers) - .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); + if (strict_validation) { + opt.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { @@ -1952,10 +1957,12 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .normalize_whitespace(static_cast(normalize_whitespace)) .mixed_types_as_string(mixed_types_as_string) .strict_validation(strict_validation) - .numeric_leading_zeros(allow_leading_zeros) - .nonnumeric_numbers(allow_nonnumeric_numbers) - .unquoted_control_chars(allow_unquoted_control) .keep_quotes(keep_quotes); + if (strict_validation) { + opt.numeric_leading_zeros(allow_leading_zeros) + .nonnumeric_numbers(allow_nonnumeric_numbers) + .unquoted_control_chars(allow_unquoted_control); + } if (!n_types.is_null()) { if (n_types.size() != n_scales.size()) { From 252c38b660be40bb37cc93cabba005cf4a344db1 Mon Sep 17 00:00:00 2001 From: Karthikeyan <6488848+karthikeyann@users.noreply.github.com> Date: Tue, 10 Sep 2024 17:03:04 -0500 Subject: [PATCH 33/33] fix typo --- java/src/main/native/src/TableJni.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index a687f0bdb79..40a111209b0 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1649,7 +1649,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSONFromDataSource(JNIEnv* env, .strict_validation(strict_validation) .keep_quotes(keep_quotes); if (strict_validation) { - opt.numeric_leading_zeros(allow_leading_zeros) + opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) .unquoted_control_chars(allow_unquoted_control); } @@ -1702,7 +1702,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .mixed_types_as_string(mixed_types_as_string) .keep_quotes(keep_quotes); if (strict_validation) { - opt.numeric_leading_zeros(allow_leading_zeros) + opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) .unquoted_control_chars(allow_unquoted_control); } @@ -1851,7 +1851,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .strict_validation(strict_validation) .keep_quotes(keep_quotes); if (strict_validation) { - opt.numeric_leading_zeros(allow_leading_zeros) + opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) .unquoted_control_chars(allow_unquoted_control); } @@ -1960,7 +1960,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .strict_validation(strict_validation) .keep_quotes(keep_quotes); if (strict_validation) { - opt.numeric_leading_zeros(allow_leading_zeros) + opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) .unquoted_control_chars(allow_unquoted_control); }