diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 5e8d13aa32d..5ab68bb8e9d 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -382,6 +382,7 @@ add_library( src/io/json/read_json.cu src/io/json/legacy/json_gpu.cu src/io/json/legacy/reader_impl.cu + src/io/json/parser_features.cpp src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp src/io/orc/dict_enc.cu diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 593dd044d51..1f2628deea7 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -333,6 +333,7 @@ class json_reader_options { /** * @brief Set whether to parse mixed types as a string column. + * Also enables forcing to read a struct as string column using schema. * * @param val Boolean value to enable/disable parsing mixed types as a string column */ @@ -491,6 +492,7 @@ class json_reader_options_builder { /** * @brief Set whether to parse mixed types as a string column. + * Also enables forcing to read a struct as string column using schema. * * @param val Boolean value to enable/disable parsing mixed types as a string column * @return this for chaining diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index 10646fad354..1cf4b55120f 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -496,15 +496,16 @@ void make_device_json_column(device_span input, rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); NodeIndexT const row_array_parent_col_id = [&]() { - if (!is_array_of_arrays) return parent_node_sentinel; - auto const list_node_index = is_enabled_lines ? 0 : 1; - NodeIndexT value; - CUDF_CUDA_TRY(cudaMemcpyAsync(&value, - col_ids.data() + list_node_index, - sizeof(NodeIndexT), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); + NodeIndexT value = parent_node_sentinel; + if (!col_ids.empty()) { + auto const list_node_index = is_enabled_lines ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + col_ids.data() + list_node_index, + sizeof(NodeIndexT), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + } return value; }(); @@ -592,6 +593,12 @@ void make_device_json_column(device_span input, col.column_order.clear(); }; + path_from_tree tree_path{column_categories, + column_parent_ids, + column_names, + is_array_of_arrays, + row_array_parent_col_id}; + // 2. generate nested columns tree and its device_memory // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. auto h_range_col_id_it = @@ -642,6 +649,7 @@ void make_device_json_column(device_span input, ignore_vals[this_col_id] = 1; continue; } + // If the child is already found, // replace if this column is a nested column and the existing was a value column // ignore this column if this column is a value column and the existing was a nested column @@ -700,6 +708,17 @@ void make_device_json_column(device_span input, "A mix of lists and structs within the same column is not supported"); } } + if (is_enabled_mixed_types_as_string) { + // get path of this column, check if it is a struct forced as string, and enforce it + auto nt = tree_path.get_path(this_col_id); + std::optional user_dt = get_path_data_type(nt, options); + if (column_categories[this_col_id] == NC_STRUCT and user_dt.has_value() and + user_dt.value().id() == type_id::STRING) { + is_mixed_type_column[this_col_id] = 1; + column_categories[this_col_id] = NC_STR; + } + } + CUDF_EXPECTS(parent_col.child_columns.count(name) == 0, "duplicate column name: " + name); // move into parent device_json_column col(stream, mr); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index f41b024bb1e..8191c13e9f6 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -307,6 +307,32 @@ table_with_metadata device_parse_nested_json(device_span input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @brief Get the path data type of a column by path if present in input schema + * + * @param path path of the column + * @param options json reader options which holds schema + * @return data type of the column if present + */ +std::optional get_path_data_type( + host_span> path, + cudf::io::json_reader_options const& options); + +/** + * @brief Helper class to get path of a column by column id from reduced column tree + * + */ +struct path_from_tree { + host_span column_categories; + host_span column_parent_ids; + host_span column_names; + bool is_array_of_arrays; + NodeIndexT const row_array_parent_col_id; + + using path_rep = std::pair; + std::vector get_path(NodeIndexT this_col_id); +}; + /** * @brief Parses the given JSON string and generates table from the given input. * diff --git a/cpp/src/io/json/parser_features.cpp b/cpp/src/io/json/parser_features.cpp new file mode 100644 index 00000000000..740b7523cc1 --- /dev/null +++ b/cpp/src/io/json/parser_features.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.hpp" + +#include + +#include +#include +#include + +namespace cudf::io::json::detail { + +std::optional child_schema_element(std::string const& col_name, + cudf::io::json_reader_options const& options) +{ + return std::visit( + cudf::detail::visitor_overload{ + [col_name](std::vector const& user_dtypes) -> std::optional { + auto column_index = atol(col_name.data()); + return (static_cast(column_index) < user_dtypes.size()) + ? std::optional{{user_dtypes[column_index]}} + : std::optional{}; + }, + [col_name]( + std::map const& user_dtypes) -> std::optional { + return (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? std::optional{{user_dtypes.find(col_name)->second}} + : std::optional{}; + }, + [col_name]( + std::map const& user_dtypes) -> std::optional { + return (user_dtypes.find(col_name) != std::end(user_dtypes)) + ? user_dtypes.find(col_name)->second + : std::optional{}; + }}, + options.get_dtypes()); +} + +// example schema and its path. +// "a": int {"a", int} +// "a": [ int ] {"a", list}, {"element", int} +// "a": { "b": int} {"a", struct}, {"b", int} +// "a": [ {"b": int }] {"a", list}, {"element", struct}, {"b", int} +// "a": [ null] {"a", list}, {"element", str} +// back() is root. +// front() is leaf. +std::optional get_path_data_type( + host_span> path, schema_element const& root) +{ + if (path.empty() || path.size() == 1) { + return root.type; + } else { + if (path.back().second == NC_STRUCT && root.type.id() == type_id::STRUCT) { + auto const child_name = path.first(path.size() - 1).back().first; + auto const child_schema_it = root.child_types.find(child_name); + return (child_schema_it != std::end(root.child_types)) + ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) + : std::optional{}; + } else if (path.back().second == NC_LIST && root.type.id() == type_id::LIST) { + auto const child_schema_it = root.child_types.find(list_child_name); + return (child_schema_it != std::end(root.child_types)) + ? get_path_data_type(path.first(path.size() - 1), child_schema_it->second) + : std::optional{}; + } + return std::optional{}; + } +} + +std::optional get_path_data_type( + host_span> path, + cudf::io::json_reader_options const& options) +{ + if (path.empty()) return {}; + std::optional col_schema = child_schema_element(path.back().first, options); + // check if it has value, then do recursive call and return. + if (col_schema.has_value()) { + return get_path_data_type(path, col_schema.value()); + } else { + return {}; + } +} + +// idea: write a memoizer using template and lambda?, then call recursively. +std::vector path_from_tree::get_path(NodeIndexT this_col_id) +{ + std::vector path; + // TODO Need to stop at row root. so, how to find row root? + while (this_col_id != parent_node_sentinel) { + auto type = column_categories[this_col_id]; + std::string name = ""; + // TODO make this ifelse into a separate lambda function, along with parent_col_id. + auto parent_col_id = column_parent_ids[this_col_id]; + if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) { + if (is_array_of_arrays && parent_col_id == row_array_parent_col_id) { + name = column_names[this_col_id]; + } else { + name = list_child_name; + } + } else if (column_categories[parent_col_id] == NC_FN) { + auto field_name_col_id = parent_col_id; + parent_col_id = column_parent_ids[parent_col_id]; + name = column_names[field_name_col_id]; + } + // "name": type/schema + path.emplace_back(name, type); + this_col_id = parent_col_id; + if (this_col_id == row_array_parent_col_id) return path; + } + return {}; +} + +} // namespace cudf::io::json::detail diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 450ea550e99..0b70e5e3f93 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -2239,4 +2239,56 @@ TEST_F(JsonReaderTest, MixedTypes) expected_list); } +TEST_F(JsonReaderTest, MapTypes) +{ + using cudf::type_id; + // Testing function for mixed types in JSON (for spark json reader) + auto test_fn = [](std::string_view json_string, bool lines, std::vector types) { + std::map dtype_schema{ + {"foo1", {data_type{type_id::STRING}}}, // list won't be a string + {"foo2", {data_type{type_id::STRING}}}, // struct forced as a string + {"1", {data_type{type_id::STRING}}}, + {"2", {data_type{type_id::STRING}}}, + {"bar", {dtype()}}, + }; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.data(), json_string.size()}) + .dtypes(dtype_schema) + .mixed_types_as_string(true) + .lines(lines); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + EXPECT_EQ(result.tbl->num_columns(), types.size()); + int i = 0; + for (auto& col : result.tbl->view()) { + EXPECT_EQ(col.type().id(), types[i]) << "column[" << i << "].type"; + i++; + } + std::cout << "\n"; + }; + + // json + test_fn(R"([{ "foo1": [1,2,3], "bar": 123 }, + { "foo2": { "a": 1 }, "bar": 456 }])", + false, + {type_id::LIST, type_id::INT32, type_id::STRING}); + // jsonl + test_fn(R"( { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 })", + true, + {type_id::LIST, type_id::INT32, type_id::STRING}); + // jsonl-array + test_fn(R"([123, [1,2,3]] + [456, null, { "a": 1 }])", + true, + {type_id::INT64, type_id::LIST, type_id::STRING}); + // json-array + test_fn(R"([[[1,2,3], null, 123], + [null, { "a": 1 }, 456 ]])", + false, + {type_id::LIST, type_id::STRING, type_id::STRING}); +} + CUDF_TEST_PROGRAM_MAIN()