Skip to content

Commit

Permalink
JSON - Parse mixed types as string in JSON reader (#14572)
Browse files Browse the repository at this point in the history
Addresses #14239




This PR adds an option to read mixed types as string columns.
It also adds related functional changes to nested JSON reader (libcudf, cuDF-python, Java).

Details:
- Added new option `mixed_types_as_string` bool in json_reader_options
- This feature requires 2 things: finding end of struct/list nodes, parse struct/list type as string.
- For Struct and List, node_range_end was node_range_begin+1 earlier (since it was not used anywhere). Now it is calculated properly by copying only struct and list tokens and their node_range_end is calculated. (Since end token is child of begin token, scattering end token's index to parent' token's corresponding node's node_range_end will get the node_range_end of List and Struct nodes).
- In `reduce_to_column_tree()` (which infers the schema), the list and struct node_range_end are changed to node_begin+1 so that it does not copy entire list/struct strings to host for column names.
- `reinitialize_as_string` reinitializes an initialized column as string.
- Mixed type columns are parsed as strings since their column category is changed to `NC_STR`.
- Added tests

Authors:
  - Karthikeyan (https://github.com/karthikeyann)
  - Andy Grove (https://github.com/andygrove)

Approvers:
  - Andy Grove (https://github.com/andygrove)
  - Jason Lowe (https://github.com/jlowe)
  - Elias Stehle (https://github.com/elstehle)
  - Bradley Dice (https://github.com/bdice)
  - Shruti Shivakumar (https://github.com/shrshi)

URL: #14572
  • Loading branch information
karthikeyann authored Jan 22, 2024
1 parent d1c0e25 commit 8fdc62b
Show file tree
Hide file tree
Showing 15 changed files with 472 additions and 47 deletions.
30 changes: 29 additions & 1 deletion cpp/include/cudf/io/json.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -98,6 +98,8 @@ class json_reader_options {

// Read the file as a json object per line
bool _lines = false;
// Parse mixed types as a string column
bool _mixed_types_as_string = false;

// Bytes to skip from the start
size_t _byte_range_offset = 0;
Expand Down Expand Up @@ -225,6 +227,13 @@ class json_reader_options {
*/
bool is_enabled_lines() const { return _lines; }

/**
* @brief Whether to parse mixed types as a string column.
*
* @return `true` if mixed types are parsed as a string column
*/
bool is_enabled_mixed_types_as_string() const { return _mixed_types_as_string; }

/**
* @brief Whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -302,6 +311,13 @@ class json_reader_options {
*/
void enable_lines(bool val) { _lines = val; }

/**
* @brief Set whether to parse mixed types as a string column.
*
* @param val Boolean value to enable/disable parsing mixed types as a string column
*/
void enable_mixed_types_as_string(bool val) { _mixed_types_as_string = val; }

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down Expand Up @@ -437,6 +453,18 @@ class json_reader_options_builder {
return *this;
}

/**
* @brief Set whether to parse mixed types as a string column.
*
* @param val Boolean value to enable/disable parsing mixed types as a string column
* @return this for chaining
*/
json_reader_options_builder& mixed_types_as_string(bool val)
{
options._mixed_types_as_string = val;
return *this;
}

/**
* @brief Set whether to parse dates as DD/MM versus MM/DD.
*
Expand Down
109 changes: 96 additions & 13 deletions cpp/src/io/json/json_column.cu
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,16 @@ reduce_to_column_tree(tree_meta_t& tree,
return is_non_list_parent(parent_col_id);
});

// For Struct and List (to avoid copying entire strings when mixed type as string is enabled)
thrust::transform_if(
rmm::exec_policy(stream),
col_range_begin.begin(),
col_range_begin.end(),
column_categories.begin(),
col_range_end.begin(),
[] __device__(auto i) { return i + 1; },
[] __device__(NodeT type) { return type == NC_STRUCT || type == NC_LIST; });

return std::tuple{tree_meta_t{std::move(column_categories),
std::move(parent_col_ids),
std::move(column_levels),
Expand Down Expand Up @@ -407,6 +417,7 @@ struct json_column_data {
* @param root Root node of the `d_json_column` tree
* @param is_array_of_arrays Whether the tree is an array of arrays
* @param is_enabled_lines Whether the input is a line-delimited JSON
* @param is_enabled_mixed_types_as_string Whether to enable reading mixed types as string
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the device memory
* of child_offets and validity members of `d_json_column`
Expand All @@ -418,6 +429,7 @@ void make_device_json_column(device_span<SymbolT const> input,
device_json_column& root,
bool is_array_of_arrays,
bool is_enabled_lines,
bool is_enabled_mixed_types_as_string,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
Expand Down Expand Up @@ -516,6 +528,19 @@ void make_device_json_column(device_span<SymbolT const> input,
col.type = to_json_col_type(column_categories[i]);
};

auto reinitialize_as_string = [&](auto i, auto& col) {
col.string_offsets.resize(max_row_offsets[i] + 1, stream);
col.string_lengths.resize(max_row_offsets[i] + 1, stream);
init_to_zero(col.string_offsets);
init_to_zero(col.string_lengths);
col.num_rows = max_row_offsets[i] + 1;
col.validity =
cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr);
col.type = json_col_t::StringColumn;
col.child_columns.clear(); // their references should be deleted too.
col.column_order.clear();
};

// 2. generate nested columns tree and its device_memory
// reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order.
auto h_range_col_id_it =
Expand All @@ -530,6 +555,7 @@ void make_device_json_column(device_span<SymbolT const> input,
std::map<std::pair<NodeIndexT, std::string>, NodeIndexT> mapped_columns;
// find column_ids which are values, but should be ignored in validity
std::vector<uint8_t> ignore_vals(num_columns, 0);
std::vector<uint8_t> is_mixed_type_column(num_columns, 0);
columns.try_emplace(parent_node_sentinel, std::ref(root));

for (auto const this_col_id : unique_col_ids) {
Expand All @@ -552,6 +578,13 @@ void make_device_json_column(device_span<SymbolT const> input,
} else {
CUDF_FAIL("Unexpected parent column category");
}

if (parent_col_id != parent_node_sentinel && is_mixed_type_column[parent_col_id] == 1) {
// if parent is mixed type column, ignore this column.
is_mixed_type_column[this_col_id] = 1;
ignore_vals[this_col_id] = 1;
continue;
}
// If the child is already found,
// replace if this column is a nested column and the existing was a value column
// ignore this column if this column is a value column and the existing was a nested column
Expand All @@ -560,6 +593,24 @@ void make_device_json_column(device_span<SymbolT const> input,
auto& parent_col = it->second.get();
bool replaced = false;
if (mapped_columns.count({parent_col_id, name}) > 0) {
// If mixed type as string is enabled, make both of them strings and merge them.
// All child columns will be ignored when parsing.
if (is_enabled_mixed_types_as_string) {
// VAL/STR or STRUCT or LIST
auto old_col_id = mapped_columns[{parent_col_id, name}];

is_mixed_type_column[this_col_id] = 1;
is_mixed_type_column[old_col_id] = 1;
// if old col type (not cat) is list or struct, replace with string.
auto& col = columns.at(old_col_id).get();
if (col.type == json_col_t::ListColumn or col.type == json_col_t::StructColumn) {
reinitialize_as_string(old_col_id, col);
// all its children (which are already inserted) are ignored later.
}
columns.try_emplace(this_col_id, columns.at(old_col_id));
continue;
}

if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) {
ignore_vals[this_col_id] = 1;
continue;
Expand Down Expand Up @@ -592,6 +643,28 @@ void make_device_json_column(device_span<SymbolT const> input,
columns.try_emplace(this_col_id, std::ref(parent_col.child_columns.at(name)));
mapped_columns.try_emplace(std::make_pair(parent_col_id, name), this_col_id);
}

if (is_enabled_mixed_types_as_string) {
// ignore all children of mixed type columns
for (auto const this_col_id : unique_col_ids) {
auto parent_col_id = column_parent_ids[this_col_id];
if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) {
is_mixed_type_column[this_col_id] = 1;
ignore_vals[this_col_id] = 1;
columns.erase(this_col_id);
}
// Convert only mixed type columns as string (so to copy), but not its children
if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 0 and
is_mixed_type_column[this_col_id] == 1)
column_categories[this_col_id] = NC_STR;
}
cudaMemcpyAsync(d_column_tree.node_categories.begin(),
column_categories.data(),
column_categories.size() * sizeof(column_categories[0]),
cudaMemcpyDefault,
stream.value());
}

// restore unique_col_ids order
std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) {
return thrust::get<1>(a) < thrust::get<1>(b);
Expand All @@ -617,14 +690,16 @@ void make_device_json_column(device_span<SymbolT const> input,
rmm::exec_policy(stream),
thrust::counting_iterator<size_type>(0),
num_nodes,
[node_categories = tree.node_categories.begin(),
col_ids = col_ids.begin(),
row_offsets = row_offsets.begin(),
range_begin = tree.node_range_begin.begin(),
range_end = tree.node_range_end.begin(),
d_ignore_vals = d_ignore_vals.begin(),
d_columns_data = d_columns_data.begin()] __device__(size_type i) {
switch (node_categories[i]) {
[column_categories = d_column_tree.node_categories.begin(),
col_ids = col_ids.begin(),
row_offsets = row_offsets.begin(),
range_begin = tree.node_range_begin.begin(),
range_end = tree.node_range_end.begin(),
d_ignore_vals = d_ignore_vals.begin(),
d_columns_data = d_columns_data.begin()] __device__(size_type i) {
if (d_ignore_vals[col_ids[i]]) return;
auto const node_category = column_categories[col_ids[i]];
switch (node_category) {
case NC_STRUCT: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
case NC_LIST: set_bit(d_columns_data[col_ids[i]].validity, row_offsets[i]); break;
case NC_STR: [[fallthrough]];
Expand Down Expand Up @@ -662,10 +737,14 @@ void make_device_json_column(device_span<SymbolT const> input,
num_nodes,
thrust::make_counting_iterator<size_type>(0),
thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()),
[node_categories = tree.node_categories.begin(),
parent_node_ids = tree.parent_node_ids.begin()] __device__(size_type node_id) {
[d_ignore_vals = d_ignore_vals.begin(),
parent_node_ids = tree.parent_node_ids.begin(),
column_categories = d_column_tree.node_categories.begin(),
col_ids = col_ids.begin()] __device__(size_type node_id) {
auto parent_node_id = parent_node_ids[node_id];
return parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST;
return parent_node_id != parent_node_sentinel and
column_categories[col_ids[parent_node_id]] == NC_LIST and
(!d_ignore_vals[col_ids[parent_node_id]]);
});

auto const num_list_children =
Expand Down Expand Up @@ -896,8 +975,11 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
const auto [tokens_gpu, token_indices_gpu] =
get_token_stream(d_input, options, stream, rmm::mr::get_current_device_resource());
// gpu tree generation
return get_tree_representation(
tokens_gpu, token_indices_gpu, stream, rmm::mr::get_current_device_resource());
return get_tree_representation(tokens_gpu,
token_indices_gpu,
options.is_enabled_mixed_types_as_string(),
stream,
rmm::mr::get_current_device_resource());
}(); // IILE used to free memory of token data.
#ifdef NJP_DEBUG_PRINT
auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
Expand Down Expand Up @@ -941,6 +1023,7 @@ table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
root_column,
is_array_of_arrays,
options.is_enabled_lines(),
options.is_enabled_mixed_types_as_string(),
stream,
mr);

Expand Down
Loading

0 comments on commit 8fdc62b

Please sign in to comment.