Skip to content

Commit

Permalink
Resolve path parsing issues in get_json_object (#15082)
Browse files Browse the repository at this point in the history
This PR addresses a parsing issue related to JSONPath by implementing distinct parsing rules for values inside and outside brackets. For instance, in `{ "A.B": 2, "'A": { "B'": 3 } }`, `$.'A.B'` differs from `$['A.B']`.  (See [Assertible JSON Path Documentation](https://assertible.com/docs/guide/json-path))

The fix ensures accurate parsing of JSONPath values containing quotes. For example in `{ "A.B": 2, "'A": { "B'": 3 } }`


| JSONPath      | Before Fix                                            | Spark        | After Fix           |
|---------------|-------------------------------------------------------|----------------------|---------------------|
| $.'A.B'       | 2                                                     | 3                    | 3                   |
| $.'A          | CUDF_FAIL("Encountered invalid JSONPath input string")| {"B'": 3}            | {"B'": 3}           |



Resolves [12483](#12483).

Authors:
  - Suraj Aralihalli (https://github.com/SurajAralihalli)
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - Robert (Bobby) Evans (https://github.com/revans2)
  - Mike Wilson (https://github.com/hyperbolic2346)
  - Nghia Truong (https://github.com/ttnghia)
  - Bradley Dice (https://github.com/bdice)

URL: #15082
  • Loading branch information
SurajAralihalli authored Feb 29, 2024
1 parent c1e26a6 commit a4f1118
Show file tree
Hide file tree
Showing 4 changed files with 81 additions and 7 deletions.
24 changes: 18 additions & 6 deletions cpp/src/json/json_path.cu
Original file line number Diff line number Diff line change
Expand Up @@ -521,6 +521,14 @@ struct path_operator {
int index{-1}; // index for subscript operator
};

/**
* @brief Enum to specify whether parsing values enclosed within brackets, like `['book']`.
*/
enum class bracket_state : bool {
INSIDE, ///< Parsing inside brackets
OUTSIDE ///< Parsing outside brackets
};

/**
* @brief Parsing class that holds the current state of the JSONPath string to be parsed
* and provides functions for navigating through it. This is only called on the host
Expand All @@ -541,7 +549,7 @@ class path_state : private parser {
case '.': {
path_operator op;
string_view term{".[", 2};
if (parse_path_name(op.name, term)) {
if (parse_path_name(op.name, term, bracket_state::OUTSIDE)) {
// this is another potential use case for __SPARK_BEHAVIORS / configurability
// Spark currently only handles the wildcard operator inside [*], it does
// not handle .*
Expand All @@ -564,7 +572,7 @@ class path_state : private parser {
path_operator op;
string_view term{"]", 1};
bool const is_string = *pos == '\'';
if (parse_path_name(op.name, term)) {
if (parse_path_name(op.name, term, bracket_state::INSIDE)) {
pos++;
if (op.name.size_bytes() == 1 && op.name.data()[0] == '*') {
op.type = path_operator_type::CHILD_WILDCARD;
Expand Down Expand Up @@ -600,7 +608,8 @@ class path_state : private parser {
private:
cudf::io::parse_options_view json_opts{',', '\n', '\"', '.'};

bool parse_path_name(string_view& name, string_view const& terminators)
// b_state is set to INSIDE while parsing values enclosed within [ ], otherwise OUTSIDE
bool parse_path_name(string_view& name, string_view const& terminators, bracket_state b_state)
{
switch (*pos) {
case '*':
Expand All @@ -609,8 +618,11 @@ class path_state : private parser {
break;

case '\'':
if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
break;
if (b_state == bracket_state::INSIDE) {
if (parse_string(name, false, '\'') != parse_result::SUCCESS) { return false; }
break;
}
// if not inside the [ ] -> go to default

default: {
size_t const chars_left = input_len - (pos - input);
Expand Down Expand Up @@ -656,7 +668,7 @@ std::pair<thrust::optional<rmm::device_uvector<path_operator>>, int> build_comma
do {
op = p_state.get_next_operator();
if (op.type == path_operator_type::ERROR) {
CUDF_FAIL("Encountered invalid JSONPath input string");
CUDF_FAIL("Encountered invalid JSONPath input string", std::invalid_argument);
}
if (op.type == path_operator_type::CHILD_WILDCARD) { max_stack_depth++; }
// convert pointer to device pointer
Expand Down
38 changes: 38 additions & 0 deletions cpp/tests/json/json_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,15 @@ TEST_F(JsonPathTests, GetJsonObjectIllegalQuery)
};
EXPECT_THROW(query(), std::invalid_argument);
}

{
auto const input = cudf::test::strings_column_wrapper{R"({"a": "b"})"};
auto const json_path = std::string{"${a}"};
auto const query = [&]() {
auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path);
};
EXPECT_THROW(query(), std::invalid_argument);
}
}

// queries that are legal, but reference invalid parts of the input
Expand Down Expand Up @@ -1018,4 +1027,33 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]");
}

TEST_F(JsonPathTests, QueriesContainingQuotes)
{
std::string input_string = R"({"AB": 1, "A.B": 2, "'A": {"B'": 3}, "A": {"B": 4} })";

auto do_test = [&input_string](auto const& json_path_string,
auto const& expected_string,
bool const& expect_null = false) {
auto const input = cudf::test::strings_column_wrapper{input_string};
auto const json_path = std::string{json_path_string};
cudf::get_json_object_options options;
options.set_allow_single_quotes(true);
auto const result = cudf::get_json_object(cudf::strings_column_view(input), json_path, options);
auto const expected =
cudf::test::strings_column_wrapper{std::initializer_list<std::string>{expected_string},
std::initializer_list<bool>{!expect_null}};

CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*result, expected);
};

// Set 1
do_test(R"($.AB)", "1");
do_test(R"($['A.B'])", "2");
do_test(R"($.'A.B')", "3");
do_test(R"($.A.B)", "4");

// Set 2
do_test(R"($.'A)", R"({"B'": 3})");
}

CUDF_TEST_PROGRAM_MAIN()
10 changes: 9 additions & 1 deletion java/src/main/native/src/ColumnViewJni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2452,7 +2452,15 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_getJSONObject(
options.set_allow_single_quotes(allow_single_quotes);
options.set_strip_quotes_from_single_strings(strip_quotes_from_single_strings);
options.set_missing_fields_as_nulls(missing_fields_as_nulls);
return release_as_jlong(cudf::get_json_object(n_strings_col_view, *n_scalar_path, options));
auto result_col_ptr = [&]() {
try {
return cudf::get_json_object(n_strings_col_view, *n_scalar_path, options);
} catch (std::invalid_argument const &err) {
auto const null_scalar = cudf::string_scalar(std::string(""), false);
return cudf::make_column_from_scalar(null_scalar, n_strings_col_view.size());
} catch (...) { throw; }
}();
return release_as_jlong(result_col_ptr);
}
CATCH_STD(env, 0)
}
Expand Down
16 changes: 16 additions & 0 deletions java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -6405,6 +6405,22 @@ void testGetJSONObjectWithSingleQuotes() {
}
}

@Test
void testGetJSONObjectWithInvalidQueries() {
String jsonString = "{" +
"\'a\': \'A\"\'" +
"}";

GetJsonObjectOptions options = GetJsonObjectOptions.builder().allowSingleQuotes(true).build();
try (ColumnVector json = ColumnVector.fromStrings(jsonString, jsonString);
Scalar nullString = Scalar.fromString(null);
ColumnVector expectedAuthors = ColumnVector.fromScalar(nullString, 2);
Scalar path = Scalar.fromString(".");
ColumnVector gotAuthors = json.getJSONObject(path, options)) {
assertColumnsAreEqual(expectedAuthors, gotAuthors);
}
}

@Test
void testMakeStructEmpty() {
final int numRows = 10;
Expand Down

0 comments on commit a4f1118

Please sign in to comment.