From 873d997ab040f29900bbe0732650a6b2496675fe Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 17 Jul 2024 05:26:12 +0800 Subject: [PATCH 1/3] Update submodule cudf to e2b7e4370c8513811e9c72b30f499a5614b49f7c (#2231) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- thirdparty/cudf-pins/rapids-cmake.sha | 2 +- thirdparty/cudf-pins/versions.json | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index 669db3ea4a..e2b7e4370c 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit 669db3ea4a0c24a343c5619dd00904ad22ea215b +Subproject commit e2b7e4370c8513811e9c72b30f499a5614b49f7c diff --git a/thirdparty/cudf-pins/rapids-cmake.sha b/thirdparty/cudf-pins/rapids-cmake.sha index a02bd5d56a..072e6efd6a 100644 --- a/thirdparty/cudf-pins/rapids-cmake.sha +++ b/thirdparty/cudf-pins/rapids-cmake.sha @@ -1 +1 @@ -351fc88100ce381434f9dc16debbd32d2fc82a07 +fddf1c05f96f3a3ff50c358fdf634ba067001850 diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json index 599a204f7a..915a004aff 100644 --- a/thirdparty/cudf-pins/versions.json +++ b/thirdparty/cudf-pins/versions.json @@ -47,7 +47,7 @@ { "always_download" : true, "git_shallow" : false, - "git_tag" : "363e2d65c02b216baf33e4219f27c0102936edee", + "git_tag" : "ff94539ff1569d990bbd6abb9c678b4d1991e5ba", "git_url" : "https://github.com/rapidsai/kvikio.git", "version" : "24.08" }, @@ -132,7 +132,7 @@ { "always_download" : true, "git_shallow" : false, - "git_tag" : "429960808c3bccea2e7e1da177439546481bb684", + "git_tag" : "f91ca6f22195868e6493e38975f5c07029bd1977", "git_url" : "https://github.com/rapidsai/rmm.git", "version" : "24.08" }, From adda312e8420e5ad791304d2c21e09b9d556d2c0 Mon Sep 17 00:00:00 2001 From: Jenkins Automation <70000568+nvauto@users.noreply.github.com> Date: Wed, 17 Jul 2024 11:27:14 +0800 Subject: [PATCH 2/3] Update submodule cudf to 093bcc94ccf156a7e39339a7c4bb7e86543187de (#2234) Signed-off-by: spark-rapids automation <70000568+nvauto@users.noreply.github.com> --- thirdparty/cudf | 2 +- thirdparty/cudf-pins/versions.json | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/thirdparty/cudf b/thirdparty/cudf index e2b7e4370c..093bcc94cc 160000 --- a/thirdparty/cudf +++ b/thirdparty/cudf @@ -1 +1 @@ -Subproject commit e2b7e4370c8513811e9c72b30f499a5614b49f7c +Subproject commit 093bcc94ccf156a7e39339a7c4bb7e86543187de diff --git a/thirdparty/cudf-pins/versions.json b/thirdparty/cudf-pins/versions.json index 915a004aff..61e3bfe97e 100644 --- a/thirdparty/cudf-pins/versions.json +++ b/thirdparty/cudf-pins/versions.json @@ -47,7 +47,7 @@ { "always_download" : true, "git_shallow" : false, - "git_tag" : "ff94539ff1569d990bbd6abb9c678b4d1991e5ba", + "git_tag" : "ab3778c0462c2f713125a4e458f46bcd654b9d54", "git_url" : "https://github.com/rapidsai/kvikio.git", "version" : "24.08" }, From aff3696693bb2910b03c83dc1b5a8dfb28b49c86 Mon Sep 17 00:00:00 2001 From: Nghia Truong <7416935+ttnghia@users.noreply.github.com> Date: Tue, 16 Jul 2024 23:15:01 -0700 Subject: [PATCH 3/3] Cleanup `get_json_object` (#2232) * Rewrite `json_generator` * Debug print * Cleanup header * Add `const` and `static` qualifiers * Cleanup Signed-off-by: Nghia Truong * Faster name concatenation Signed-off-by: Nghia Truong * Add case path enum Signed-off-by: Nghia Truong * Fix `try_skip_children` Signed-off-by: Nghia Truong * Construct path command async * More cleanup Signed-off-by: Nghia Truong * Add comment Signed-off-by: Nghia Truong * Fix format Signed-off-by: Nghia Truong --------- Signed-off-by: Nghia Truong --- src/main/cpp/src/get_json_object.cu | 529 +++++++++++++-------------- src/main/cpp/src/get_json_object.hpp | 9 +- src/main/cpp/src/json_parser.cuh | 91 ++--- 3 files changed, 297 insertions(+), 332 deletions(-) diff --git a/src/main/cpp/src/get_json_object.cu b/src/main/cpp/src/get_json_object.cu index 47b1245307..b52754da4b 100644 --- a/src/main/cpp/src/get_json_object.cu +++ b/src/main/cpp/src/get_json_object.cu @@ -58,7 +58,7 @@ constexpr int max_path_depth = 16; /** * write JSON style */ -enum class write_style { RAW, QUOTED, FLATTEN }; +enum class write_style : int8_t { RAW, QUOTED, FLATTEN }; /** * path instruction @@ -66,13 +66,13 @@ enum class write_style { RAW, QUOTED, FLATTEN }; struct path_instruction { __device__ inline path_instruction(path_instruction_type _type) : type(_type) {} - path_instruction_type type; - // used when type is named type cudf::string_view name; // used when type is index int index{-1}; + + path_instruction_type type; }; /** @@ -83,30 +83,24 @@ struct path_instruction { */ class json_generator { public: - __device__ json_generator(char* _output) : output(_output), output_len(0) {} - __device__ json_generator() : output(nullptr), output_len(0) {} + __device__ json_generator(int _offset = 0) : offset(_offset), output_len(0) {} // create a nested child generator based on this parent generator, // child generator is a view, parent and child share the same byte array - __device__ json_generator new_child_generator() + __device__ json_generator new_child_generator() const { - if (nullptr == output) { - return json_generator(); - } else { - return json_generator(output + output_len); - } + return json_generator(offset + output_len); } // write [ // add an extra comma if needed, // e.g.: when JSON content is: [[1,2,3] // writing a new [ should result: [[1,2,3],[ - __device__ void write_start_array() + __device__ void write_start_array(char* out_begin) { - try_write_comma(); - - if (output) { *(output + output_len) = '['; } + try_write_comma(out_begin); + out_begin[offset + output_len] = '['; output_len++; array_depth++; // new array is empty @@ -114,14 +108,12 @@ class json_generator { } // write ] - __device__ void write_end_array() + __device__ void write_end_array(char* out_begin) { - if (output) { *(output + output_len) = ']'; } + out_begin[offset + output_len] = ']'; output_len++; - // point to parent array array_depth--; - // set parent array as non-empty because already had a closed child item. is_curr_array_empty = false; } @@ -137,16 +129,16 @@ class json_generator { } // return true if it's in a array context and it's not writing the first item. - __device__ inline bool need_comma() { return (array_depth > 0 && !is_curr_array_empty); } + __device__ inline bool need_comma() const { return (array_depth > 0 && !is_curr_array_empty); } /** * write comma accroding to current generator state */ - __device__ void try_write_comma() + __device__ void try_write_comma(char* out_begin) { if (need_comma()) { // in array context and writes first item - if (output) { *(output + output_len) = ','; } + out_begin[offset + output_len] = ','; output_len++; } } @@ -156,24 +148,16 @@ class json_generator { * object/array, then copy to corresponding matched end object/array. return * false if JSON format is invalid return true if JSON format is valid */ - __device__ bool copy_current_structure(json_parser& parser) + __device__ bool copy_current_structure(json_parser& parser, char* out_begin) { // first try add comma - try_write_comma(); + try_write_comma(out_begin); if (array_depth > 0) { is_curr_array_empty = false; } - if (nullptr != output) { - auto copy_to = output + output_len; - auto [b, copy_len] = parser.copy_current_structure(copy_to); - output_len += copy_len; - return b; - } else { - char* copy_to = nullptr; - auto [b, copy_len] = parser.copy_current_structure(copy_to); - output_len += copy_len; - return b; - } + auto [b, copy_len] = parser.copy_current_structure(out_begin + offset + output_len); + output_len += copy_len; + return b; } /** @@ -183,17 +167,12 @@ class json_generator { * then can not return a pointer and length pair (char *, len), * For number token, JSON parser can return a pair (char *, len) */ - __device__ void write_raw(json_parser& parser) + __device__ void write_raw(json_parser& parser, char* out_begin) { if (array_depth > 0) { is_curr_array_empty = false; } - if (nullptr != output) { - auto copied = parser.write_unescaped_text(output + output_len); - output_len += copied; - } else { - auto len = parser.compute_unescaped_len(); - output_len += len; - } + auto copied = parser.write_unescaped_text(out_begin + offset + output_len); + output_len += copied; } /** @@ -227,34 +206,32 @@ class json_generator { * block */ __device__ void write_child_raw_value(char* child_block_begin, - size_t child_block_len, + int child_block_len, bool write_outer_array_tokens) { bool insert_comma = need_comma(); if (array_depth > 0) { is_curr_array_empty = false; } - if (nullptr != output) { - if (write_outer_array_tokens) { - if (insert_comma) { - *(child_block_begin + child_block_len + 2) = ']'; - move_forward(child_block_begin, child_block_len, 2); - *(child_block_begin + 1) = '['; - *(child_block_begin) = ','; - } else { - *(child_block_begin + child_block_len + 1) = ']'; - move_forward(child_block_begin, child_block_len, 1); - *(child_block_begin) = '['; - } + if (write_outer_array_tokens) { + if (insert_comma) { + *(child_block_begin + child_block_len + 2) = ']'; + move_forward(child_block_begin, child_block_len, 2); + *(child_block_begin + 1) = '['; + *(child_block_begin) = ','; } else { - if (insert_comma) { - move_forward(child_block_begin, child_block_len, 1); - *(child_block_begin) = ','; - } else { - // do not need comma && do not need write outer array tokens - // do nothing, because child generator buff is directly after the - // parent generator - } + *(child_block_begin + child_block_len + 1) = ']'; + move_forward(child_block_begin, child_block_len, 1); + *(child_block_begin) = '['; + } + } else { + if (insert_comma) { + move_forward(child_block_begin, child_block_len, 1); + *(child_block_begin) = ','; + } else { + // do not need comma && do not need write outer array tokens + // do nothing, because child generator buff is directly after the + // parent generator } } @@ -270,7 +247,7 @@ class json_generator { // e.g.: memory is: 1 2 0 0, begin is 1, len is 1, after moving, // memory is: 1 1 2 0. // Note: should move from end to begin to avoid overwrite buffer - __device__ void move_forward(char* begin, size_t len, int forward) + static __device__ void move_forward(char* begin, size_t len, int forward) { // TODO copy by 8 bytes char* pos = begin + len + forward - 1; @@ -281,9 +258,8 @@ class json_generator { } } - __device__ inline size_t get_output_len() const { return output_len; } - __device__ inline char* get_output_start_position() const { return output; } - __device__ inline char* get_current_output_position() const { return output + output_len; } + __device__ inline int get_offset() const { return offset; } + __device__ inline int get_output_len() const { return output_len; } /** * generator may contain trash output, e.g.: generator writes some output, @@ -294,13 +270,14 @@ class json_generator { __device__ inline void set_output_len(size_t len) { output_len = len; } private: - char* output; - size_t output_len; + int offset; // offset from the global output buffer + int output_len; + + int array_depth = 0; // whether already worte a item in current array // used to decide whether add a comma before writing out a new item. bool is_curr_array_empty; - int array_depth = 0; }; /** @@ -357,84 +334,95 @@ __device__ inline thrust::tuple path_match_index_wildcard( } } -/** - * - * This function is rewritten from above commented recursive function. - * this function is equivalent to the above commented recursive function. - */ -__device__ bool evaluate_path(json_parser& p, - json_generator& root_g, - write_style root_style, - cudf::device_span root_path) -{ - // manually maintained context stack in lieu of calling evaluate_path recursively. - struct context { - // current token - json_token token; +enum class evaluation_case_path : int8_t { + INVALID = -1, + START_ARRAY___EMPTY_PATH___FLATTEN_STYLE = 2, + START_OBJECT___MATCHED_NAME_PATH = 4, + START_ARRAY___MATCHED_DOUBLE_WILDCARD = 5, + START_ARRAY___MATCHED_WILDCARD___STYLE_NOT_QUOTED = 6, + START_ARRAY___MATCHED_WILDCARD = 7, + START_ARRAY___MATCHED_INDEX_AND_WILDCARD = 8, + START_ARRAY___MATCHED_INDEX = 9 +}; - // which case path that this task is from - int case_path; +struct context { + // used to save current generator + json_generator g; - // used to save current generator - json_generator g; + // used to save child JSON generator for case path 6 + json_generator child_g; - write_style style; + cudf::device_span path; - cudf::device_span path; - // is this context task is done - bool task_is_done; + // whether written output + // if dirty > 0, indicates success + int dirty; - // whether written output - // if dirty > 0, indicates success - int dirty; + // which case path that this task is from + evaluation_case_path case_path; - // for some case paths - bool is_first_enter; + // current token + json_token token; - // used to save child JSON generator for case path 8 - json_generator child_g; - }; + write_style style; + + // for some case paths + bool is_first_enter; + + // is this context task is done + bool task_is_done; +}; + +/** + * @brief Parse a single json string using the provided command buffer. + * + * @param input The incoming json string + * @param path_commands The command buffer to be applied to the string + * @param out_buf Buffer user to store the string resulted from the query + * @return A pair containing the result code and the output size + */ +__device__ thrust::pair evaluate_path( + char_range input, cudf::device_span path_commands, char* out_buf) +{ + json_parser p{input}; + p.next_token(); + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // define stack; plus 1 indicates root context task needs an extra memory context stack[max_path_depth + 1]; - int stack_pos = 0; + int stack_size = 0; // push context function - auto push_context = [&stack, &stack_pos](json_token _token, - int _case_path, - json_generator _g, - write_style _style, - cudf::device_span _path) { + auto push_context = [&p, &stack, &stack_size](evaluation_case_path _case_path, + json_generator _g, + write_style _style, + cudf::device_span _path) { // no need to check stack is full // because Spark-Rapids already checked maximum length of `path_instruction` - auto& ctx = stack[stack_pos]; - ctx.token = _token; - ctx.case_path = _case_path; + auto& ctx = stack[stack_size++]; ctx.g = _g; - ctx.style = _style; ctx.path = _path; - ctx.task_is_done = false; ctx.dirty = 0; + ctx.case_path = _case_path; + ctx.token = p.get_current_token(); + ctx.style = _style; ctx.is_first_enter = true; - - stack_pos++; + ctx.task_is_done = false; }; // put the first context task - push_context(p.get_current_token(), -1, root_g, root_style, root_path); + push_context(evaluation_case_path::INVALID, json_generator{}, write_style::RAW, path_commands); - while (stack_pos > 0) { - auto& ctx = stack[stack_pos - 1]; + while (stack_size > 0) { + auto& ctx = stack[stack_size - 1]; if (!ctx.task_is_done) { - // task is not done. - // case (VALUE_STRING, Nil) if style == RawStyle // case path 1 if (json_token::VALUE_STRING == ctx.token && path_is_empty(ctx.path.size()) && ctx.style == write_style::RAW) { // there is no array wildcard or slice parent, emit this string without // quotes write current string in parser to generator - ctx.g.write_raw(p); + ctx.g.write_raw(p, out_buf); ctx.dirty = 1; ctx.task_is_done = true; } @@ -445,10 +433,13 @@ __device__ bool evaluate_path(json_parser& p, // flatten this array into the parent if (json_token::END_ARRAY != p.next_token()) { // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // push back task // add child task - push_context(p.get_current_token(), 2, ctx.g, ctx.style, {nullptr, 0}); + push_context(evaluation_case_path::START_ARRAY___EMPTY_PATH___FLATTEN_STYLE, + ctx.g, + ctx.style, + {nullptr, 0}); } else { // END_ARRAY ctx.task_is_done = true; @@ -458,9 +449,9 @@ __device__ bool evaluate_path(json_parser& p, // case path 3 else if (path_is_empty(ctx.path.size())) { // general case: just copy the child tree verbatim - if (!(ctx.g.copy_current_structure(p))) { + if (!(ctx.g.copy_current_structure(p, out_buf))) { // JSON validation check - return false; + return {false, 0}; } ctx.dirty = 1; ctx.task_is_done = true; @@ -475,17 +466,17 @@ __device__ bool evaluate_path(json_parser& p, if (ctx.dirty > 0) { while (json_token::END_OBJECT != p.next_token()) { // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // skip FIELD_NAME token p.next_token(); // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // skip value of FIELD_NAME if (!p.try_skip_children()) { // JSON validation check - return false; + return {false, 0}; } } } @@ -498,7 +489,7 @@ __device__ bool evaluate_path(json_parser& p, bool found_expected_child = false; while (json_token::END_OBJECT != p.next_token()) { // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // need to try more children auto match_named = path_match_named(ctx.path); @@ -508,13 +499,12 @@ __device__ bool evaluate_path(json_parser& p, // skip FIELD_NAME token p.next_token(); // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // meets null token, it's not expected, return false - if (json_token::VALUE_NULL == p.get_current_token()) { return false; } + if (json_token::VALUE_NULL == p.get_current_token()) { return {false, 0}; } // push sub task; sub task will update the result of path 4 - push_context(p.get_current_token(), - 4, + push_context(evaluation_case_path::START_OBJECT___MATCHED_NAME_PATH, ctx.g, ctx.style, {ctx.path.data() + 1, ctx.path.size() - 1}); @@ -524,12 +514,12 @@ __device__ bool evaluate_path(json_parser& p, // skip FIELD_NAME token p.next_token(); // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // current child is not expected, skip current child if (!p.try_skip_children()) { // JSON validation check - return false; + return {false, 0}; } } } @@ -549,19 +539,18 @@ __device__ bool evaluate_path(json_parser& p, // behavior in Hive if (ctx.is_first_enter) { ctx.is_first_enter = false; - ctx.g.write_start_array(); + ctx.g.write_start_array(out_buf); } if (p.next_token() != json_token::END_ARRAY) { // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } - push_context(p.get_current_token(), - 5, + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } + push_context(evaluation_case_path::START_ARRAY___MATCHED_DOUBLE_WILDCARD, ctx.g, write_style::FLATTEN, {ctx.path.data() + 2, ctx.path.size() - 2}); } else { - ctx.g.write_end_array(); + ctx.g.write_end_array(out_buf); ctx.task_is_done = true; } } @@ -594,17 +583,16 @@ __device__ bool evaluate_path(json_parser& p, if (p.next_token() != json_token::END_ARRAY) { // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // track the number of array elements and only emit an outer array if // we've written more than one element, this matches Hive's behavior - push_context(p.get_current_token(), - 6, + push_context(evaluation_case_path::START_ARRAY___MATCHED_WILDCARD___STYLE_NOT_QUOTED, child_g, next_style, {ctx.path.data() + 1, ctx.path.size() - 1}); } else { - char* child_g_start = child_g.get_output_start_position(); - size_t child_g_len = child_g.get_output_len(); + char* child_g_start = out_buf + child_g.get_offset(); + int child_g_len = child_g.get_output_len(); if (ctx.dirty > 1) { // add outer array tokens ctx.g.write_child_raw_value( @@ -625,21 +613,20 @@ __device__ bool evaluate_path(json_parser& p, path_match_element(ctx.path, path_instruction_type::WILDCARD)) { if (ctx.is_first_enter) { ctx.is_first_enter = false; - ctx.g.write_start_array(); + ctx.g.write_start_array(out_buf); } if (p.next_token() != json_token::END_ARRAY) { // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // wildcards can have multiple matches, continually update the dirty // count - push_context(p.get_current_token(), - 7, + push_context(evaluation_case_path::START_ARRAY___MATCHED_WILDCARD, ctx.g, write_style::QUOTED, {ctx.path.data() + 1, ctx.path.size() - 1}); } else { - ctx.g.write_end_array(); + ctx.g.write_end_array(out_buf); ctx.task_is_done = true; } } @@ -651,28 +638,27 @@ __device__ bool evaluate_path(json_parser& p, p.next_token(); // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } ctx.is_first_enter = false; int i = idx; while (i > 0) { if (p.get_current_token() == json_token::END_ARRAY) { // terminate, nothing has been written - return false; + return {false, 0}; } - if (!p.try_skip_children()) { return false; } + if (!p.try_skip_children()) { return {false, 0}; } p.next_token(); // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } --i; } // i == 0 - push_context(p.get_current_token(), - 8, + push_context(evaluation_case_path::START_ARRAY___MATCHED_INDEX_AND_WILDCARD, ctx.g, write_style::QUOTED, {ctx.path.data() + 1, ctx.path.size() - 1}); @@ -684,176 +670,122 @@ __device__ bool evaluate_path(json_parser& p, p.next_token(); // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } int i = idx; while (i > 0) { if (p.get_current_token() == json_token::END_ARRAY) { // terminate, nothing has been written - return false; + return {false, 0}; } - if (!p.try_skip_children()) { return false; } + if (!p.try_skip_children()) { return {false, 0}; } p.next_token(); // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } --i; } // i == 0 - push_context( - p.get_current_token(), 9, ctx.g, ctx.style, {ctx.path.data() + 1, ctx.path.size() - 1}); + push_context(evaluation_case_path::START_ARRAY___MATCHED_INDEX, + ctx.g, + ctx.style, + {ctx.path.data() + 1, ctx.path.size() - 1}); } // case _ => // case path 12 else { - if (!p.try_skip_children()) { return false; } + if (!p.try_skip_children()) { return {false, 0}; } // default case path, return false for this task ctx.dirty = 0; ctx.task_is_done = true; } - } else { - // current context is done. - + } // if (!ctx.task_is_done) + else { // current context is done. // pop current top context - stack_pos--; + stack_size--; + + // has no parent task, stack is empty, will exit + if (stack_size == 0) { break; } - // pop parent task + // peek parent context task // update parent task info according to current task result - if (stack_pos > 0) { - // peek parent context task - auto& p_ctx = stack[stack_pos - 1]; - - // case (VALUE_STRING, Nil) if style == RawStyle - // case path 1 - if (1 == ctx.case_path) { - // never happen - } - // path 2: case (START_ARRAY, Nil) if style == FlattenStyle - // path 5: case (START_ARRAY, Wildcard :: Wildcard :: xs) - // path 7: case (START_ARRAY, Wildcard :: xs) - else if (2 == ctx.case_path || 5 == ctx.case_path || 7 == ctx.case_path) { + auto& p_ctx = stack[stack_size - 1]; + + switch (ctx.case_path) { + // path 2: case (START_ARRAY, Nil) if style == FlattenStyle + // path 5: case (START_ARRAY, Wildcard :: Wildcard :: xs) + // path 7: case (START_ARRAY, Wildcard :: xs) + case evaluation_case_path::START_ARRAY___EMPTY_PATH___FLATTEN_STYLE: + case evaluation_case_path::START_ARRAY___MATCHED_DOUBLE_WILDCARD: + case evaluation_case_path::START_ARRAY___MATCHED_WILDCARD: { // collect result from child task p_ctx.dirty += ctx.dirty; // copy generator states to parent task; p_ctx.g = ctx.g; + + break; } - // case (START_OBJECT, Named :: xs) - // case path 4 - else if (4 == ctx.case_path) { + + // case (START_OBJECT, Named :: xs) + // case path 4 + case evaluation_case_path::START_OBJECT___MATCHED_NAME_PATH: { p_ctx.dirty = ctx.dirty; // copy generator states to parent task; p_ctx.g = ctx.g; + + break; } - // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle - // case path 6 - else if (6 == ctx.case_path) { + + // case (START_ARRAY, Wildcard :: xs) if style != QuotedStyle + // case path 6 + case evaluation_case_path::START_ARRAY___MATCHED_WILDCARD___STYLE_NOT_QUOTED: { // collect result from child task p_ctx.dirty += ctx.dirty; // update child generator for parent task p_ctx.child_g = ctx.g; + + break; } - /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */ - // case path 8 - // case (START_ARRAY, Index(idx) :: xs) - // case path 9 - else if (8 == ctx.case_path || 9 == ctx.case_path) { + + /* case (START_ARRAY, Index(idx) :: (xs@Wildcard :: _)) */ + // case path 8 + // case (START_ARRAY, Index(idx) :: xs) + // case path 9 + case evaluation_case_path::START_ARRAY___MATCHED_INDEX_AND_WILDCARD: + case evaluation_case_path::START_ARRAY___MATCHED_INDEX: { // collect result from child task p_ctx.dirty += ctx.dirty; // post logic: while (p.next_token() != json_token::END_ARRAY) { // JSON validation check - if (json_token::ERROR == p.get_current_token()) { return false; } + if (json_token::ERROR == p.get_current_token()) { return {false, 0}; } // advance the token stream to the end of the array - if (!p.try_skip_children()) { return false; } + if (!p.try_skip_children()) { return {false, 0}; } } // task is done p_ctx.task_is_done = true; // copy generator states to parent task; p_ctx.g = ctx.g; - } - // case path 3: case (_, Nil) - // case path 12: case _ => - // others - else { - // never happen - } - } else { - // has no parent task, stack is empty, will exit - } - } - } - - // copy output len - root_g.set_output_len(stack[0].g.get_output_len()); - return stack[0].dirty > 0; -} -rmm::device_uvector construct_path_commands( - std::vector> const& instructions, - cudf::string_scalar const& all_names_scalar, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - int name_pos = 0; - - // construct the path commands - std::vector path_commands; - for (auto const& inst : instructions) { - auto const& [type, name, index] = inst; - switch (type) { - case path_instruction_type::WILDCARD: - path_commands.emplace_back(path_instruction{path_instruction_type::WILDCARD}); - break; - case path_instruction_type::INDEX: - path_commands.emplace_back(path_instruction{path_instruction_type::INDEX}); - path_commands.back().index = index; - break; - case path_instruction_type::NAMED: - path_commands.emplace_back(path_instruction{path_instruction_type::NAMED}); - path_commands.back().name = - cudf::string_view(all_names_scalar.data() + name_pos, name.size()); - name_pos += name.size(); - break; - default: CUDF_FAIL("Invalid path instruction type"); - } - } - // convert to uvector - return cudf::detail::make_device_uvector_sync(path_commands, stream, mr); -} - -/** - * @brief Parse a single json string using the provided command buffer - * - * - * @param input The incoming json string - * @param path_commands The command buffer to be applied to the string - * @param out_buf Buffer user to store the string resulted from the query - * @returns A pair containing the result code and the output buffer - */ -__device__ thrust::pair get_json_object_single( - char_range input, cudf::device_span path_commands, char* out_buf) -{ - json_parser j_parser(input); - j_parser.next_token(); - // JSON validation check - if (json_token::ERROR == j_parser.get_current_token()) { return {false, 0}; } + break; + } - json_generator generator(out_buf); + default:; // Never happens! + } // end switch (ctx.case_path) - bool const success = evaluate_path(j_parser, generator, write_style::RAW, path_commands); + } // ctx.task_is_done + } // while (stack_size > 0) - if (!success) { - // generator may contain trash output, e.g.: generator writes some output, - // then JSON format is invalid, the previous output becomes trash. - // set output as zero to tell second step - generator.set_output_len_zero(); - } + auto const success = stack[0].dirty > 0; - return {success, static_cast(generator.get_output_len())}; + // generator may contain trash output, e.g.: generator writes some output, + // then JSON format is invalid, the previous output becomes trash. + // We need to return output size as zero. + return {success, success ? stack[0].g.get_output_len() : 0}; } /** @@ -895,11 +827,9 @@ __launch_bounds__(block_size, 1) CUDF_KERNEL auto const str = input.element(tid); if (str.size_bytes() > 0) { - auto const max_size = offsets[tid + 1] - offsets[tid]; + thrust::tie(is_valid, out_size) = evaluate_path(char_range{str}, path_commands, dst); - // If `max_size == 0`, do not pass in the dst pointer to prevent writing garbage data. - thrust::tie(is_valid, out_size) = - get_json_object_single(str, path_commands, max_size != 0 ? dst : nullptr); + auto const max_size = offsets[tid + 1] - offsets[tid]; if (out_size > max_size) { *has_out_of_bound = true; } } @@ -910,6 +840,35 @@ __launch_bounds__(block_size, 1) CUDF_KERNEL } } +std::pair, std::unique_ptr>> +construct_path_commands( + std::vector> const& instructions, + cudf::string_scalar const& all_names_scalar, + rmm::cuda_stream_view stream) +{ + std::size_t name_pos{0}; + auto h_path_commands = std::make_unique>(); + h_path_commands->reserve(instructions.size()); + for (auto const& [type, name, index] : instructions) { + h_path_commands->emplace_back(path_instruction{type}); + + if (type == path_instruction_type::INDEX) { + h_path_commands->back().index = index; + } else if (type == path_instruction_type::NAMED) { + h_path_commands->back().name = + cudf::string_view(all_names_scalar.data() + name_pos, name.size()); + name_pos += name.size(); + } else if (type != path_instruction_type::WILDCARD) { + CUDF_FAIL("Invalid path instruction type"); + } + } + + // h_path_commands needs to be kept alive outside of this function due to async copy. + return {cudf::detail::make_device_uvector_async( + *h_path_commands, stream, rmm::mr::get_current_device_resource()), + std::move(h_path_commands)}; +} + std::unique_ptr get_json_object( cudf::strings_column_view const& input, std::vector> const& instructions, @@ -919,13 +878,23 @@ std::unique_ptr get_json_object( if (instructions.size() > max_path_depth) { CUDF_FAIL("JSONPath query exceeds maximum depth"); } if (input.is_empty()) { return cudf::make_empty_column(cudf::type_id::STRING); } - std::string all_names; - for (auto const& inst : instructions) { - all_names += std::get<1>(inst); - } + auto const all_names = [&] { + std::size_t length{0}; + for (auto const& inst : instructions) { + length += (std::get<1>(inst)).length(); + } + + std::string all_names; + all_names.reserve(length); + for (auto const& inst : instructions) { + all_names += std::get<1>(inst); + } + return all_names; + }(); + auto const all_names_scalar = cudf::string_scalar(all_names, true, stream); - auto const path_commands = construct_path_commands( - instructions, all_names_scalar, stream, rmm::mr::get_current_device_resource()); + auto const [d_path_commands, h_path_commands] = + construct_path_commands(instructions, all_names_scalar, stream); auto const d_input_ptr = cudf::column_device_view::create(input.parent(), stream); auto const in_offsets = cudf::detail::offsetalator_factory::make_input_iterator(input.offsets()); @@ -966,7 +935,7 @@ std::unique_ptr get_json_object( get_json_object_kernel <<>>(*d_input_ptr, in_offsets, - path_commands, + d_path_commands, out_stringviews.data(), output_scratch.data(), has_out_of_bound.data()); @@ -1009,7 +978,7 @@ std::unique_ptr get_json_object( get_json_object_kernel <<>>(*d_input_ptr, out_offsets, - path_commands, + d_path_commands, nullptr /*out_stringviews*/, chars.data(), has_out_of_bound.data()); diff --git a/src/main/cpp/src/get_json_object.hpp b/src/main/cpp/src/get_json_object.hpp index bb3294b424..e13c1e42d7 100644 --- a/src/main/cpp/src/get_json_object.hpp +++ b/src/main/cpp/src/get_json_object.hpp @@ -16,18 +16,11 @@ #pragma once -#include #include #include -#include -#include -#include -#include - #include -#include #include namespace spark_rapids_jni { @@ -35,7 +28,7 @@ namespace spark_rapids_jni { /** * path instruction type */ -enum class path_instruction_type { WILDCARD, INDEX, NAMED }; +enum class path_instruction_type : int8_t { WILDCARD, INDEX, NAMED }; /** * Extracts json object from a json string based on json path specified, and diff --git a/src/main/cpp/src/json_parser.cuh b/src/main/cpp/src/json_parser.cuh index 217ec0047b..10ad2e4fcc 100644 --- a/src/main/cpp/src/json_parser.cuh +++ b/src/main/cpp/src/json_parser.cuh @@ -61,7 +61,7 @@ constexpr int max_num_len = 1000; /** * JSON token enum */ -enum class json_token { +enum class json_token : int8_t { // start token INIT = 0, @@ -228,7 +228,7 @@ class json_parser { /** * @brief get the bit value for specified bit from a int64 number */ - __device__ inline bool get_bit_value(int64_t number, int bitIndex) + static __device__ inline bool get_bit_value(int64_t number, int bitIndex) { // Shift the number right by the bitIndex to bring the desired bit to the rightmost position long shifted = number >> bitIndex; @@ -242,7 +242,7 @@ class json_parser { /** * @brief set the bit value for specified bit to a int64 number */ - __device__ inline void set_bit_value(int64_t& number, int bit_index, bool bit_value) + static __device__ inline void set_bit_value(int64_t& number, int bit_index, bool bit_value) { // Create a mask with a 1 at the desired bit index long mask = 1L << bit_index; @@ -265,7 +265,7 @@ class json_parser { /** * is hex digits: 0-9, A-F, a-f */ - __device__ inline bool is_hex_digit(char c) const + static __device__ inline bool is_hex_digit(char c) { return (c >= '0' && c <= '9') || (c >= 'A' && c <= 'F') || (c >= 'a' && c <= 'f'); } @@ -273,12 +273,12 @@ class json_parser { /** * is 0 to 9 digit */ - __device__ inline bool is_digit(char c) const { return (c >= '0' && c <= '9'); } + static __device__ inline bool is_digit(char c) { return (c >= '0' && c <= '9'); } /** * is white spaces: ' ', '\t', '\n' '\r' */ - __device__ inline bool is_whitespace(char c) const + static __device__ inline bool is_whitespace(char c) { return c == ' ' || c == '\t' || c == '\n' || c == '\r'; } @@ -296,7 +296,7 @@ class json_parser { /** * check current char, if it's expected, then plus the position */ - __device__ inline bool try_skip(char_range_reader& reader, char expected) + static __device__ inline bool try_skip(char_range_reader& reader, char expected) { if (!reader.eof() && reader.current_char() == expected) { reader.next(); @@ -305,7 +305,7 @@ class json_parser { return false; } - __device__ inline bool try_skip(cudf::size_type& pos, char expected) + __device__ inline bool try_skip(cudf::size_type& pos, char expected) const { if (!eof(pos) && chars[pos] == expected) { pos++; @@ -343,7 +343,7 @@ class json_parser { * true is object, false is array * only has two contexts: object or array */ - __device__ inline bool is_object_context() + __device__ inline bool is_object_context() const { return get_bit_value(context_stack, stack_size - 1); } @@ -356,7 +356,7 @@ class json_parser { /** * is context stack is empty */ - __device__ inline bool is_context_stack_empty() { return stack_size == 0; } + __device__ inline bool is_context_stack_empty() const { return stack_size == 0; } __device__ inline void set_current_error() { current_token = json_token::ERROR; } @@ -431,7 +431,7 @@ class json_parser { /** * transform int value from [0, 15] to hex char */ - __device__ inline char to_hex_char(unsigned int v) + static __device__ inline char to_hex_char(unsigned int v) { if (v < 10) return '0' + v; @@ -446,7 +446,7 @@ class json_parser { * @param char to be escaped, c should in range [0, 31) * @param[out] escape output */ - __device__ inline int escape_char(unsigned char c, char* output) + static __device__ inline int escape_char(unsigned char c, char* output) { if (nullptr == output) { switch (c) { @@ -499,9 +499,9 @@ class json_parser { } } - __device__ inline int write_string(char_range_reader& str, - char* copy_destination, - escape_style w_style) + static __device__ inline int write_string(char_range_reader& str, + char* copy_destination, + escape_style w_style) { if (str.eof()) { return 0; } char const quote_char = str.current_char(); @@ -619,7 +619,7 @@ class json_parser { * is valid and length is the number of bytes needed to encode the string * in the given style. */ - __device__ inline std::pair try_parse_string( + static __device__ inline std::pair try_parse_string( char_range_reader& str, char_range_reader to_match = char_range_reader(char_range::null()), escape_style w_style = escape_style::UNESCAPED) @@ -689,7 +689,7 @@ class json_parser { return std::make_pair(false, 0); } - __device__ inline bool try_match_char(char_range_reader& reader, char c) + static __device__ inline bool try_match_char(char_range_reader& reader, char c) { if (!reader.is_null()) { if (!reader.eof() && reader.current_char() == c) { @@ -708,11 +708,11 @@ class json_parser { * skip the HEX chars in \u HEX HEX HEX HEX. * @return positive escaped ASCII value if success, -1 otherwise */ - __device__ inline bool try_skip_escape_part(char_range_reader& str, - char_range_reader& to_match, - char*& copy_dest, - escape_style w_style, - int& output_size_bytes) + static __device__ inline bool try_skip_escape_part(char_range_reader& str, + char_range_reader& to_match, + char*& copy_dest, + escape_style w_style, + int& output_size_bytes) { // already skipped the first '\' // try skip second part @@ -853,7 +853,7 @@ class json_parser { * : ~ ["\\\u0000-\u001F] * ; */ - __device__ inline bool try_skip_safe_code_point(char_range_reader& str, char c) + static __device__ inline bool try_skip_safe_code_point(char_range_reader& str, char c) { // 1 the char is not quoted(' or ") char, here satisfy, do not need to check // again @@ -873,7 +873,7 @@ class json_parser { /** * convert chars 0-9, a-f, A-F to int value */ - __device__ inline uint8_t hex_value(char c) + static __device__ inline uint8_t hex_value(char c) { if (c >= '0' && c <= '9') return c - '0'; if (c >= 'a' && c <= 'f') return c - 'a' + 10; @@ -887,7 +887,7 @@ class json_parser { * @param character Single character * @return Number of bytes */ - __device__ cudf::size_type bytes_in_char_utf8(cudf::char_utf8 character) + static __device__ cudf::size_type bytes_in_char_utf8(cudf::char_utf8 character) { return 1 + static_cast((character & 0x0000'FF00u) > 0) + static_cast((character & 0x00FF'0000u) > 0) + @@ -900,7 +900,7 @@ class json_parser { * @param unchr Character code-point to convert. * @return Single UTF-8 character. */ - __device__ cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) + static __device__ cudf::char_utf8 codepoint_to_utf8(uint32_t unchr) { cudf::char_utf8 utf8 = 0; if (unchr < 0x0000'0080) { @@ -935,7 +935,7 @@ class json_parser { * @param[out] str Output array. * @return The number of bytes in the character */ - __device__ cudf::size_type from_char_utf8(cudf::char_utf8 character, char* str) + static __device__ cudf::size_type from_char_utf8(cudf::char_utf8 character, char* str) { cudf::size_type const chr_width = bytes_in_char_utf8(character); for (cudf::size_type idx = 0; idx < chr_width; ++idx) { @@ -949,10 +949,10 @@ class json_parser { * try skip 4 HEX chars * in pattern: '\\' 'u' HEX HEX HEX HEX, it's a code point of unicode */ - __device__ bool try_skip_unicode(char_range_reader& str, - char_range_reader& to_match, - char*& copy_dest, - int& output_size_bytes) + static __device__ bool try_skip_unicode(char_range_reader& str, + char_range_reader& to_match, + char*& copy_dest, + int& output_size_bytes) { // already parsed \u // now we expect 4 hex chars. @@ -1042,7 +1042,7 @@ class json_parser { * verify max number digits length if enabled * e.g.: +1.23e-45 length is 5 */ - __device__ inline bool check_max_num_len(int number_digits_length) + static __device__ inline bool check_max_num_len(int number_digits_length) { return // disabled num len check @@ -1362,10 +1362,10 @@ class json_parser { /** * get current token */ - __device__ json_token get_current_token() { return current_token; } + __device__ json_token get_current_token() const { return current_token; } // TODO make this go away!!!! - __device__ inline char_range current_range() + __device__ inline char_range current_range() const { return chars.slice(current_token_start_pos, curr_pos - current_token_start_pos); } @@ -1386,9 +1386,10 @@ class json_parser { return true; } + json_token t; int open = 1; - while (true) { - json_token t = next_token(); + do { + t = next_token(); if (t == json_token::START_OBJECT || t == json_token::START_ARRAY) { ++open; } else if (t == json_token::END_OBJECT || t == json_token::END_ARRAY) { @@ -1396,10 +1397,11 @@ class json_parser { } else if (t == json_token::ERROR) { return false; } - } + } while (t != json_token::SUCCESS); + return false; } - __device__ cudf::size_type compute_unescaped_len() { return write_unescaped_text(nullptr); } + __device__ cudf::size_type compute_unescaped_len() const { return write_unescaped_text(nullptr); } /** * unescape current token text, then write to destination @@ -1408,7 +1410,7 @@ class json_parser { * writes 6 utf8 bytes: -28 -72 -83 -27 -101 -67 * For number, write verbatim without normalization */ - __device__ cudf::size_type write_unescaped_text(char* destination) + __device__ cudf::size_type write_unescaped_text(char* destination) const { switch (current_token) { case json_token::VALUE_STRING: { @@ -1490,7 +1492,7 @@ class json_parser { return 0; } - __device__ cudf::size_type compute_escaped_len() { return write_escaped_text(nullptr); } + __device__ cudf::size_type compute_escaped_len() const { return write_escaped_text(nullptr); } /** * escape current token text, then write to destination * e.g.: '"' is a string with 1 char '"', writes out 4 chars '"' '\' '\"' '"' @@ -1498,7 +1500,7 @@ class json_parser { * writes 8 utf8 bytes: '"' -28 -72 -83 -27 -101 -67 '"' * For number, write verbatim without normalization */ - __device__ cudf::size_type write_escaped_text(char* destination) + __device__ cudf::size_type write_escaped_text(char* destination) const { switch (current_token) { case json_token::VALUE_STRING: { @@ -1580,7 +1582,7 @@ class json_parser { * return true if current token is FIELD_NAME and match successfully. * return false otherwise, */ - __device__ bool match_current_field_name(cudf::string_view name) + __device__ bool match_current_field_name(cudf::string_view name) const { return match_current_field_name(char_range(name)); } @@ -1588,7 +1590,7 @@ class json_parser { /** * match current field name */ - __device__ bool match_current_field_name(char_range name) + __device__ bool match_current_field_name(char_range name) const { if (json_token::FIELD_NAME == current_token) { char_range_reader reader(current_range()); @@ -1689,7 +1691,6 @@ class json_parser { private: char_range const chars; cudf::size_type curr_pos; - json_token current_token; // 64 bits long saves the nested object/array contexts // true(bit value 1) is JSON object context @@ -1704,6 +1705,8 @@ class json_parser { // TODO remove if possible // used to store number token length cudf::size_type number_token_len; + + json_token current_token; }; } // namespace spark_rapids_jni