From e4a16ae2550f5a7481887b28c7a60fc14fea2f5c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 18 Jan 2022 06:31:14 -0800 Subject: [PATCH] Implement mixed equality/conditional joins (#9917) This PR implements mixed equality/inequality joins for inner, left, and full joins. This resolves #9696 and contributes to #5401. For the moment, all APIs are functional only, but an object-oriented API is planned to support caching of the hash table. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Robert Maynard (https://github.com/robertmaynard) - Yunsong Wang (https://github.com/PointKernel) - Jason Lowe (https://github.com/jlowe) - Jake Hemstad (https://github.com/jrhemstad) URL: https://github.com/rapidsai/cudf/pull/9917 --- cpp/CMakeLists.txt | 1 + .../cudf/ast/detail/expression_evaluator.cuh | 4 +- cpp/include/cudf/join.hpp | 267 +++++++- cpp/include/cudf/table/row_operators.cuh | 7 +- cpp/src/join/conditional_join.cu | 21 +- cpp/src/join/conditional_join_kernels.cuh | 6 +- cpp/src/join/hash_join.cu | 43 -- cpp/src/join/hash_join.cuh | 44 +- cpp/src/join/join_common_utils.cuh | 11 + cpp/src/join/join_common_utils.hpp | 9 + cpp/src/join/mixed_join.cu | 557 +++++++++++++++ cpp/src/join/mixed_join_kernels.cuh | 322 +++++++++ cpp/tests/CMakeLists.txt | 2 +- cpp/tests/join/conditional_join_tests.cu | 13 +- cpp/tests/join/mixed_join_tests.cu | 643 ++++++++++++++++++ 15 files changed, 1884 insertions(+), 66 deletions(-) create mode 100644 cpp/src/join/mixed_join.cu create mode 100644 cpp/src/join/mixed_join_kernels.cuh create mode 100644 cpp/tests/join/mixed_join_tests.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index a8100fb3f92..2f51f582e12 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -322,6 +322,7 @@ add_library( src/jit/parser.cpp src/jit/type.cpp src/join/conditional_join.cu + src/join/mixed_join.cu src/join/cross_join.cu src/join/hash_join.cu src/join/join.cu diff --git a/cpp/include/cudf/ast/detail/expression_evaluator.cuh b/cpp/include/cudf/ast/detail/expression_evaluator.cuh index 0b739482c4d..ecd46ec2c23 100644 --- a/cpp/include/cudf/ast/detail/expression_evaluator.cuh +++ b/cpp/include/cudf/ast/detail/expression_evaluator.cuh @@ -429,7 +429,7 @@ struct expression_evaluator { __device__ __forceinline__ void evaluate( expression_result& output_object, cudf::size_type const row_index, - IntermediateDataType* thread_intermediate_storage) + IntermediateDataType* thread_intermediate_storage) const { evaluate(output_object, row_index, row_index, row_index, thread_intermediate_storage); } @@ -452,7 +452,7 @@ struct expression_evaluator { cudf::size_type const left_row_index, cudf::size_type const right_row_index, cudf::size_type const output_row_index, - IntermediateDataType* thread_intermediate_storage) + IntermediateDataType* thread_intermediate_storage) const { cudf::size_type operator_source_index{0}; for (cudf::size_type operator_index = 0; operator_index < plan.operators.size(); diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index 8ea6bd1a6cc..30400074c50 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -19,6 +19,7 @@ #include #include #include +#include #include #include @@ -701,7 +702,7 @@ conditional_inner_join( * The first returned vector contains all the row indices from the left * table (in unspecified order). The corresponding value in the * second returned vector is either (1) the row index of the matched row - * from the right table, if there is a match or (2) an unspecified + * from the right table, if there is a match or (2) an unspecified * out-of-bounds value. * * If the provided predicate returns NULL for a pair of rows @@ -858,6 +859,270 @@ std::unique_ptr> conditional_left_anti_join( std::optional output_size = {}, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Returns a pair of row index vectors corresponding to all pairs of + * rows between the specified tables where the columns of the equality table + * are equal and the predicate evaluates to true on the conditional tables. + * + * The first returned vector contains the row indices from the left + * table that have a match in the right table (in unspecified order). + * The corresponding values in the second returned vector are + * the matched row indices from the right table. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. It is the user's + * responsiblity to choose a suitable compare_nulls value AND use appropriate + * null-safe operators in the expression. + * + * If the provided output size or per-row counts are incorrect, behavior is undefined. + * + * @code{.pseudo} + * left_equality: {{0, 1, 2}} + * right_equality: {{1, 2, 3}} + * left_conditional: {{4, 4, 4}} + * right_conditional: {{3, 4, 5}} + * Expression: Left.Column_0 > Right.Column_0 + * Result: {{1}, {0}} + * @endcode + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size_data An optional pair of values indicating the exact output size and the + * number of matches for each row in the larger of the two input tables, left or right (may be + * precomputed using the corresponding mixed_inner_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a mixed inner join between the four input tables. + */ +std::pair>, + std::unique_ptr>> +mixed_inner_join( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional>> output_size_data = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a pair of row index vectors corresponding to all pairs of + * rows between the specified tables where the columns of the equality table + * are equal and the predicate evaluates to true on the conditional tables, + * or null matches for rows in left that have no match in right. + * + * The first returned vector contains the row indices from the left + * tables that have a match in the right tables (in unspecified order). + * The corresponding value in the second returned vector is either (1) + * the row index of the matched row from the right tables, or (2) an + * unspecified out-of-bounds value. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. It is the user's + * responsiblity to choose a suitable compare_nulls value AND use appropriate + * null-safe operators in the expression. + * + * If the provided output size or per-row counts are incorrect, behavior is undefined. + * + * @code{.pseudo} + * left_equality: {{0, 1, 2}} + * right_equality: {{1, 2, 3}} + * left_conditional: {{4, 4, 4}} + * right_conditional: {{3, 4, 5}} + * Expression: Left.Column_0 > Right.Column_0 + * Result: {{0, 1, 2}, {None, 0, None}} + * @endcode + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size_data An optional pair of values indicating the exact output size and the + * number of matches for each row in the larger of the two input tables, left or right (may be + * precomputed using the corresponding mixed_left_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a mixed left join between the four input tables. + */ +std::pair>, + std::unique_ptr>> +mixed_left_join( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional>> output_size_data = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns a pair of row index vectors corresponding to all pairs of + * rows between the specified tables where the columns of the equality table + * are equal and the predicate evaluates to true on the conditional tables, + * or null matches for rows in either pair of tables that have no matches in + * the other pair. + * + * Taken pairwise, the values from the returned vectors are one of: + * (1) row indices corresponding to matching rows from the left and + * right tables, (2) a row index and an unspecified out-of-bounds value, + * representing a row from one table without a match in the other. + * + * If the provided predicate returns NULL for a pair of rows + * (left, right), that pair is not included in the output. It is the user's + * responsiblity to choose a suitable compare_nulls value AND use appropriate + * null-safe operators in the expression. + * + * If the provided output size or per-row counts are incorrect, behavior is undefined. + * + * @code{.pseudo} + * left_equality: {{0, 1, 2}} + * right_equality: {{1, 2, 3}} + * left_conditional: {{4, 4, 4}} + * right_conditional: {{3, 4, 5}} + * Expression: Left.Column_0 > Right.Column_0 + * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} + * @endcode + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size_data An optional pair of values indicating the exact output size and the + * number of matches for each row in the larger of the two input tables, left or right (may be + * precomputed using the corresponding mixed_full_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair of vectors [`left_indices`, `right_indices`] that can be used to construct + * the result of performing a mixed full join between the four input tables. + */ +std::pair>, + std::unique_ptr>> +mixed_full_join( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + std::optional>> output_size_data = {}, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the exact number of matches (rows) when performing a + * mixed inner join between the specified tables where the columns of the + * equality table are equal and the predicate evaluates to true on the + * conditional tables. + * + * If the provided predicate returns NULL for a pair of rows (left, right), + * that pair is not included in the output. It is the user's responsiblity to + * choose a suitable compare_nulls value AND use appropriate null-safe + * operators in the expression. + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size An optional pair of values indicating the exact output size and the number of + * matches for each row in the larger of the two input tables, left or right (may be precomputed + * using the corresponding mixed_inner_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair containing the size that would result from performing the + * requested join and the number of matches for each row in one of the two + * tables. Which of the two tables is an implementation detail and should not + * be relied upon, simply passed to the corresponding `mixed_inner_join` API as + * is. + */ +std::pair>> mixed_inner_join_size( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Returns the exact number of matches (rows) when performing a + * mixed left join between the specified tables where the columns of the + * equality table are equal and the predicate evaluates to true on the + * conditional tables. + * + * If the provided predicate returns NULL for a pair of rows (left, right), + * that pair is not included in the output. It is the user's responsiblity to + * choose a suitable compare_nulls value AND use appropriate null-safe + * operators in the expression. + * + * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not + * match. + * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not + * match. + * + * @param left_equality The left table used for the equality join. + * @param right_equality The right table used for the equality join. + * @param left_conditional The left table used for the conditional join. + * @param right_conditional The right table used for the conditional join. + * @param binary_predicate The condition on which to join. + * @param compare_nulls Whether or not null values join to each other or not. + * @param output_size An optional pair of values indicating the exact output size and the number of + * matches for each row in the larger of the two input tables, left or right (may be precomputed + * using the corresponding mixed_inner_join_size API). + * @param mr Device memory resource used to allocate the returned table and columns' device memory + * + * @return A pair containing the size that would result from performing the + * requested join and the number of matches for each row in one of the two + * tables. Which of the two tables is an implementation detail and should not + * be relied upon, simply passed to the corresponding `mixed_left_join` API as + * is. + */ +std::pair>> mixed_left_join_size( + table_view const& left_equality, + table_view const& right_equality, + table_view const& left_conditional, + table_view const& right_conditional, + ast::expression const& binary_predicate, + null_equality compare_nulls = null_equality::EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Returns the exact number of matches (rows) when performing a * conditional inner join between the specified tables where the predicate diff --git a/cpp/include/cudf/table/row_operators.cuh b/cpp/include/cudf/table/row_operators.cuh index 32ddd1ef49a..a3b08fda15d 100644 --- a/cpp/include/cudf/table/row_operators.cuh +++ b/cpp/include/cudf/table/row_operators.cuh @@ -492,8 +492,11 @@ template