Skip to content

Commit

Permalink
Minor: improve join / join_on docs (#7813)
Browse files Browse the repository at this point in the history
* Minor: improve join_on docs

* Update datafusion/expr/src/logical_plan/builder.rs

Co-authored-by: jakevin <jakevingoo@gmail.com>

---------

Co-authored-by: jakevin <jakevingoo@gmail.com>
  • Loading branch information
alamb and jackwener authored Oct 18, 2023
1 parent ba2039a commit 57f2675
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 18 deletions.
33 changes: 26 additions & 7 deletions datafusion/core/src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -582,12 +582,21 @@ impl DataFrame {
Ok(DataFrame::new(self.session_state, plan))
}

/// Join this DataFrame with another DataFrame using the specified columns as join keys.
/// Join this `DataFrame` with another `DataFrame` using explicitly specified
/// columns and an optional filter expression.
///
/// Filter expression expected to contain non-equality predicates that can not be pushed
/// down to any of join inputs.
/// In case of outer join, filter applied to only matched rows.
/// See [`join_on`](Self::join_on) for a more concise way to specify the
/// join condition. Since DataFusion will automatically identify and
/// optimize equality predicates there is no performance difference between
/// this function and `join_on`
///
/// `left_cols` and `right_cols` are used to form "equijoin" predicates (see
/// example below), which are then combined with the optional `filter`
/// expression.
///
/// Note that in case of outer join, the `filter` is applied to only matched rows.
///
/// # Example
/// ```
/// # use datafusion::prelude::*;
/// # use datafusion::error::Result;
Expand All @@ -600,11 +609,14 @@ impl DataFrame {
/// col("a").alias("a2"),
/// col("b").alias("b2"),
/// col("c").alias("c2")])?;
/// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)`
/// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`.
/// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?;
/// let batches = join.collect().await?;
/// # Ok(())
/// # }
/// ```
///
pub fn join(
self,
right: DataFrame,
Expand All @@ -624,10 +636,13 @@ impl DataFrame {
Ok(DataFrame::new(self.session_state, plan))
}

/// Join this DataFrame with another DataFrame using the specified expressions.
/// Join this `DataFrame` with another `DataFrame` using the specified
/// expressions.
///
/// Simply a thin wrapper over [`join`](Self::join) where the join keys are not provided,
/// and the provided expressions are AND'ed together to form the filter expression.
/// Note that DataFusion automatically optimizes joins, including
/// identifying and optimizing equality predicates.
///
/// # Example
///
/// ```
/// # use datafusion::prelude::*;
Expand All @@ -646,6 +661,10 @@ impl DataFrame {
/// col("b").alias("b2"),
/// col("c").alias("c2"),
/// ])?;
///
/// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)`
/// // finding all pairs of rows from `left` and `right` where
/// // where `a != a2` and `b != b2`.
/// let join_on = left.join_on(
/// right,
/// JoinType::Inner,
Expand Down
44 changes: 33 additions & 11 deletions datafusion/expr/src/logical_plan/builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -611,11 +611,19 @@ impl LogicalPlanBuilder {
})))
}

/// Apply a join with on constraint.
/// Apply a join to `right` using explicitly specified columns and an
/// optional filter expression.
///
/// Filter expression expected to contain non-equality predicates that can not be pushed
/// down to any of join inputs.
/// In case of outer join, filter applied to only matched rows.
/// See [`join_on`](Self::join_on) for a more concise way to specify the
/// join condition. Since DataFusion will automatically identify and
/// optimize equality predicates there is no performance difference between
/// this function and `join_on`
///
/// `left_cols` and `right_cols` are used to form "equijoin" predicates (see
/// example below), which are then combined with the optional `filter`
/// expression.
///
/// Note that in case of outer join, the `filter` is applied to only matched rows.
pub fn join(
self,
right: LogicalPlan,
Expand All @@ -626,11 +634,12 @@ impl LogicalPlanBuilder {
self.join_detailed(right, join_type, join_keys, filter, false)
}

/// Apply a join with on constraint.
/// Apply a join with using the specified expressions.
///
/// Note that DataFusion automatically optimizes joins, including
/// identifying and optimizing equality predicates.
///
/// The `ExtractEquijoinPredicate` optimizer pass has the ability to split join predicates into
/// equijoin predicates and (other) filter predicates. Therefore, if you prefer not to manually split the
/// join predicates, it is recommended to use the `join_on` method instead of the `join` method.
/// # Example
///
/// ```
/// # use datafusion_expr::{Expr, col, LogicalPlanBuilder,
Expand All @@ -650,8 +659,15 @@ impl LogicalPlanBuilder {
///
/// let right_plan = LogicalPlanBuilder::scan("right", right_table, None)?.build()?;
///
/// let exprs = vec![col("left.a").eq(col("right.a")), col("left.b").not_eq(col("right.b"))];
/// // Form the expression `(left.a != right.a)` AND `(left.b != right.b)`
/// let exprs = vec![
/// col("left.a").eq(col("right.a")),
/// col("left.b").not_eq(col("right.b"))
/// ];
///
/// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)`
/// // finding all pairs of rows from `left` and `right` where
/// // where `a = a2` and `b != b2`.
/// let plan = LogicalPlanBuilder::scan("left", left_table, None)?
/// .join_on(right_plan, JoinType::Inner, exprs)?
/// .build()?;
Expand Down Expand Up @@ -688,8 +704,14 @@ impl LogicalPlanBuilder {
)
}

/// Apply a join with on constraint and specified null equality
/// If null_equals_null is true then null == null, else null != null
/// Apply a join with on constraint and specified null equality.
///
/// The behavior is the same as [`join`](Self::join) except that it allows
/// specifying the null equality behavior.
///
/// If `null_equals_null=true`, rows where both join keys are `null` will be
/// emitted. Otherwise rows where either or both join keys are `null` will be
/// omitted.
pub fn join_detailed(
self,
right: LogicalPlan,
Expand Down

0 comments on commit 57f2675

Please sign in to comment.