-
Notifications
You must be signed in to change notification settings - Fork 212
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: update arrow to 51, datafusion to 37 #2240
Changes from 1 commit
2eb43d8
84d683f
2298a71
370694d
98b2049
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,12 +17,12 @@ use datafusion::{ | |
TaskContext, | ||
}, | ||
physical_plan::{ | ||
streaming::PartitionStream, DisplayAs, DisplayFormatType, ExecutionPlan, | ||
streaming::PartitionStream, DisplayAs, DisplayFormatType, ExecutionPlan, PlanProperties, | ||
SendableRecordBatchStream, | ||
}, | ||
}; | ||
use datafusion_common::DataFusionError; | ||
use datafusion_physical_expr::Partitioning; | ||
use datafusion_common::{DataFusionError, Statistics}; | ||
use datafusion_physical_expr::{EquivalenceProperties, Partitioning}; | ||
|
||
use lance_arrow::SchemaExt; | ||
use lance_core::Result; | ||
|
@@ -32,11 +32,15 @@ use log::{info, warn}; | |
/// | ||
/// It can only be used once, and will return the stream. After that the node | ||
/// is exhuasted. | ||
/// | ||
/// Note: the stream should be finite, otherwise we will report datafusion properties | ||
/// incorrectly. | ||
pub struct OneShotExec { | ||
stream: Mutex<Option<SendableRecordBatchStream>>, | ||
// We save off a copy of the schema to speed up formatting and so ExecutionPlan::schema & display_as | ||
// can still function after exhuasted | ||
schema: Arc<ArrowSchema>, | ||
properties: PlanProperties, | ||
} | ||
|
||
impl OneShotExec { | ||
|
@@ -45,7 +49,12 @@ impl OneShotExec { | |
let schema = stream.schema().clone(); | ||
Self { | ||
stream: Mutex::new(Some(stream)), | ||
schema, | ||
schema: schema.clone(), | ||
properties: PlanProperties::new( | ||
EquivalenceProperties::new(schema), | ||
Partitioning::RoundRobinBatch(1), | ||
datafusion::physical_plan::ExecutionMode::Bounded, | ||
), | ||
} | ||
Comment on lines
+52
to
58
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The |
||
} | ||
} | ||
|
@@ -96,14 +105,6 @@ impl ExecutionPlan for OneShotExec { | |
self.schema.clone() | ||
} | ||
|
||
fn output_partitioning(&self) -> datafusion_physical_expr::Partitioning { | ||
Partitioning::RoundRobinBatch(1) | ||
} | ||
|
||
fn output_ordering(&self) -> Option<&[datafusion_physical_expr::PhysicalSortExpr]> { | ||
None | ||
} | ||
|
||
fn children(&self) -> Vec<Arc<dyn ExecutionPlan>> { | ||
vec![] | ||
} | ||
|
@@ -135,7 +136,11 @@ impl ExecutionPlan for OneShotExec { | |
} | ||
|
||
fn statistics(&self) -> datafusion_common::Result<datafusion_common::Statistics> { | ||
todo!() | ||
Ok(Statistics::new_unknown(&self.schema)) | ||
} | ||
|
||
fn properties(&self) -> &datafusion::physical_plan::PlanProperties { | ||
&self.properties | ||
} | ||
} | ||
|
||
|
@@ -194,7 +199,7 @@ pub fn execute_plan( | |
let session_state = SessionState::new_with_config_rt(session_config, runtime_env); | ||
// NOTE: we are only executing the first partition here. Therefore, if | ||
// the plan has more than one partition, we will be missing data. | ||
assert_eq!(plan.output_partitioning().partition_count(), 1); | ||
assert_eq!(plan.properties().partitioning.partition_count(), 1); | ||
Ok(plan.execute(0, session_state.task_ctx())?) | ||
} | ||
|
||
|
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -3,15 +3,19 @@ | |||||
|
||||||
//! Extends logical expression. | ||||||
|
||||||
use std::sync::Arc; | ||||||
|
||||||
use arrow_schema::DataType; | ||||||
|
||||||
use datafusion::logical_expr::ScalarFunctionDefinition; | ||||||
use datafusion::logical_expr::ScalarUDF; | ||||||
use datafusion::logical_expr::ScalarUDFImpl; | ||||||
use datafusion::logical_expr::{ | ||||||
expr::ScalarFunction, BinaryExpr, BuiltinScalarFunction, GetFieldAccess, GetIndexedField, | ||||||
Operator, | ||||||
expr::ScalarFunction, BinaryExpr, GetFieldAccess, GetIndexedField, Operator, | ||||||
}; | ||||||
use datafusion::prelude::*; | ||||||
use datafusion::scalar::ScalarValue; | ||||||
use datafusion_functions::core::getfield::GetFieldFunc; | ||||||
use lance_arrow::DataTypeExt; | ||||||
use lance_datafusion::expr::safe_coerce_scalar; | ||||||
|
||||||
|
@@ -34,6 +38,45 @@ | |||||
} | ||||||
} | ||||||
|
||||||
/// A simple helper function that interprets an Expr as a string scalar | ||||||
/// or returns None if it is not. | ||||||
pub fn get_as_string_scalar_opt(expr: &Expr) -> Option<&String> { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit:
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Changed to |
||||||
match expr { | ||||||
Expr::Literal(ScalarValue::Utf8(Some(s))) => Some(s), | ||||||
_ => None, | ||||||
} | ||||||
} | ||||||
|
||||||
// As part of the DF 37 release there are now two different ways to | ||||||
// represent a nested field access in `Expr`. The old way is to use | ||||||
// `Expr::field` which returns a `GetStructField` and the new way is | ||||||
// to use `Expr::ScalarFunction` with a `GetFieldFunc` UDF. | ||||||
// | ||||||
// Currently, the old path leads to bugs in DF. This is probably a | ||||||
// bug and will probably be fixed in a future version. In the meantime | ||||||
// we need to make sure we are always using the new way to avoid this | ||||||
// bug. This trait adds field_newstyle which lets us easily create | ||||||
// logical `Expr` that use the new style. | ||||||
pub trait ExprExt { | ||||||
// Helper function to replace Expr::field in DF 37 since DF | ||||||
// confuses itself with the GetStructField returned by Expr::field | ||||||
fn field_newstyle(&self, name: &str) -> Expr; | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This was the most complicated change. The comment explains it ok. I expect this may eventually go away (e.g. when apache/datafusion#10181 is resolved) |
||||||
} | ||||||
|
||||||
impl ExprExt for Expr { | ||||||
fn field_newstyle(&self, name: &str) -> Expr { | ||||||
Expr::ScalarFunction(ScalarFunction { | ||||||
func_def: ScalarFunctionDefinition::UDF(Arc::new(ScalarUDF::new_from_impl( | ||||||
GetFieldFunc::default(), | ||||||
))), | ||||||
args: vec![ | ||||||
self.clone(), | ||||||
Expr::Literal(ScalarValue::Utf8(Some(name.to_string()))), | ||||||
], | ||||||
}) | ||||||
} | ||||||
} | ||||||
|
||||||
/// Given a Expr::Column or Expr::GetIndexedField, get the data type of referenced | ||||||
/// field in the schema. | ||||||
/// | ||||||
|
@@ -49,6 +92,15 @@ | |||||
field_path.push(c.name.as_str()); | ||||||
break; | ||||||
} | ||||||
Expr::ScalarFunction(udf) => { | ||||||
if udf.name() == GetFieldFunc::default().name() { | ||||||
let name = get_as_string_scalar_opt(&udf.args[1])?; | ||||||
field_path.push(&name); | ||||||
current_expr = &udf.args[0]; | ||||||
} else { | ||||||
return None; | ||||||
} | ||||||
} | ||||||
Expr::GetIndexedField(GetIndexedField { expr, field }) => { | ||||||
if let GetFieldAccess::NamedStructField { | ||||||
name: ScalarValue::Utf8(Some(name)), | ||||||
|
@@ -87,52 +139,41 @@ | |||||
match expr { | ||||||
Expr::BinaryExpr(BinaryExpr { left, op, right }) => { | ||||||
if matches!(op, Operator::And | Operator::Or) { | ||||||
return Ok(Expr::BinaryExpr(BinaryExpr { | ||||||
Ok(Expr::BinaryExpr(BinaryExpr { | ||||||
left: Box::new(resolve_expr(left.as_ref(), schema)?), | ||||||
op: *op, | ||||||
right: Box::new(resolve_expr(right.as_ref(), schema)?), | ||||||
})); | ||||||
} | ||||||
match (left.as_ref(), right.as_ref()) { | ||||||
(Expr::Column(_) | Expr::GetIndexedField(_), Expr::Literal(_)) => { | ||||||
if let Some(resolved_type) = resolve_column_type(left.as_ref(), schema) { | ||||||
Ok(Expr::BinaryExpr(BinaryExpr { | ||||||
left: left.clone(), | ||||||
op: *op, | ||||||
right: Box::new(resolve_value(right.as_ref(), &resolved_type)?), | ||||||
})) | ||||||
} else { | ||||||
Ok(expr.clone()) | ||||||
} | ||||||
} | ||||||
(Expr::Literal(_), Expr::Column(_) | Expr::GetIndexedField(_)) => { | ||||||
if let Some(resolved_type) = resolve_column_type(right.as_ref(), schema) { | ||||||
Ok(Expr::BinaryExpr(BinaryExpr { | ||||||
left: Box::new(resolve_value(left.as_ref(), &resolved_type)?), | ||||||
op: *op, | ||||||
right: right.clone(), | ||||||
})) | ||||||
} else { | ||||||
Ok(expr.clone()) | ||||||
} | ||||||
})) | ||||||
} else if let Some(left_type) = dbg!(resolve_column_type(left.as_ref(), schema)) { | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Left over
Suggested change
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good catch. Removed. |
||||||
match right.as_ref() { | ||||||
Expr::Literal(_) => Ok(Expr::BinaryExpr(BinaryExpr { | ||||||
left: left.clone(), | ||||||
op: *op, | ||||||
right: Box::new(resolve_value(right.as_ref(), &left_type)?), | ||||||
})), | ||||||
// For cases complex expressions (not just literals) on right hand side like x = 1 + 1 + -2*2 | ||||||
Expr::BinaryExpr(r) => Ok(Expr::BinaryExpr(BinaryExpr { | ||||||
left: left.clone(), | ||||||
op: *op, | ||||||
right: Box::new(Expr::BinaryExpr(BinaryExpr { | ||||||
left: coerce_expr(&r.left, &left_type).map(Box::new)?, | ||||||
op: r.op, | ||||||
right: coerce_expr(&r.right, &left_type).map(Box::new)?, | ||||||
})), | ||||||
})), | ||||||
_ => Ok(expr.clone()), | ||||||
} | ||||||
// For cases complex expressions (not just literals) on right hand side like x = 1 + 1 + -2*2 | ||||||
(Expr::Column(_) | Expr::GetIndexedField(_), Expr::BinaryExpr(r)) => { | ||||||
if let Some(resolved_type) = resolve_column_type(left.as_ref(), schema) { | ||||||
Ok(Expr::BinaryExpr(BinaryExpr { | ||||||
left: left.clone(), | ||||||
op: *op, | ||||||
right: Box::new(Expr::BinaryExpr(BinaryExpr { | ||||||
left: coerce_expr(&r.left, &resolved_type).map(Box::new)?, | ||||||
op: r.op, | ||||||
right: coerce_expr(&r.right, &resolved_type).map(Box::new)?, | ||||||
})), | ||||||
})) | ||||||
} else { | ||||||
Ok(expr.clone()) | ||||||
} | ||||||
} else if let Some(right_type) = dbg!(resolve_column_type(right.as_ref(), schema)) { | ||||||
match left.as_ref() { | ||||||
Expr::Literal(_) => Ok(Expr::BinaryExpr(BinaryExpr { | ||||||
left: Box::new(resolve_value(left.as_ref(), &right_type)?), | ||||||
op: *op, | ||||||
right: right.clone(), | ||||||
})), | ||||||
_ => Ok(expr.clone()), | ||||||
} | ||||||
_ => Ok(expr.clone()), | ||||||
} else { | ||||||
Ok(expr.clone()) | ||||||
} | ||||||
} | ||||||
Expr::InList(in_list) => { | ||||||
|
@@ -189,14 +230,19 @@ | |||||
/// | ||||||
/// - *expr*: a datafusion logical expression | ||||||
pub fn coerce_filter_type_to_boolean(expr: Expr) -> Result<Expr> { | ||||||
match expr { | ||||||
match &expr { | ||||||
// TODO: consider making this dispatch more generic, i.e. fun.output_type -> coerce | ||||||
// instead of hardcoding coerce method for each function | ||||||
Expr::ScalarFunction(ScalarFunction { | ||||||
func_def: ScalarFunctionDefinition::BuiltIn(BuiltinScalarFunction::RegexpMatch), | ||||||
func_def: ScalarFunctionDefinition::UDF(udf), | ||||||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There are no more builtin functions. So we need to look for |
||||||
.. | ||||||
}) => Ok(Expr::IsNotNull(Box::new(expr))), | ||||||
|
||||||
}) => { | ||||||
if udf.name() == "regexp_match" { | ||||||
Ok(Expr::IsNotNull(Box::new(expr))) | ||||||
} else { | ||||||
Ok(expr) | ||||||
} | ||||||
} | ||||||
_ => Ok(expr), | ||||||
} | ||||||
} | ||||||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Datafusion is on 37.1 https://docs.rs/datafusion/latest/datafusion/
Should we bump to datafusion 37.1?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Oh yea, maybe just
37
for simplicity.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Actually, probably best to do 37.1 just to avoid issues. It doesn't change that often.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Updated.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we don't have a reason to require
>=37.1
, I feel like it would be nice to keep at37.0
. Then users can use any37.X
version.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do you mean
37
? Or does37.0
allow37.1
?Scanning through apache/datafusion#9904 I don't see any defects that would impact us.