Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove scan_csv methods from LogicalPlanBuilder #2537

Merged
merged 8 commits into from
May 16, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
283 changes: 91 additions & 192 deletions ballista/rust/core/src/serde/logical_plan/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1016,19 +1016,16 @@ macro_rules! into_logical_plan {
mod roundtrip_tests {

use super::super::{super::error::Result, protobuf};
use crate::error::BallistaError;
use crate::serde::{AsLogicalPlan, BallistaCodec};
use async_trait::async_trait;
use core::panic;
use datafusion::common::DFSchemaRef;
use datafusion::logical_plan::source_as_provider;
use datafusion::{
arrow::datatypes::{DataType, Field, Schema},
datafusion_data_access::{
self,
object_store::{
local::LocalFileSystem, FileMetaStream, ListEntryStream, ObjectReader,
ObjectStore,
},
object_store::{FileMetaStream, ListEntryStream, ObjectReader, ObjectStore},
SizedFile,
},
datasource::listing::ListingTable,
Expand Down Expand Up @@ -1142,26 +1139,11 @@ mod roundtrip_tests {
let test_expr: Vec<Expr> =
vec![col("c1") + col("c2"), Expr::Literal((4.0).into())];

let schema = Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
]);

let plan = std::sync::Arc::new(
LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![3, 4]),
4,
)
.await
.and_then(|plan| plan.sort(vec![col("salary")]))
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?,
test_scan_csv("employee.csv", Some(vec![3, 4]))
.await?
.sort(vec![col("salary")])?
.build()?,
);

for partition_count in test_partition_counts.iter() {
Expand Down Expand Up @@ -1198,13 +1180,7 @@ mod roundtrip_tests {

#[test]
fn roundtrip_create_external_table() -> Result<()> {
let schema = Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
]);
let schema = test_schema();

let df_schema_ref = schema.to_dfschema_ref()?;

Expand Down Expand Up @@ -1236,39 +1212,17 @@ mod roundtrip_tests {

#[tokio::test]
async fn roundtrip_analyze() -> Result<()> {
let schema = Schema::new(vec![
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nice cleanups

Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
]);

let verbose_plan = LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![3, 4]),
4,
)
.await
.and_then(|plan| plan.sort(vec![col("salary")]))
.and_then(|plan| plan.explain(true, true))
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?;

let plan = LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![3, 4]),
4,
)
.await
.and_then(|plan| plan.sort(vec![col("salary")]))
.and_then(|plan| plan.explain(false, true))
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?;
let verbose_plan = test_scan_csv("employee.csv", Some(vec![3, 4]))
.await?
.sort(vec![col("salary")])?
.explain(true, true)?
.build()?;

let plan = test_scan_csv("employee.csv", Some(vec![3, 4]))
.await?
.sort(vec![col("salary")])?
.explain(false, true)?
.build()?;

roundtrip_test!(plan);

Expand All @@ -1279,39 +1233,17 @@ mod roundtrip_tests {

#[tokio::test]
async fn roundtrip_explain() -> Result<()> {
let schema = Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
]);

let verbose_plan = LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![3, 4]),
4,
)
.await
.and_then(|plan| plan.sort(vec![col("salary")]))
.and_then(|plan| plan.explain(true, false))
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?;

let plan = LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![3, 4]),
4,
)
.await
.and_then(|plan| plan.sort(vec![col("salary")]))
.and_then(|plan| plan.explain(false, false))
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?;
let verbose_plan = test_scan_csv("employee.csv", Some(vec![3, 4]))
.await?
.sort(vec![col("salary")])?
.explain(true, false)?
.build()?;

let plan = test_scan_csv("employee.csv", Some(vec![3, 4]))
.await?
.sort(vec![col("salary")])?
.explain(false, false)?
.build()?;

roundtrip_test!(plan);

Expand All @@ -1322,78 +1254,37 @@ mod roundtrip_tests {

#[tokio::test]
async fn roundtrip_join() -> Result<()> {
let schema = Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
]);

let scan_plan = LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee1",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![0, 3, 4]),
4,
)
.await?
.build()
.map_err(BallistaError::DataFusionError)?;

let plan = LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee2",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![0, 3, 4]),
4,
)
.await
.and_then(|plan| plan.join(&scan_plan, JoinType::Inner, (vec!["id"], vec!["id"])))
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?;
let scan_plan = test_scan_csv("employee1", Some(vec![0, 3, 4]))
.await?
.build()?;

let plan = test_scan_csv("employee2", Some(vec![0, 3, 4]))
.await?
.join(&scan_plan, JoinType::Inner, (vec!["id"], vec!["id"]))?
.build()?;

roundtrip_test!(plan);
Ok(())
}

#[tokio::test]
async fn roundtrip_sort() -> Result<()> {
let schema = Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
]);

let plan = LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![3, 4]),
4,
)
.await
.and_then(|plan| plan.sort(vec![col("salary")]))
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?;
let plan = test_scan_csv("employee.csv", Some(vec![3, 4]))
.await?
.sort(vec![col("salary")])?
.build()?;
roundtrip_test!(plan);

Ok(())
}

#[tokio::test]
async fn roundtrip_empty_relation() -> Result<()> {
let plan_false = LogicalPlanBuilder::empty(false)
.build()
.map_err(BallistaError::DataFusionError)?;
let plan_false = LogicalPlanBuilder::empty(false).build()?;

roundtrip_test!(plan_false);

let plan_true = LogicalPlanBuilder::empty(true)
.build()
.map_err(BallistaError::DataFusionError)?;
let plan_true = LogicalPlanBuilder::empty(true).build()?;

roundtrip_test!(plan_true);

Expand All @@ -1402,31 +1293,17 @@ mod roundtrip_tests {

#[tokio::test]
async fn roundtrip_logical_plan() -> Result<()> {
let schema = Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
]);

let plan = LogicalPlanBuilder::scan_csv(
Arc::new(LocalFileSystem {}),
"employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![3, 4]),
4,
)
.await
.and_then(|plan| plan.aggregate(vec![col("state")], vec![max(col("salary"))]))
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?;
let plan = test_scan_csv("employee.csv", Some(vec![3, 4]))
.await?
.aggregate(vec![col("state")], vec![max(col("salary"))])?
.build()?;

roundtrip_test!(plan);

Ok(())
}

#[ignore] // see https://github.com/apache/arrow-datafusion/issues/2546
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

#[tokio::test]
async fn roundtrip_logical_plan_custom_ctx() -> Result<()> {
let ctx = SessionContext::new();
Expand All @@ -1436,28 +1313,18 @@ mod roundtrip_tests {
ctx.runtime_env()
.register_object_store("test", custom_object_store.clone());

let (os, _) = ctx.runtime_env().object_store("test://foo.csv")?;

println!("Object Store {:?}", os);
let (os, uri) = ctx.runtime_env().object_store("test://foo.csv")?;
assert_eq!("TestObjectStore", &format!("{:?}", os));
assert_eq!("foo.csv", uri);

let schema = Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
]);

let plan = LogicalPlanBuilder::scan_csv(
custom_object_store.clone(),
"test://employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
Some(vec![3, 4]),
4,
)
.await
.and_then(|plan| plan.build())
.map_err(BallistaError::DataFusionError)?;
let schema = test_schema();
let plan = ctx
.read_csv(
"test://employee.csv",
CsvReadOptions::new().schema(&schema).has_header(true),
)
.await?
.to_logical_plan()?;

let proto: protobuf::LogicalPlanNode =
protobuf::LogicalPlanNode::try_from_logical_plan(
Expand Down Expand Up @@ -1488,4 +1355,36 @@ mod roundtrip_tests {

Ok(())
}

fn test_schema() -> Schema {
Schema::new(vec![
Field::new("id", DataType::Int32, false),
Field::new("first_name", DataType::Utf8, false),
Field::new("last_name", DataType::Utf8, false),
Field::new("state", DataType::Utf8, false),
Field::new("salary", DataType::Int32, false),
])
}

async fn test_scan_csv(
table_name: &str,
projection: Option<Vec<usize>>,
) -> Result<LogicalPlanBuilder> {
let schema = test_schema();
let ctx = SessionContext::new();
let options = CsvReadOptions::new().schema(&schema);
let df = ctx.read_csv(table_name, options).await?;
let plan = match df.to_logical_plan()? {
LogicalPlan::TableScan(ref scan) => {
let mut scan = scan.clone();
scan.projection = projection;
let mut projected_schema = scan.projected_schema.as_ref().clone();
projected_schema = projected_schema.replace_qualifier(table_name);
scan.projected_schema = DFSchemaRef::new(projected_schema);
LogicalPlan::TableScan(scan)
}
_ => unimplemented!(),
};
Ok(LogicalPlanBuilder::from(plan))
}
}
Loading