diff --git a/Cargo.toml b/Cargo.toml index 124747999041..ae6c1df98f9c 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -69,22 +69,22 @@ version = "41.0.0" ahash = { version = "0.8", default-features = false, features = [ "runtime-rng", ] } -arrow = { version = "52.2.0", features = [ +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", features = [ "prettyprint", ] } -arrow-array = { version = "52.2.0", default-features = false, features = [ +arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", default-features = false, features = [ "chrono-tz", ] } -arrow-buffer = { version = "52.2.0", default-features = false } -arrow-flight = { version = "52.2.0", features = [ +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", default-features = false } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", features = [ "flight-sql-experimental", ] } -arrow-ipc = { version = "52.2.0", default-features = false, features = [ +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", default-features = false, features = [ "lz4", ] } -arrow-ord = { version = "52.2.0", default-features = false } -arrow-schema = { version = "52.2.0", default-features = false } -arrow-string = { version = "52.2.0", default-features = false } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", default-features = false } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", default-features = false } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", default-features = false } async-trait = "0.1.73" bigdecimal = "=0.4.1" bytes = "1.4" @@ -123,9 +123,9 @@ indexmap = "2.0.0" itertools = "0.13" log = "^0.4" num_cpus = "1.13.0" -object_store = { version = "0.10.2", default-features = false } +object_store = { version = "0.11", default-features = false } parking_lot = "0.12" -parquet = { version = "52.2.0", default-features = false, features = [ +parquet = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94", default-features = false, features = [ "arrow", "async", "object_store", @@ -134,7 +134,7 @@ rand = "0.8" regex = "1.8" rstest = "0.22.0" serde_json = "1" -sqlparser = { version = "0.50.0", features = ["visitor"] } +sqlparser = { git = "https://github.com/sqlparser-rs/sqlparser-rs.git", rev = "fab834d", features = ["visitor"] } tempfile = "3" thiserror = "1.0.44" tokio = { version = "1.36", features = ["macros", "rt", "sync"] } @@ -165,3 +165,15 @@ large_futures = "warn" [workspace.lints.rust] unexpected_cfgs = { level = "warn", check-cfg = ["cfg(tarpaulin)"] } unused_imports = "deny" + +[patch.crates-io] +arrow = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-array = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-buffer = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-data = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-flight = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-ipc = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-ord = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-schema = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-select = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } +arrow-string = { git = "https://github.com/apache/arrow-rs.git", rev = "2795b94" } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index e35eb3906b9a..882b8c303096 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -174,13 +174,11 @@ checksum = "96d30a06541fbafbc7f82ed10c06164cfbd2c401138f6addd8404629c4b16711" [[package]] name = "arrow" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05048a8932648b63f21c37d88b552ccc8a65afb6dfe9fc9f30ce79174c2e7a85" dependencies = [ "arrow-arith", "arrow-array", "arrow-buffer", - "arrow-cast", + "arrow-cast 52.2.0", "arrow-csv", "arrow-data", "arrow-ipc", @@ -195,8 +193,6 @@ dependencies = [ [[package]] name = "arrow-arith" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d8a57966e43bfe9a3277984a14c24ec617ad874e4c0e1d2a1b083a39cfbf22c" dependencies = [ "arrow-array", "arrow-buffer", @@ -210,8 +206,6 @@ dependencies = [ [[package]] name = "arrow-array" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16f4a9468c882dc66862cef4e1fd8423d47e67972377d85d80e022786427768c" dependencies = [ "ahash", "arrow-buffer", @@ -227,14 +221,31 @@ dependencies = [ [[package]] name = "arrow-buffer" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c975484888fc95ec4a632cdc98be39c085b1bb518531b0c80c5d462063e5daa1" dependencies = [ "bytes", "half", "num", ] +[[package]] +name = "arrow-cast" +version = "52.2.0" +dependencies = [ + "arrow-array", + "arrow-buffer", + "arrow-data", + "arrow-schema", + "arrow-select", + "atoi", + "base64 0.22.1", + "chrono", + "comfy-table", + "half", + "lexical-core", + "num", + "ryu", +] + [[package]] name = "arrow-cast" version = "52.2.0" @@ -249,7 +260,6 @@ dependencies = [ "atoi", "base64 0.22.1", "chrono", - "comfy-table", "half", "lexical-core", "num", @@ -259,12 +269,10 @@ dependencies = [ [[package]] name = "arrow-csv" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13c36dc5ddf8c128df19bab27898eea64bf9da2b555ec1cd17a8ff57fba9ec2" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", + "arrow-cast 52.2.0", "arrow-data", "arrow-schema", "chrono", @@ -278,8 +286,6 @@ dependencies = [ [[package]] name = "arrow-data" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dd9d6f18c65ef7a2573ab498c374d8ae364b4a4edf67105357491c031f716ca5" dependencies = [ "arrow-buffer", "arrow-schema", @@ -290,12 +296,10 @@ dependencies = [ [[package]] name = "arrow-ipc" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e786e1cdd952205d9a8afc69397b317cfbb6e0095e445c69cda7e8da5c1eeb0f" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", + "arrow-cast 52.2.0", "arrow-data", "arrow-schema", "flatbuffers", @@ -305,12 +309,10 @@ dependencies = [ [[package]] name = "arrow-json" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fb22284c5a2a01d73cebfd88a33511a3234ab45d66086b2ca2d1228c3498e445" dependencies = [ "arrow-array", "arrow-buffer", - "arrow-cast", + "arrow-cast 52.2.0", "arrow-data", "arrow-schema", "chrono", @@ -325,8 +327,6 @@ dependencies = [ [[package]] name = "arrow-ord" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "42745f86b1ab99ef96d1c0bcf49180848a64fe2c7a7a0d945bc64fa2b21ba9bc" dependencies = [ "arrow-array", "arrow-buffer", @@ -340,8 +340,6 @@ dependencies = [ [[package]] name = "arrow-row" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4cd09a518c602a55bd406bcc291a967b284cfa7a63edfbf8b897ea4748aad23c" dependencies = [ "ahash", "arrow-array", @@ -354,14 +352,10 @@ dependencies = [ [[package]] name = "arrow-schema" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9e972cd1ff4a4ccd22f86d3e53e835c2ed92e0eea6a3e8eadb72b4f1ac802cf8" [[package]] name = "arrow-select" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "600bae05d43483d216fb3494f8c32fdbefd8aa4e1de237e790dbb3d9f44690a3" dependencies = [ "ahash", "arrow-array", @@ -374,8 +368,6 @@ dependencies = [ [[package]] name = "arrow-string" version = "52.2.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f0dc1985b67cb45f6606a248ac2b4a288849f196bab8c657ea5589f47cdd55e6" dependencies = [ "arrow-array", "arrow-buffer", @@ -2713,7 +2705,7 @@ dependencies = [ "ahash", "arrow-array", "arrow-buffer", - "arrow-cast", + "arrow-cast 52.2.0 (registry+https://github.com/rust-lang/crates.io-index)", "arrow-data", "arrow-ipc", "arrow-schema", @@ -4456,3 +4448,7 @@ dependencies = [ "cc", "pkg-config", ] + +[[patch.unused]] +name = "arrow-flight" +version = "52.2.0" diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 0a4523a1c04e..77c20c6ea633 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -62,3 +62,15 @@ assert_cmd = "2.0" ctor = "0.2.0" predicates = "3.0" rstest = "0.17" + +[patch.crates-io] +arrow = { path = "../../arrow-rs/arrow" } +arrow-array = { path = "../../arrow-rs/arrow-array" } +arrow-buffer = { path = "../../arrow-rs/arrow-buffer" } +arrow-data = { path = "../../arrow-rs/arrow-data" } +arrow-flight = { path = "../../arrow-rs/arrow-flight" } +arrow-ipc = { path = "../../arrow-rs/arrow-ipc" } +arrow-ord = { path = "../../arrow-rs/arrow-ord" } +arrow-schema = { path = "../../arrow-rs/arrow-schema" } +arrow-select = { path = "../../arrow-rs/arrow-select" } +arrow-string = { path = "../../arrow-rs/arrow-string" } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 8435d0632576..4a9e0ae1c3bd 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -61,7 +61,7 @@ num_cpus = { workspace = true } object_store = { workspace = true, optional = true } parquet = { workspace = true, optional = true, default-features = true } paste = "1.0.15" -pyo3 = { version = "0.21.0", optional = true } +pyo3 = { version = "0.22.2", optional = true } sqlparser = { workspace = true } [target.'cfg(target_family = "wasm")'.dependencies] diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index f233f3842c8c..92b2bf068594 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -2002,7 +2002,7 @@ mod tests { let int_col_offset = offset_index.get(4).unwrap(); // 325 pages in int_col - assert_eq!(int_col_offset.len(), 325); + assert_eq!(int_col_offset.page_locations.len(), 325); match int_col_index { Index::INT32(index) => { assert_eq!(index.indexes.len(), 325); diff --git a/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs index e4d26a460ecd..a0ff922f77aa 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/page_filter.rs @@ -28,7 +28,7 @@ use datafusion_physical_expr::{split_conjunction, PhysicalExpr}; use log::{debug, trace}; use parquet::arrow::arrow_reader::statistics::StatisticsConverter; use parquet::file::metadata::{ParquetColumnIndex, ParquetOffsetIndex}; -use parquet::format::PageLocation; +use parquet::file::page_index::offset_index::OffsetIndexMetaData; use parquet::schema::types::SchemaDescriptor; use parquet::{ arrow::arrow_reader::{RowSelection, RowSelector}, @@ -362,7 +362,7 @@ struct PagesPruningStatistics<'a> { converter: StatisticsConverter<'a>, column_index: &'a ParquetColumnIndex, offset_index: &'a ParquetOffsetIndex, - page_offsets: &'a Vec, + page_offsets: &'a OffsetIndexMetaData, } impl<'a> PagesPruningStatistics<'a> { diff --git a/datafusion/functions/src/datetime/date_part.rs b/datafusion/functions/src/datetime/date_part.rs index e24b11aeb71f..d8e1d24ab973 100644 --- a/datafusion/functions/src/datetime/date_part.rs +++ b/datafusion/functions/src/datetime/date_part.rs @@ -18,13 +18,14 @@ use std::any::Any; use std::sync::Arc; -use arrow::array::{Array, ArrayRef, Float64Array}; +use arrow::array::{Array, ArrayRef, Float64Array, PrimitiveArray}; use arrow::compute::{binary, cast, date_part, DatePart}; use arrow::datatypes::DataType::{ - Date32, Date64, Float64, Time32, Time64, Timestamp, Utf8, Utf8View, + Date32, Date64, Duration, Float64, Interval, Time32, Time64, Timestamp, Utf8, Utf8View, }; +use arrow::datatypes::IntervalUnit::{YearMonth, DayTime, MonthDayNano}; use arrow::datatypes::TimeUnit::{Microsecond, Millisecond, Nanosecond, Second}; -use arrow::datatypes::{DataType, TimeUnit}; +use arrow::datatypes::{DataType, Int32Type, TimeUnit}; use datafusion_common::cast::{ as_date32_array, as_date64_array, as_int32_array, as_time32_millisecond_array, @@ -107,6 +108,20 @@ impl DatePartFunc { Exact(vec![Utf8View, Time64(Microsecond)]), Exact(vec![Utf8, Time64(Nanosecond)]), Exact(vec![Utf8View, Time64(Nanosecond)]), + Exact(vec![Utf8, Interval(YearMonth)]), + Exact(vec![Utf8View, Interval(YearMonth)]), + Exact(vec![Utf8, Interval(DayTime)]), + Exact(vec![Utf8View, Interval(DayTime)]), + Exact(vec![Utf8, Interval(MonthDayNano)]), + Exact(vec![Utf8View, Interval(MonthDayNano)]), + Exact(vec![Utf8, Duration(Second)]), + Exact(vec![Utf8View, Duration(Second)]), + Exact(vec![Utf8, Duration(Millisecond)]), + Exact(vec![Utf8View, Duration(Millisecond)]), + Exact(vec![Utf8, Duration(Microsecond)]), + Exact(vec![Utf8View, Duration(Microsecond)]), + Exact(vec![Utf8, Duration(Nanosecond)]), + Exact(vec![Utf8View, Duration(Nanosecond)]), ], Volatility::Immutable, ), @@ -211,9 +226,16 @@ fn seconds(array: &dyn Array, unit: TimeUnit) -> Result { let secs = as_int32_array(secs.as_ref())?; let subsecs = date_part(array, DatePart::Nanosecond)?; let subsecs = as_int32_array(subsecs.as_ref())?; + // REVIEW: there has got to be a better way to do this: I want to treat null as 0, I was expecting + // some kind of map function (or even better map_if_null) on PrimitiveArray. Instead I have just + // discarded the null mask, which feels bad because if I was meant to be able to do this, I'd + // expect there to be a function to do it. I think it is also possible that a null-masked value in + // the array will not be 0 (I don't think that happens right now, but I think we're one optimisation + // away from disaster). + let subsecs: PrimitiveArray = PrimitiveArray::new(subsecs.values().clone(), None); - let r: Float64Array = binary(secs, subsecs, |secs, subsecs| { - (secs as f64 + (subsecs as f64 / 1_000_000_000_f64)) * sf + let r: Float64Array = binary(secs, &subsecs, |secs, subsecs| { + (secs as f64 + ((subsecs % 1_000_000_000) as f64 / 1_000_000_000_f64)) * sf })?; Ok(Arc::new(r)) } @@ -244,7 +266,7 @@ fn epoch(array: &dyn Array) -> Result { Time64(Nanosecond) => { as_time64_nanosecond_array(array)?.unary(|x| x as f64 / 1_000_000_000_f64) } - d => return exec_err!("Can not convert {d:?} to epoch"), + d => return exec_err!("Cannot convert {d:?} to epoch"), }; Ok(Arc::new(f)) }