diff --git a/benchmarks/Cargo.toml b/benchmarks/Cargo.toml index d54d15b80f53..43955ae8cc63 100644 --- a/benchmarks/Cargo.toml +++ b/benchmarks/Cargo.toml @@ -33,14 +33,14 @@ simd = ["datafusion/simd"] snmalloc = ["snmalloc-rs"] [dependencies] -arrow = "27.0.0" +arrow = "28.0.0" datafusion = { path = "../datafusion/core", version = "14.0.0", features = ["scheduler"] } env_logger = "0.10" futures = "0.3" mimalloc = { version = "0.1", optional = true, default-features = false } num_cpus = "1.13.0" object_store = "0.5.0" -parquet = "27.0.0" +parquet = "28.0.0" parquet-test-utils = { path = "../parquet-test-utils/", version = "0.1.0" } rand = "0.8.4" serde = { version = "1.0.136", features = ["derive"] } diff --git a/datafusion-cli/Cargo.lock b/datafusion-cli/Cargo.lock index 52888d07e044..f09e8be214d5 100644 --- a/datafusion-cli/Cargo.lock +++ b/datafusion-cli/Cargo.lock @@ -68,9 +68,9 @@ checksum = "8da52d66c7071e2e3fa2a1e5c6d088fec47b593032b254f5e980de8ea54454d6" [[package]] name = "arrow" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b329393dcb0f1d7b11179bb4bdf1ba03e65f0c3aa09a70c7d75a889cb4e2be48" +checksum = "aed9849f86164fad5cb66ce4732782b15f1bc97f8febab04e782c20cce9d4b6c" dependencies = [ "ahash", "arrow-array", @@ -94,9 +94,9 @@ dependencies = [ [[package]] name = "arrow-array" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "05cc1efb944a4309b73fd8035f283724ec767b632fc3048e2da7c87cfcd92b08" +checksum = "6b8504cf0a6797e908eecf221a865e7d339892720587f87c8b90262863015b08" dependencies = [ "ahash", "arrow-buffer", @@ -110,9 +110,9 @@ dependencies = [ [[package]] name = "arrow-buffer" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3b1e46f50d5a7adcd14465605a3f07f0f11982a0fcf6a5a3562b94ea53af4546" +checksum = "d6de64a27cea684b24784647d9608314bc80f7c4d55acb44a425e05fab39d916" dependencies = [ "half", "num", @@ -120,9 +120,9 @@ dependencies = [ [[package]] name = "arrow-cast" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "563ceb60e814e9f89c3a1e20ce13f388af362e7ca9c1e75493e047d82cbcb773" +checksum = "bec4a54502eefe05923c385c90a005d69474fa06ca7aa2a2b123c9f9532f6178" dependencies = [ "arrow-array", "arrow-buffer", @@ -136,9 +136,9 @@ dependencies = [ [[package]] name = "arrow-csv" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c2c61a79a865fbb7a209623513824d73ecd5b860b8dcfedb271df664e843e6ae" +checksum = "e7902bbf8127eac48554fe902775303377047ad49a9fd473c2b8cb399d092080" dependencies = [ "arrow-array", "arrow-buffer", @@ -154,9 +154,9 @@ dependencies = [ [[package]] name = "arrow-data" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e17bc1336039023ba297cccea8fe9dc6c8d03a980414e938274a3a3f21922200" +checksum = "7e4882efe617002449d5c6b5de9ddb632339074b36df8a96ea7147072f1faa8a" dependencies = [ "arrow-buffer", "arrow-schema", @@ -166,9 +166,9 @@ dependencies = [ [[package]] name = "arrow-ipc" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4728d966e9e79d55e27d354fe8bad38b59106143230d80723219457c34de0259" +checksum = "fa0703a6de2785828561b03a4d7793ecd333233e1b166316b4bfc7cfce55a4a7" dependencies = [ "arrow-array", "arrow-buffer", @@ -180,9 +180,9 @@ dependencies = [ [[package]] name = "arrow-json" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "290444e50fde2e38e7611b50886ab4329787c792b7563c5541af1236cdbc3a5f" +checksum = "3bd23fc8c6d251f96cd63b96fece56bbb9710ce5874a627cb786e2600673595a" dependencies = [ "arrow-array", "arrow-buffer", @@ -198,15 +198,15 @@ dependencies = [ [[package]] name = "arrow-schema" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c564654550c1e197d07660391e32607fac1ce9ca6748a949c36f372bb715f451" +checksum = "da9f143882a80be168538a60e298546314f50f11f2a288c8d73e11108da39d26" [[package]] name = "arrow-select" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "816c229e4489b66a5512d589406fa816b774005b7365cd9b6af8f160806cceb0" +checksum = "520406331d4ad60075359524947ebd804e479816439af82bcb17f8d280d9b38c" dependencies = [ "arrow-array", "arrow-buffer", @@ -1680,9 +1680,9 @@ checksum = "86f0b0d4bf799edbc74508c1e8bf170ff5f41238e5f8225603ca7caaae2b7860" [[package]] name = "ordered-float" -version = "1.1.1" +version = "2.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3305af35278dd29f46fcdd139e0b1fbfae2153f0e5928b39b035542dd31e37b7" +checksum = "7940cf2ca942593318d07fcf2596cdca60a85c9e7fab408a5e21a4f9dcd40d87" dependencies = [ "num-traits", ] @@ -1718,9 +1718,9 @@ dependencies = [ [[package]] name = "parquet" -version = "27.0.0" +version = "28.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "54bd654e4ff294122f6d64938f400a433aee755433e6d9cb28b1861781ea1a3b" +checksum = "21433e9209111bb3720b747f2f137e0d115af1af0420a7a1c26b6e88227fa353" dependencies = [ "ahash", "arrow-array", @@ -1740,10 +1740,12 @@ dependencies = [ "lz4", "num", "num-bigint", + "paste", "seq-macro", "snap", "thrift", "tokio", + "twox-hash", "zstd", ] @@ -2322,9 +2324,9 @@ dependencies = [ [[package]] name = "thrift" -version = "0.16.0" +version = "0.17.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "09678c4cdbb4eed72e18b7c2af1329c69825ed16fcbac62d083fc3e2b0590ff0" +checksum = "7e54bc85fc7faa8bc175c4bab5b92ba8d9a3ce893d0e9f42cc455c8ab16a9e09" dependencies = [ "byteorder", "integer-encoding", @@ -2465,6 +2467,16 @@ version = "0.2.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "59547bce71d9c38b83d9c0e92b6066c4253371f15005def0c30d9657f50c7642" +[[package]] +name = "twox-hash" +version = "1.6.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "97fee6b57c6a41524a810daee9286c02d7752c4253064d0b05472833a438f675" +dependencies = [ + "cfg-if", + "static_assertions", +] + [[package]] name = "typenum" version = "1.15.0" @@ -2772,18 +2784,18 @@ dependencies = [ [[package]] name = "zstd" -version = "0.11.2+zstd.1.5.2" +version = "0.12.0+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" +checksum = "b8148aa921e9d53217ab9322f8553bd130f7ae33489db68b381d76137d2e6374" dependencies = [ "zstd-safe", ] [[package]] name = "zstd-safe" -version = "5.0.2+zstd.1.5.2" +version = "6.0.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1d2a5585e04f9eea4b2a3d1eca508c4dee9592a89ef6f450c11719da0726f4db" +checksum = "a6cf39f730b440bab43da8fb5faf5f254574462f73f260f85f7987f32154ff17" dependencies = [ "libc", "zstd-sys", diff --git a/datafusion-cli/Cargo.toml b/datafusion-cli/Cargo.toml index 09e7eba50b01..3b23b8c75116 100644 --- a/datafusion-cli/Cargo.toml +++ b/datafusion-cli/Cargo.toml @@ -29,7 +29,7 @@ rust-version = "1.62" readme = "README.md" [dependencies] -arrow = "27.0.0" +arrow = "28.0.0" clap = { version = "3", features = ["derive", "cargo"] } datafusion = { path = "../datafusion/core", version = "14.0.0" } dirs = "4.0.0" diff --git a/datafusion-examples/Cargo.toml b/datafusion-examples/Cargo.toml index 5f40ed336b29..cdb77d152f62 100644 --- a/datafusion-examples/Cargo.toml +++ b/datafusion-examples/Cargo.toml @@ -34,8 +34,8 @@ path = "examples/avro_sql.rs" required-features = ["datafusion/avro"] [dev-dependencies] -arrow = "27.0.0" -arrow-flight = "27.0.0" +arrow = "28.0.0" +arrow-flight = "28.0.0" async-trait = "0.1.41" datafusion = { path = "../datafusion/core" } datafusion-common = { path = "../datafusion/common" } diff --git a/datafusion/common/Cargo.toml b/datafusion/common/Cargo.toml index 7284f3f370d4..6b24cba560ca 100644 --- a/datafusion/common/Cargo.toml +++ b/datafusion/common/Cargo.toml @@ -40,10 +40,10 @@ pyarrow = ["pyo3", "arrow/pyarrow"] [dependencies] apache-avro = { version = "0.14", default-features = false, features = ["snappy"], optional = true } -arrow = { version = "27.0.0", default-features = false } +arrow = { version = "28.0.0", default-features = false } chrono = { version = "0.4", default-features = false } cranelift-module = { version = "0.89.0", optional = true } object_store = { version = "0.5.0", default-features = false, optional = true } -parquet = { version = "27.0.0", default-features = false, optional = true } +parquet = { version = "28.0.0", default-features = false, optional = true } pyo3 = { version = "0.17.1", optional = true } sqlparser = "0.27" diff --git a/datafusion/common/src/dfschema.rs b/datafusion/common/src/dfschema.rs index 1110423dbffd..005488153699 100644 --- a/datafusion/common/src/dfschema.rs +++ b/datafusion/common/src/dfschema.rs @@ -625,7 +625,6 @@ impl DFField { mod tests { use super::*; use arrow::datatypes::DataType; - use std::collections::BTreeMap; #[test] fn qualifier_in_name() -> Result<()> { @@ -673,8 +672,8 @@ mod tests { fn from_qualified_schema_into_arrow_schema() -> Result<()> { let schema = DFSchema::try_from_qualified_schema("t1", &test_schema_1())?; let arrow_schema: Schema = schema.into(); - let expected = "Field { name: \"c0\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }, \ - Field { name: \"c1\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: None }"; + let expected = "Field { name: \"c0\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }, \ + Field { name: \"c1\", data_type: Boolean, nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {} }"; assert_eq!(expected, arrow_schema.to_string()); Ok(()) } @@ -798,7 +797,7 @@ mod tests { field1_i16_t .field() .clone() - .with_metadata(Some(test_bmetadata_n(2))), + .with_metadata(test_metadata_n(2)), ); let field1_i16_t_qualified = DFField::from_qualified("foo", field1_i16_t.field().clone()); @@ -947,12 +946,12 @@ mod tests { fn test_dfschema_to_schema_convertion() { let mut a: DFField = DFField::new(Some("table1"), "a", DataType::Int64, false); let mut b: DFField = DFField::new(Some("table1"), "b", DataType::Int64, false); - let mut a_metadata = BTreeMap::new(); + let mut a_metadata = HashMap::new(); a_metadata.insert("key".to_string(), "value".to_string()); - a.field.set_metadata(Some(a_metadata)); - let mut b_metadata = BTreeMap::new(); + a.field.set_metadata(a_metadata); + let mut b_metadata = HashMap::new(); b_metadata.insert("key".to_string(), "value".to_string()); - b.field.set_metadata(Some(b_metadata)); + b.field.set_metadata(b_metadata); let df_schema = Arc::new( DFSchema::new_with_metadata([a, b].to_vec(), HashMap::new()).unwrap(), @@ -980,11 +979,4 @@ mod tests { .map(|i| (format!("k{}", i), format!("v{}", i))) .collect() } - - fn test_bmetadata_n(n: usize) -> BTreeMap { - (0..n) - .into_iter() - .map(|i| (format!("k{}", i), format!("v{}", i))) - .collect() - } } diff --git a/datafusion/common/src/scalar.rs b/datafusion/common/src/scalar.rs index 0c3e1d832c65..79bfcad65c80 100644 --- a/datafusion/common/src/scalar.rs +++ b/datafusion/common/src/scalar.rs @@ -59,7 +59,7 @@ pub enum ScalarValue { /// 64bit float Float64(Option), /// 128bit decimal, using the i128 to represent the decimal, precision scale - Decimal128(Option, u8, u8), + Decimal128(Option, u8, i8), /// signed 8bit int Int8(Option), /// signed 16bit int @@ -977,9 +977,9 @@ macro_rules! eq_array_primitive { impl ScalarValue { /// Create a decimal Scalar from value/precision and scale. - pub fn try_new_decimal128(value: i128, precision: u8, scale: u8) -> Result { + pub fn try_new_decimal128(value: i128, precision: u8, scale: i8) -> Result { // make sure the precision and scale is valid - if precision <= DECIMAL128_MAX_PRECISION && scale <= precision { + if precision <= DECIMAL128_MAX_PRECISION && scale.unsigned_abs() <= precision { return Ok(ScalarValue::Decimal128(Some(value), precision, scale)); } Err(DataFusionError::Internal(format!( @@ -1561,7 +1561,7 @@ impl ScalarValue { _ => unreachable!("Invalid dictionary keys type: {:?}", key_type), } } - DataType::FixedSizeBinary(_) => { + DataType::FixedSizeBinary(size) => { let array = scalars .map(|sv| { if let ScalarValue::FixedSizeBinary(_, v) = sv { @@ -1575,8 +1575,10 @@ impl ScalarValue { } }) .collect::>>()?; - let array = - FixedSizeBinaryArray::try_from_sparse_iter(array.into_iter())?; + let array = FixedSizeBinaryArray::try_from_sparse_iter_with_size( + array.into_iter(), + *size, + )?; Arc::new(array) } // explicitly enumerate unsupported types so newly added @@ -1620,7 +1622,7 @@ impl ScalarValue { fn iter_to_decimal_array( scalars: impl IntoIterator, precision: u8, - scale: u8, + scale: i8, ) -> Result { let array = scalars .into_iter() @@ -1701,7 +1703,7 @@ impl ScalarValue { fn build_decimal_array( value: Option, precision: u8, - scale: u8, + scale: i8, size: usize, ) -> Decimal128Array { std::iter::repeat(value) @@ -1796,16 +1798,18 @@ impl ScalarValue { Arc::new(repeat(None::<&str>).take(size).collect::()) } }, - ScalarValue::FixedSizeBinary(_, e) => match e { + ScalarValue::FixedSizeBinary(s, e) => match e { Some(value) => Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter( + FixedSizeBinaryArray::try_from_sparse_iter_with_size( repeat(Some(value.as_slice())).take(size), + *s, ) .unwrap(), ), None => Arc::new( - FixedSizeBinaryArray::try_from_sparse_iter( + FixedSizeBinaryArray::try_from_sparse_iter_with_size( repeat(None::<&[u8]>).take(size), + *s, ) .unwrap(), ), @@ -1961,7 +1965,7 @@ impl ScalarValue { array: &ArrayRef, index: usize, precision: u8, - scale: u8, + scale: i8, ) -> Result { let array = as_decimal128_array(array)?; if array.is_null(index) { @@ -2158,7 +2162,7 @@ impl ScalarValue { index: usize, value: &Option, precision: u8, - scale: u8, + scale: i8, ) -> Result { let array = as_decimal128_array(array)?; if array.precision() != precision || array.scale() != scale { diff --git a/datafusion/core/Cargo.toml b/datafusion/core/Cargo.toml index ec870c436ed4..228da1eb4583 100644 --- a/datafusion/core/Cargo.toml +++ b/datafusion/core/Cargo.toml @@ -57,7 +57,7 @@ unicode_expressions = ["datafusion-physical-expr/regex_expressions", "datafusion [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } apache-avro = { version = "0.14", optional = true } -arrow = { version = "27.0.0", features = ["prettyprint"] } +arrow = { version = "28.0.0", features = ["prettyprint"] } async-compression = { version = "0.3.14", features = ["bzip2", "gzip", "xz", "futures-io", "tokio"], optional = true } async-trait = "0.1.41" bytes = "1.1" @@ -82,7 +82,7 @@ num-traits = { version = "0.2", optional = true } num_cpus = "1.13.0" object_store = "0.5.0" parking_lot = "0.12" -parquet = { version = "27.0.0", features = ["arrow", "async"] } +parquet = { version = "28.0.0", features = ["arrow", "async"] } paste = "^1.0" percent-encoding = "2.2.0" pin-project-lite = "^0.2.7" @@ -100,7 +100,7 @@ uuid = { version = "1.0", features = ["v4"] } xz2 = { version = "0.1", optional = true } [dev-dependencies] -arrow = { version = "27.0.0", features = ["prettyprint", "dyn_cmp_dict"] } +arrow = { version = "28.0.0", features = ["prettyprint", "dyn_cmp_dict"] } async-trait = "0.1.53" criterion = "0.4" csv = "1.1.6" diff --git a/datafusion/core/src/avro_to_arrow/schema.rs b/datafusion/core/src/avro_to_arrow/schema.rs index a2582451b3a9..c71db3c15612 100644 --- a/datafusion/core/src/avro_to_arrow/schema.rs +++ b/datafusion/core/src/avro_to_arrow/schema.rs @@ -21,7 +21,7 @@ use apache_avro::schema::{Alias, Name}; use apache_avro::types::Value; use apache_avro::Schema as AvroSchema; use arrow::datatypes::Field; -use std::collections::BTreeMap; +use std::collections::HashMap; use std::convert::TryFrom; /// Converts an avro schema to an arrow schema @@ -34,7 +34,7 @@ pub fn to_arrow_schema(avro_schema: &apache_avro::Schema) -> Result { &field.schema, Some(&field.name), false, - Some(&external_props(&field.schema)), + Some(external_props(&field.schema)), )?) } } @@ -50,14 +50,14 @@ fn schema_to_field( name: Option<&str>, nullable: bool, ) -> Result { - schema_to_field_with_props(schema, name, nullable, None) + schema_to_field_with_props(schema, name, nullable, Default::default()) } fn schema_to_field_with_props( schema: &AvroSchema, name: Option<&str>, nullable: bool, - props: Option<&BTreeMap>, + props: Option>, ) -> Result { let mut nullable = nullable; let field_type: DataType = match schema { @@ -112,7 +112,7 @@ fn schema_to_field_with_props( let fields: Result> = fields .iter() .map(|field| { - let mut props = BTreeMap::new(); + let mut props = HashMap::new(); if let Some(doc) = &field.doc { props.insert("avro::doc".to_string(), doc.clone()); } @@ -123,7 +123,7 @@ fn schema_to_field_with_props( &field.schema, Some(&format!("{}.{}", name.fullname(None), field.name)), false, - Some(&props), + Some(props), ) }) .collect(); @@ -141,7 +141,7 @@ fn schema_to_field_with_props( AvroSchema::Fixed { size, .. } => DataType::FixedSizeBinary(*size as i32), AvroSchema::Decimal { precision, scale, .. - } => DataType::Decimal128(*precision as u8, *scale as u8), + } => DataType::Decimal128(*precision as u8, *scale as i8), AvroSchema::Uuid => DataType::FixedSizeBinary(16), AvroSchema::Date => DataType::Date32, AvroSchema::TimeMillis => DataType::Time32(TimeUnit::Millisecond), @@ -155,7 +155,7 @@ fn schema_to_field_with_props( let name = name.unwrap_or_else(|| default_field_name(&data_type)); let mut field = Field::new(name, field_type, nullable); - field.set_metadata(props.cloned()); + field.set_metadata(props.unwrap_or_default()); Ok(field) } @@ -234,8 +234,8 @@ fn index_type(len: usize) -> DataType { } } -fn external_props(schema: &AvroSchema) -> BTreeMap { - let mut props = BTreeMap::new(); +fn external_props(schema: &AvroSchema) -> HashMap { + let mut props = HashMap::new(); match &schema { AvroSchema::Record { doc: Some(ref doc), .. @@ -280,16 +280,6 @@ fn external_props(schema: &AvroSchema) -> BTreeMap { props } -#[allow(dead_code)] -fn get_metadata( - _schema: AvroSchema, - props: BTreeMap, -) -> BTreeMap { - let mut metadata: BTreeMap = Default::default(); - metadata.extend(props); - metadata -} - /// Returns the fully qualified name for a field pub fn aliased( alias: &Alias, diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 715a29190cf2..a1b5307dd54a 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -118,7 +118,7 @@ fn clear_metadata( .fields() .iter() .map(|field| { - field.clone().with_metadata(None) // clear meta + field.clone().with_metadata(Default::default()) // clear meta }) .collect::>(); Schema::new(fields) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index ce525d2cf96a..df616574ecb8 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -912,7 +912,7 @@ mod tests { min: impl IntoIterator>, max: impl IntoIterator>, precision: u8, - scale: u8, + scale: i8, ) -> Self { Self { min: Arc::new( diff --git a/datafusion/core/src/physical_plan/file_format/parquet.rs b/datafusion/core/src/physical_plan/file_format/parquet.rs index cfd1c2194116..596d2b09e950 100644 --- a/datafusion/core/src/physical_plan/file_format/parquet.rs +++ b/datafusion/core/src/physical_plan/file_format/parquet.rs @@ -701,12 +701,12 @@ pub(crate) fn parquet_to_arrow_decimal_type( let type_ptr = parquet_column.self_type_ptr(); match type_ptr.get_basic_info().logical_type() { Some(LogicalType::Decimal { scale, precision }) => { - Some(DataType::Decimal128(precision as u8, scale as u8)) + Some(DataType::Decimal128(precision as u8, scale as i8)) } _ => match type_ptr.get_basic_info().converted_type() { ConvertedType::DECIMAL => Some(DataType::Decimal128( type_ptr.get_precision() as u8, - type_ptr.get_scale() as u8, + type_ptr.get_scale() as i8, )), _ => None, }, diff --git a/datafusion/core/src/physical_plan/planner.rs b/datafusion/core/src/physical_plan/planner.rs index 19d3c3e0867c..e325b361b233 100644 --- a/datafusion/core/src/physical_plan/planner.rs +++ b/datafusion/core/src/physical_plan/planner.rs @@ -1997,7 +1997,7 @@ mod tests { nullable: false, \ dict_id: 0, \ dict_is_ordered: false, \ - metadata: None } }\ + metadata: {} } }\ ], metadata: {} }, \ ExecutionPlan schema: Schema { fields: [\ Field { \ @@ -2006,7 +2006,7 @@ mod tests { nullable: false, \ dict_id: 0, \ dict_is_ordered: false, \ - metadata: None }\ + metadata: {} }\ ], metadata: {} }"; match plan { Ok(_) => panic!("Expected planning failure"), @@ -2053,7 +2053,7 @@ mod tests { .build()?; let e = plan(&logical_plan).await.unwrap_err().to_string(); - assert_contains!(&e, "The data type inlist should be same, the value type is Boolean, one of list expr type is Struct([Field { name: \"foo\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None }])"); + assert_contains!(&e, "The data type inlist should be same, the value type is Boolean, one of list expr type is Struct([Field { name: \"foo\", data_type: Boolean, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }])"); Ok(()) } diff --git a/datafusion/core/src/physical_plan/projection.rs b/datafusion/core/src/physical_plan/projection.rs index 692880a83a00..3a2765e7f78d 100644 --- a/datafusion/core/src/physical_plan/projection.rs +++ b/datafusion/core/src/physical_plan/projection.rs @@ -21,7 +21,7 @@ //! projection expressions. `SELECT` without `FROM` will only evaluate expressions. use std::any::Any; -use std::collections::{BTreeMap, HashMap}; +use std::collections::HashMap; use std::pin::Pin; use std::sync::Arc; use std::task::{Context, Poll}; @@ -79,7 +79,9 @@ impl ProjectionExec { e.data_type(&input_schema)?, e.nullable(&input_schema)?, ); - field.set_metadata(get_field_metadata(e, &input_schema)); + field.set_metadata( + get_field_metadata(e, &input_schema).unwrap_or_default(), + ); Ok(field) }) @@ -268,7 +270,7 @@ impl ExecutionPlan for ProjectionExec { fn get_field_metadata( e: &Arc, input_schema: &Schema, -) -> Option> { +) -> Option> { let name = if let Some(column) = e.as_any().downcast_ref::() { column.name() } else { @@ -278,7 +280,7 @@ fn get_field_metadata( input_schema .field_with_name(name) .ok() - .and_then(|f| f.metadata().cloned()) + .map(|f| f.metadata().clone()) } fn stats_projection( @@ -384,7 +386,7 @@ mod tests { ProjectionExec::try_new(vec![(col("c1", &schema)?, "c1".to_string())], csv)?; let col_field = projection.schema.field(0); - let col_metadata = col_field.metadata().unwrap().clone(); + let col_metadata = col_field.metadata(); let data: &str = &col_metadata["testing"]; assert_eq!(data, "test"); diff --git a/datafusion/core/src/physical_plan/sorts/sort.rs b/datafusion/core/src/physical_plan/sorts/sort.rs index 0b3be090672d..b6c37d109859 100644 --- a/datafusion/core/src/physical_plan/sorts/sort.rs +++ b/datafusion/core/src/physical_plan/sorts/sort.rs @@ -954,7 +954,7 @@ mod tests { use arrow::datatypes::*; use datafusion_common::cast::{as_primitive_array, as_string_array}; use futures::FutureExt; - use std::collections::{BTreeMap, HashMap}; + use std::collections::HashMap; #[tokio::test] async fn test_in_mem_sort() -> Result<()> { @@ -1151,7 +1151,7 @@ mod tests { async fn test_sort_metadata() -> Result<()> { let session_ctx = SessionContext::new(); let task_ctx = session_ctx.task_ctx(); - let field_metadata: BTreeMap = + let field_metadata: HashMap = vec![("foo".to_string(), "bar".to_string())] .into_iter() .collect(); @@ -1161,7 +1161,7 @@ mod tests { .collect(); let mut field = Field::new("field_name", DataType::UInt64, true); - field.set_metadata(Some(field_metadata.clone())); + field.set_metadata(field_metadata.clone()); let schema = Schema::new_with_metadata(vec![field], schema_metadata.clone()); let schema = Arc::new(schema); @@ -1192,10 +1192,7 @@ mod tests { assert_eq!(&vec![expected_batch], &result); // explicitlty ensure the metadata is present - assert_eq!( - result[0].schema().fields()[0].metadata(), - Some(&field_metadata) - ); + assert_eq!(result[0].schema().fields()[0].metadata(), &field_metadata); assert_eq!(result[0].schema().metadata(), &schema_metadata); Ok(()) diff --git a/datafusion/core/src/test_util.rs b/datafusion/core/src/test_util.rs index 3f67915d492b..089cc5a23600 100644 --- a/datafusion/core/src/test_util.rs +++ b/datafusion/core/src/test_util.rs @@ -18,7 +18,7 @@ //! Utility functions to make testing DataFusion based crates easier use std::any::Any; -use std::collections::BTreeMap; +use std::collections::HashMap; use std::{env, error::Error, path::PathBuf, sync::Arc}; use crate::datasource::datasource::TableProviderFactory; @@ -229,9 +229,9 @@ pub fn scan_empty_with_partitions( /// Get the schema for the aggregate_test_* csv files pub fn aggr_test_schema() -> SchemaRef { let mut f1 = Field::new("c1", DataType::Utf8, false); - f1.set_metadata(Some(BTreeMap::from_iter( + f1.set_metadata(HashMap::from_iter( vec![("testing".into(), "test".into())].into_iter(), - ))); + )); let schema = Schema::new(vec![ f1, Field::new("c2", DataType::UInt32, false), @@ -254,9 +254,9 @@ pub fn aggr_test_schema() -> SchemaRef { /// Get the schema for the aggregate_test_* csv files with an additional filed not present in the files. pub fn aggr_test_schema_with_missing_col() -> SchemaRef { let mut f1 = Field::new("c1", DataType::Utf8, false); - f1.set_metadata(Some(BTreeMap::from_iter( + f1.set_metadata(HashMap::from_iter( vec![("testing".into(), "test".into())].into_iter(), - ))); + )); let schema = Schema::new(vec![ f1, Field::new("c2", DataType::UInt32, false), diff --git a/datafusion/core/tests/parquet/mod.rs b/datafusion/core/tests/parquet/mod.rs index 255df515c791..54397d5a0ad0 100644 --- a/datafusion/core/tests/parquet/mod.rs +++ b/datafusion/core/tests/parquet/mod.rs @@ -364,7 +364,7 @@ fn make_f64_batch(v: Vec) -> RecordBatch { /// /// Columns are named /// "decimal_col" -> DecimalArray -fn make_decimal_batch(v: Vec, precision: u8, scale: u8) -> RecordBatch { +fn make_decimal_batch(v: Vec, precision: u8, scale: i8) -> RecordBatch { let schema = Arc::new(Schema::new(vec![Field::new( "decimal_col", DataType::Decimal128(precision, scale), diff --git a/datafusion/core/tests/sql/expr.rs b/datafusion/core/tests/sql/expr.rs index f82be2a5b95a..301cebc11aa4 100644 --- a/datafusion/core/tests/sql/expr.rs +++ b/datafusion/core/tests/sql/expr.rs @@ -849,11 +849,11 @@ async fn test_interval_expressions() -> Result<()> { // day nano intervals test_expression!( "interval '1'", - "0 years 0 mons 0 days 0 hours 0 mins 1.00 secs" + "0 years 0 mons 0 days 0 hours 0 mins 1.000 secs" ); test_expression!( "interval '1 second'", - "0 years 0 mons 0 days 0 hours 0 mins 1.00 secs" + "0 years 0 mons 0 days 0 hours 0 mins 1.000 secs" ); test_expression!( "interval '500 milliseconds'", @@ -861,47 +861,47 @@ async fn test_interval_expressions() -> Result<()> { ); test_expression!( "interval '5 second'", - "0 years 0 mons 0 days 0 hours 0 mins 5.00 secs" + "0 years 0 mons 0 days 0 hours 0 mins 5.000 secs" ); test_expression!( "interval '0.5 minute'", - "0 years 0 mons 0 days 0 hours 0 mins 30.00 secs" + "0 years 0 mons 0 days 0 hours 0 mins 30.000 secs" ); test_expression!( "interval '.5 minute'", - "0 years 0 mons 0 days 0 hours 0 mins 30.00 secs" + "0 years 0 mons 0 days 0 hours 0 mins 30.000 secs" ); test_expression!( "interval '5 minute'", - "0 years 0 mons 0 days 0 hours 5 mins 0.00 secs" + "0 years 0 mons 0 days 0 hours 5 mins 0.000 secs" ); test_expression!( "interval '5 minute 1 second'", - "0 years 0 mons 0 days 0 hours 5 mins 1.00 secs" + "0 years 0 mons 0 days 0 hours 5 mins 1.000 secs" ); test_expression!( "interval '1 hour'", - "0 years 0 mons 0 days 1 hours 0 mins 0.00 secs" + "0 years 0 mons 0 days 1 hours 0 mins 0.000 secs" ); test_expression!( "interval '5 hour'", - "0 years 0 mons 0 days 5 hours 0 mins 0.00 secs" + "0 years 0 mons 0 days 5 hours 0 mins 0.000 secs" ); test_expression!( "interval '1 day'", - "0 years 0 mons 1 days 0 hours 0 mins 0.00 secs" + "0 years 0 mons 1 days 0 hours 0 mins 0.000 secs" ); test_expression!( "interval '1 week'", - "0 years 0 mons 7 days 0 hours 0 mins 0.00 secs" + "0 years 0 mons 7 days 0 hours 0 mins 0.000 secs" ); test_expression!( "interval '2 weeks'", - "0 years 0 mons 14 days 0 hours 0 mins 0.00 secs" + "0 years 0 mons 14 days 0 hours 0 mins 0.000 secs" ); test_expression!( "interval '1 day 1'", - "0 years 0 mons 1 days 0 hours 0 mins 1.00 secs" + "0 years 0 mons 1 days 0 hours 0 mins 1.000 secs" ); test_expression!( "interval '0.5'", @@ -909,11 +909,11 @@ async fn test_interval_expressions() -> Result<()> { ); test_expression!( "interval '0.5 day 1'", - "0 years 0 mons 0 days 12 hours 0 mins 1.00 secs" + "0 years 0 mons 0 days 12 hours 0 mins 1.000 secs" ); test_expression!( "interval '0.49 day'", - "0 years 0 mons 0 days 11 hours 45 mins 36.00 secs" + "0 years 0 mons 0 days 11 hours 45 mins 36.000 secs" ); test_expression!( "interval '0.499 day'", @@ -933,12 +933,12 @@ async fn test_interval_expressions() -> Result<()> { ); test_expression!( "interval '5 day'", - "0 years 0 mons 5 days 0 hours 0 mins 0.00 secs" + "0 years 0 mons 5 days 0 hours 0 mins 0.000 secs" ); // Hour is ignored, this matches PostgreSQL test_expression!( "interval '5 day' hour", - "0 years 0 mons 5 days 0 hours 0 mins 0.00 secs" + "0 years 0 mons 5 days 0 hours 0 mins 0.000 secs" ); test_expression!( "interval '5 day 4 hours 3 minutes 2 seconds 100 milliseconds'", @@ -947,11 +947,11 @@ async fn test_interval_expressions() -> Result<()> { // month intervals test_expression!( "interval '0.5 month'", - "0 years 0 mons 15 days 0 hours 0 mins 0.00 secs" + "0 years 0 mons 15 days 0 hours 0 mins 0.000 secs" ); test_expression!( "interval '0.5' month", - "0 years 0 mons 15 days 0 hours 0 mins 0.00 secs" + "0 years 0 mons 15 days 0 hours 0 mins 0.000 secs" ); test_expression!( "interval '1 month'", @@ -1000,19 +1000,19 @@ async fn test_interval_expressions() -> Result<()> { // complex test_expression!( "interval '1 year 1 day'", - "0 years 12 mons 1 days 0 hours 0 mins 0.00 secs" + "0 years 12 mons 1 days 0 hours 0 mins 0.000000000 secs" ); test_expression!( "interval '1 year 1 day 1 hour'", - "0 years 12 mons 1 days 1 hours 0 mins 0.00 secs" + "0 years 12 mons 1 days 1 hours 0 mins 0.000000000 secs" ); test_expression!( "interval '1 year 1 day 1 hour 1 minute'", - "0 years 12 mons 1 days 1 hours 1 mins 0.00 secs" + "0 years 12 mons 1 days 1 hours 1 mins 0.000000000 secs" ); test_expression!( "interval '1 year 1 day 1 hour 1 minute 1 second'", - "0 years 12 mons 1 days 1 hours 1 mins 1.00 secs" + "0 years 12 mons 1 days 1 hours 1 mins 1.000000000 secs" ); Ok(()) diff --git a/datafusion/core/tests/sql/parquet_schema.rs b/datafusion/core/tests/sql/parquet_schema.rs index 9c56ab1472c8..3e7bfcc2cc7b 100644 --- a/datafusion/core/tests/sql/parquet_schema.rs +++ b/datafusion/core/tests/sql/parquet_schema.rs @@ -16,11 +16,7 @@ // under the License. //! Tests for parquet schema handling -use std::{ - collections::{BTreeMap, HashMap}, - fs, - path::Path, -}; +use std::{collections::HashMap, fs, path::Path}; use ::parquet::arrow::ArrowWriter; use tempfile::TempDir; @@ -48,12 +44,12 @@ async fn schema_merge_ignores_metadata_by_default() { Schema::new(vec![f1.clone(), f2.clone()]), // field level metadata Schema::new(vec![ - f1.clone().with_metadata(make_b_meta("blarg", "bar")), + f1.clone().with_metadata(make_meta("blarg", "bar")), f2.clone(), ]), // incompatible field level metadata Schema::new(vec![ - f1.clone().with_metadata(make_b_meta("blarg", "baz")), + f1.clone().with_metadata(make_meta("blarg", "baz")), f2.clone(), ]), // schema with no meta @@ -166,17 +162,6 @@ fn make_meta(k: impl Into, v: impl Into) -> HashMap, - v: impl Into, -) -> Option> { - let mut meta = BTreeMap::new(); - meta.insert(k.into(), v.into()); - Some(meta) -} - /// Writes individual files with the specified schemas to temp_path) /// /// Assumes each schema has an int32 and a string column diff --git a/datafusion/expr/Cargo.toml b/datafusion/expr/Cargo.toml index 4527b2bd3f29..acf9e2a04495 100644 --- a/datafusion/expr/Cargo.toml +++ b/datafusion/expr/Cargo.toml @@ -36,7 +36,7 @@ path = "src/lib.rs" [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -arrow = { version = "27.0.0", default-features = false } +arrow = { version = "28.0.0", default-features = false } datafusion-common = { path = "../common", version = "14.0.0" } log = "^0.4" sqlparser = "0.27" diff --git a/datafusion/expr/src/type_coercion/binary.rs b/datafusion/expr/src/type_coercion/binary.rs index 70e09f2fbdf2..6ca0c4e0445a 100644 --- a/datafusion/expr/src/type_coercion/binary.rs +++ b/datafusion/expr/src/type_coercion/binary.rs @@ -287,8 +287,8 @@ fn get_wider_decimal_type( (DataType::Decimal128(p1, s1), DataType::Decimal128(p2, s2)) => { // max(s1, s2) + max(p1-s1, p2-s2), max(s1, s2) let s = *s1.max(s2); - let range = (p1 - s1).max(p2 - s2); - Some(create_decimal_type(range + s, s)) + let range = (*p1 as i8 - s1).max(*p2 as i8 - s2); + Some(create_decimal_type((range + s) as u8, s)) } (_, _) => None, } @@ -377,7 +377,7 @@ fn mathematics_numerical_coercion( } } -fn create_decimal_type(precision: u8, scale: u8) -> DataType { +fn create_decimal_type(precision: u8, scale: i8) -> DataType { DataType::Decimal128( DECIMAL128_MAX_PRECISION.min(precision), DECIMAL128_MAX_SCALE.min(scale), @@ -399,8 +399,9 @@ fn coercion_decimal_mathematics_type( // max(s1, s2) let result_scale = *s1.max(s2); // max(s1, s2) + max(p1-s1, p2-s2) + 1 - let result_precision = result_scale + (*p1 - *s1).max(*p2 - *s2) + 1; - Some(create_decimal_type(result_precision, result_scale)) + let result_precision = + result_scale + (*p1 as i8 - *s1).max(*p2 as i8 - *s2) + 1; + Some(create_decimal_type(result_precision as u8, result_scale)) } Operator::Multiply => { // s1 + s2 @@ -411,17 +412,18 @@ fn coercion_decimal_mathematics_type( } Operator::Divide => { // max(6, s1 + p2 + 1) - let result_scale = 6.max(*s1 + *p2 + 1); + let result_scale = 6.max(*s1 + *p2 as i8 + 1); // p1 - s1 + s2 + max(6, s1 + p2 + 1) - let result_precision = result_scale + *p1 - *s1 + *s2; - Some(create_decimal_type(result_precision, result_scale)) + let result_precision = result_scale + *p1 as i8 - *s1 + *s2; + Some(create_decimal_type(result_precision as u8, result_scale)) } Operator::Modulo => { // max(s1, s2) let result_scale = *s1.max(s2); // min(p1-s1, p2-s2) + max(s1, s2) - let result_precision = result_scale + (*p1 - *s1).min(*p2 - *s2); - Some(create_decimal_type(result_precision, result_scale)) + let result_precision = + result_scale + (*p1 as i8 - *s1).min(*p2 as i8 - *s2); + Some(create_decimal_type(result_precision as u8, result_scale)) } _ => None, } diff --git a/datafusion/jit/Cargo.toml b/datafusion/jit/Cargo.toml index e4bd5fc2e472..8a923e661301 100644 --- a/datafusion/jit/Cargo.toml +++ b/datafusion/jit/Cargo.toml @@ -36,7 +36,7 @@ path = "src/lib.rs" jit = [] [dependencies] -arrow = { version = "27.0.0", default-features = false } +arrow = { version = "28.0.0", default-features = false } cranelift = "0.89.0" cranelift-jit = "0.89.0" cranelift-module = "0.89.0" diff --git a/datafusion/optimizer/Cargo.toml b/datafusion/optimizer/Cargo.toml index 4ee13b9aebc3..c31abeb59d96 100644 --- a/datafusion/optimizer/Cargo.toml +++ b/datafusion/optimizer/Cargo.toml @@ -37,7 +37,7 @@ default = ["unicode_expressions"] unicode_expressions = [] [dependencies] -arrow = { version = "27.0.0", features = ["prettyprint"] } +arrow = { version = "28.0.0", features = ["prettyprint"] } async-trait = "0.1.41" chrono = { version = "0.4.23", default-features = false } datafusion-common = { path = "../common", version = "14.0.0" } diff --git a/datafusion/optimizer/src/optimizer.rs b/datafusion/optimizer/src/optimizer.rs index afa391d41696..5340f4f806b3 100644 --- a/datafusion/optimizer/src/optimizer.rs +++ b/datafusion/optimizer/src/optimizer.rs @@ -299,7 +299,6 @@ mod tests { use datafusion_common::{DFField, DFSchema, DFSchemaRef, DataFusionError}; use datafusion_expr::logical_plan::EmptyRelation; use datafusion_expr::{col, LogicalPlan, LogicalPlanBuilder, Projection}; - use std::collections::BTreeMap; use std::sync::Arc; #[test] @@ -345,9 +344,9 @@ mod tests { "Internal error: Optimizer rule 'get table_scan rule' failed, due to generate a different schema, \ original schema: DFSchema { fields: [], metadata: {} }, \ new schema: DFSchema { fields: [\ - DFField { qualifier: Some(\"test\"), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None } }, \ - DFField { qualifier: Some(\"test\"), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None } }, \ - DFField { qualifier: Some(\"test\"), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: None } }], \ + DFField { qualifier: Some(\"test\"), field: Field { name: \"a\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ + DFField { qualifier: Some(\"test\"), field: Field { name: \"b\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }, \ + DFField { qualifier: Some(\"test\"), field: Field { name: \"c\", data_type: UInt32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} } }], \ metadata: {} }. \ This was likely caused by a bug in DataFusion's code \ and we would welcome that you file an bug report in our issue tracker", @@ -385,11 +384,11 @@ mod tests { .iter() .enumerate() .map(|(i, f)| { - let metadata: BTreeMap<_, _> = [("key".into(), format!("value {}", i))] + let metadata = [("key".into(), format!("value {}", i))] .into_iter() .collect(); - let new_arrow_field = f.field().clone().with_metadata(Some(metadata)); + let new_arrow_field = f.field().clone().with_metadata(metadata); if let Some(qualifier) = f.qualifier() { DFField::from_qualified(qualifier, new_arrow_field) } else { diff --git a/datafusion/optimizer/src/simplify_expressions/utils.rs b/datafusion/optimizer/src/simplify_expressions/utils.rs index c6c13c0334ae..94f6e86b37a9 100644 --- a/datafusion/optimizer/src/simplify_expressions/utils.rs +++ b/datafusion/optimizer/src/simplify_expressions/utils.rs @@ -17,8 +17,6 @@ //! Utility functions for expression simplification -use arrow::datatypes::DECIMAL128_MAX_PRECISION; - use datafusion_common::{DataFusionError, Result, ScalarValue}; use datafusion_expr::{ expr::{Between, BinaryExpr}, @@ -108,8 +106,12 @@ pub fn is_one(s: &Expr) -> bool { | Expr::Literal(ScalarValue::UInt64(Some(1))) => true, Expr::Literal(ScalarValue::Float32(Some(v))) if *v == 1. => true, Expr::Literal(ScalarValue::Float64(Some(v))) if *v == 1. => true, - Expr::Literal(ScalarValue::Decimal128(Some(v), _p, _s)) => { - *_s < DECIMAL128_MAX_PRECISION && POWS_OF_TEN[*_s as usize] == *v + Expr::Literal(ScalarValue::Decimal128(Some(v), _p, s)) => { + *s >= 0 + && POWS_OF_TEN + .get(*s as usize) + .map(|x| x == v) + .unwrap_or_default() } _ => false, } diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 5f542d7498b9..8e17c184eeab 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -752,7 +752,7 @@ mod tests { lit(ScalarValue::Int64(None)) } - fn lit_decimal(value: i128, precision: u8, scale: u8) -> Expr { + fn lit_decimal(value: i128, precision: u8, scale: i8) -> Expr { lit(ScalarValue::Decimal128(Some(value), precision, scale)) } @@ -765,7 +765,7 @@ mod tests { lit(ScalarValue::TimestampNanosecond(Some(ts), utc)) } - fn null_decimal(precision: u8, scale: u8) -> Expr { + fn null_decimal(precision: u8, scale: i8) -> Expr { lit(ScalarValue::Decimal128(None, precision, scale)) } diff --git a/datafusion/physical-expr/Cargo.toml b/datafusion/physical-expr/Cargo.toml index 21afaebe8449..0ba813110612 100644 --- a/datafusion/physical-expr/Cargo.toml +++ b/datafusion/physical-expr/Cargo.toml @@ -40,9 +40,9 @@ unicode_expressions = ["unicode-segmentation"] [dependencies] ahash = { version = "0.8", default-features = false, features = ["runtime-rng"] } -arrow = { version = "27.0.0", features = ["prettyprint", "dyn_cmp_dict"] } -arrow-buffer = "27.0.0" -arrow-schema = "27.0.0" +arrow = { version = "28.0.0", features = ["prettyprint", "dyn_cmp_dict"] } +arrow-buffer = "28.0.0" +arrow-schema = "28.0.0" blake2 = { version = "^0.10.2", optional = true } blake3 = { version = "1.0", optional = true } chrono = { version = "0.4.23", default-features = false } diff --git a/datafusion/physical-expr/src/aggregate/sum.rs b/datafusion/physical-expr/src/aggregate/sum.rs index 32259bc3061d..b330455a1855 100644 --- a/datafusion/physical-expr/src/aggregate/sum.rs +++ b/datafusion/physical-expr/src/aggregate/sum.rs @@ -161,7 +161,7 @@ macro_rules! typed_sum_delta_batch { // TODO implement this in arrow-rs with simd // https://github.com/apache/arrow-rs/issues/1010 -fn sum_decimal_batch(values: &ArrayRef, precision: u8, scale: u8) -> Result { +fn sum_decimal_batch(values: &ArrayRef, precision: u8, scale: i8) -> Result { let array = downcast_value!(values, Decimal128Array); if array.null_count() == array.len() { diff --git a/datafusion/physical-expr/src/expressions/binary.rs b/datafusion/physical-expr/src/expressions/binary.rs index dc8ee635f8cb..c7aa7f014254 100644 --- a/datafusion/physical-expr/src/expressions/binary.rs +++ b/datafusion/physical-expr/src/expressions/binary.rs @@ -2290,7 +2290,7 @@ mod tests { fn create_decimal_array( array: &[Option], precision: u8, - scale: u8, + scale: i8, ) -> Decimal128Array { let mut decimal_builder = Decimal128Builder::with_capacity(array.len()); for value in array.iter().copied() { diff --git a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs index 0edb7f162455..1d86d171f37e 100644 --- a/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs +++ b/datafusion/physical-expr/src/expressions/binary/kernels_arrow.rs @@ -377,7 +377,7 @@ mod tests { fn create_decimal_array( array: &[Option], precision: u8, - scale: u8, + scale: i8, ) -> Decimal128Array { let mut decimal_builder = Decimal128Builder::with_capacity(array.len()); diff --git a/datafusion/physical-expr/src/expressions/try_cast.rs b/datafusion/physical-expr/src/expressions/try_cast.rs index b36243b91777..9fa6fc0d9b3d 100644 --- a/datafusion/physical-expr/src/expressions/try_cast.rs +++ b/datafusion/physical-expr/src/expressions/try_cast.rs @@ -547,7 +547,7 @@ mod tests { } // create decimal array with the specified precision and scale - fn create_decimal_array(array: &[i128], precision: u8, scale: u8) -> Decimal128Array { + fn create_decimal_array(array: &[i128], precision: u8, scale: i8) -> Decimal128Array { let mut decimal_builder = Decimal128Builder::with_capacity(array.len()); for value in array { decimal_builder.append_value(*value); diff --git a/datafusion/proto/Cargo.toml b/datafusion/proto/Cargo.toml index 95c7bf0f6bdd..e3ede179ed6b 100644 --- a/datafusion/proto/Cargo.toml +++ b/datafusion/proto/Cargo.toml @@ -40,7 +40,7 @@ default = [] json = ["pbjson", "serde", "serde_json"] [dependencies] -arrow = "27.0.0" +arrow = "28.0.0" datafusion = { path = "../core", version = "14.0.0" } datafusion-common = { path = "../common", version = "14.0.0" } datafusion-expr = { path = "../expr", version = "14.0.0" } diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index b1ba0ec2d95e..8b2c21f413bf 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -705,8 +705,9 @@ enum IntervalUnit{ } message Decimal{ - uint64 whole = 1; - uint64 fractional = 2; + reserved 1, 2; + uint32 precision = 3; + int32 scale = 4; } message List{ diff --git a/datafusion/proto/src/from_proto.rs b/datafusion/proto/src/from_proto.rs index 1de84ad94aa7..a30e7b323aba 100644 --- a/datafusion/proto/src/from_proto.rs +++ b/datafusion/proto/src/from_proto.rs @@ -265,9 +265,9 @@ impl TryFrom<&protobuf::arrow_type::ArrowTypeEnum> for DataType { protobuf::IntervalUnit::try_from(interval_unit)?.into(), ), arrow_type::ArrowTypeEnum::Decimal(protobuf::Decimal { - whole, - fractional, - }) => DataType::Decimal128(*whole as u8, *fractional as u8), + precision, + scale, + }) => DataType::Decimal128(*precision as u8, *scale as i8), arrow_type::ArrowTypeEnum::List(list) => { let list_type = list.as_ref().field_type.as_deref().required("field_type")?; @@ -579,7 +579,7 @@ impl TryFrom<&protobuf::ScalarValue> for ScalarValue { Self::Decimal128( Some(i128::from_be_bytes(array)), val.p as u8, - val.s as u8, + val.s as i8, ) } Value::Date64Value(v) => Self::Date64(Some(*v)), diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index ee0142011b9a..3752c64d2d99 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -3316,18 +3316,18 @@ impl serde::Serialize for Decimal { { use serde::ser::SerializeStruct; let mut len = 0; - if self.whole != 0 { + if self.precision != 0 { len += 1; } - if self.fractional != 0 { + if self.scale != 0 { len += 1; } let mut struct_ser = serializer.serialize_struct("datafusion.Decimal", len)?; - if self.whole != 0 { - struct_ser.serialize_field("whole", ToString::to_string(&self.whole).as_str())?; + if self.precision != 0 { + struct_ser.serialize_field("precision", &self.precision)?; } - if self.fractional != 0 { - struct_ser.serialize_field("fractional", ToString::to_string(&self.fractional).as_str())?; + if self.scale != 0 { + struct_ser.serialize_field("scale", &self.scale)?; } struct_ser.end() } @@ -3339,14 +3339,14 @@ impl<'de> serde::Deserialize<'de> for Decimal { D: serde::Deserializer<'de>, { const FIELDS: &[&str] = &[ - "whole", - "fractional", + "precision", + "scale", ]; #[allow(clippy::enum_variant_names)] enum GeneratedField { - Whole, - Fractional, + Precision, + Scale, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -3368,8 +3368,8 @@ impl<'de> serde::Deserialize<'de> for Decimal { E: serde::de::Error, { match value { - "whole" => Ok(GeneratedField::Whole), - "fractional" => Ok(GeneratedField::Fractional), + "precision" => Ok(GeneratedField::Precision), + "scale" => Ok(GeneratedField::Scale), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -3389,31 +3389,31 @@ impl<'de> serde::Deserialize<'de> for Decimal { where V: serde::de::MapAccess<'de>, { - let mut whole__ = None; - let mut fractional__ = None; + let mut precision__ = None; + let mut scale__ = None; while let Some(k) = map.next_key()? { match k { - GeneratedField::Whole => { - if whole__.is_some() { - return Err(serde::de::Error::duplicate_field("whole")); + GeneratedField::Precision => { + if precision__.is_some() { + return Err(serde::de::Error::duplicate_field("precision")); } - whole__ = + precision__ = Some(map.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) ; } - GeneratedField::Fractional => { - if fractional__.is_some() { - return Err(serde::de::Error::duplicate_field("fractional")); + GeneratedField::Scale => { + if scale__.is_some() { + return Err(serde::de::Error::duplicate_field("scale")); } - fractional__ = + scale__ = Some(map.next_value::<::pbjson::private::NumberDeserialize<_>>()?.0) ; } } } Ok(Decimal { - whole: whole__.unwrap_or_default(), - fractional: fractional__.unwrap_or_default(), + precision: precision__.unwrap_or_default(), + scale: scale__.unwrap_or_default(), }) } } diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index 25297787a166..dfce7f4b67c3 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -801,10 +801,10 @@ pub struct Timestamp { } #[derive(Clone, PartialEq, ::prost::Message)] pub struct Decimal { - #[prost(uint64, tag = "1")] - pub whole: u64, - #[prost(uint64, tag = "2")] - pub fractional: u64, + #[prost(uint32, tag = "3")] + pub precision: u32, + #[prost(int32, tag = "4")] + pub scale: i32, } #[derive(Clone, PartialEq, ::prost::Message)] pub struct List { diff --git a/datafusion/proto/src/lib.rs b/datafusion/proto/src/lib.rs index da61188f0a9f..e4b1c65d26f2 100644 --- a/datafusion/proto/src/lib.rs +++ b/datafusion/proto/src/lib.rs @@ -812,7 +812,7 @@ mod roundtrip_tests { DataType::LargeBinary, DataType::Utf8, DataType::LargeUtf8, - DataType::Decimal128(123, 234), + DataType::Decimal128(7, 12), // Recursive list tests DataType::List(new_box_field("Level1", DataType::Binary, true)), DataType::List(new_box_field( diff --git a/datafusion/proto/src/to_proto.rs b/datafusion/proto/src/to_proto.rs index 6e152cc0c3b8..a73565ae6bae 100644 --- a/datafusion/proto/src/to_proto.rs +++ b/datafusion/proto/src/to_proto.rs @@ -209,9 +209,9 @@ impl TryFrom<&DataType> for protobuf::arrow_type::ArrowTypeEnum { value: Some(Box::new(value_type.as_ref().try_into()?)), })) } - DataType::Decimal128(whole, fractional) => Self::Decimal(protobuf::Decimal { - whole: *whole as u64, - fractional: *fractional as u64, + DataType::Decimal128(precision, scale) => Self::Decimal(protobuf::Decimal { + precision: *precision as u32, + scale: *scale as i32, }), DataType::Decimal256(_, _) => { return Err(Error::General("Proto serialization error: The Decimal256 data type is not yet supported".to_owned())) diff --git a/datafusion/row/Cargo.toml b/datafusion/row/Cargo.toml index a6934cb6f5d3..2ca850aaf12e 100644 --- a/datafusion/row/Cargo.toml +++ b/datafusion/row/Cargo.toml @@ -37,7 +37,7 @@ path = "src/lib.rs" jit = ["datafusion-jit"] [dependencies] -arrow = "27.0.0" +arrow = "28.0.0" datafusion-common = { path = "../common", version = "14.0.0" } datafusion-jit = { path = "../jit", version = "14.0.0", optional = true } paste = "^1.0" diff --git a/datafusion/sql/Cargo.toml b/datafusion/sql/Cargo.toml index ef421702168c..294ee53a6d53 100644 --- a/datafusion/sql/Cargo.toml +++ b/datafusion/sql/Cargo.toml @@ -37,7 +37,7 @@ default = ["unicode_expressions"] unicode_expressions = [] [dependencies] -arrow = { version = "27.0.0", default-features = false } +arrow = { version = "28.0.0", default-features = false } datafusion-common = { path = "../common", version = "14.0.0" } datafusion-expr = { path = "../expr", version = "14.0.0" } sqlparser = "0.27" diff --git a/datafusion/sql/src/planner.rs b/datafusion/sql/src/planner.rs index 36b45390e47a..d52dfa08aa97 100644 --- a/datafusion/sql/src/planner.rs +++ b/datafusion/sql/src/planner.rs @@ -2754,7 +2754,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Ok(Expr::Literal(ScalarValue::Decimal128( Some(n), p as u8, - s as u8, + s as i8, ))) } else { let number = n.parse::().map_err(|_| { @@ -3213,7 +3213,7 @@ mod tests { let sql = "SELECT CAST(10 AS DECIMAL(0))"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Internal("Decimal(precision = 0, scale = 0) should satisty `0 < precision <= 38`, and `scale <= precision`.")"##, + r##"Internal("Decimal(precision = 0, scale = 0) should satisfy `0 < precision <= 38`, and `scale <= precision`.")"##, format!("{:?}", err) ); } @@ -3222,7 +3222,7 @@ mod tests { let sql = "SELECT CAST(10 AS DECIMAL(39))"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Internal("Decimal(precision = 39, scale = 0) should satisty `0 < precision <= 38`, and `scale <= precision`.")"##, + r##"Internal("Decimal(precision = 39, scale = 0) should satisfy `0 < precision <= 38`, and `scale <= precision`.")"##, format!("{:?}", err) ); } @@ -3231,7 +3231,7 @@ mod tests { let sql = "SELECT CAST(10 AS DECIMAL(5, 10))"; let err = logical_plan(sql).expect_err("query should have failed"); assert_eq!( - r##"Internal("Decimal(precision = 5, scale = 10) should satisty `0 < precision <= 38`, and `scale <= precision`.")"##, + r##"Internal("Decimal(precision = 5, scale = 10) should satisfy `0 < precision <= 38`, and `scale <= precision`.")"##, format!("{:?}", err) ); } diff --git a/datafusion/sql/src/utils.rs b/datafusion/sql/src/utils.rs index 3b6eaba2a806..f12578fbb75f 100644 --- a/datafusion/sql/src/utils.rs +++ b/datafusion/sql/src/utils.rs @@ -511,7 +511,7 @@ pub(crate) fn make_decimal_type( ) -> Result { // postgres like behavior let (precision, scale) = match (precision, scale) { - (Some(p), Some(s)) => (p as u8, s as u8), + (Some(p), Some(s)) => (p as u8, s as i8), (Some(p), None) => (p as u8, 0), (None, Some(_)) => { return Err(DataFusionError::Internal( @@ -522,9 +522,12 @@ pub(crate) fn make_decimal_type( }; // Arrow decimal is i128 meaning 38 maximum decimal digits - if precision == 0 || precision > DECIMAL128_MAX_PRECISION || scale > precision { + if precision == 0 + || precision > DECIMAL128_MAX_PRECISION + || scale.unsigned_abs() > precision + { Err(DataFusionError::Internal(format!( - "Decimal(precision = {}, scale = {}) should satisty `0 < precision <= 38`, and `scale <= precision`.", + "Decimal(precision = {}, scale = {}) should satisfy `0 < precision <= 38`, and `scale <= precision`.", precision, scale ))) } else { diff --git a/parquet-test-utils/Cargo.toml b/parquet-test-utils/Cargo.toml index 48c7678adbcf..b6e9dc045c2b 100644 --- a/parquet-test-utils/Cargo.toml +++ b/parquet-test-utils/Cargo.toml @@ -25,4 +25,4 @@ edition = "2021" [dependencies] datafusion = { path = "../datafusion/core" } object_store = "0.5.0" -parquet = "27.0.0" +parquet = "28.0.0" diff --git a/test-utils/Cargo.toml b/test-utils/Cargo.toml index ea0a18c5edb2..2024716bf8c5 100644 --- a/test-utils/Cargo.toml +++ b/test-utils/Cargo.toml @@ -23,7 +23,7 @@ edition = "2021" # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -arrow = { version = "27.0.0", features = ["prettyprint"] } +arrow = { version = "28.0.0", features = ["prettyprint"] } datafusion-common = { path = "../datafusion/common", version = "14.0.0" } env_logger = "0.10.0" rand = "0.8"