From 877fcb6c3629e24055369e374def81a05585206f Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Aug 2023 16:23:11 +0200 Subject: [PATCH 01/13] add test for valid metadata v2 --- crates/iceberg/src/spec/snapshot.rs | 2 +- crates/iceberg/src/spec/table_metadata.rs | 150 +++++++++++++++++- .../table_metadata/TableMetadataV2Valid.json | 122 ++++++++++++++ 3 files changed, 269 insertions(+), 5 deletions(-) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataV2Valid.json diff --git a/crates/iceberg/src/spec/snapshot.rs b/crates/iceberg/src/spec/snapshot.rs index 9a802883b..f38a605cf 100644 --- a/crates/iceberg/src/spec/snapshot.rs +++ b/crates/iceberg/src/spec/snapshot.rs @@ -77,7 +77,7 @@ pub struct Snapshot { /// A string map that summarizes the snapshot changes, including operation. summary: Summary, /// ID of the table’s current schema when the snapshot was created. - #[builder(setter(strip_option))] + #[builder(setter(strip_option), default = "None")] schema_id: Option, } diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index ebf7cca16..441536b01 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -653,7 +653,7 @@ pub struct SnapshotLog { #[cfg(test)] mod tests { - use std::{collections::HashMap, sync::Arc}; + use std::{collections::HashMap, fs, sync::Arc}; use anyhow::Result; use uuid::Uuid; @@ -661,9 +661,9 @@ mod tests { use pretty_assertions::assert_eq; use crate::spec::{ - table_metadata::TableMetadata, ManifestList, NestedField, Operation, PartitionField, - PartitionSpec, PrimitiveType, Schema, Snapshot, SnapshotReference, SnapshotRetention, - SortOrder, Summary, Transform, Type, + table_metadata::TableMetadata, ManifestList, NestedField, NullOrder, Operation, + PartitionField, PartitionSpec, PrimitiveType, Schema, Snapshot, SnapshotReference, + SnapshotRetention, SortDirection, SortField, SortOrder, Summary, Transform, Type, }; use super::{FormatVersion, MetadataLog, SnapshotLog}; @@ -972,4 +972,146 @@ mod tests { assert!(serde_json::from_str::(data).is_err()); Ok(()) } + + #[test] + fn test_table_metadata_v2_file_valid() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataV2Valid.json").unwrap(); + + let schema1 = Schema::builder() + .with_schema_id(0) + .with_fields(vec![Arc::new(NestedField::required( + 1, + "x", + Type::Primitive(PrimitiveType::Long), + ))]) + .build() + .unwrap(); + + let schema2 = Schema::builder() + .with_schema_id(1) + .with_fields(vec![ + Arc::new(NestedField::required( + 1, + "x", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new( + NestedField::required(2, "y", Type::Primitive(PrimitiveType::Long)) + .with_doc("comment"), + ), + Arc::new(NestedField::required( + 3, + "z", + Type::Primitive(PrimitiveType::Long), + )), + ]) + .with_identifier_field_ids(vec![1, 2]) + .build() + .unwrap(); + + let partition_spec = PartitionSpec::builder() + .with_spec_id(0) + .with_partition_field(PartitionField { + name: "x".to_string(), + transform: Transform::Identity, + source_id: 1, + field_id: 1000, + }) + .build() + .unwrap(); + + let sort_order = SortOrder::builder() + .with_order_id(3) + .with_sort_field(SortField { + source_id: 2, + transform: Transform::Identity, + direction: SortDirection::Ascending, + null_order: NullOrder::First, + }) + .with_sort_field(SortField { + source_id: 3, + transform: Transform::Bucket(4), + direction: SortDirection::Descending, + null_order: NullOrder::Last, + }) + .build() + .unwrap(); + + let snapshot1 = Snapshot::builder() + .with_snapshot_id(3051729675574597004) + .with_timestamp_ms(1515100955770) + .with_sequence_number(0) + .with_manifest_list(ManifestList::ManifestListFile( + "s3://a/b/1.avro".to_string(), + )) + .with_summary(Summary { + operation: Operation::Append, + other: HashMap::new(), + }) + .build() + .unwrap(); + + let snapshot2 = Snapshot::builder() + .with_snapshot_id(3055729675574597004) + .with_parent_snapshot_id(Some(3051729675574597004)) + .with_timestamp_ms(1555100955770) + .with_sequence_number(1) + .with_schema_id(1) + .with_manifest_list(ManifestList::ManifestListFile( + "s3://a/b/2.avro".to_string(), + )) + .with_summary(Summary { + operation: Operation::Append, + other: HashMap::new(), + }) + .build() + .unwrap(); + + let expected = TableMetadata { + format_version: FormatVersion::V2, + table_uuid: Uuid::parse_str("9c12d441-03fe-4693-9a96-a0705ddf69c1").unwrap(), + location: "s3://bucket/test/location".to_string(), + last_updated_ms: 1602638573590, + last_column_id: 3, + schemas: HashMap::from_iter(vec![(0, Arc::new(schema1)), (1, Arc::new(schema2))]), + current_schema_id: 1, + partition_specs: HashMap::from_iter(vec![(0, partition_spec)]), + default_spec_id: 0, + last_partition_id: 1000, + default_sort_order_id: 3, + sort_orders: HashMap::from_iter(vec![(3, sort_order)]), + snapshots: Some(HashMap::from_iter(vec![ + (3051729675574597004, Arc::new(snapshot1)), + (3055729675574597004, Arc::new(snapshot2)), + ])), + current_snapshot_id: Some(3055729675574597004), + last_sequence_number: 34, + properties: HashMap::new(), + snapshot_log: vec![ + SnapshotLog { + snapshot_id: 3051729675574597004, + timestamp_ms: 1515100955770, + }, + SnapshotLog { + snapshot_id: 3055729675574597004, + timestamp_ms: 1555100955770, + }, + ], + metadata_log: Vec::new(), + refs: HashMap::from_iter(vec![( + "main".to_string(), + SnapshotReference { + snapshot_id: 3055729675574597004, + retention: SnapshotRetention::Branch { + min_snapshots_to_keep: None, + max_snapshot_age_ms: None, + max_ref_age_ms: None, + }, + }, + )]), + }; + + check_table_metadata_serde(&metadata, expected); + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV2Valid.json b/crates/iceberg/testdata/table_metadata/TableMetadataV2Valid.json new file mode 100644 index 000000000..0dc89de58 --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV2Valid.json @@ -0,0 +1,122 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 1, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [ + 1, + 2 + ], + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/1.avro" + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 1 + } + ], + "snapshot-log": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770 + }, + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770 + } + ], + "metadata-log": [] +} \ No newline at end of file From cd6e9d01d2dbb69c2d5d0c91544861373e6c49b5 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Aug 2023 16:31:16 +0200 Subject: [PATCH 02/13] add test for minimal valid table metadata v2 --- crates/iceberg/src/spec/table_metadata.rs | 79 +++++++++++++++++++ .../TableMetadataV2ValidMinimal.json | 71 +++++++++++++++++ 2 files changed, 150 insertions(+) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataV2ValidMinimal.json diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 441536b01..743079485 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1114,4 +1114,83 @@ mod tests { check_table_metadata_serde(&metadata, expected); } + + #[test] + fn test_table_metadata_v2_file_valid_minimal() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataV2ValidMinimal.json").unwrap(); + + let schema = Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + Arc::new(NestedField::required( + 1, + "x", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new( + NestedField::required(2, "y", Type::Primitive(PrimitiveType::Long)) + .with_doc("comment"), + ), + Arc::new(NestedField::required( + 3, + "z", + Type::Primitive(PrimitiveType::Long), + )), + ]) + .build() + .unwrap(); + + let partition_spec = PartitionSpec::builder() + .with_spec_id(0) + .with_partition_field(PartitionField { + name: "x".to_string(), + transform: Transform::Identity, + source_id: 1, + field_id: 1000, + }) + .build() + .unwrap(); + + let sort_order = SortOrder::builder() + .with_order_id(3) + .with_sort_field(SortField { + source_id: 2, + transform: Transform::Identity, + direction: SortDirection::Ascending, + null_order: NullOrder::First, + }) + .with_sort_field(SortField { + source_id: 3, + transform: Transform::Bucket(4), + direction: SortDirection::Descending, + null_order: NullOrder::Last, + }) + .build() + .unwrap(); + + let expected = TableMetadata { + format_version: FormatVersion::V2, + table_uuid: Uuid::parse_str("9c12d441-03fe-4693-9a96-a0705ddf69c1").unwrap(), + location: "s3://bucket/test/location".to_string(), + last_updated_ms: 1602638573590, + last_column_id: 3, + schemas: HashMap::from_iter(vec![(0, Arc::new(schema))]), + current_schema_id: 0, + partition_specs: HashMap::from_iter(vec![(0, partition_spec)]), + default_spec_id: 0, + last_partition_id: 1000, + default_sort_order_id: 3, + sort_orders: HashMap::from_iter(vec![(3, sort_order)]), + snapshots: None, + current_snapshot_id: None, + last_sequence_number: 34, + properties: HashMap::new(), + snapshot_log: vec![], + metadata_log: Vec::new(), + refs: HashMap::new(), + }; + + check_table_metadata_serde(&metadata, expected); + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV2ValidMinimal.json b/crates/iceberg/testdata/table_metadata/TableMetadataV2ValidMinimal.json new file mode 100644 index 000000000..529b10d1f --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV2ValidMinimal.json @@ -0,0 +1,71 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ] +} \ No newline at end of file From 93ec8b357ac2c3e12fdc1c8df03b8daec29a9285 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Aug 2023 16:47:38 +0200 Subject: [PATCH 03/13] test valied v1 table metadata --- crates/iceberg/src/spec/table_metadata.rs | 95 +++++++++++++++++-- .../table_metadata/TableMetadataV1Valid.json | 42 ++++++++ 2 files changed, 128 insertions(+), 9 deletions(-) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataV1Valid.json diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 743079485..98b5a51e7 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -39,6 +39,7 @@ use _serde::TableMetadataEnum; static MAIN_BRANCH: &str = "main"; static DEFAULT_SPEC_ID: i32 = 0; +static DEFAULT_SORT_ORDER_ID: i64 = 0; #[derive(Debug, PartialEq, Serialize, Deserialize, Eq, Clone)] #[serde(try_from = "TableMetadataEnum", into = "TableMetadataEnum")] @@ -215,7 +216,8 @@ pub(super) mod _serde { }; use super::{ - FormatVersion, MetadataLog, SnapshotLog, TableMetadata, DEFAULT_SPEC_ID, MAIN_BRANCH, + FormatVersion, MetadataLog, SnapshotLog, TableMetadata, DEFAULT_SORT_ORDER_ID, + DEFAULT_SPEC_ID, MAIN_BRANCH, }; #[derive(Debug, Serialize, Deserialize, PartialEq, Eq)] @@ -288,8 +290,8 @@ pub(super) mod _serde { pub snapshot_log: Option>, #[serde(skip_serializing_if = "Option::is_none")] pub metadata_log: Option>, - pub sort_orders: Vec, - pub default_sort_order_id: i64, + pub sort_orders: Option>, + pub default_sort_order_id: Option, } /// Helper to serialize and deserialize the format version. @@ -479,10 +481,13 @@ pub(super) mod _serde { .transpose()?, snapshot_log: value.snapshot_log.unwrap_or_default(), metadata_log: value.metadata_log.unwrap_or_default(), - sort_orders: HashMap::from_iter( - value.sort_orders.into_iter().map(|x| (x.order_id, x)), - ), - default_sort_order_id: value.default_sort_order_id, + sort_orders: match value.sort_orders { + Some(sort_orders) => { + HashMap::from_iter(sort_orders.into_iter().map(|x| (x.order_id, x))) + } + None => HashMap::new(), + }, + default_sort_order_id: value.default_sort_order_id.unwrap_or(DEFAULT_SORT_ORDER_ID), refs: HashMap::from_iter(vec![( MAIN_BRANCH.to_string(), SnapshotReference { @@ -613,8 +618,8 @@ pub(super) mod _serde { } else { Some(v.metadata_log) }, - sort_orders: v.sort_orders.into_values().collect(), - default_sort_order_id: v.default_sort_order_id, + sort_orders: Some(v.sort_orders.into_values().collect()), + default_sort_order_id: Some(v.default_sort_order_id), } } } @@ -1193,4 +1198,76 @@ mod tests { check_table_metadata_serde(&metadata, expected); } + + #[test] + fn test_table_metadata_v1_file_valid() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataV1Valid.json").unwrap(); + + let schema = Schema::builder() + .with_schema_id(0) + .with_fields(vec![ + Arc::new(NestedField::required( + 1, + "x", + Type::Primitive(PrimitiveType::Long), + )), + Arc::new( + NestedField::required(2, "y", Type::Primitive(PrimitiveType::Long)) + .with_doc("comment"), + ), + Arc::new(NestedField::required( + 3, + "z", + Type::Primitive(PrimitiveType::Long), + )), + ]) + .build() + .unwrap(); + + let partition_spec = PartitionSpec::builder() + .with_spec_id(0) + .with_partition_field(PartitionField { + name: "x".to_string(), + transform: Transform::Identity, + source_id: 1, + field_id: 1000, + }) + .build() + .unwrap(); + + let expected = TableMetadata { + format_version: FormatVersion::V1, + table_uuid: Uuid::parse_str("d20125c8-7284-442c-9aea-15fee620737c").unwrap(), + location: "s3://bucket/test/location".to_string(), + last_updated_ms: 1602638573874, + last_column_id: 3, + schemas: HashMap::from_iter(vec![(0, Arc::new(schema))]), + current_schema_id: 0, + partition_specs: HashMap::from_iter(vec![(0, partition_spec)]), + default_spec_id: 0, + last_partition_id: 0, + default_sort_order_id: 0, + sort_orders: HashMap::new(), + snapshots: Some(HashMap::new()), + current_snapshot_id: None, + last_sequence_number: 0, + properties: HashMap::new(), + snapshot_log: vec![], + metadata_log: Vec::new(), + refs: HashMap::from_iter(vec![( + "main".to_string(), + SnapshotReference { + snapshot_id: -1, + retention: SnapshotRetention::Branch { + min_snapshots_to_keep: None, + max_snapshot_age_ms: None, + max_ref_age_ms: None, + }, + }, + )]), + }; + + check_table_metadata_serde(&metadata, expected); + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV1Valid.json b/crates/iceberg/testdata/table_metadata/TableMetadataV1Valid.json new file mode 100644 index 000000000..0b55d51ba --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV1Valid.json @@ -0,0 +1,42 @@ +{ + "format-version": 1, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "partition-spec": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] +} \ No newline at end of file From b563e27601dbe8a922ec97d17f6bfc0ebcfda863 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Tue, 29 Aug 2023 17:09:53 +0200 Subject: [PATCH 04/13] Test for invalid schema id --- crates/iceberg/src/spec/table_metadata.rs | 40 ++++-- .../TableMetadataV2CurrentSchemaNotFound.json | 122 ++++++++++++++++++ 2 files changed, 153 insertions(+), 9 deletions(-) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 98b5a51e7..8fd25fdc2 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -203,6 +203,7 @@ pub(super) mod _serde { /// [TableMetadataV1] and [TableMetadataV2] are internal struct that are only used for serialization and deserialization. use std::{collections::HashMap, sync::Arc}; + use itertools::Itertools; use serde::{Deserialize, Serialize}; use uuid::Uuid; @@ -212,7 +213,7 @@ pub(super) mod _serde { snapshot::_serde::{SnapshotV1, SnapshotV2}, PartitionField, PartitionSpec, Schema, SnapshotReference, SnapshotRetention, SortOrder, }, - Error, + Error, ErrorKind, }; use super::{ @@ -348,6 +349,14 @@ pub(super) mod _serde { } else { value.current_snapshot_id }; + let schemas = HashMap::from_iter( + value + .schemas + .into_iter() + .map(|schema| Ok((schema.schema_id, Arc::new(schema.try_into()?)))) + .collect::, Error>>()? + .into_iter(), + ); Ok(TableMetadata { format_version: FormatVersion::V2, table_uuid: value.table_uuid, @@ -355,14 +364,15 @@ pub(super) mod _serde { last_sequence_number: value.last_sequence_number, last_updated_ms: value.last_updated_ms, last_column_id: value.last_column_id, - schemas: HashMap::from_iter( - value - .schemas - .into_iter() - .map(|schema| Ok((schema.schema_id, Arc::new(schema.try_into()?)))) - .collect::, Error>>()?, - ), - current_schema_id: value.current_schema_id, + current_schema_id: if schemas.keys().contains(&value.current_schema_id) { + Ok(value.current_schema_id) + } else { + Err(self::Error::new( + ErrorKind::DataInvalid, + "No schema exists with the current schema id.", + )) + }?, + schemas, partition_specs: HashMap::from_iter( value.partition_specs.into_iter().map(|x| (x.spec_id, x)), ), @@ -1270,4 +1280,16 @@ mod tests { check_table_metadata_serde(&metadata, expected); } + + #[test] + #[should_panic] + fn test_table_metadata_v2_schema_not_found() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json") + .unwrap(); + + let desered: Result = serde_json::from_str(&metadata); + + desered.unwrap(); + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json b/crates/iceberg/testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json new file mode 100644 index 000000000..d176195f0 --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json @@ -0,0 +1,122 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 2, + "schemas": [ + { + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + } + ] + }, + { + "type": "struct", + "schema-id": 1, + "identifier-field-ids": [ + 1, + 2 + ], + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + } + ], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": 3055729675574597004, + "snapshots": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770, + "sequence-number": 0, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/1.avro" + }, + { + "snapshot-id": 3055729675574597004, + "parent-snapshot-id": 3051729675574597004, + "timestamp-ms": 1555100955770, + "sequence-number": 1, + "summary": { + "operation": "append" + }, + "manifest-list": "s3://a/b/2.avro", + "schema-id": 1 + } + ], + "snapshot-log": [ + { + "snapshot-id": 3051729675574597004, + "timestamp-ms": 1515100955770 + }, + { + "snapshot-id": 3055729675574597004, + "timestamp-ms": 1555100955770 + } + ], + "metadata-log": [] +} \ No newline at end of file From 08f11edc2d6e6f88b16f1a0c2e8a0f178853980c Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 1 Sep 2023 12:26:41 +0200 Subject: [PATCH 05/13] improve failing test --- crates/iceberg/src/spec/table_metadata.rs | 8 +++- .../TableMetadataV2CurrentSchemaNotFound.json | 40 ++----------------- 2 files changed, 9 insertions(+), 39 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 8fd25fdc2..be39f86c2 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -681,6 +681,8 @@ mod tests { SnapshotRetention, SortDirection, SortField, SortOrder, Summary, Transform, Type, }; + use crate::error::{Error, ErrorKind}; + use super::{FormatVersion, MetadataLog, SnapshotLog}; fn check_table_metadata_serde(json: &str, expected_type: TableMetadata) { @@ -1282,7 +1284,6 @@ mod tests { } #[test] - #[should_panic] fn test_table_metadata_v2_schema_not_found() { let metadata = fs::read_to_string("testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json") @@ -1290,6 +1291,9 @@ mod tests { let desered: Result = serde_json::from_str(&metadata); - desered.unwrap(); + assert_eq!( + desered.unwrap_err().to_string(), + "DataInvalid => No schema exists with the current schema id." + ) } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json b/crates/iceberg/testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json index d176195f0..d010785b3 100644 --- a/crates/iceberg/testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV2CurrentSchemaNotFound.json @@ -22,10 +22,6 @@ { "type": "struct", "schema-id": 1, - "identifier-field-ids": [ - 1, - 2 - ], "fields": [ { "id": 1, @@ -85,38 +81,8 @@ } ], "properties": {}, - "current-snapshot-id": 3055729675574597004, - "snapshots": [ - { - "snapshot-id": 3051729675574597004, - "timestamp-ms": 1515100955770, - "sequence-number": 0, - "summary": { - "operation": "append" - }, - "manifest-list": "s3://a/b/1.avro" - }, - { - "snapshot-id": 3055729675574597004, - "parent-snapshot-id": 3051729675574597004, - "timestamp-ms": 1555100955770, - "sequence-number": 1, - "summary": { - "operation": "append" - }, - "manifest-list": "s3://a/b/2.avro", - "schema-id": 1 - } - ], - "snapshot-log": [ - { - "snapshot-id": 3051729675574597004, - "timestamp-ms": 1515100955770 - }, - { - "snapshot-id": 3055729675574597004, - "timestamp-ms": 1555100955770 - } - ], + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], "metadata-log": [] } \ No newline at end of file From 078e0ddefc1b4ade31598140e136c47a3c651eea Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 8 Sep 2023 09:21:10 +0200 Subject: [PATCH 06/13] missing sort oder test --- crates/iceberg/src/spec/table_metadata.rs | 14 +++++ .../TableMetadataV2MissingSortOrder.json | 54 +++++++++++++++++++ 2 files changed, 68 insertions(+) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataV2MissingSortOrder.json diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index be39f86c2..ddc262f0c 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1296,4 +1296,18 @@ mod tests { "DataInvalid => No schema exists with the current schema id." ) } + + #[test] + fn test_table_metadata_v2_missing_sort_order() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataV2MissingSortOrder.json") + .unwrap(); + + let desered: Result = serde_json::from_str(&metadata); + + assert_eq!( + desered.unwrap_err().to_string(), + "data did not match any variant of untagged enum TableMetadataEnum" + ) + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingSortOrder.json b/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingSortOrder.json new file mode 100644 index 000000000..fbbcf415d --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingSortOrder.json @@ -0,0 +1,54 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "last-partition-id": 1000, + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} \ No newline at end of file From 6d86746ba88c65a11076be9a28aafa3b914249e1 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 8 Sep 2023 09:24:34 +0200 Subject: [PATCH 07/13] test for missing partition spec --- crates/iceberg/src/spec/table_metadata.rs | 14 ++++ .../TableMetadataV2MissingPartitionSpecs.json | 67 +++++++++++++++++++ 2 files changed, 81 insertions(+) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataV2MissingPartitionSpecs.json diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index ddc262f0c..d47b29c13 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1310,4 +1310,18 @@ mod tests { "data did not match any variant of untagged enum TableMetadataEnum" ) } + + #[test] + fn test_table_metadata_v2_missing_partition_specs() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataV2MissingPartitionSpecs.json") + .unwrap(); + + let desered: Result = serde_json::from_str(&metadata); + + assert_eq!( + desered.unwrap_err().to_string(), + "data did not match any variant of untagged enum TableMetadataEnum" + ) + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingPartitionSpecs.json b/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingPartitionSpecs.json new file mode 100644 index 000000000..3ab0a7a1e --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingPartitionSpecs.json @@ -0,0 +1,67 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "partition-spec": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} \ No newline at end of file From b628d0c1ebbb83c2847320b06e480fe91679d779 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 8 Sep 2023 09:26:10 +0200 Subject: [PATCH 08/13] test missing partition id --- crates/iceberg/src/spec/table_metadata.rs | 15 ++++ ...TableMetadataV2MissingLastPartitionId.json | 73 +++++++++++++++++++ 2 files changed, 88 insertions(+) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataV2MissingLastPartitionId.json diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index d47b29c13..38f650e63 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1324,4 +1324,19 @@ mod tests { "data did not match any variant of untagged enum TableMetadataEnum" ) } + + #[test] + fn test_table_metadata_v2_missing_last_partition_id() { + let metadata = fs::read_to_string( + "testdata/table_metadata/TableMetadataV2MissingLastPartitionId.json", + ) + .unwrap(); + + let desered: Result = serde_json::from_str(&metadata); + + assert_eq!( + desered.unwrap_err().to_string(), + "data did not match any variant of untagged enum TableMetadataEnum" + ) + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingLastPartitionId.json b/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingLastPartitionId.json new file mode 100644 index 000000000..31c2b4caf --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingLastPartitionId.json @@ -0,0 +1,73 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "current-schema-id": 0, + "schemas": [{ + "type": "struct", + "schema-id": 0, + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }], + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} \ No newline at end of file From 45c892e0bc98db7b25e132fb8fc453dd80c62739 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 8 Sep 2023 09:28:09 +0200 Subject: [PATCH 09/13] test missing schemas --- crates/iceberg/src/spec/table_metadata.rs | 14 ++++ .../TableMetadataV2MissingSchemas.json | 71 +++++++++++++++++++ 2 files changed, 85 insertions(+) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataV2MissingSchemas.json diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 38f650e63..56f624913 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1339,4 +1339,18 @@ mod tests { "data did not match any variant of untagged enum TableMetadataEnum" ) } + + #[test] + fn test_table_metadata_v2_missing_schemas() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataV2MissingSchemas.json") + .unwrap(); + + let desered: Result = serde_json::from_str(&metadata); + + assert_eq!( + desered.unwrap_err().to_string(), + "data did not match any variant of untagged enum TableMetadataEnum" + ) + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingSchemas.json b/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingSchemas.json new file mode 100644 index 000000000..3754354ed --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataV2MissingSchemas.json @@ -0,0 +1,71 @@ +{ + "format-version": 2, + "table-uuid": "9c12d441-03fe-4693-9a96-a0705ddf69c1", + "location": "s3://bucket/test/location", + "last-sequence-number": 34, + "last-updated-ms": 1602638573590, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "default-spec-id": 0, + "partition-specs": [ + { + "spec-id": 0, + "fields": [ + { + "name": "x", + "transform": "identity", + "source-id": 1, + "field-id": 1000 + } + ] + } + ], + "default-sort-order-id": 3, + "sort-orders": [ + { + "order-id": 3, + "fields": [ + { + "transform": "identity", + "source-id": 2, + "direction": "asc", + "null-order": "nulls-first" + }, + { + "transform": "bucket[4]", + "source-id": 3, + "direction": "desc", + "null-order": "nulls-last" + } + ] + } + ], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [], + "snapshot-log": [], + "metadata-log": [] +} \ No newline at end of file From 2321b647c008e467351ea91bcc579f76246fb8bb Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 8 Sep 2023 09:34:54 +0200 Subject: [PATCH 10/13] test unsupported version --- crates/iceberg/src/spec/table_metadata.rs | 14 ++++++++ .../TableMetadataUnsupportedVersion.json | 36 +++++++++++++++++++ 2 files changed, 50 insertions(+) create mode 100644 crates/iceberg/testdata/table_metadata/TableMetadataUnsupportedVersion.json diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 56f624913..27399fa53 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -1353,4 +1353,18 @@ mod tests { "data did not match any variant of untagged enum TableMetadataEnum" ) } + + #[test] + fn test_table_metadata_v2_unsupported_version() { + let metadata = + fs::read_to_string("testdata/table_metadata/TableMetadataUnsupportedVersion.json") + .unwrap(); + + let desered: Result = serde_json::from_str(&metadata); + + assert_eq!( + desered.unwrap_err().to_string(), + "data did not match any variant of untagged enum TableMetadataEnum" + ) + } } diff --git a/crates/iceberg/testdata/table_metadata/TableMetadataUnsupportedVersion.json b/crates/iceberg/testdata/table_metadata/TableMetadataUnsupportedVersion.json new file mode 100644 index 000000000..0633a71d2 --- /dev/null +++ b/crates/iceberg/testdata/table_metadata/TableMetadataUnsupportedVersion.json @@ -0,0 +1,36 @@ +{ + "format-version": 3, + "table-uuid": "d20125c8-7284-442c-9aea-15fee620737c", + "location": "s3://bucket/test/location", + "last-updated-ms": 1602638573874, + "last-sequence-number": 0, + "last-column-id": 3, + "schema": { + "type": "struct", + "fields": [ + { + "id": 1, + "name": "x", + "required": true, + "type": "long" + }, + { + "id": 2, + "name": "y", + "required": true, + "type": "long", + "doc": "comment" + }, + { + "id": 3, + "name": "z", + "required": true, + "type": "long" + } + ] + }, + "partition-spec": [], + "properties": {}, + "current-snapshot-id": -1, + "snapshots": [] +} \ No newline at end of file From a33934e00acf940e981e08d2a43e5a1e1cd48f1f Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 8 Sep 2023 09:48:13 +0200 Subject: [PATCH 11/13] fix changes --- crates/iceberg/src/spec/table_metadata.rs | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 27399fa53..890d01209 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -354,8 +354,7 @@ pub(super) mod _serde { .schemas .into_iter() .map(|schema| Ok((schema.schema_id, Arc::new(schema.try_into()?)))) - .collect::, Error>>()? - .into_iter(), + .collect::, Error>>()?, ); Ok(TableMetadata { format_version: FormatVersion::V2, @@ -681,8 +680,6 @@ mod tests { SnapshotRetention, SortDirection, SortField, SortOrder, Summary, Transform, Type, }; - use crate::error::{Error, ErrorKind}; - use super::{FormatVersion, MetadataLog, SnapshotLog}; fn check_table_metadata_serde(json: &str, expected_type: TableMetadata) { From f57c0ed7f53488decfd0835f7dc46e544db49de0 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 8 Sep 2023 14:59:54 +0200 Subject: [PATCH 12/13] improve error message --- crates/iceberg/src/spec/table_metadata.rs | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 890d01209..134a88c72 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -368,7 +368,10 @@ pub(super) mod _serde { } else { Err(self::Error::new( ErrorKind::DataInvalid, - "No schema exists with the current schema id.", + &format!( + "No schema exists with the current schema id {}.", + value.current_schema_id + ), )) }?, schemas, @@ -1290,7 +1293,7 @@ mod tests { assert_eq!( desered.unwrap_err().to_string(), - "DataInvalid => No schema exists with the current schema id." + "DataInvalid => No schema exists with the current schema id 2." ) } From c02e8821685ba1f04c1845345a13cebf6721cd00 Mon Sep 17 00:00:00 2001 From: Jan Kaul Date: Fri, 8 Sep 2023 15:01:51 +0200 Subject: [PATCH 13/13] fix clippy warnings --- crates/iceberg/src/spec/table_metadata.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/iceberg/src/spec/table_metadata.rs b/crates/iceberg/src/spec/table_metadata.rs index 134a88c72..f40b63e2e 100644 --- a/crates/iceberg/src/spec/table_metadata.rs +++ b/crates/iceberg/src/spec/table_metadata.rs @@ -368,7 +368,7 @@ pub(super) mod _serde { } else { Err(self::Error::new( ErrorKind::DataInvalid, - &format!( + format!( "No schema exists with the current schema id {}.", value.current_schema_id ),