-
Notifications
You must be signed in to change notification settings - Fork 178
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat: support UnboundPartitionSpec #106
Changes from all commits
eb5f549
ec74d1f
c858172
b0c1997
4c044dc
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -18,4 +18,5 @@ | |
/target | ||
/Cargo.lock | ||
.idea | ||
.vscode | ||
**/.DS_Store |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -26,9 +26,9 @@ use super::transform::Transform; | |
|
||
/// Reference to [`PartitionSpec`]. | ||
pub type PartitionSpecRef = Arc<PartitionSpec>; | ||
/// Partition fields capture the transform from table data to partition values. | ||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, TypedBuilder)] | ||
#[serde(rename_all = "kebab-case")] | ||
/// Partition fields capture the transform from table data to partition values. | ||
pub struct PartitionField { | ||
/// A source column id from the table’s schema | ||
pub source_id: i32, | ||
|
@@ -41,10 +41,10 @@ pub struct PartitionField { | |
pub transform: Transform, | ||
} | ||
|
||
/// Partition spec that defines how to produce a tuple of partition values from a record. | ||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Default, Builder)] | ||
#[serde(rename_all = "kebab-case")] | ||
#[builder(setter(prefix = "with"))] | ||
/// Partition spec that defines how to produce a tuple of partition values from a record. | ||
pub struct PartitionSpec { | ||
/// Identifier for PartitionSpec | ||
pub spec_id: i32, | ||
|
@@ -60,13 +60,51 @@ impl PartitionSpec { | |
} | ||
} | ||
|
||
/// Reference to [`UnboundPartitionSpec`]. | ||
pub type UnboundPartitionSpecRef = Arc<UnboundPartitionSpec>; | ||
/// Unbound partition field can be built without a schema and later bound to a schema. | ||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, TypedBuilder)] | ||
#[serde(rename_all = "kebab-case")] | ||
pub struct UnboundPartitionField { | ||
/// A source column id from the table’s schema | ||
pub source_id: i32, | ||
/// A partition field id that is used to identify a partition field and is unique within a partition spec. | ||
/// In v2 table metadata, it is unique across all partition specs. | ||
#[builder(default, setter(strip_option))] | ||
pub partition_id: Option<i32>, | ||
/// A partition name. | ||
pub name: String, | ||
/// A transform that is applied to the source column to produce a partition value. | ||
pub transform: Transform, | ||
} | ||
|
||
/// Unbound partition spec can be built without a schema and later bound to a schema. | ||
#[derive(Debug, Serialize, Deserialize, PartialEq, Eq, Clone, Default, Builder)] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't find a neat way to add |
||
#[serde(rename_all = "kebab-case")] | ||
#[builder(setter(prefix = "with"))] | ||
pub struct UnboundPartitionSpec { | ||
my-vegetable-has-exploded marked this conversation as resolved.
Show resolved
Hide resolved
|
||
/// Identifier for PartitionSpec | ||
#[builder(default, setter(strip_option))] | ||
pub spec_id: Option<i32>, | ||
/// Details of the partition spec | ||
#[builder(setter(each(name = "with_unbound_partition_field")))] | ||
pub fields: Vec<UnboundPartitionField>, | ||
} | ||
|
||
impl UnboundPartitionSpec { | ||
/// Create unbound partition spec builer | ||
pub fn builder() -> UnboundPartitionSpecBuilder { | ||
UnboundPartitionSpecBuilder::default() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn partition_spec() { | ||
let sort_order = r#" | ||
fn test_partition_spec() { | ||
let spec = r#" | ||
{ | ||
"spec-id": 1, | ||
"fields": [ { | ||
|
@@ -88,7 +126,7 @@ mod tests { | |
} | ||
"#; | ||
|
||
let partition_spec: PartitionSpec = serde_json::from_str(sort_order).unwrap(); | ||
let partition_spec: PartitionSpec = serde_json::from_str(spec).unwrap(); | ||
assert_eq!(4, partition_spec.fields[0].source_id); | ||
assert_eq!(1000, partition_spec.fields[0].field_id); | ||
assert_eq!("ts_day", partition_spec.fields[0].name); | ||
|
@@ -104,4 +142,64 @@ mod tests { | |
assert_eq!("id_truncate", partition_spec.fields[2].name); | ||
assert_eq!(Transform::Truncate(4), partition_spec.fields[2].transform); | ||
} | ||
|
||
#[test] | ||
fn test_unbound_partition_spec() { | ||
let spec = r#" | ||
{ | ||
"spec-id": 1, | ||
"fields": [ { | ||
"source-id": 4, | ||
"partition-id": 1000, | ||
"name": "ts_day", | ||
"transform": "day" | ||
}, { | ||
"source-id": 1, | ||
"partition-id": 1001, | ||
"name": "id_bucket", | ||
"transform": "bucket[16]" | ||
}, { | ||
"source-id": 2, | ||
"partition-id": 1002, | ||
"name": "id_truncate", | ||
"transform": "truncate[4]" | ||
} ] | ||
} | ||
"#; | ||
|
||
let partition_spec: UnboundPartitionSpec = serde_json::from_str(spec).unwrap(); | ||
assert_eq!(Some(1), partition_spec.spec_id); | ||
|
||
assert_eq!(4, partition_spec.fields[0].source_id); | ||
assert_eq!(Some(1000), partition_spec.fields[0].partition_id); | ||
assert_eq!("ts_day", partition_spec.fields[0].name); | ||
assert_eq!(Transform::Day, partition_spec.fields[0].transform); | ||
|
||
assert_eq!(1, partition_spec.fields[1].source_id); | ||
assert_eq!(Some(1001), partition_spec.fields[1].partition_id); | ||
assert_eq!("id_bucket", partition_spec.fields[1].name); | ||
assert_eq!(Transform::Bucket(16), partition_spec.fields[1].transform); | ||
|
||
assert_eq!(2, partition_spec.fields[2].source_id); | ||
assert_eq!(Some(1002), partition_spec.fields[2].partition_id); | ||
assert_eq!("id_truncate", partition_spec.fields[2].name); | ||
assert_eq!(Transform::Truncate(4), partition_spec.fields[2].transform); | ||
|
||
let spec = r#" | ||
{ | ||
"fields": [ { | ||
"source-id": 4, | ||
"name": "ts_day", | ||
"transform": "day" | ||
} ] | ||
} | ||
"#; | ||
let partition_spec: UnboundPartitionSpec = serde_json::from_str(spec).unwrap(); | ||
assert_eq!(None, partition_spec.spec_id); | ||
|
||
assert_eq!(4, partition_spec.fields[0].source_id); | ||
assert_eq!(None, partition_spec.fields[0].partition_id); | ||
assert_eq!("ts_day", partition_spec.fields[0].name); | ||
assert_eq!(Transform::Day, partition_spec.fields[0].transform); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why remove these? They are part of the
PartitionField
: https://github.com/apache/iceberg/blob/main/open-api/rest-catalog-open-api.py#L131-L135There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think this
field_id
in request is optional?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The open-api spec is a bit unclear, but it isn't right now:
https://github.com/apache/iceberg/blob/af9522ac7e8e25dc044622c566b66301b6df9581/open-api/rest-catalog-open-api.py#L283-L284
References:
https://github.com/apache/iceberg/blob/af9522ac7e8e25dc044622c566b66301b6df9581/open-api/rest-catalog-open-api.py#L138-L140
References:
https://github.com/apache/iceberg/blob/af9522ac7e8e25dc044622c566b66301b6df9581/open-api/rest-catalog-open-api.py#L131-L135
And there it is non-optional.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
On Java:
https://github.com/apache/iceberg/blob/af9522ac7e8e25dc044622c566b66301b6df9581/core/src/main/java/org/apache/iceberg/MetadataUpdateParser.java#L447-L451
And there it is optional:
https://github.com/apache/iceberg/blob/af9522ac7e8e25dc044622c566b66301b6df9581/core/src/main/java/org/apache/iceberg/PartitionSpecParser.java#L128-L145
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It looks like it is for historical reasons. For old specs they can be missing, and they'll just auto-increment.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
BTW, cleaned up the spec a bit: apache/iceberg#9240
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Thanks for the reference, I saw the comments now. But I think in the rest api request, it's reasonable to make it optional, and ask the server to assign a meaningful field id to it?