-
Notifications
You must be signed in to change notification settings - Fork 3
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix: Parquet format arrays #71
Changes from all commits
c7abe58
d6b10a8
fbf22fb
41ac118
1822a0a
8f9d7db
673da8e
0c0cdad
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,56 +8,84 @@ import ( | |
pschema "github.com/xitongsys/parquet-go/schema" | ||
) | ||
|
||
func makeSchema(cols schema.ColumnList) string { | ||
func makeSchema(tableName string, cols schema.ColumnList) string { | ||
s := pschema.JSONSchemaItemType{ | ||
Tag: `name=parquet_go_root, repetitiontype=REQUIRED`, | ||
Tag: `name=` + tableName + `_root, repetitiontype=REQUIRED`, | ||
} | ||
|
||
for i := range cols { | ||
tag := `name=` + cols[i].Name | ||
if opts := structOptsForColumn(cols[i]); len(opts) > 0 { | ||
tag += ", " + strings.Join(opts, ", ") | ||
for _, col := range cols { | ||
var subFields []*pschema.JSONSchemaItemType | ||
|
||
tag := []string{`name=` + col.Name} | ||
|
||
switch col.Type { | ||
case schema.TypeTimestamp: | ||
tag = append(tag, "type=INT64", "convertedtype=TIMESTAMP_MILLIS") | ||
case schema.TypeJSON, schema.TypeString, schema.TypeUUID, schema.TypeCIDR, schema.TypeInet, schema.TypeMacAddr: | ||
tag = append(tag, "type=BYTE_ARRAY", "convertedtype=UTF8") | ||
case schema.TypeFloat: | ||
tag = append(tag, "type=DOUBLE") | ||
case schema.TypeInt: | ||
tag = append(tag, "type=INT64") | ||
case schema.TypeByteArray: | ||
tag = append(tag, "type=BYTE_ARRAY") | ||
case schema.TypeBool: | ||
tag = append(tag, "type=BOOLEAN") | ||
case schema.TypeIntArray: | ||
tag = append(tag, "type=LIST", "repetitiontype=OPTIONAL") | ||
subFields = []*pschema.JSONSchemaItemType{ | ||
{ | ||
Tag: "name=element, type=INT64, repetitiontype=OPTIONAL", | ||
}, | ||
} | ||
case schema.TypeStringArray, schema.TypeUUIDArray, schema.TypeCIDRArray, schema.TypeInetArray, schema.TypeMacAddrArray: | ||
tag = append(tag, "type=LIST", "repetitiontype=OPTIONAL") | ||
subFields = []*pschema.JSONSchemaItemType{ | ||
{ | ||
Tag: "name=element, type=BYTE_ARRAY, convertedtype=UTF8, repetitiontype=OPTIONAL", | ||
}, | ||
} | ||
default: | ||
panic("unhandled type: " + col.Type.String()) | ||
} | ||
s.Fields = append(s.Fields, &pschema.JSONSchemaItemType{Tag: tag}) | ||
|
||
if !isArray(col.Type) { // array types are handled differently, see above | ||
if col.CreationOptions.PrimaryKey || col.CreationOptions.IncrementalKey { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't think we enforce constraints on incremental keys in other destinations right now, but it seems like a good idea, so I'm fine with this |
||
tag = append(tag, "repetitiontype=REQUIRED") | ||
} else { | ||
tag = append(tag, "repetitiontype=OPTIONAL") | ||
} | ||
} | ||
|
||
s.Fields = append(s.Fields, &pschema.JSONSchemaItemType{ | ||
Tag: strings.Join(tag, ", "), | ||
Fields: subFields, | ||
}) | ||
} | ||
|
||
b, _ := json.Marshal(s) | ||
return string(b) | ||
} | ||
|
||
func structOptsForColumn(col schema.Column) []string { | ||
opts := []string{} | ||
|
||
switch col.Type { | ||
case schema.TypeJSON: | ||
opts = append(opts, "type=BYTE_ARRAY", "convertedtype=UTF8") | ||
case schema.TypeTimestamp: | ||
opts = append(opts, "type=INT64", "convertedtype=TIMESTAMP_MILLIS") | ||
case schema.TypeString, schema.TypeUUID, schema.TypeCIDR, schema.TypeInet, schema.TypeMacAddr, | ||
schema.TypeStringArray, schema.TypeUUIDArray, schema.TypeCIDRArray, schema.TypeInetArray, schema.TypeMacAddrArray: | ||
opts = append(opts, "type=BYTE_ARRAY", "convertedtype=UTF8") | ||
case schema.TypeFloat: | ||
opts = append(opts, "type=DOUBLE") | ||
case schema.TypeInt, schema.TypeIntArray: | ||
opts = append(opts, "type=INT64") | ||
case schema.TypeByteArray: | ||
opts = append(opts, "type=BYTE_ARRAY") | ||
case schema.TypeBool: | ||
opts = append(opts, "type=BOOLEAN") | ||
default: | ||
panic("unhandled type: " + col.Type.String()) | ||
} | ||
func isArray(t schema.ValueType) bool { | ||
return arrayElement(t) != schema.TypeInvalid | ||
} | ||
|
||
switch col.Type { | ||
case schema.TypeStringArray, schema.TypeIntArray, schema.TypeUUIDArray, schema.TypeCIDRArray, schema.TypeInetArray, schema.TypeMacAddrArray: | ||
opts = append(opts, "repetitiontype=REPEATED") | ||
func arrayElement(t schema.ValueType) schema.ValueType { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We could use this method in the plugin-sdk in the future. |
||
switch t { | ||
case schema.TypeIntArray: | ||
return schema.TypeInt | ||
case schema.TypeStringArray: | ||
return schema.TypeString | ||
case schema.TypeUUIDArray: | ||
return schema.TypeUUID | ||
case schema.TypeCIDRArray: | ||
return schema.TypeCIDR | ||
case schema.TypeInetArray: | ||
return schema.TypeInet | ||
case schema.TypeMacAddrArray: | ||
return schema.TypeMacAddr | ||
default: | ||
if col.CreationOptions.PrimaryKey || col.CreationOptions.IncrementalKey { | ||
opts = append(opts, "repetitiontype=REQUIRED") | ||
} else { | ||
opts = append(opts, "repetitiontype=OPTIONAL") | ||
} | ||
return schema.TypeInvalid | ||
} | ||
|
||
return opts | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -177,6 +177,7 @@ func (ReverseTransformer) ReverseTransformValues(table *schema.Table, values []a | |
if err := t.Set(v); err != nil { | ||
return nil, fmt.Errorf("failed to convert value %v to type %s: %w", v, table.Columns[i].Type, err) | ||
} | ||
res[i] = t | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Without this, the read-write test panics if there's a type error, instead of failing with a proper (and helpful) error message. |
||
continue | ||
} | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The item name here doesn't seem to have any significance,
parquet_go_root
is what the library uses but it's not a const in the parquet world, so I opted to add the table name here so that files are identifiable (with parquet-tools etc.) even if they lost their filenames.