-
Notifications
You must be signed in to change notification settings - Fork 4.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add support for tuple ClickHouse #29715
Changes from 8 commits
3bbbe2f
a3a1850
53e4e81
a1d5182
a3cafa6
120bb2b
256187f
c1b3f98
a477cbc
ccfdc37
67434ef
f27b943
b4ce111
748dca6
57cb6a5
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,6 +27,7 @@ | |
import java.sql.SQLException; | ||
import java.sql.Statement; | ||
import java.util.ArrayList; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
import java.util.Properties; | ||
import java.util.stream.Collectors; | ||
|
@@ -112,7 +113,8 @@ | |
* </table> | ||
* | ||
* Nullable row columns are supported through Nullable type in ClickHouse. Low cardinality hint is | ||
* supported through LowCardinality DataType in ClickHouse. | ||
* supported through LowCardinality DataType in ClickHouse supported through Tuple DataType in | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Consider put the documentation Tuple to the table above. Broken sentence here currently. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please check this sentence |
||
* ClickHouse see: Example in ClickHouseIOTest.testComplexTupleType. | ||
* | ||
* <p>Nested rows should be unnested using {@link Select#flattenedSchema()}. Type casting should be | ||
* done using {@link org.apache.beam.sdk.schemas.transforms.Cast} before {@link ClickHouseIO}. | ||
|
@@ -475,6 +477,15 @@ abstract static class Builder<T> { | |
} | ||
} | ||
|
||
private static String tuplePreprocessing(String payload) { | ||
List<String> l = | ||
Arrays.stream(payload.trim().split(",")) | ||
.map(s -> s.trim().replaceAll(" +", "':;")) | ||
.collect(Collectors.toList()); | ||
String content = | ||
String.join(",", l).trim().replaceAll("Tuple\\(", "Tuple('").replaceAll(",", ",'"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The caller has a condition There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The condition is on with lower case, but the actual parsing is on the original string There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I mean below when " if (type.toLowerCase().trim().startsWith("tuple(")) {" the this preprocessing will be executed. So Tuple( (with backslash) is considered here but not the condition in the caller there. Coild this cause issue? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You're right. Would you mind adding some comments about the proprocessing rule and example, for example, sth like |
||
return content; | ||
} | ||
/** | ||
* Returns {@link TableSchema} for a given table. | ||
* | ||
|
@@ -498,7 +509,13 @@ public static TableSchema getTableSchema(String jdbcUrl, String table) { | |
String defaultTypeStr = rs.getString("default_type"); | ||
String defaultExpression = rs.getString("default_expression"); | ||
|
||
ColumnType columnType = ColumnType.parse(type); | ||
ColumnType columnType = null; | ||
if (type.toLowerCase().trim().startsWith("tuple(")) { | ||
String content = tuplePreprocessing(type); | ||
columnType = ColumnType.parse(content); | ||
} else { | ||
columnType = ColumnType.parse(type); | ||
} | ||
DefaultType defaultType = DefaultType.parse(defaultTypeStr).orElse(null); | ||
|
||
Object defaultValue; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -24,6 +24,7 @@ | |
import java.util.List; | ||
import java.util.Map; | ||
import java.util.Optional; | ||
import java.util.stream.Collectors; | ||
import org.apache.beam.sdk.schemas.Schema; | ||
import org.apache.beam.sdk.schemas.logicaltypes.FixedBytes; | ||
import org.checkerframework.checker.nullness.qual.Nullable; | ||
|
@@ -111,6 +112,14 @@ public static Schema.FieldType getEquivalentFieldType(ColumnType columnType) { | |
return Schema.FieldType.STRING; | ||
case BOOL: | ||
return Schema.FieldType.BOOLEAN; | ||
case TUPLE: | ||
List<Schema.Field> fields = | ||
columnType.tupleTypes().entrySet().stream() | ||
.map(x -> Schema.Field.of(x.getKey(), Schema.FieldType.DATETIME)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi @mzitnik, There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, This quick adaptation seems to work:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Hi, Also, it seems that This seems to fix the issue : There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. |
||
.collect(Collectors.toList()); | ||
Schema.Field[] array = fields.toArray(new Schema.Field[fields.size()]); | ||
Schema schema = Schema.of(array); | ||
return Schema.FieldType.row(schema); | ||
} | ||
|
||
// not possible, errorprone checks for exhaustive switch | ||
|
@@ -168,7 +177,9 @@ public enum TypeName { | |
// Composite type | ||
ARRAY, | ||
// Primitive type | ||
BOOL | ||
BOOL, | ||
// Composite type | ||
TUPLE | ||
} | ||
|
||
/** | ||
|
@@ -208,6 +219,7 @@ public abstract static class ColumnType implements Serializable { | |
public static final ColumnType UINT32 = ColumnType.of(TypeName.UINT32); | ||
public static final ColumnType UINT64 = ColumnType.of(TypeName.UINT64); | ||
public static final ColumnType BOOL = ColumnType.of(TypeName.BOOL); | ||
public static final ColumnType TUPLE = ColumnType.of(TypeName.TUPLE); | ||
|
||
// ClickHouse doesn't allow nested nullables, so boolean flag is enough | ||
public abstract boolean nullable(); | ||
|
@@ -220,6 +232,8 @@ public abstract static class ColumnType implements Serializable { | |
|
||
public abstract @Nullable ColumnType arrayElementType(); | ||
|
||
public abstract @Nullable Map<String, ColumnType> tupleTypes(); | ||
|
||
public ColumnType withNullable(boolean nullable) { | ||
return toBuilder().nullable(nullable).build(); | ||
} | ||
|
@@ -265,6 +279,14 @@ public static ColumnType array(ColumnType arrayElementType) { | |
.build(); | ||
} | ||
|
||
public static ColumnType tuple(Map<String, ColumnType> elements) { | ||
return ColumnType.builder() | ||
.typeName(TypeName.TUPLE) | ||
.nullable(false) | ||
.tupleTypes(elements) | ||
.build(); | ||
} | ||
|
||
/** | ||
* Parse string with ClickHouse type to {@link ColumnType}. | ||
* | ||
|
@@ -339,6 +361,8 @@ abstract static class Builder { | |
|
||
public abstract Builder fixedStringSize(Integer size); | ||
|
||
public abstract Builder tupleTypes(Map<String, ColumnType> tupleElements); | ||
|
||
public abstract ColumnType build(); | ||
} | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -17,6 +17,10 @@ | |
*/ | ||
options { | ||
IGNORE_CASE=true; | ||
DEBUG_PARSER = false; | ||
DEBUG_LOOKAHEAD = false; | ||
DEBUG_TOKEN_MANAGER = false; | ||
STATIC = false; | ||
} | ||
|
||
PARSER_BEGIN(ColumnTypeParser) | ||
|
@@ -99,6 +103,9 @@ TOKEN : | |
| < EQ : "=" > | ||
| < BOOL : "BOOL" > | ||
| < LOWCARDINALITY : "LOWCARDINALITY" > | ||
| < TUPLE : "TUPLE" > | ||
| < COLON : ":" > | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is COLON and SEMI_COLON for tuple or another feature? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using that since i have some ambiguity in parsing (looking for other options), but for now decided to leave it that way There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If it's get added to SDK then there is expectation of backward compatibility. So I would prefer to keep consistent with ClickHouse's SQL syntax There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's a temporary solution since there is some ambiguity when using There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Removed |
||
| < SEMI_COLON : ";" > | ||
} | ||
|
||
public ColumnType columnType() : | ||
|
@@ -113,6 +120,7 @@ public ColumnType columnType() : | |
| ct = array() | ||
| ct = nullable() | ||
| ct = lowcardenality() | ||
| ct = tuple() | ||
) | ||
{ | ||
return ct; | ||
|
@@ -263,6 +271,33 @@ private Map<String, Integer> enumElements() : | |
} | ||
} | ||
|
||
private Map.Entry<String, ColumnType> tupleElement() : | ||
{ | ||
String key; | ||
ColumnType value; | ||
Token token; | ||
} | ||
{ | ||
( (key = string() ) <COLON> <SEMI_COLON> ( value = columnType() ) ) { | ||
return Maps.immutableEntry(key, value); | ||
} | ||
} | ||
|
||
private Map<String, ColumnType> tupleElements() : | ||
{ | ||
Map.Entry<String, ColumnType> el; | ||
List<Map.Entry<String, ColumnType>> entries = Lists.newArrayList(); | ||
} | ||
{ | ||
( | ||
( el = tupleElement() { entries.add(el); } ) | ||
( <COMMA> ( el = tupleElement() { entries.add(el); } ) )* | ||
) | ||
{ | ||
return ImmutableMap.copyOf(entries); | ||
} | ||
} | ||
|
||
private ColumnType enum_() : | ||
{ | ||
Map<String, Integer> elements; | ||
|
@@ -289,4 +324,18 @@ private ColumnType lowcardenality() : | |
( | ||
(<LOWCARDINALITY> <LPAREN> (ct = primitive()) <RPAREN>) { return ct; } | ||
) | ||
} | ||
|
||
private ColumnType tuple() : | ||
{ | ||
Map<String, ColumnType> elements; | ||
} | ||
{ | ||
( | ||
(<TUPLE> <LPAREN> ( elements = tupleElements() ) <RPAREN>) | ||
{ | ||
return ColumnType.tuple(elements); | ||
} | ||
) | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -19,6 +19,7 @@ | |
|
||
import static org.junit.Assert.assertEquals; | ||
|
||
import java.util.HashMap; | ||
import java.util.Map; | ||
import org.apache.beam.sdk.io.clickhouse.TableSchema.ColumnType; | ||
import org.apache.beam.sdk.schemas.Schema; | ||
|
@@ -196,4 +197,55 @@ public void testEquivalentSchema() { | |
|
||
assertEquals(expected, TableSchema.getEquivalentSchema(tableSchema)); | ||
} | ||
|
||
@Test | ||
public void testParseTupleSingle() { | ||
Map<String, ColumnType> m1 = new HashMap<>(); | ||
m1.put("s", ColumnType.STRING); | ||
ColumnType columnType01 = ColumnType.parse("Tuple('s':;String)"); | ||
assertEquals(ColumnType.tuple(m1), columnType01); | ||
} | ||
|
||
@Test | ||
public void testParseTupleDouble() { | ||
Map<String, ColumnType> m2 = new HashMap<>(); | ||
m2.put("a1", ColumnType.STRING); | ||
m2.put("b", ColumnType.BOOL); | ||
ColumnType columnType02 = ColumnType.parse("Tuple('a1':;String,'b':;Bool)"); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Abacn Removed |
||
assertEquals(ColumnType.tuple(m2), columnType02); | ||
} | ||
|
||
@Test | ||
public void testTupleNested() { | ||
Map<String, ColumnType> m1 = new HashMap<>(); | ||
m1.put("a", ColumnType.STRING); | ||
Map<String, ColumnType> m3 = new HashMap<>(); | ||
m3.put("a", ColumnType.STRING); | ||
m3.put("b", ColumnType.BOOL); | ||
m3.put("c", ColumnType.tuple(m1)); | ||
ColumnType columnType03 = | ||
ColumnType.parse("Tuple('a':;String,'b':;Bool,'c':;Tuple('a':;String))"); | ||
assertEquals(ColumnType.tuple(m3), columnType03); | ||
} | ||
|
||
@Test | ||
public void testTupleComplex() { | ||
Map<String, ColumnType> m1 = new HashMap<>(); | ||
m1.put("width", ColumnType.INT64.withNullable(true)); | ||
m1.put("height", ColumnType.INT64.withNullable(true)); | ||
|
||
Map<String, ColumnType> m2 = new HashMap<>(); | ||
m2.put("name", ColumnType.STRING.withNullable(true)); | ||
m2.put("size", ColumnType.tuple(m1)); | ||
m2.put("version", ColumnType.STRING.withNullable(true)); | ||
|
||
Map<String, ColumnType> m3 = new HashMap<>(); | ||
m3.put("browser", ColumnType.tuple(m2)); | ||
m3.put("deviceCategory", ColumnType.STRING.withNullable(true)); | ||
|
||
ColumnType columnType03 = | ||
ColumnType.parse( | ||
"Tuple('browser':;Tuple('name':;Nullable(String),'size':;Tuple('width':;Nullable(Int64),'height':;Nullable(Int64)),'version':;Nullable(String)),'deviceCategory':; Nullable(String))"); | ||
assertEquals(ColumnType.tuple(m3), columnType03); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
We can combine this with LowCardinality DataType above, e.g.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Release cut is today so this may not get into 2.53.0. Could leave it as is and fix when the PR is finalized