From 45e156ed1cada2ad40e8057113420e28c02b0fe6 Mon Sep 17 00:00:00 2001 From: jarvis Date: Fri, 20 Oct 2023 21:12:47 +0800 Subject: [PATCH] [Improve][connector-file] update e2e config --- docs/en/connector-v2/source/CosFile.md | 10 +- docs/en/connector-v2/source/FtpFile.md | 12 +- docs/en/connector-v2/source/HdfsFile.md | 6 +- docs/en/connector-v2/source/LocalFile.md | 10 +- docs/en/connector-v2/source/OssFile.md | 10 +- docs/en/connector-v2/source/OssJindoFile.md | 10 +- docs/en/connector-v2/source/S3File.md | 10 +- docs/en/connector-v2/source/SftpFile.md | 10 +- .../file/config/BaseSourceConfig.java | 1 + .../file/source/reader/ExcelReadStrategy.java | 5 +- .../file/source/reader/TextReadStrategy.java | 9 +- .../resources/excel/cos_excel_to_assert.conf | 2 +- .../excel/ftp_excel_projection_to_assert.conf | 2 +- .../resources/excel/ftp_excel_to_assert.conf | 2 +- .../excel/ftp_filter_excel_to_assert.conf | 2 +- .../e2e/connector/file/local/LocalFileIT.java | 6 + .../local_excel_projection_to_assert.conf | 2 +- .../excel/local_excel_to_assert.conf | 2 +- .../excel/local_filter_excel_to_assert.conf | 2 +- .../src/test/resources/text/e2e_delimiter.txt | 5 + .../text/local_file_delimiter_assert.conf | 104 ++++++++++++++++++ .../text/local_file_text_lzo_to_assert.conf | 1 - .../sftp_excel_projection_to_assert.conf | 2 +- .../resources/excel/sftp_excel_to_assert.conf | 2 +- .../excel/sftp_filter_excel_to_assert.conf | 2 +- 25 files changed, 186 insertions(+), 43 deletions(-) create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/e2e_delimiter.txt create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_delimiter_assert.conf diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md index f1964e46fcd8..406c86fab5bd 100644 --- a/docs/en/connector-v2/source/CosFile.md +++ b/docs/en/connector-v2/source/CosFile.md @@ -52,7 +52,7 @@ To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and | secret_key | string | yes | - | | region | string | yes | - | | read_columns | list | yes | - | -| field_delimiter | string | no | \001 | +| delimiter/field_delimiter | string | no | \001 | | parse_partition_from_path | boolean | no | true | | skip_header_row_number | long | no | 0 | | date_format | string | no | yyyy-MM-dd | @@ -133,13 +133,13 @@ If you do not assign data schema connector will treat the upstream data as the f |-----------------------| | tyrantlucifer#26#male | -If you assign data schema, you should also assign the option `delimiter` too except CSV file type +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon -delimiter = "#" +field_delimiter = "#" schema { fields { name = string @@ -176,7 +176,9 @@ The region of cos file system. The read column list of the data source, user can use it to implement field projection. -### field_delimiter [string] +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md index 9f6319fcbd7d..781d7d40bc2e 100644 --- a/docs/en/connector-v2/source/FtpFile.md +++ b/docs/en/connector-v2/source/FtpFile.md @@ -44,7 +44,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you | password | string | yes | - | | path | string | yes | - | | file_format_type | string | yes | - | -| field_delimiter | string | no | \001 | +| delimiter/field_delimiter | string | no | \001 | | read_columns | list | no | - | | parse_partition_from_path | boolean | no | true | | date_format | string | no | yyyy-MM-dd | @@ -131,13 +131,13 @@ If you do not assign data schema connector will treat the upstream data as the f |-----------------------| | tyrantlucifer#26#male | -If you assign data schema, you should also assign the option `delimiter` too except CSV file type +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon -delimiter = "#" +field_delimiter = "#" schema { fields { name = string @@ -154,7 +154,9 @@ connector will generate data as the following: |---------------|-----|--------| | tyrantlucifer | 26 | male | -### field_delimiter [string] +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. @@ -253,7 +255,7 @@ Source plugin common parameters, please refer to [Source Common Options](common- name = string age = int } - delimiter = "#" + field_delimiter = "#" } ``` diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md index 219603a30f92..f1ef0aa87741 100644 --- a/docs/en/connector-v2/source/HdfsFile.md +++ b/docs/en/connector-v2/source/HdfsFile.md @@ -46,7 +46,7 @@ Read data from hdfs file system. | fs.defaultFS | string | yes | - | The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` | | read_columns | list | yes | - | The read column list of the data source, user can use it to implement field projection.The file type supported column projection as the following shown:[text,json,csv,orc,parquet,excel].Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured. | | hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes | -| field_delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. default `\001`, the same as hive's default delimiter | +| delimiter/field_delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. default `\001`, the same as hive's default delimiter | | parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields:[name:tyrantlucifer,age:26].Tips:Do not define partition fields in schema option. | | date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd`.Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` .default `yyyy-MM-dd HH:mm:ss` | @@ -59,6 +59,10 @@ Read data from hdfs file system. | compress_codec | string | no | none | The compress codec of files | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index 7cf7d5116779..f562fd30ae5b 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -46,7 +46,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you | path | string | yes | - | | file_format_type | string | yes | - | | read_columns | list | no | - | -| field_delimiter | string | no | \001 | +| delimiter/field_delimiter | string | no | \001 | | parse_partition_from_path | boolean | no | true | | date_format | string | no | yyyy-MM-dd | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | @@ -127,13 +127,13 @@ If you do not assign data schema connector will treat the upstream data as the f |-----------------------| | tyrantlucifer#26#male | -If you assign data schema, you should also assign the option `delimiter` too except CSV file type +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon -delimiter = "#" +field_delimiter = "#" schema { fields { name = string @@ -154,7 +154,9 @@ connector will generate data as the following: The read column list of the data source, user can use it to implement field projection. -### field_delimiter [string] +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. diff --git a/docs/en/connector-v2/source/OssFile.md b/docs/en/connector-v2/source/OssFile.md index a4ef088249fd..2f51024b67e7 100644 --- a/docs/en/connector-v2/source/OssFile.md +++ b/docs/en/connector-v2/source/OssFile.md @@ -53,7 +53,7 @@ It only supports hadoop version **2.9.X+**. | access_secret | string | yes | - | | endpoint | string | yes | - | | read_columns | list | yes | - | -| field_delimiter | string | no | \001 | +| delimiter/field_delimiter | string | no | \001 | | parse_partition_from_path | boolean | no | true | | skip_header_row_number | long | no | 0 | | date_format | string | no | yyyy-MM-dd | @@ -134,13 +134,13 @@ If you do not assign data schema connector will treat the upstream data as the f |-----------------------| | tyrantlucifer#26#male | -If you assign data schema, you should also assign the option `delimiter` too except CSV file type +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon -delimiter = "#" +field_delimiter = "#" schema { fields { name = string @@ -177,7 +177,9 @@ The endpoint of oss file system. The read column list of the data source, user can use it to implement field projection. -### field_delimiter [string] +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md index e3942886308c..27b710cfb8ad 100644 --- a/docs/en/connector-v2/source/OssJindoFile.md +++ b/docs/en/connector-v2/source/OssJindoFile.md @@ -56,7 +56,7 @@ It only supports hadoop version **2.9.X+**. | access_secret | string | yes | - | | endpoint | string | yes | - | | read_columns | list | no | - | -| field_delimiter | string | no | \001 | +| delimiter/field_delimiter | string | no | \001 | | parse_partition_from_path | boolean | no | true | | date_format | string | no | yyyy-MM-dd | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | @@ -137,13 +137,13 @@ If you do not assign data schema connector will treat the upstream data as the f |-----------------------| | tyrantlucifer#26#male | -If you assign data schema, you should also assign the option `delimiter` too except CSV file type +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon -delimiter = "#" +field_delimiter = "#" schema { fields { name = string @@ -180,7 +180,9 @@ The endpoint of oss file system. The read column list of the data source, user can use it to implement field projection. -### field_delimiter [string] +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md index 20b793dbc74f..78ae7422ed2b 100644 --- a/docs/en/connector-v2/source/S3File.md +++ b/docs/en/connector-v2/source/S3File.md @@ -111,13 +111,13 @@ If you do not assign data schema connector will treat the upstream data as the f |-----------------------| | tyrantlucifer#26#male | -If you assign data schema, you should also assign the option `delimiter` too except CSV file type +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon -delimiter = "#" +field_delimiter = "#" schema { fields { name = string @@ -205,7 +205,7 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto | access_key | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` | | access_secret | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` | | hadoop_s3_properties | map | no | - | If you need to add other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | -| field_delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. | +| delimiter/field_delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. | | parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields: name="tyrantlucifer", age=16 | | date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`. default `yyyy-MM-dd` | | datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` | @@ -216,6 +216,10 @@ If you assign file type to `parquet` `orc`, schema option not required, connecto | compress_codec | string | no | none | | common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. + ### compress_codec [string] The compress codec of files and the details that supported as the following shown: diff --git a/docs/en/connector-v2/source/SftpFile.md b/docs/en/connector-v2/source/SftpFile.md index 62db6f30c493..05b3bc4f38f3 100644 --- a/docs/en/connector-v2/source/SftpFile.md +++ b/docs/en/connector-v2/source/SftpFile.md @@ -44,7 +44,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you | password | string | yes | - | | path | string | yes | - | | file_format_type | string | yes | - | -| field_delimiter | string | no | \001 | +| delimiter/field_delimiter | string | no | \001 | | parse_partition_from_path | boolean | no | true | | date_format | string | no | yyyy-MM-dd | | skip_header_row_number | long | no | 0 | @@ -132,13 +132,13 @@ If you do not assign data schema connector will treat the upstream data as the f |-----------------------| | tyrantlucifer#26#male | -If you assign data schema, you should also assign the option `delimiter` too except CSV file type +If you assign data schema, you should also assign the option `field_delimiter` too except CSV file type you should assign schema and delimiter as the following: ```hocon -delimiter = "#" +field_delimiter = "#" schema { fields { name = string @@ -155,7 +155,9 @@ connector will generate data as the following: |---------------|-----|--------| | tyrantlucifer | 26 | male | -### field_delimiter [string] +### delimiter/field_delimiter [string] + +**delimiter** parameter will deprecate after version 2.3.5, please use **field_delimiter** instead. Only need to be configured when file_format is text. diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfig.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfig.java index f92e06fa48e5..6e2818d4597f 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfig.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfig.java @@ -44,6 +44,7 @@ public class BaseSourceConfig { Options.key("field_delimiter") .stringType() .defaultValue(TextFormatConstant.SEPARATOR[0]) + .withFallbackKeys("delimiter") .withDescription( "The separator between columns in a row of data. Only needed by `text` file format"); diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/ExcelReadStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/ExcelReadStrategy.java index 3f60d5da748e..3371580c17f8 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/ExcelReadStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/ExcelReadStrategy.java @@ -19,6 +19,7 @@ import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.source.Collector; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; @@ -233,7 +234,9 @@ private Object convert(Object field, SeaTunnelDataType fieldType) { case BYTES: return field.toString().getBytes(StandardCharsets.UTF_8); case ROW: - String delimiter = pluginConfig.getString(BaseSourceConfig.FIELD_DELIMITER.key()); + String delimiter = + ReadonlyConfig.fromConfig(pluginConfig) + .get(BaseSourceConfig.FIELD_DELIMITER); String[] context = field.toString().split(delimiter); SeaTunnelRowType ft = (SeaTunnelRowType) fieldType; int length = context.length; diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/TextReadStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/TextReadStrategy.java index 037ed4e007f2..816e50b57b8d 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/TextReadStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/TextReadStrategy.java @@ -18,6 +18,7 @@ package org.apache.seatunnel.connectors.seatunnel.file.source.reader; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; +import org.apache.seatunnel.api.configuration.ReadonlyConfig; import org.apache.seatunnel.api.serialization.DeserializationSchema; import org.apache.seatunnel.api.source.Collector; import org.apache.seatunnel.api.table.catalog.CatalogTableUtil; @@ -49,6 +50,7 @@ import java.io.InputStreamReader; import java.nio.charset.StandardCharsets; import java.util.Map; +import java.util.Optional; @Slf4j public class TextReadStrategy extends AbstractReadStrategy { @@ -162,8 +164,11 @@ public SeaTunnelRowType getSeaTunnelRowTypeInfo(HadoopConf hadoopConf, String pa public void setSeaTunnelRowTypeInfo(SeaTunnelRowType seaTunnelRowType) { SeaTunnelRowType userDefinedRowTypeWithPartition = mergePartitionTypes(fileNames.get(0), seaTunnelRowType); - if (pluginConfig.hasPath(BaseSourceConfig.FIELD_DELIMITER.key())) { - fieldDelimiter = pluginConfig.getString(BaseSourceConfig.FIELD_DELIMITER.key()); + Optional fieldDelimiterOptional = + ReadonlyConfig.fromConfig(pluginConfig) + .getOptional(BaseSourceConfig.FIELD_DELIMITER); + if (fieldDelimiterOptional.isPresent()) { + fieldDelimiter = fieldDelimiterOptional.get(); } else { FileFormat fileFormat = FileFormat.valueOf( diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/cos_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/cos_excel_to_assert.conf index b71709318ec5..733d393340c3 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/cos_excel_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/cos_excel_to_assert.conf @@ -34,7 +34,7 @@ source { region = "ap-chengdu" result_table_name = "fake" file_format_type = excel - delimiter = ; + field_delimiter = ; skip_header_row_number = 1 schema = { fields { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_excel_projection_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_excel_projection_to_assert.conf index c271a0486a6b..1bb682342231 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_excel_projection_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_excel_projection_to_assert.conf @@ -37,7 +37,7 @@ source { path = "/tmp/seatunnel/read/excel" result_table_name = "ftp" file_format_type = excel - delimiter = ; + field_delimiter = ; read_columns = [c_string, c_boolean] skip_header_row_number = 1 schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_excel_to_assert.conf index b25e8ab1ac0c..80ebf1577f3c 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_excel_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_excel_to_assert.conf @@ -37,7 +37,7 @@ source { path = "/tmp/seatunnel/read/excel" result_table_name = "ftp" file_format_type = excel - delimiter = ; + field_delimiter = ; skip_header_row_number = 1 schema = { fields { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_filter_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_filter_excel_to_assert.conf index 6af42f6f3d62..e881c39af048 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_filter_excel_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_filter_excel_to_assert.conf @@ -37,7 +37,7 @@ source { path = "/tmp/seatunnel/read/excel_filter" result_table_name = "ftp" file_format_type = excel - delimiter = ; + field_delimiter = ; skip_header_row_number = 1 file_filter_pattern = "e2e_filter.*" schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index 8132a9c7f590..bb80160f14b0 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -61,6 +61,11 @@ public class LocalFileIT extends TestSuiteBase { "/seatunnel/read/text/name=tyrantlucifer/hobby=coding/e2e.txt", container); + ContainerUtil.copyFileIntoContainers( + "/text/e2e_delimiter.txt", + "/seatunnel/read/text_delimiter/e2e.txt", + container); + Path txtLzo = convertToLzoFile(ContainerUtil.getResourcesFile("/text/e2e.txt")); ContainerUtil.copyFileIntoContainers( txtLzo, "/seatunnel/read/lzo_text/e2e.txt", container); @@ -97,6 +102,7 @@ public void testLocalFileReadAndWrite(TestContainer container) // test write local text file helper.execute("/text/fake_to_local_file_text.conf"); helper.execute("/text/local_file_text_lzo_to_assert.conf"); + helper.execute("/text/local_file_delimiter_assert.conf"); // test read skip header helper.execute("/text/local_file_text_skip_headers.conf"); // test read local text file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_projection_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_projection_to_assert.conf index df6749f718cc..65e4424fe043 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_projection_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_projection_to_assert.conf @@ -30,7 +30,7 @@ source { path = "/seatunnel/read/excel" result_table_name = "fake" file_format_type = excel - delimiter = ; + field_delimiter = ; read_columns = [c_string, c_boolean] skip_header_row_number = 1 schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_to_assert.conf index 1160ac5f25b1..87a62367fc6a 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_excel_to_assert.conf @@ -30,7 +30,7 @@ source { path = "/seatunnel/read/excel" result_table_name = "fake" file_format_type = excel - delimiter = ; + field_delimiter = ; skip_header_row_number = 1 schema = { fields { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_filter_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_filter_excel_to_assert.conf index 86039b44dbfe..c47c8c5f0d63 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_filter_excel_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_filter_excel_to_assert.conf @@ -30,7 +30,7 @@ source { path = "/seatunnel/read/excel_filter" result_table_name = "fake" file_format_type = excel - delimiter = ; + field_delimiter = ; skip_header_row_number = 1 file_filter_pattern = "e2e_filter.*" schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/e2e_delimiter.txt b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/e2e_delimiter.txt new file mode 100644 index 000000000000..b87687448ca6 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/e2e_delimiter.txt @@ -0,0 +1,5 @@ +qwerqwer|1972607327106509113020400507301104442513849629249|qwer|true|108|22432|11383204|723560014108175360|3.1407707E38|1.262116635132156E308|zlmzw|2023-05-25|97236477433882034782.803540569732795689|2023-03-25 04:30:13|qwerqwer1458583961104266156763552401211382922561937221393qwertrue930925142792030530244095935039344647.838737E373.3238256808030654E307Zicjq2023-10-1918739344608215707574.2737367351403166822023-10-07 08:24:27 +qwerqwer|20734545375230101131603368534223532992574063143|qwer|true|99|21567|768189694|8504422836686883840|1.3761162E38|5.460153079423635E307|dkCwG|2023-05-19|83044404421834652395.960138696348105704|2023-03-24 10:48:12|qwerqwer2774295104069855819185865051778415509162817756qwerfalse1619571127265647324402356645454202881.8446726E381.7000909191489263E308cXxQV2023-07-2713431695514477025331.5815661990272672962023-12-22 12:26:16 +qwerqwer|11147903451235598576860383707165213199232994316|qwer|true|49|21122|1110303282|2083282743100007424|1.9729736E38|1.0399541425415623E308|muvcN|2023-08-13|68941603382218317993.487441177291093700|2023-04-06 02:40:57|qwerqwer69745783829424948385550024313502468211004949206qwertrue117227855844811138143962162044856324.844609E374.992962483991954E307pPYZS2023-05-1751345924758748590630.6631664051742477762023-12-10 19:23:26 +qwerqwer|12600145717385486047323762331460409881387559257|qwer|true|54|30782|475296705|6520650210788816896|3.253564E38|1.181636072812166E308|RxBAU|2023-03-14|94882795877228509625.376060071805770292|2023-02-25 15:29:26|qwerqwer17078206571395918506189177703116985975671620089209qwerfalse11415353139002758476082670167752366081.4806856E385.82327433457546E307ppTVu2023-10-2784302780955330822761.6237458260160280852023-08-23 09:26:16 +qwerqwer|10811140972103212018816962034437650301336224152|qwer|true|82|27637|1110251085|806786601324796928|7.711023E37|4.398648945575819E307|kGVbL|2023-04-26|80164231813502964946.202647535547152674|2023-04-15 05:22:59|qwerqwer800727634149093075168463891515323059061714847070qwertrue351280654957024134756885372412119043.0538885E384.631561190310559E306leTTG2023-11-1490016690865756655359.8578360402194859042023-08-23 10:30:18 \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_delimiter_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_delimiter_assert.conf new file mode 100644 index 000000000000..2051010b0fed --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_delimiter_assert.conf @@ -0,0 +1,104 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + LocalFile { + path = "/seatunnel/read/text_delimiter" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + file_format_type = "text" + read_columns = [c_string, c_boolean] + delimiter = "|" + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + equals_to = "qwer" + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + equals_to = true + } + ] + } + ] + } + } +} + diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_text_lzo_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_text_lzo_to_assert.conf index 80613ec0fcca..eb92936aadd6 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_text_lzo_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/text/local_file_text_lzo_to_assert.conf @@ -28,7 +28,6 @@ env { source { LocalFile { path = "/seatunnel/read/lzo_text" - row_delimiter = "\n" partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_name_expression = "${transactionId}_${now}" diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_excel_projection_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_excel_projection_to_assert.conf index 356c0a8114c0..bc55ed12c4d4 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_excel_projection_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_excel_projection_to_assert.conf @@ -37,7 +37,7 @@ source { path = "tmp/seatunnel/read/excel" result_table_name = "sftp" file_format_type = excel - delimiter = ; + field_delimiter = ; read_columns = [c_string, c_boolean] skip_header_row_number = 1 schema = { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_excel_to_assert.conf index 0031b320980a..606f04ecab65 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_excel_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_excel_to_assert.conf @@ -37,7 +37,7 @@ source { port = 22 user = seatunnel password = pass - delimiter = ";" + field_delimiter = ";" skip_header_row_number = 1 schema = { fields { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_filter_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_filter_excel_to_assert.conf index b6cd92f712a5..6125ac9537bd 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_filter_excel_to_assert.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_filter_excel_to_assert.conf @@ -37,7 +37,7 @@ source { port = 22 user = seatunnel password = pass - delimiter = ";" + field_delimiter = ";" file_filter_pattern = "e2e_filter.*" skip_header_row_number = 1 schema = {