From e2ef3ad65dd33fbac655977dac3f4097c58c8015 Mon Sep 17 00:00:00 2001 From: Jia Fan Date: Tue, 29 Aug 2023 14:12:57 +0800 Subject: [PATCH] Merge 2.3.3 dev to business-dev (#292) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * [Feature][Connector V2] expose configurable options in Cassandra (#3681) * [Connector-V2][Paimon] Introduce paimon connector (#4178) * [Improve][Zeta] Improve Zeta operation max count and ignore NPE (#4787) * [Improve][Zeta] Improve Zeta operation max count and ignore NPE * [Improve][Zeta] Improve Zeta operation max count and ignore NPE * [Improve][Zeta] Cancel pipeline add retry to avoid cancel failed. (#4792) * [Hotfix][CDC] Fix chunk start/end parameter type error (#4777) Incorrect wrapping as Array types, but only Array type required * [Feature][Zeta] Add OSS support for Imap storage to cluster-mode type (#4683) * Add OSS/S3 to cluster-mode type #4621 * fixed bug & add e2e test * Wait for the node to start before scheduling & Move jar to parent pom & optimize writer * update LICENSE * [Hotfix][CI] Fix error repository name in ci config files (#4795) * [Feature][Json-format] support read format for pulsar (#4111) * [Improve][Connector-V2][Jdbc-Sink][Doc] Add the generate sink sql par… (#4797) * [Improve][Connector-V2][Jdbc-Sink][Doc] Add the generate sink sql parameter for the jdbc sinj document * [Docs][Connector-V2][Mysql] fix Mysql sink format doc (#4800) * [Hotfix][Connector][Jdbc] Fix sqlserver system table case sensitivity (#4806) * [Hotfix][Connector][Jdbc] Fix reconnect throw close statement exception (#4801) * [Hotfix][Connector-V2][Jdbc] Fix the error of extracting primary key column in sink (#4815) * [Feature][Connector-v2] Add Snowflake Source&Sink connector (#4470) --------- Co-authored-by: Eric Co-authored-by: hailin0 * [Hotfix][CI] Fix redundant modules run e2e tests when change jdbc module (#4824) * fix pom.xml code style (#4836) * [Chore] Format the.conf file using the same style (#4830) * [Hotfix][Zeta] Fix cpu load problem (#4828) * [Improve][Zeta] Reduce the number of IMAPs used by checkpointIdCounter (#4832) * [Bugfix][connector-v2][rabbitmq] Fix reduplicate ack msg bug and code style (#4842) --------- Co-authored-by: 毕博 * [Improve][Zeta] async execute checkpoint trigger and other block method (#4846) * [Improve][Zeta] async execute checkpoint trigger * [Bug][Zeta] Fix zeta cannot normally recycle thread belong to abnormal tasks * [Improve][Zeta] Move `restoreState` add `addSplitsBack` execute by TaskExecuteService * [Improve][Zeta] Move `receivedReader` execute by TaskExecuteService * [Bug][Zeta] Fix task `notifyTaskStatusToMaster` failed when job not running or failed before run (#4847) * [Bug][Zeta] Fix task repeat notify failed when job not running * [Bug][Zeta] Fix notifyTaskStatusToMaster not release lock and NPE * [Improve][Zeta] Reduce the frequency of fetching data from imap (#4851) * [Improve][Zeta] Add Metaspace size default value to config file (#4848) * [Improve][Zeta] Speed up listAllJob function (#4852) * [Bug][Zeta] Fix TaskGroupContext always hold classloader so classloader can't recycle (#4849) * [Improve][Zeta] Fix engine runtime error (#4850) * [Hotfix][Zeta] Fix completePendingCheckpoint concurrent action (#4854) This operation does not allow concurrent execution * [Hotfix][Zeta] Fix master active bug (#4855) * [Bugfix][DAG] Fix the incorrect setting of transform parallelism (#4814) * [Hotfix][Zeta] fix pipeline state not right bug (#4823) * [BUG][Doris] Add a jobId to the doris label to distinguish between tasks (#4853) Co-authored-by: zhouyao * [Improve] Add a jobId to the doris label to distinguish between tasks (#4839) Co-authored-by: zhouyao * [Hotfix][Zeta] Fix IMap operation timeout bug (#4859) * [Bug][Zeta] Fix restoreComplete Future can't be completed when cancel task (#4863) * [Feature][SQL Transform]Add catalog support for SQL Transform plugin (#4819) * [improve][SelectDB] Add a jobId to the selectDB label to distinguish between tasks (#4864) Co-authored-by: zhouyao * [Hotfix][Connector-v2][kafka] Fix the short interval of pull data settings and revise the format (#4875) * [Bug][Connector-V2][Doris] update last checkpoint id when doing snapshot (#4881) * [Hotfix][Zeta] Fix deploy operation timeout but task already finished bug (#4867) * [Core][Docs]Remove incubator in README file (#4882) * [Bugfix][CDC Base] Solving the ConcurrentModificationException caused by snapshotState being modified concurrently. (#4877) * [improve][CDC base] Implement Sample-based Sharding Strategy with Configurable Sampling Rate (#4856) * [Improve][Zeta] Reduce the operation count of imap_running_job_metrics (#4861) * [Bug][Zeta] Fix TaskExecutionService will return not active ExecutionContext (#4869) * [Hotfix][Jdbc] Fix XA DataSource crash(Oracle/Dameng/SqlServer) (#4866) * [Bugfix] [Connector-V2] [File] Fix read temp file (#4876) Co-authored-by: wantao * [Bug][Zeta] Fix TaskExecutionService synchronized lock will not release (#4886) * [Improve][Zeta] Move driver into lib directory and change operation count (#4845) * [hotfix][kafka] Fix the problem that the partition information cannot be obtained when kafka is restored (#4764) * [Bugfix][zeta] Fix the deadlock issue with JDBC driver loading (#4878) * [Chore] update 2.3.2 release-note.md (#4892) * [Improve][Connector-V2][Jdbc-Source] Support for Decimal types as splict keys (#4634) * [Improve][Connector-V2][Jdbc-Source]Support Compatible Mysql bigint(20) used as a partition_column #4634 Co-authored-by: zhilinli * [Bug][connector-v2][doris] add streamload Content-type for doris URLdecode error (#4880) * [Chore] Change repository name from incubator-seatunnel to seatunnel (#4868) --------- Co-authored-by: Jia Fan * [Improve][connector-V2-Neo4j]Supports neo4j sink batch write and update docs (#4841) * [Hotfix][connector-v2][e2e] Fix maven scope (#4901) * quick-start-seatunnel-engine.md (#4943) * fix error (#4888) * [Hotfix][Connector-V2][ClickhouseFile] Fix ClickhouseFile write file failed when field value is null (#4937) * Update ClickhouseFileSinkWriter.java Bug fix: When ClikchouseFileSinkerWriter writes to a temporary file, it does not check whether the field value is empty, so an exception will be thrown. Modified to write an empty string when a null value is encountered * Update ClickhouseFileSinkWriter.java repair code style * Update ClickhouseFileSinkWriter.java code style * [Improve][Zeta] Add an interface for batch retrieval of JobMetrics (#4576) * [Improve] Documentation and partial word optimization. (#4936) * code format * add cdc feature * fix cdc can not get driver error --------- Co-authored-by: gdliu3 * [Doc][Connector-V2] StarRocks `nodeUrls` property name fix (#4951) node_urls -> nodeUrls node_urls doesn't work * [Feature][E2E][FtpFile] add ftp file e2e test case (#4647) * [WIP][Feature][Connector-e2e] add ftp e2e test * Let e2e barely execute by excluding the commons-net jar package. * Resolve the maven conflict --------- Co-authored-by: hailin0 * [Hotfix][Connector-V2][StarRocks] Fix code style (#4966) * [Hotfix][Connector-v2][HbaseSink]Fix default timestamp (#4958) * [Doc]Change the transform website url (#4954) * [Docs][Connector-V2][Http]Reconstruct the Http connector document (#4962) Co-authored-by: chenzy15 * [Feature][connector-v2][mongodb] mongodb support cdc sink (#4833) * [Bug] [zeta][starter]fix bug (#4983) (#4984) Co-authored-by: wsstony * fix redis nodes format error. (#4981) Co-authored-by: lightzhao * [Improve][CDC]Remove driver for cdc connector (#4952) * [Hotfix][Connector-V2][Mongodb] Fix document error content and remove redundant code (#4982) Co-authored-by: chenzy15 * [Improve][Connector-V2][OSS-Jindo] Optimize jindo oss connector (#4964) * [Improve][Connector-V2][Jindo-Oss] Optimize jindo-oss connector * [Improve][Connector-V2][Jindo-Oss] Update module name * [Hotfix][Connector-V2][StarRocks] Fix code style * [bugfix] Upgrade the key log output level(#4993) * [Feature][Zeta] Configuration files support user variable replacement (#4969) * [Feature][Transform-V2][SQL] Support 'select *' and 'like' clause for SQL Transform plugin (#4991) Co-authored-by: mcy * [Improve][CDC]change driver scope to provider (#5002) * [Hotfix][Connector-V2][Hive] Support user-defined hive-site.xml (#4965) * [Improve][Connector-v2][Mongodb]Optimize reading logic (#5001) Co-authored-by: chenqqq11 * [Feature][Connector-V2][Clickhouse] clickhouse writes with checkpoints (#4999) * [Hotfix][Connector-V2][Mongodb] Compatible with historical parameters (#4997) * Split updated modules integration test for part 4 (#5028) * [Hotfix] Fix the CI Job name error (#5032) * [Feature][CDC] Support disable/enable exactly once for INITIAL (#4921) * [bugfix][zeta] Fixed multi-table job data loss and latency issues (#149) (#5031) * [Hotfix][CDC] Fix jdbc connection leak for mysql (#5037) * [Bugfix][zeta] Fix cdc connection does not close (#4922) * Fix XA Transaction bug (#5020) * Set Up with Kubernetes, dockerfile document error in constructing docker image (#5022) Co-authored-by: yctan <1417983443@qq.com> * [Improve][Connector-v2][Mongodb]sink support transaction update/writing (#5034) * fix:the HdfsStorage can not delete checkpoint file #5046 (#5054) * [BugFix] [Connector-V2] [MySQL-CDC] serverId from int to long (#5033) (#5035) * [bugfix] change MySQL CDC serverId from int to long (#5033) * style: 🎨 optimize code style * [Feature][Connector-V2][cdc] Change the time zone to the default time zone (#5030) * [Bugfix][connector-cdc-mysql] Fix listener not released when BinlogClient reuse (#5011) * [Feature][Connector-V2][Jdbc] Add oceanbase dialect factory (#4989) --------- Co-authored-by: silenceland Co-authored-by: changhuyan <877018069@qq.com> * [HotFix][Zeta] fix after the savepoint job is restored, the checkpoint file cannot be generated #4985 (#5051) * fix after the savepoint job is restored, the checkpoint file cannot be generated * fix class not found exception (#5063) * [Feature] update action config to support run CI on fork repo (#5065) * [Bugfix]fix clickhouse source connector read Nullable() type is not null,example:Nullable(Float64) while value is null the result is 0.0 (#5080) * [Feature][Connector-V2][Clickhouse] Add clickhouse connector time zone key,default system time zone (#5078) * Add clickhouse connector time zone key,default system time zone * Modify the document and add clickhouse server_time_zone configuration * [Chore] Modify repeat des (#5088) Co-authored-by: 80597928 * [Docs] Add Value types in Java to Schema feature (#5087) * [Feature][Connector-V2] JDBC source support string type as partition key (#4947) * [HotFix] Fix code style (#5092) * [Docs][Zeta] Add savepoint doc (#5081) * [Feature][connector-v2][mongodbcdc]Support source mongodb cdc (#4923) * [Improve] Improve savemode api (#4767) * [Doc] Improve DB2 Source Vertica Source & DB2 Sink Vertica Sink document (#5102) * [Improve][Docs][Clickhouse] Reconstruct the clickhouse connector doc (#5085) --------- Co-authored-by: chenzy15 * [Pom]update version to 2.3.3-SNAPSHOT (#5043) * update version to 2.3.3-SNAPSHOT * update dependency version in know dependencies file * Add logs to find job restore from master active switch error * [Feature][Connector-V2][mysql cdc] Conversion of tinyint(1) to bool is supported (#5105) Co-authored-by: zhouyao * [Improve][Zeta] Add sleep for Task to reduce CPU cost (#5117) * [Feature][JDBC Sink] Add DM upsert support (#5073) --------- Co-authored-by: David Zollo * [Hotfix][Connector][Jdbc] Fix the problem of JdbcOutputFormat database connection leak (#4802) [Hotfix][Connector][Jdbc] Fix the problem of JdbcOutputFormat database connection leak * [Hotfix]Fix mongodb cdc e2e instability (#5128) Co-authored-by: chenzy15 * [Hotfix][Zeta] Fix task state memory leak (#5139) * [Hotfix][Zeta] Fix checkpoint error report without msg (#5137) * [Improve][Zeta] Improve CheckpointCoordinator notify complete when restore (#5136) * [Improve] Improve CheckpointCoordinator notify complete when restore * update * [Improve][Zeta] Improve CheckpointCoordinator log error when report error from task (#178) (#5134) * [Hotfix][Zeta] Fix MultipleTableJobConfigParser ignore env option (#5067) * [Fix][Zeta] Fix MultipleTableJobConfigParser ignore env option * update * [Improve][Connector[File] Optimize files commit order (#5045) Before using `HashMap` store files path, so every checkpoint file commit is out of order. Now switch to using `LinkedHashMap` to ensure that files are commit in the generated order * [Hotfix][Mongodb cdc] Solve startup resume token is negative (#5143) --------- Co-authored-by: chenzy15 * [Feature][connector][kafka] Support read debezium format message from kafka (#5066) * [Feature][CDC] Support tables without primary keys (with unique keys) (#163) (#5150) * [Feature][Connector-V2][CDC] Support string type shard fields. (#5147) * [feature][CDC base] Supports string type shard fields * Delete invalid code * [Feature][Connector-V2][File] Add cos source&sink (#4979) * [Feature][Connector-V2][File] Add cos sink * update doc&e2e and add pom file header * add e2e file header and config * add file-cos module into dist pom.xml * [Feature][Connector-V2][File] Add cos source --------- Co-authored-by: dengd1937 * [Fix][Zeta] Fix SinkFlowLifeCycle without init lastCommitInfo (#5152) * [Hotfix][MongodbCDC]Refine data format to adapt to universal logic (#5162) Co-authored-by: chenzy15 * [Chore] Update bug-report.yml (#5160) * [Improve][CDC] support exactly-once of cdc and fix the BinlogOffset comparing bug (#5057) * [Improve][CDC] support exactly-once of cdc, fix the BinlogOffset comparing bug * [Improve][CDC] adjust code style * [Improve][CDC] fix ci error --------- Co-authored-by: happyboy1024 <296442618@qq.com> * [Docs][Connector-V2][Hudi] Reconstruct the Hudi connector document (#4905) * [Docs][Connector-V2][Hudi] Reconstruct the Hudi connector document --------- Co-authored-by: zhouyao * [Docs][Connector-V2][Doris] Reconstruct the Doris connector document (#4903) * [Docs][Connector-V2][Doris] Reconstruct the Doris connector document --------- Co-authored-by: zhouyao * [improve] [CDC Base] Add some split parameters to the optionRule (#5161) * [bugfix] [File Base] Fix Hadoop Kerberos authentication related issues. (#5171) * [CI] add code style check when docs changed (#5183) * [Bug][Translation][Spark] Fix SeaTunnelRowConvertor fail to convert when schema contains row type. (#5170) * [Improve][Zeta] Move checkpoint notify complete in checkpoint stage (#5185) * [Feature][Catalog] Add JDBC Catalog auto create table (#4917) * [Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. (#5153) * [Feature][Connector V2][File] Add config of 'file_filter_pattern', which used for filtering files. * [Improve][Connector-v2][Jdbc] check url not null throw friendly message (#5097) * check url not null throw friendly message * check jdbc source config * modify jdbc validate method --------- Co-authored-by: 80597928 Co-authored-by: 80597928 <673421862@qq.com> * [bugfix][zeta] Fix the issue of two identical IDs appearing when executing seatunnel.sh -l as the job resumes (#5191) * [Improve][Docs][Kafka]Reconstruct the kafka connector document (#4778) * [Docs][Connector-V2][Kafka]Reconstruct the kafka connector document --------- Co-authored-by: chenzy15 * [Bug][Improve][LocalFileSink]Fix LocalFile Sink file_format_type. (#5118) * [Bug] [connector-v2] PostgreSQL versions below 9.5 are compatible use cdc sync problem (#5120) * [e2e] kafka e2e error (#5200) * [Hotfix][Connector-V2][JindoOssFile] Fix plugin-mapping.properties (#5215) Co-authored-by: tyrantlucifer * [Improve][Zeta] Don't trigger handleSaveMode when restore (#5192) * move imap storage file dependency packages to submodules (#5218) * [Hotfix][CI]Declare files that will always have UNIX line endings on checkout. (#5221) * [Hotfix][Connector-V2][Paimon] Bump paimon-bundle version to 0.4.0-incubating (#5219) * [Docs][Connector-V2][PostgreSQL] Refactor connector-v2 docs using unified format PostgreSQL #4590 (#4757) * [Docs][Connector-V2][PostgreSQL] Refactor connector-v2 docs using unified format PostgreSQL * [Docs] Fix Dockerfile and seatunnel-flink.yaml in Set Up with Kubernetes (#4793) * [Docs] update seatunnel-flink.yaml and Dockerfile to help the demo work * [Docs] update release-note #4788 --------- Co-authored-by: flynnxue Co-authored-by: ic4y <83933160+ic4y@users.noreply.github.com> * [feature][doris] Doris factory type (#5061) * [feature][doris] Web need factory and data type convertor * [Fix] Update the Readme (#4968) Use the better description for the SeaTunnel Project * [CI] Split updated modules integration test for part 5 (#5208) * [CI] Split updated modules integration test for part 5 * [CI] Split updated modules integration test for part 5 * Split e2e * update json-smart * fix dm error * fix dm error * fix dm error * fix dm error * fix dm error * fix dm error * fix dm error * revert code * revert code * revert code --------- Co-authored-by: gdliu3 * [Feature][CDC][Zeta] Support schema evolution framework(DDL) (#5125) * Fixed IMap file storage e2e bug (#5237) * [Improve] [Connector-V2] Remove scheduler in JDBC sink #4736 (#5168) --------- Co-authored-by: gdliu3 * [Doc] [JDBC Oracle] Add JDBC Oracle Documentation (#5239) * [Feature][Zeta][REST-API]Add REST API To Submit Job (#5107) * [Fix] Update the project description (#4967) * Update the project description * [Feature][Zeta] Support history service record job execute error (#5114) * fix:hdfs Checkpoint Storage management fails to delete historical files * fix:hdfs Checkpoint Storage management fails to delete historical files * fix after the savepoint job is restored, the checkpoint file cannot be generated * [Feature][Zeta] Support history service record job execute error * Improve Jobstate-related class additions add serialVersionUID * add e2e test * [hotfix]Update .asf.yaml (#5242) * Update .asf.yaml * [Hotfix]Fix array index anomalies caused by #5057 (#5195) * [bugfix] [savepoint test] Turn on the testSavepoint test. (#5199) * [BUG][Connector-V2][Jdbc] support postgresql json type (#5194) * add Postgresql json type Co-authored-by: 80597928 <673421862@qq.com> * [Bugfix][cdc] Fix mysql bit column to java byte (#4817) * [Bugfix][AmazonDynamoDB] Fix the problem that all table data cannot be obtained (#5146) * [Docs][Connector][Source][jdbc]Change the line boundary store value type to BigDecimal (#4900) * [bug][jdbc][oracle]Fix the Oracle number type mapping problem (#5209) * [Bugfix][zeta] Fix the serialization issue of GetMetricsOperation during multi-node operation. (#5206) * [Hotfix][Zeta] Avoid Redundant Job Submissions by Checking Job Status (#5229) * [Bugfix][zeta] Fixed the issue of duplicated metrics caused by job fault tolerance or restore. (#5214) * [Imporve] [CDC Base] Add a fast sampling method that supports character types (#5179) * fixed zeta ci error (#5254) * [Doc][README] Remove useless github workflow, and adjust description of 'engineering structure'. (#4305) * [Feature][Zeta]The expiration time of a historical Job can be config (#5180) * fix:hdfs Checkpoint Storage management fails to delete historical files Co-authored-by: hailin0 * [bugfix] [e2e] Fixed a minor bug (#5274) * [Improve][SQL] Support use catalogTableName as SQL expression (#5273) * [Doc] Improve S3File Source & S3File Sink document (#5101) * Improve S3File Source & S3File Sink document * Fix style error (#5280) * Fix StarRocksJsonSerializer will transform array/map/row to string (#5281) * [Docs][Connector-V2][MyHours]Reconstruct the MyHours connector document (#5129) * [Docs][Connector-V2][MyHours]Reconstruct the MyHours connector document * fix format * fix format * [Improve][API & Zeta] Using connector custom serializer encode/decode states (#5238) * API: Using DefaultSerializer as connector sink default serializer * Zeta: Using connector custom serializer encode/decode states * [Feature][Connector-V2] connector-kafka source support data conversion extracted by kafka connect source (#4516) * Compatible kafka connect json #4137 * [Improve][CI/CD] Remove 'paths-ignore', enable the code style check for markdown files. (#5286) * [Bugfix][zeta] Resolved the issue causing checkpoints to halt on tolerable-failure=0. (#5263) * [Bugfix][zeta] Resolved the issue causing checkpoints to halt on tolerable-failure=0. * remove max-concurrent * [Feature][Connector-v2][RedisSink]Support redis to set expiration time. (#4975) * Support redis to set expiration time. * Set redis expire default value. * add e2e test. * add e2e test. * modify config file name. --------- Co-authored-by: lightzhao * [bugfix] Fix testGetErrorInfo case error (#5282) * [Feature][Zeta] Checkpoint support hdfs ha mode (#4942) * fix browser long type intercept (#5267) Co-authored-by: 80597928 <673421862@qq.com> * [Docs] remove `incubating` keyword in document (#5257) * [feature][web] hive add option because web need (#5154) * [feature][web] hive add option because web need * [feature][web] hive add option read_columns * [feature][web] required update optional * [bugfix] mvn spotless * fix conf * fix conf --------- Co-authored-by: liuli * [Bug][flink-runtime][connectors-v2] Flink register table Environment The running mode is set to`job.mode` (#4826) * [Docs][Connector-V2][StarRocks]Reconstruct the StarRocks connector document (#5132) * [Docs][Connector-V2][StarRocks]Reconstruct the StarRocks connector document * [Improve][Connector-v2][HiveSink]remove drop partition when abort. (#4940) Co-authored-by: lightzhao Co-authored-by: liuli Co-authored-by: ic4y <83933160+ic4y@users.noreply.github.com> * [Docs][Connector-V2][SelectDB-Cloud]Reconstruct the SelectDB-Cloud connector document (#5130) * [Docs][Connector-V2][SelectDB-Cloud]Reconstruct the SelectDB-Cloud connector document * fix codestyle --------- Co-authored-by: liuli * [Docs][Connector-V2][HDFS]Refactor connector-v2 docs using unified format HDFS. (#4871) * Refactor connector-v2 docs using unified format HDFS. * add data type. * update. * add key feature. * add hdfs_site_path * 1.add data type. 2.add hdfs_site_path conf. * add data type. * add hdfs site conf. --------- Co-authored-by: lightzhao Co-authored-by: liuli * [Improve] [Connector-V2] Remove scheduler in Tablestore sink (#5272) --------- Co-authored-by: gdliu3 * [BUG][Connector-V2][Mongo-cdc] Incremental data kind error in snapshot phase (#5184) * [BUG][Connector-V2][Mongo-cdc] Incremental data kind error in snapshot phase * [Hotfix] Fix com.google.common.base.Preconditions to seatunnel shade one (#5284) * [Merge] Fix merge conflict and fix jdbc fieldIde with compatibleMode confusion --------- Co-authored-by: Cason-ACE <35160064+cason0126@users.noreply.github.com> Co-authored-by: Tyrantlucifer Co-authored-by: hailin0 Co-authored-by: Xiaojian Sun Co-authored-by: Laglangyue <35491928+laglangyue@users.noreply.github.com> Co-authored-by: ZhilinLi Co-authored-by: ic4y <83933160+ic4y@users.noreply.github.com> Co-authored-by: Hao Xu Co-authored-by: Eric Co-authored-by: Bibo <33744252+531651225@users.noreply.github.com> Co-authored-by: 毕博 Co-authored-by: Carl-Zhou-CN <67902676+Carl-Zhou-CN@users.noreply.github.com> Co-authored-by: zhouyao Co-authored-by: Marvin <29311598@qq.com> Co-authored-by: monster <60029759+MonsterChenzhuo@users.noreply.github.com> Co-authored-by: gnehil Co-authored-by: TaoZex <45089228+TaoZex@users.noreply.github.com> Co-authored-by: xiaofan2012 <41982310+xiaofan2022@users.noreply.github.com> Co-authored-by: wantao Co-authored-by: Guangdong Liu <804167098@qq.com> Co-authored-by: zhilinli Co-authored-by: zhaifengbing Co-authored-by: dalong <60906603+alibabaMapengfei@users.noreply.github.com> Co-authored-by: FuYouJ <1247908487@qq.com> Co-authored-by: davidfans <136911434+davidfans@users.noreply.github.com> Co-authored-by: Fan Donglai Co-authored-by: gdliu3 Co-authored-by: DismalSnail Co-authored-by: lightzhao <40714172+lightzhao@users.noreply.github.com> Co-authored-by: chenzy15 Co-authored-by: wssmao <39487209+wssmao@users.noreply.github.com> Co-authored-by: wsstony Co-authored-by: lightzhao Co-authored-by: XiaoJiang521 <131635688+XiaoJiang521@users.noreply.github.com> Co-authored-by: mcy Co-authored-by: yctanGmail <138592845+yctanGmail@users.noreply.github.com> Co-authored-by: yctan <1417983443@qq.com> Co-authored-by: wu-a-ge Co-authored-by: 司马琦昂 Co-authored-by: happyboy1024 <137260654+happyboy1024@users.noreply.github.com> Co-authored-by: He Wang Co-authored-by: silenceland Co-authored-by: changhuyan <877018069@qq.com> Co-authored-by: Jarvis Co-authored-by: 阿丙 <50567478+gaopeng666@users.noreply.github.com> Co-authored-by: jackyyyyyssss <127465317+jackyyyyyssss@users.noreply.github.com> Co-authored-by: 80597928 Co-authored-by: Chengyu Yan Co-authored-by: zhangchengming601 <86779821+zhangchengming601@users.noreply.github.com> Co-authored-by: lihjChina <237206177@qq.com> Co-authored-by: David Zollo Co-authored-by: EchoLee5 <39044001+EchoLee5@users.noreply.github.com> Co-authored-by: dengdi <114273849+dengd1937@users.noreply.github.com> Co-authored-by: dengd1937 Co-authored-by: happyboy1024 <296442618@qq.com> Co-authored-by: FlechazoW <35768015+FlechazoW@users.noreply.github.com> Co-authored-by: 80597928 <673421862@qq.com> Co-authored-by: kun <66303359+Lifu12@users.noreply.github.com> Co-authored-by: Volodymyr Co-authored-by: javalover123 Co-authored-by: Volodymyr <770925351@qq.com> Co-authored-by: kksxf Co-authored-by: flynnxue Co-authored-by: fang <56808812+zhibinF@users.noreply.github.com> Co-authored-by: gejinxin <844156709@qq.com> Co-authored-by: Wenjun Ruan Co-authored-by: Koyfin <1040080742@qq.com> Co-authored-by: liuli --- .asf.yaml | 12 +- .gitattributes | 1 + .github/ISSUE_TEMPLATE/bug-report.yml | 6 +- .github/workflows/backend.yml | 140 ++++- DISCLAIMER | 2 +- README.md | 58 +-- config/hazelcast.yaml | 1 + config/seatunnel.yaml | 3 +- docs/en/concept/schema-feature.md | 38 +- .../kafka-compatible-kafkaconnect-json.md | 47 ++ docs/en/connector-v2/sink/Clickhouse.md | 207 ++++---- docs/en/connector-v2/sink/CosFile.md | 259 ++++++++++ docs/en/connector-v2/sink/DB2.md | 170 ++++++ docs/en/connector-v2/sink/Doris.md | 226 ++++++-- docs/en/connector-v2/sink/FtpFile.md | 18 +- docs/en/connector-v2/sink/HdfsFile.md | 324 +++++------- docs/en/connector-v2/sink/Jdbc.md | 97 ++-- docs/en/connector-v2/sink/Kafka.md | 151 +++--- docs/en/connector-v2/sink/LocalFile.md | 24 +- docs/en/connector-v2/sink/MongoDB.md | 8 +- docs/en/connector-v2/sink/Mysql.md | 3 +- docs/en/connector-v2/sink/OceanBase.md | 185 +++++++ docs/en/connector-v2/sink/Oracle.md | 191 +++++++ docs/en/connector-v2/sink/OssFile.md | 10 +- docs/en/connector-v2/sink/OssJindoFile.md | 10 +- docs/en/connector-v2/sink/PostgreSql.md | 203 ++++++++ docs/en/connector-v2/sink/Redis.md | 5 + docs/en/connector-v2/sink/S3-Redshift.md | 281 ++++++---- docs/en/connector-v2/sink/S3File.md | 243 ++++++--- docs/en/connector-v2/sink/SelectDB-Cloud.md | 228 +++++---- docs/en/connector-v2/sink/SftpFile.md | 10 +- docs/en/connector-v2/sink/Snowflake.md | 3 +- docs/en/connector-v2/sink/StarRocks.md | 216 ++++---- docs/en/connector-v2/sink/Tablestore.md | 1 - docs/en/connector-v2/sink/Vertica.md | 172 +++++++ docs/en/connector-v2/source/Clickhouse.md | 125 ++--- docs/en/connector-v2/source/CosFile.md | 294 +++++++++++ docs/en/connector-v2/source/DB2.md | 155 ++++++ docs/en/connector-v2/source/FtpFile.md | 3 +- docs/en/connector-v2/source/HdfsFile.md | 302 +++-------- docs/en/connector-v2/source/Hive.md | 27 +- docs/en/connector-v2/source/Hudi.md | 82 +-- docs/en/connector-v2/source/Jdbc.md | 14 +- docs/en/connector-v2/source/LocalFile.md | 7 +- docs/en/connector-v2/source/MongoDB-CDC.md | 41 +- docs/en/connector-v2/source/MongoDB.md | 5 + docs/en/connector-v2/source/MyHours.md | 215 ++++---- docs/en/connector-v2/source/MySQL-CDC.md | 2 +- docs/en/connector-v2/source/Mysql.md | 30 +- docs/en/connector-v2/source/OceanBase.md | 168 ++++++ docs/en/connector-v2/source/Oracle.md | 154 ++++++ docs/en/connector-v2/source/OssFile.md | 7 +- docs/en/connector-v2/source/OssJindoFile.md | 7 +- docs/en/connector-v2/source/PostgreSQL.md | 158 ++++++ docs/en/connector-v2/source/S3File.md | 309 +++++------ docs/en/connector-v2/source/SftpFile.md | 7 +- docs/en/connector-v2/source/Snowflake.md | 28 +- docs/en/connector-v2/source/Vertica.md | 157 ++++++ docs/en/connector-v2/source/kafka.md | 172 ++----- .../en/seatunnel-engine/checkpoint-storage.md | 30 +- docs/en/seatunnel-engine/deployment.md | 22 +- docs/en/seatunnel-engine/rest-api.md | 58 +++ docs/en/start-v2/kubernetes/kubernetes.mdx | 16 +- docs/en/start-v2/locally/deployment.md | 4 +- docs/en/start-v2/locally/quick-start-flink.md | 4 +- .../locally/quick-start-seatunnel-engine.md | 2 +- docs/en/start-v2/locally/quick-start-spark.md | 4 +- docs/sidebars.js | 1 + plugin-mapping.properties | 6 +- plugins/README.md | 2 + release-note.md | 7 + .../seatunnel/api/configuration/Options.java | 2 +- .../table/event/AlterTableColumnEvent.java | 17 + .../api/table/type/SeaTunnelRowType.java | 2 +- .../src/test/resources/conf/option-test.conf | 2 +- .../source/AmazonDynamoDBSourceReader.java | 33 +- .../base/config/JdbcSourceConfigFactory.java | 2 +- .../cdc/base/option/JdbcSourceOptions.java | 7 +- .../AbstractJdbcSourceChunkSplitter.java | 14 - .../enumerator/splitter/ChunkRange.java | 2 +- .../IncrementalSourceRecordEmitter.java | 6 +- .../IncrementalSourceScanFetcher.java | 9 +- .../IncrementalSourceStreamFetcher.java | 91 +++- ...lRowDebeziumDeserializationConverters.java | 2 + .../mongodb/config/MongodbSourceOptions.java | 7 +- .../source/fetch/MongodbFetchTaskContext.java | 56 +- .../cdc/mongodb/utils/MongodbRecordUtils.java | 47 +- .../cdc/mongodb/utils/ResumeToken.java | 11 +- .../config/MySqlSourceConfigFactory.java | 2 +- .../cdc/mysql/config/ServerIdRange.java | 26 +- .../cdc/mysql/source/offset/BinlogOffset.java | 8 +- .../fetch/MySqlSourceFetchTaskContext.java | 18 +- .../cdc/mysql/utils/MySqlTypeUtils.java | 1 + .../clickhouse/config/ClickhouseConfig.java | 10 + .../sink/client/ClickhouseSink.java | 4 + .../sink/client/ClickhouseSinkWriter.java | 39 +- .../executor/FieldNamedPreparedStatement.java | 4 +- .../sink/file/ClickhouseFileSink.java | 2 + .../clickhouse/source/ClickhouseSource.java | 11 + .../source/ClickhouseSourceReader.java | 12 +- .../clickhouse/util/ClickhouseUtil.java | 8 +- .../source/AbstractSingleSplitSource.java | 2 +- .../datatype/DorisDataTypeConvertor.java | 13 +- .../file/hdfs/source/BaseHdfsFileSource.java | 7 +- .../file/config/BaseSourceConfig.java | 7 + .../seatunnel/file/config/FileSystemType.java | 1 + .../file/sink/config/FileSinkConfig.java | 2 +- .../source/reader/AbstractReadStrategy.java | 30 +- .../connector-file/connector-file-cos/pom.xml | 64 +++ .../seatunnel/file/cos/config/CosConf.java | 59 +++ .../seatunnel/file/cos/config/CosConfig.java | 39 ++ .../seatunnel/file/cos/sink/CosFileSink.java | 63 +++ .../file/cos/sink/CosFileSinkFactory.java | 88 ++++ .../file/cos/source/CosFileSource.java | 119 +++++ .../file/cos/source/CosFileSourceFactory.java | 71 +++ .../services/org.apache.hadoop.fs.FileSystem | 16 + .../file/cos/CosFileFactoryTest.java | 33 ++ .../file/ftp/source/FtpFileSourceFactory.java | 1 + .../hdfs/source/HdfsFileSourceFactory.java | 1 + .../file/oss/source/OssFileSourceFactory.java | 1 + .../local/source/LocalFileSourceFactory.java | 1 + .../file/oss/source/OssFileSourceFactory.java | 1 + .../file/s3/source/S3FileSourceFactory.java | 1 + .../sftp/source/SftpFileSourceFactory.java | 1 + .../connector-file/pom.xml | 1 + .../commit/HiveSinkAggregatedCommitter.java | 37 +- .../seatunnel/hive/config/HiveConfig.java | 7 + .../seatunnel/hive/sink/HiveSinkFactory.java | 3 + .../seatunnel/iceberg/IcebergTableLoader.java | 2 +- .../iceberg/config/CommonConfig.java | 4 +- .../iceberg/source/IcebergSource.java | 2 +- .../scan/IcebergScanSplitPlanner.java | 2 +- .../seatunnel/iotdb/config/SinkConfig.java | 4 +- .../jdbc/catalog/AbstractJdbcCatalog.java | 4 +- .../jdbc/catalog/mysql/MySqlCatalog.java | 12 +- .../mysql/MysqlCreateTableSqlBuilder.java | 4 +- .../jdbc/catalog/oracle/OracleCatalog.java | 5 +- .../oracle/OracleCreateTableSqlBuilder.java | 17 + .../psql/PostgresCreateTableSqlBuilder.java | 17 + .../psql/PostgresDataTypeConvertor.java | 4 + .../SqlServerCreateTableSqlBuilder.java | 4 +- .../jdbc/config/JdbcConnectionConfig.java | 26 +- .../seatunnel/jdbc/config/JdbcOptions.java | 12 +- .../jdbc/config/JdbcSourceConfig.java | 2 + .../jdbc/internal/JdbcOutputFormat.java | 61 +-- .../internal/dialect/JdbcDialectFactory.java | 18 +- .../internal/dialect/JdbcDialectLoader.java | 22 +- .../jdbc/internal/dialect/dm/DmdbDialect.java | 64 ++- .../dialect/mysql/MySqlDialectFactory.java | 2 +- .../oceanbase/OceanBaseDialectFactory.java | 49 ++ .../dialect/oracle/OracleDialectFactory.java | 2 +- .../dialect/oracle/OracleTypeMapper.java | 5 +- .../dialect/psql/PostgresDialectFactory.java | 22 +- .../dialect/psql/PostgresTypeMapper.java | 4 + .../dialect/psqllow/PostgresLowDialect.java | 30 ++ .../redshift/RedshiftDialectFactory.java | 2 +- .../sqlserver/SqlServerDialectFactory.java | 2 +- .../executor/FieldNamedPreparedStatement.java | 4 +- .../JdbcNumericBetweenParametersProvider.java | 4 +- .../internal/xa/SemanticXidGenerator.java | 2 +- .../seatunnel/jdbc/internal/xa/XidImpl.java | 2 +- .../jdbc/sink/JdbcExactlyOnceSinkWriter.java | 4 +- .../seatunnel/jdbc/sink/JdbcSink.java | 1 + .../seatunnel/jdbc/sink/JdbcSinkFactory.java | 27 +- .../seatunnel/jdbc/source/JdbcSource.java | 6 +- .../jdbc/source/JdbcSourceFactory.java | 9 +- .../jdbc/catalog/mysql/MySqlCatalogTest.java | 17 + .../catalog/oracle/OracleCatalogTest.java | 17 + .../sql/MysqlCreateTableSqlBuilderTest.java | 15 +- .../sqlserver/SqlServerCatalogTest.java | 17 + .../dialect/PostgresDialectFactoryTest.java | 40 ++ .../connector-kafka/pom.xml | 12 + .../seatunnel/kafka/config/Config.java | 2 - .../seatunnel/kafka/config/MessageFormat.java | 3 +- .../seatunnel/kafka/source/KafkaSource.java | 10 + .../mongodb/config/MongodbConfig.java | 6 +- .../seatunnel/mongodb/sink/MongodbSink.java | 44 +- .../seatunnel/mongodb/sink/MongodbWriter.java | 62 ++- .../mongodb/sink/MongodbWriterOptions.java | 18 +- .../sink/commit/CommittableTransaction.java | 49 ++ .../commit/CommittableUpsertTransaction.java | 68 +++ .../MongodbSinkAggregatedCommitter.java | 167 ++++++ .../mongodb/sink/state/DocumentBulk.java | 61 +++ .../state/MongodbAggregatedCommitInfo.java | 30 ++ .../mongodb/sink/state/MongodbCommitInfo.java | 30 ++ .../mongodb/source/MongodbSource.java | 12 + .../source/config/MongodbReadOptions.java | 2 +- .../connector-paimon/pom.xml | 2 +- .../cursor/start/MessageIdStartCursor.java | 2 +- .../seatunnel/redis/config/RedisConfig.java | 6 + .../seatunnel/redis/config/RedisDataType.java | 23 +- .../redis/config/RedisParameters.java | 4 + .../redis/sink/RedisSinkFactory.java | 3 +- .../seatunnel/redis/sink/RedisSinkWriter.java | 3 +- .../starrocks/catalog/StarRocksCatalog.java | 2 +- .../source/StarRocksRowBatchReader.java | 2 +- .../config/StarRocksSinkOptions.java | 3 +- .../serialize/StarRocksJsonSerializer.java | 20 +- .../starrocks/sink/StarRocksSinkFactory.java | 1 + .../StarRocksJsonSerializerTest.java | 56 ++ .../tablestore/config/TablestoreConfig.java | 5 - .../tablestore/config/TablestoreOptions.java | 5 - .../tablestore/sink/TablestoreSinkClient.java | 31 +- .../sink/TablestoreSinkFactory.java | 3 +- .../tablestore/sink/TablestoreWriter.java | 7 + .../core/starter/utils/ConfigBuilder.java | 6 + .../flink/execution/FlinkExecution.java | 8 + .../execution/SourceExecuteProcessor.java | 6 +- .../execution/TransformExecuteProcessor.java | 1 - seatunnel-dist/pom.xml | 46 ++ .../src/test/java/mongodb/MongodbCDCIT.java | 12 +- .../test/resources/mongodbcdc_to_mysql.conf | 8 +- .../src/test/resources/mysqlcdc_to_mysql.conf | 2 +- .../resources/sqlservercdc_to_console.conf | 4 +- .../connector-file-cos-e2e/pom.xml | 48 ++ .../e2e/connector/file/cos/CosFileIT.java | 76 +++ .../resources/excel/cos_excel_to_assert.conf | 116 +++++ .../resources/excel/fake_to_cos_excel.conf | 82 +++ .../json/cos_file_json_to_assert.conf | 114 +++++ .../resources/json/fake_to_cos_file_json.conf | 83 +++ .../resources/orc/cos_file_orc_to_assert.conf | 80 +++ .../resources/orc/fake_to_cos_file_orc.conf | 84 +++ .../parquet/cos_file_parquet_to_assert.conf | 80 +++ .../parquet/fake_to_cos_file_parquet.conf | 84 +++ .../text/cos_file_text_to_assert.conf | 114 +++++ .../resources/text/fake_to_cos_file_text.conf | 84 +++ .../e2e/connector/file/ftp/FtpFileIT.java | 86 ++-- .../excel/ftp_filter_excel_to_assert.conf | 141 +++++ .../e2e/connector/file/local/LocalFileIT.java | 119 ++--- .../excel/local_filter_excel_to_assert.conf | 131 +++++ .../e2e/connector/file/fstp/SftpFileIT.java | 78 ++- .../excel/sftp_filter_excel_to_assert.conf | 132 +++++ .../seatunnel/jdbc/JdbcOceanBaseITBase.java | 147 ++++++ .../seatunnel/jdbc/JdbcOceanBaseMysqlIT.java | 256 ++++++++++ .../seatunnel/jdbc/JdbcOceanBaseOracleIT.java | 161 ++++++ .../jdbc_oceanbase_mysql_source_and_sink.conf | 55 ++ ...jdbc_oceanbase_oracle_source_and_sink.conf | 53 ++ .../resources/selectdb-jdbc-to-selectdb.conf | 2 +- .../connector-jdbc-e2e-part-3/pom.xml | 22 + .../seatunnel/jdbc/JdbcPostgresIT.java | 26 +- .../jdbc/JdbcSinkCDCChangelogIT.java | 10 + .../jdbc_postgres_source_and_sink.conf | 6 +- ...dbc_postgres_source_and_sink_parallel.conf | 8 +- ..._source_and_sink_parallel_upper_lower.conf | 8 +- .../jdbc_postgres_source_and_sink_xa.conf | 8 +- .../resources/jdbc_sink_cdc_changelog.conf | 2 +- .../connector-jdbc-e2e-part-4/pom.xml | 96 ++++ .../jdbc/JdbcMySqlCreateTableIT.java | 471 +++++++++++++++++ .../jdbc/JdbcSqlServerCreateTableIT.java | 482 ++++++++++++++++++ .../connector-jdbc-e2e-part-5/pom.xml | 49 ++ .../connectors/seatunnel/jdbc/JdbcDmIT.java | 0 .../seatunnel/jdbc/JdbcDmUpsetIT.java | 258 ++++++++++ .../seatunnel/jdbc/JdbcDorisIT.java | 0 .../seatunnel/jdbc/JdbcDorisdbIT.java | 0 .../seatunnel/jdbc/JdbcGBase8aIT.java | 0 .../seatunnel/jdbc/JdbcGreenplumIT.java | 0 .../test/resources/doris-jdbc-to-doris.conf | 0 .../jdbc_dm_source_and_dm_upset_sink.conf | 49 ++ .../resources/jdbc_dm_source_and_sink.conf | 0 .../resources/jdbc_doris_source_and_sink.conf | 0 .../jdbc_gbase8a_source_to_assert.conf | 0 .../jdbc_greenplum_source_and_sink.conf | 0 .../connector-jdbc-e2e/pom.xml | 2 + .../connector-kafka-e2e/pom.xml | 5 + .../e2e/connector/kafka/CanalToKafkaIT.java | 11 + .../connector/kafka/DebeziumToKafkaIT.java | 11 + .../kafka/KafkaConnectToKafkaIT.java | 282 ++++++++++ .../kafka_source_canal_cdc_to_pgsql.conf | 5 +- .../kafkasource_debezium_cdc_to_pgsql.conf | 4 +- .../kafkasource_jdbc_record_to_mysql.conf | 63 +++ .../v2/mongodb/AbstractMongodbIT.java | 5 + .../e2e/connector/v2/mongodb/MongodbIT.java | 61 +++ .../fake_source_to_update_mongodb.conf | 103 ++++ .../mongodb_matchQuery_source_to_assert.conf | 93 ++++ ...ke_source_to_transaction_sink_mongodb.conf | 102 ++++ ..._source_to_transaction_upsert_mongodb.conf | 104 ++++ ...odb_source_transaction_sink_to_assert.conf | 115 +++++ ...b_source_transaction_upsert_to_assert.conf | 115 +++++ .../e2e/connector/pulsar/CanalToPulsarIT.java | 11 + .../e2e/connector/pulsar/PulsarBatchIT.java | 2 + .../resources/cdc_canal_pulsar_to_pg.conf | 4 +- .../e2e/connector/redis/RedisIT.java | 11 + .../test/resources/redis-to-redis-expire.conf | 50 ++ .../seatunnel-connector-v2-e2e/pom.xml | 1 + .../e2e/common/container/TestHelper.java | 40 ++ .../e2e/common/util/ContainerUtil.java | 6 + .../connector-seatunnel-e2e-base/pom.xml | 12 + .../engine/e2e/ClusterFaultToleranceIT.java | 2 +- .../ClusterFaultToleranceTwoPipelineIT.java | 2 +- .../seatunnel/engine/e2e/JobExecutionIT.java | 79 ++- .../seatunnel/engine/e2e/RestApiIT.java | 72 +++ .../batch_fakesource_to_console_error.conf | 50 ++ ...ke_to_localfile_two_pipeline_template.conf | 2 +- .../src/test/resources/seatunnel.yaml | 3 +- .../src/test/resources/copy_transform.conf | 6 + .../filter_row_kind_exclude_delete.conf | 5 + .../filter_row_kind_exclude_insert.conf | 5 + .../filter_row_kind_include_insert.conf | 5 + .../src/test/resources/filter_transform.conf | 7 +- .../src/test/resources/split_transform.conf | 5 + .../resources/field_mapper_transform.conf | 6 + .../src/test/resources/sql_transform.conf | 7 +- .../engine/client/job/ClientJobProxy.java | 18 +- .../client/job/JobExecutionEnvironment.java | 91 +--- .../resources/batch_fakesource_to_file.conf | 2 +- .../batch_fakesource_to_file_complex.conf | 2 +- .../src/test/resources/client_test.conf | 2 +- .../src/test/resources/seatunnel.yaml | 2 - .../engine/common/config/EngineConfig.java | 10 + .../YamlSeaTunnelDomConfigProcessor.java | 15 +- .../config/server/CheckpointConfig.java | 22 +- .../config/server/ServerConfigOptions.java | 17 +- .../src/main/resources/seatunnel.yaml | 2 - .../config/YamlSeaTunnelConfigParserTest.java | 6 - .../src/test/resources/seatunnel.yaml | 2 - .../dag/actions/ShufflePartitionStrategy.java | 2 +- .../core/job/AbstractJobEnvironment.java | 114 +++++ .../parse/MultipleTableJobConfigParser.java | 23 +- .../engine/server/CoordinatorService.java | 11 +- .../engine/server/NodeExtension.java | 2 + .../checkpoint/CheckpointCoordinator.java | 22 +- .../server/checkpoint/CheckpointManager.java | 4 + .../server/checkpoint/TaskStatistics.java | 4 +- .../CheckpointErrorReportOperation.java | 17 + .../dag/execution/ExecutionPlanGenerator.java | 2 +- .../dag/execution/PipelineGenerator.java | 2 +- .../server/dag/physical/PhysicalVertex.java | 5 - .../server/dag/physical/PipelineLocation.java | 1 + .../engine/server/dag/physical/SubPlan.java | 15 +- .../server/execution/TaskGroupLocation.java | 1 + .../job/JobImmutableInformationEnv.java | 80 +++ .../server/master/JobHistoryService.java | 20 +- .../engine/server/master/JobMaster.java | 15 +- .../resource/ResourceProfile.java | 2 +- .../engine/server/rest/RestConstant.java | 1 + .../rest/RestHttpGetCommandProcessor.java | 16 +- .../rest/RestHttpPostCommandProcessor.java | 135 +++++ .../queue/IntermediateBlockingQueue.java | 2 +- .../engine/server/utils/RestUtil.java | 65 +++ .../server/checkpoint/SavePointTest.java | 1 - .../resources/batch_fakesource_to_file.conf | 2 +- .../batch_fakesource_to_file_complex.conf | 2 +- .../src/test/resources/seatunnel.yaml | 2 - .../resources/stream_fakesource_to_file.conf | 2 +- .../stream_fakesource_to_file_savepoint.conf | 2 +- .../hdfs/common/HdfsConfiguration.java | 12 +- .../storage/hdfs/HDFSFileCheckpointTest.java | 50 ++ .../imap-storage-file/pom.xml | 22 + seatunnel-formats/pom.xml | 1 + .../pom.xml | 62 +++ ...ibleKafkaConnectDeserializationSchema.java | 212 ++++++++ .../json/KafkaConnectJsonFormatOptions.java | 49 ++ .../seatunnel/transform/sql/SQLEngine.java | 6 +- .../seatunnel/transform/sql/SQLTransform.java | 6 +- .../transform/sql/zeta/ZetaSQLEngine.java | 10 +- .../transform/sql/zeta/ZetaSQLEngineTest.java | 54 ++ .../serialization/SeaTunnelRowConverter.java | 29 +- tools/dependencies/known-dependencies.txt | 4 +- .../update_modules_check.py | 4 - 359 files changed, 12790 insertions(+), 2587 deletions(-) create mode 100644 .gitattributes create mode 100644 docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md create mode 100644 docs/en/connector-v2/sink/CosFile.md create mode 100644 docs/en/connector-v2/sink/DB2.md create mode 100644 docs/en/connector-v2/sink/OceanBase.md create mode 100644 docs/en/connector-v2/sink/Oracle.md create mode 100644 docs/en/connector-v2/sink/PostgreSql.md create mode 100644 docs/en/connector-v2/sink/Vertica.md create mode 100644 docs/en/connector-v2/source/CosFile.md create mode 100644 docs/en/connector-v2/source/DB2.md create mode 100644 docs/en/connector-v2/source/OceanBase.md create mode 100644 docs/en/connector-v2/source/Oracle.md create mode 100644 docs/en/connector-v2/source/PostgreSQL.md create mode 100644 docs/en/connector-v2/source/Vertica.md create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/pom.xml create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/config/CosConf.java create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/config/CosConfig.java create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSink.java create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSinkFactory.java create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/source/CosFileSource.java create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/source/CosFileSourceFactory.java create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/resources/services/org.apache.hadoop.fs.FileSystem create mode 100644 seatunnel-connectors-v2/connector-file/connector-file-cos/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/cos/CosFileFactoryTest.java create mode 100644 seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oceanbase/OceanBaseDialectFactory.java create mode 100644 seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psqllow/PostgresLowDialect.java create mode 100644 seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/PostgresDialectFactoryTest.java create mode 100644 seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/CommittableTransaction.java create mode 100644 seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/CommittableUpsertTransaction.java create mode 100644 seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/MongodbSinkAggregatedCommitter.java create mode 100644 seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/DocumentBulk.java create mode 100644 seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/MongodbAggregatedCommitInfo.java create mode 100644 seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/MongodbCommitInfo.java create mode 100644 seatunnel-connectors-v2/connector-starrocks/src/test/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializerTest.java create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/pom.xml create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/cos/CosFileIT.java create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/cos_excel_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/fake_to_cos_excel.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/json/cos_file_json_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/json/fake_to_cos_file_json.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/orc/cos_file_orc_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/orc/fake_to_cos_file_orc.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/parquet/cos_file_parquet_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/parquet/fake_to_cos_file_parquet.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/text/cos_file_text_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/text/fake_to_cos_file_text.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_filter_excel_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_filter_excel_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_filter_excel_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseITBase.java create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseMysqlIT.java create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseOracleIT.java create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_oceanbase_mysql_source_and_sink.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_oceanbase_oracle_source_and_sink.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/pom.xml create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcMySqlCreateTableIT.java create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcSqlServerCreateTableIT.java create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/pom.xml rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmIT.java (100%) create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmUpsetIT.java rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisIT.java (100%) rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisdbIT.java (100%) rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGBase8aIT.java (100%) rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGreenplumIT.java (100%) rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/resources/doris-jdbc-to-doris.conf (100%) create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_dm_source_and_dm_upset_sink.conf rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/resources/jdbc_dm_source_and_sink.conf (100%) rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/resources/jdbc_doris_source_and_sink.conf (100%) rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/resources/jdbc_gbase8a_source_to_assert.conf (100%) rename seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/{connector-jdbc-e2e-part-2 => connector-jdbc-e2e-part-5}/src/test/resources/jdbc_greenplum_source_and_sink.conf (100%) create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/KafkaConnectToKafkaIT.java create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/kafkasource_jdbc_record_to_mysql.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/compatibleParametersIT/fake_source_to_update_mongodb.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/compatibleParametersIT/mongodb_matchQuery_source_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/fake_source_to_transaction_sink_mongodb.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/fake_source_to_transaction_upsert_mongodb.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/mongodb_source_transaction_sink_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/mongodb_source_transaction_upsert_to_assert.conf create mode 100644 seatunnel-e2e/seatunnel-connector-v2-e2e/connector-redis-e2e/src/test/resources/redis-to-redis-expire.conf create mode 100644 seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/common/container/TestHelper.java create mode 100644 seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/batch_fakesource_to_console_error.conf create mode 100644 seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/job/AbstractJobEnvironment.java create mode 100644 seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/job/JobImmutableInformationEnv.java create mode 100644 seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestHttpPostCommandProcessor.java create mode 100644 seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/utils/RestUtil.java create mode 100644 seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/test/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/HDFSFileCheckpointTest.java create mode 100644 seatunnel-formats/seatunnel-format-compatible-connect-json/pom.xml create mode 100644 seatunnel-formats/seatunnel-format-compatible-connect-json/src/main/java/org/apache/seatunnel/format/compatible/kafka/connect/json/CompatibleKafkaConnectDeserializationSchema.java create mode 100644 seatunnel-formats/seatunnel-format-compatible-connect-json/src/main/java/org/apache/seatunnel/format/compatible/kafka/connect/json/KafkaConnectJsonFormatOptions.java create mode 100644 seatunnel-transforms-v2/src/test/java/org/apache/seatunnel/transform/sql/zeta/ZetaSQLEngineTest.java diff --git a/.asf.yaml b/.asf.yaml index c5d24103072..dcab78f6fd9 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -15,18 +15,20 @@ # github: - description: SeaTunnel is a distributed, high-performance data integration platform for the synchronization and transformation of massive data (offline & real-time). + description: SeaTunnel is a next-generation super high-performance, distributed, massive data integration tool. homepage: https://seatunnel.apache.org/ labels: - data-integration + - change-data-capture + - cdc - high-performance - offline - real-time - - data-pipeline - - sql-engine + - batch + - streaming + - data-ingestion - apache - - seatunnel - - etl-framework + - elt enabled_merge_buttons: squash: true merge: false diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 00000000000..dfdb8b771ce --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.sh text eol=lf diff --git a/.github/ISSUE_TEMPLATE/bug-report.yml b/.github/ISSUE_TEMPLATE/bug-report.yml index 39f5b87900e..5892a2677fc 100644 --- a/.github/ISSUE_TEMPLATE/bug-report.yml +++ b/.github/ISSUE_TEMPLATE/bug-report.yml @@ -90,10 +90,10 @@ body: - type: textarea attributes: - label: Flink or Spark Version - description: Provide Flink or Spark Version. + label: Zeta or Flink or Spark Version + description: Provide Zeta or Flink or Spark Version. placeholder: > - Please provide the version of Flink or Spark. + Please provide the version of Zeta or Flink or Spark. validations: required: false diff --git a/.github/workflows/backend.yml b/.github/workflows/backend.yml index 61a85d4e4f9..4608c0086df 100644 --- a/.github/workflows/backend.yml +++ b/.github/workflows/backend.yml @@ -18,6 +18,7 @@ name: Backend on: push: + pull_request: branches: - business-dev - "v[0-9]+.[0-9]+.[0-9]+-release" @@ -26,8 +27,6 @@ on: - business-dev - "v[0-9]+.[0-9]+.[0-9]+-release" paths-ignore: - - 'docs/**' - - '**/*.md' - 'seatunnel-ui/**' concurrency: @@ -270,7 +269,7 @@ jobs: - name: run updated modules integration test (part-1) if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '' run: | - sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 2 0` + sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 5 0` ./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci env: MAVEN_OPTS: -Xmx2048m @@ -295,7 +294,7 @@ jobs: - name: run updated modules integration test (part-2) if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '' run: | - sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 2 1` + sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 5 1` if [ ! -z $sub_modules ]; then ./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci else @@ -304,6 +303,91 @@ jobs: env: MAVEN_OPTS: -Xmx2048m + updated-modules-integration-test-part-3: + needs: [ changes, sanity-check ] + if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '' + runs-on: ${{ matrix.os }} + strategy: + matrix: + java: [ '8' ] + os: [ 'self-hosted' ] + timeout-minutes: 90 + steps: + - uses: actions/checkout@v2 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + cache: 'maven' + - name: run updated modules integration test (part-3) + if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '' + run: | + sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 5 2` + if [ ! -z $sub_modules ]; then + ./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci + else + echo "sub modules is empty, skipping" + fi + env: + MAVEN_OPTS: -Xmx2048m + + updated-modules-integration-test-part-4: + needs: [ changes, sanity-check ] + if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '' + runs-on: ${{ matrix.os }} + strategy: + matrix: + java: [ '8' ] + os: [ 'self-hosted' ] + timeout-minutes: 90 + steps: + - uses: actions/checkout@v2 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + cache: 'maven' + - name: run updated modules integration test (part-4) + if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '' + run: | + sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 5 3` + if [ ! -z $sub_modules ]; then + ./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci + else + echo "sub modules is empty, skipping" + fi + env: + MAVEN_OPTS: -Xmx2048m + updated-modules-integration-test-part-5: + needs: [ changes, sanity-check ] + if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '' + runs-on: ${{ matrix.os }} + strategy: + matrix: + java: [ '8' ] + os: [ 'self-hosted' ] + timeout-minutes: 90 + steps: + - uses: actions/checkout@v2 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + cache: 'maven' + - name: run updated modules integration test (part-5) + if: needs.changes.outputs.api == 'false' && needs.changes.outputs.it-modules != '' + run: | + sub_modules=`python tools/update_modules_check/update_modules_check.py sub_update_it_module ${{needs.changes.outputs.it-modules}} 5 4` + if [ ! -z $sub_modules ]; then + ./mvnw -T 1C -B verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl $sub_modules -am -Pci + else + echo "sub modules is empty, skipping" + fi + env: + MAVEN_OPTS: -Xmx2048m engine-v2-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' @@ -637,6 +721,54 @@ jobs: env: MAVEN_OPTS: -Xmx4096m + jdbc-connectors-it-part-4: + needs: [ changes, sanity-check ] + if: needs.changes.outputs.api == 'true' + runs-on: ${{ matrix.os }} + strategy: + matrix: + java: [ '8', '11' ] + os: [ 'ubuntu-latest' ] + timeout-minutes: 90 + steps: + - uses: actions/checkout@v2 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + cache: 'maven' + - name: run jdbc connectors integration test (part-4) + if: needs.changes.outputs.api == 'true' + run: | + ./mvnw -B -T 1C verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-4 -am -Pci + env: + MAVEN_OPTS: -Xmx4096m + + jdbc-connectors-it-part-5: + needs: [ changes, sanity-check ] + if: needs.changes.outputs.api == 'true' + runs-on: ${{ matrix.os }} + strategy: + matrix: + java: [ '8', '11' ] + os: [ 'ubuntu-latest' ] + timeout-minutes: 90 + steps: + - uses: actions/checkout@v2 + - name: Set up JDK ${{ matrix.java }} + uses: actions/setup-java@v3 + with: + java-version: ${{ matrix.java }} + distribution: 'temurin' + cache: 'maven' + - name: run jdbc connectors integration test (part-5) + if: needs.changes.outputs.api == 'true' + run: | + ./mvnw -B -T 1C verify -DskipUT=true -DskipIT=false -D"license.skipAddThirdParty"=true --no-snapshot-updates -pl :connector-jdbc-e2e-part-5 -am -Pci + env: + MAVEN_OPTS: -Xmx4096m + kafka-connector-it: needs: [ changes, sanity-check ] if: needs.changes.outputs.api == 'true' diff --git a/DISCLAIMER b/DISCLAIMER index fac720f1f3e..517e33ffafa 100644 --- a/DISCLAIMER +++ b/DISCLAIMER @@ -1,4 +1,4 @@ -Apache SeaTunnel (incubating) is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC. +Apache SeaTunnel is an effort undergoing incubation at The Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC. Incubation is required of all newly accepted projects until a further review indicates that the infrastructure, communications, and decision making process have stabilized in a manner consistent with other successful ASF projects. While incubation status is not necessarily a reflection of the completeness or stability of the code, diff --git a/README.md b/README.md index c214ef5c718..e7f898bd659 100644 --- a/README.md +++ b/README.md @@ -3,7 +3,7 @@ seatunnel logo [![Backend Workflow](https://github.com/apache/seatunnel/actions/workflows/backend.yml/badge.svg?branch=dev)](https://github.com/apache/seatunnel/actions/workflows/backend.yml) -[![Slack](https://img.shields.io/badge/slack-%23seatunnel-4f8eba?logo=slack)](https://the-asf.slack.com/archives/C053HND1D6X) +[![Slack](https://img.shields.io/badge/slack-%23seatunnel-4f8eba?logo=slack)](https://s.apache.org/seatunnel-slack) [![Twitter Follow](https://img.shields.io/twitter/follow/ASFSeaTunnel.svg?label=Follow&logo=twitter)](https://twitter.com/ASFSeaTunnel) --- @@ -13,9 +13,7 @@ SeaTunnel was formerly named Waterdrop , and renamed SeaTunnel since October 12, --- -SeaTunnel is a very easy-to-use ultra-high-performance distributed data integration platform that supports real-time -synchronization of massive data. It can synchronize tens of billions of data stably and efficiently every day, and has -been used in the production of nearly 100 companies. +SeaTunnel is a next-generation super high-performance, distributed, massive data integration tool. It can synchronize tens of billions of data stably and efficiently every day, and has been used in the production of many companies. ## Why do we need SeaTunnel @@ -25,21 +23,20 @@ SeaTunnel focuses on data integration and data synchronization, and is mainly de - Complex synchronization scenarios: Data synchronization needs to support various synchronization scenarios such as offline-full synchronization, offline-incremental synchronization, CDC, real-time synchronization, and full database synchronization. - High demand in resource: Existing data integration and data synchronization tools often require vast computing resources or JDBC connection resources to complete real-time synchronization of massive small tables. This has increased the burden on enterprises to a certain extent. - Lack of quality and monitoring: Data integration and synchronization processes often experience loss or duplication of data. The synchronization process lacks monitoring, and it is impossible to intuitively understand the real-situation of the data during the task process. -- Complex technology stack: The technology components used by enterprises are different, and users need to develop corresponding synchronization programs for different components to complete data integration. -- Difficulty in management and maintenance: Limited to different underlying technology components (Flink/Spark) , offline synchronization and real-time synchronization often have be developed and managed separately, which increases the difficulty of the management and maintainance. ## Features of SeaTunnel -- Rich and extensible Connector: SeaTunnel provides a Connector API that does not depend on a specific execution engine. Connectors (Source, Transform, Sink) developed based on this API can run on many different engines, such as SeaTunnel Engine, Flink, Spark that are currently supported. -- Connector plugin: The plugin design allows users to easily develop their own Connector and integrate it into the SeaTunnel project. Currently, SeaTunnel has supported more than 70 Connectors, and the number is surging. There is the list of connectors we [supported and plan to support](https://github.com/apache/seatunnel/issues/3018). +- Diverse Connectors: SeaTunnel has supported more than 100 Connectors, and the number is surging. Here is the list of connectors we [supported and plan to support](https://github.com/apache/seatunnel/issues/3018). - Batch-stream integration: Connectors developed based on SeaTunnel Connector API are perfectly compatible with offline synchronization, real-time synchronization, full- synchronization, incremental synchronization and other scenarios. It greatly reduces the difficulty of managing data integration tasks. - Support distributed snapshot algorithm to ensure data consistency. -- Multi-engine support: SeaTunnel uses SeaTunnel Engine for data synchronization by default. At the same time, SeaTunnel also supports the use of Flink or Spark as the execution engine of the Connector to adapt to the existing technical components of the enterprise. In addition, SeaTunnel supports multiple versions of Spark and Flink. +- Multi-engine support: SeaTunnel uses SeaTunnel Zeta Engine for data synchronization by default. At the same time, SeaTunnel also supports the use of Flink or Spark as the execution engine of the Connector to adapt to the existing technical components of the enterprise. In addition, SeaTunnel supports multiple versions of Spark and Flink. - JDBC multiplexing, database log multi-table parsing: SeaTunnel supports multi-table or whole database synchronization, which solves the problem of over-JDBC connections; supports multi-table or whole database log reading and parsing, which solves the need for CDC multi-table synchronization scenarios problems with repeated reading and parsing of logs. - High throughput and low latency: SeaTunnel supports parallel reading and writing, providing stable and reliable data synchronization capabilities with high throughput and low latency. - Perfect real-time monitoring: SeaTunnel supports detailed monitoring information of each step in the data synchronization process, allowing users to easily understand the number of data, data size, QPS and other information read and written by the synchronization task. - Two job development methods are supported: coding and canvas design. The SeaTunnel web project https://github.com/apache/seatunnel-web provides visual management of jobs, scheduling, running and monitoring capabilities. +Besides, SeaTunnel provides a Connector API that does not depend on a specific execution engine. Connectors (Source, Transform, Sink) developed based on this API can run on many different engines, such as SeaTunnel Zeta Engine, Flink, Spark that are currently supported. + ## SeaTunnel work flowchart ![SeaTunnel work flowchart](docs/en/images/architecture_diagram.png) @@ -63,29 +60,15 @@ The default engine use by SeaTunnel is [SeaTunnel Engine](seatunnel-engine/READM ### Here's a list of our connectors with their health status.[connector status](docs/en/Connector-v2-release-state.md) -## Environmental dependency - -1. java runtime environment, java >= 8 - -2. If you want to run SeaTunnel in a cluster environment, any of the following Spark cluster environments is usable: - -- Spark on Yarn -- Spark Standalone - -If the data volume is small, or the goal is merely for functional verification, you can also start in local mode without -a cluster environment, because SeaTunnel supports standalone operation. Note: SeaTunnel 2.0 supports running on Spark -and Flink. - -## Compiling project -Follow this [document](docs/en/contribution/setup.md). ## Downloads Download address for run-directly software package : https://seatunnel.apache.org/download ## Quick start +SeaTunnel uses SeaTunnel Zeta Engine as the runtime execution engine for data synchronization by default. We highly recommend utilizing Zeta engine as the runtime engine, as it offers superior functionality and performance. By the way, SeaTunnel also supports the use of Flink or Spark as the execution engine. -**SeaTunnel Engine** +**SeaTunnel Zeta Engine** https://seatunnel.apache.org/docs/start-v2/locally/quick-start-seatunnel-engine/ **Spark** @@ -101,6 +84,10 @@ https://seatunnel.apache.org/docs/start-v2/locally/quick-start-flink Weibo business uses an internal customized version of SeaTunnel and its sub-project Guardian for SeaTunnel On Yarn task monitoring for hundreds of real-time streaming computing tasks. +- Tencent Cloud + +Collecting various logs from business services into Apache Kafka, some of the data in Apache Kafka is consumed and extracted through SeaTunnel, and then store into Clickhouse. + - Sina, Big Data Operation Analysis Platform Sina Data Operation Analysis Platform uses SeaTunnel to perform real-time and offline analysis of data operation and @@ -110,27 +97,11 @@ maintenance for Sina News, CDN and other services, and write it into Clickhouse. Sogou Qiqian System takes SeaTunnel as an ETL tool to help establish a real-time data warehouse system. -- Qutoutiao, Qutoutiao Data Center - -Qutoutiao Data Center uses SeaTunnel to support mysql to hive offline ETL tasks, real-time hive to clickhouse backfill -technical support, and well covers most offline and real-time tasks needs. - -- Yixia Technology, Yizhibo Data Platform - - Yonghui Superstores Founders' Alliance-Yonghui Yunchuang Technology, Member E-commerce Data Analysis Platform SeaTunnel provides real-time streaming and offline SQL computing of e-commerce user behavior data for Yonghui Life, a new retail brand of Yonghui Yunchuang Technology. -- Shuidichou, Data Platform - -Shuidichou adopts SeaTunnel to do real-time streaming and regular offline batch processing on Yarn, processing 3~4T data -volume average daily, and later writing the data to Clickhouse. - -- Tencent Cloud - -Collecting various logs from business services into Apache Kafka, some of the data in Apache Kafka is consumed and extracted through SeaTunnel, and then store into Clickhouse. - For more use cases, please refer to: https://seatunnel.apache.org/blog ## Code of conduct @@ -140,7 +111,7 @@ By participating, you are expected to uphold this code. Please follow the [REPORTING GUIDELINES](https://www.apache.org/foundation/policies/conduct#reporting-guidelines) to report unacceptable behavior. -## Developer +## Contributors Thanks to [all developers](https://github.com/apache/seatunnel/graphs/contributors)! @@ -148,6 +119,9 @@ Thanks to [all developers](https://github.com/apache/seatunnel/graphs/contributo +## How to compile +Please follow this [document](docs/en/contribution/setup.md). + ## Contact Us * Mail list: **dev@seatunnel.apache.org**. Mail to `dev-subscribe@seatunnel.apache.org`, follow the reply to subscribe diff --git a/config/hazelcast.yaml b/config/hazelcast.yaml index fcce72a4148..934712a8241 100644 --- a/config/hazelcast.yaml +++ b/config/hazelcast.yaml @@ -38,3 +38,4 @@ hazelcast: hazelcast.tcp.join.port.try.count: 30 hazelcast.logging.type: log4j2 hazelcast.operation.generic.thread.count: 50 + diff --git a/config/seatunnel.yaml b/config/seatunnel.yaml index 7e496ca39ad..5961c839238 100644 --- a/config/seatunnel.yaml +++ b/config/seatunnel.yaml @@ -17,6 +17,7 @@ seatunnel: engine: + history-job-expire-minutes: 1440 backup-count: 1 queue-type: blockingqueue print-execution-info-interval: 60 @@ -26,8 +27,6 @@ seatunnel: checkpoint: interval: 10000 timeout: 60000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 diff --git a/docs/en/concept/schema-feature.md b/docs/en/concept/schema-feature.md index 88c2efe3d6a..ebc0cf68a46 100644 --- a/docs/en/concept/schema-feature.md +++ b/docs/en/concept/schema-feature.md @@ -6,25 +6,25 @@ Some NoSQL databases or message queue are not strongly limited schema, so the sc ## What type supported at now -| Data type | Description | -|:----------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| string | string | -| boolean | boolean | -| tinyint | -128 to 127 regular. 0 to 255 unsigned*. Specify the maximum number of digits in parentheses. | -| smallint | -32768 to 32767 General. 0 to 65535 unsigned*. Specify the maximum number of digits in parentheses. | -| int | All numbers from -2,147,483,648 to 2,147,483,647 are allowed. | -| bigint | All numbers between -9,223,372,036,854,775,808 and 9,223,372,036,854,775,807 are allowed. | -| float | Float-precision numeric data from -1.79E+308 to 1.79E+308. | -| double | Double precision floating point. Handle most decimals. | -| decimal | DOUBLE type stored as a string, allowing a fixed decimal point. | -| null | null | -| bytes | bytes. | -| date | Only the date is stored. From January 1, 0001 to December 31, 9999. | -| time | Only store time. Accuracy is 100 nanoseconds. | -| timestamp | Stores a unique number that is updated whenever a row is created or modified. timestamp is based on the internal clock and does not correspond to real time. There can only be one timestamp variable per table. | -| row | Row type,can be nested. | -| map | A Map is an object that maps keys to values. The key type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` , and the value type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` `array` `map`. | -| array | A array is a data type that represents a collection of elements. The element type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `array` `map`. | +| Data type | Value type in Java | Description | +|:----------|:---------------------------------------------------|:------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| string | `java.lang.String` | string | +| boolean | `java.lang.Boolean` | boolean | +| tinyint | `java.lang.Byte` | -128 to 127 regular. 0 to 255 unsigned*. Specify the maximum number of digits in parentheses. | +| smallint | `java.lang.Short` | -32768 to 32767 General. 0 to 65535 unsigned*. Specify the maximum number of digits in parentheses. | +| int | `java.lang.Integer` | All numbers from -2,147,483,648 to 2,147,483,647 are allowed. | +| bigint | `java.lang.Long` | All numbers between -9,223,372,036,854,775,808 and 9,223,372,036,854,775,807 are allowed. | +| float | `java.lang.Float` | Float-precision numeric data from -1.79E+308 to 1.79E+308. | +| double | `java.lang.Double` | Double precision floating point. Handle most decimals. | +| decimal | `java.math.BigDecimal` | DOUBLE type stored as a string, allowing a fixed decimal point. | +| null | `java.lang.Void` | null | +| bytes | `byte[]` | bytes. | +| date | `java.time.LocalDate` | Only the date is stored. From January 1, 0001 to December 31, 9999. | +| time | `java.time.LocalTime` | Only store time. Accuracy is 100 nanoseconds. | +| timestamp | `java.time.LocalDateTime` | Stores a unique number that is updated whenever a row is created or modified. timestamp is based on the internal clock and does not correspond to real time. There can only be one timestamp variable per table. | +| row | `org.apache.seatunnel.api.table.type.SeaTunnelRow` | Row type,can be nested. | +| map | `java.util.Map` | A Map is an object that maps keys to values. The key type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` , and the value type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `decimal` `date` `time` `timestamp` `null` `array` `map`. | +| array | `ValueType[]` | A array is a data type that represents a collection of elements. The element type includes `int` `string` `boolean` `tinyint` `smallint` `bigint` `float` `double` `array` `map`. | ## How to use schema diff --git a/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md b/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md new file mode 100644 index 00000000000..7de8a9e838b --- /dev/null +++ b/docs/en/connector-v2/formats/kafka-compatible-kafkaconnect-json.md @@ -0,0 +1,47 @@ +# Kafka source compatible kafka-connect-json + +Seatunnel connector kafka supports parsing data extracted through kafka connect source, especially data extracted from kafka connect jdbc and kafka connect debezium + +# How to use + +## Kafka output to mysql + +```bash +env { + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + Kafka { + bootstrap.servers = "localhost:9092" + topic = "jdbc_source_record" + result_table_name = "kafka_table" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = COMPATIBLE_KAFKA_CONNECT_JSON + } +} + + +sink { + Jdbc { + driver = com.mysql.cj.jdbc.Driver + url = "jdbc:mysql://localhost:3306/seatunnel" + user = st_user + password = seatunnel + generate_sink_sql = true + database = seatunnel + table = jdbc_sink + primary_keys = ["id"] + } +} +``` + diff --git a/docs/en/connector-v2/sink/Clickhouse.md b/docs/en/connector-v2/sink/Clickhouse.md index 05d03330c70..27bf274c77f 100644 --- a/docs/en/connector-v2/sink/Clickhouse.md +++ b/docs/en/connector-v2/sink/Clickhouse.md @@ -2,95 +2,110 @@ > Clickhouse sink connector -## Description +## Support Those Engines -Used to write data to Clickhouse. +> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features +## Key Features - [ ] [exactly-once](../../concept/connector-v2-features.md) - -The Clickhouse sink plug-in can achieve accuracy once by implementing idempotent writing, and needs to cooperate with aggregatingmergetree and other engines that support deduplication. - - [x] [cdc](../../concept/connector-v2-features.md) -## Options - -| name | type | required | default value | -|---------------------------------------|---------|----------|---------------| -| host | string | yes | - | -| database | string | yes | - | -| table | string | yes | - | -| username | string | yes | - | -| password | string | yes | - | -| clickhouse.config | map | no | | -| bulk_size | string | no | 20000 | -| split_mode | string | no | false | -| sharding_key | string | no | - | -| primary_key | string | no | - | -| support_upsert | boolean | no | false | -| allow_experimental_lightweight_delete | boolean | no | false | -| common-options | | no | - | - -### host [string] - -`ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` . - -### database [string] - -The `ClickHouse` database - -### table [string] - -The table name - -### username [string] - -`ClickHouse` user username - -### password [string] - -`ClickHouse` user password - -### clickhouse.config [map] - -In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc` . - -### bulk_size [number] - -The number of rows written through [Clickhouse-jdbc](https://github.com/ClickHouse/clickhouse-jdbc) each time, the `default is 20000` . - -### split_mode [boolean] - -This mode only support clickhouse table which engine is 'Distributed'.And `internal_replication` option -should be `true`. They will split distributed table data in seatunnel and perform write directly on each shard. The shard weight define is clickhouse will be -counted. - -### sharding_key [string] +> The Clickhouse sink plug-in can achieve accuracy once by implementing idempotent writing, and needs to cooperate with aggregatingmergetree and other engines that support deduplication. -When use split_mode, which node to send data to is a problem, the default is random selection, but the -'sharding_key' parameter can be used to specify the field for the sharding algorithm. This option only -worked when 'split_mode' is true. - -### primary_key [string] - -Mark the primary key column from clickhouse table, and based on primary key execute INSERT/UPDATE/DELETE to clickhouse table - -### support_upsert [boolean] +## Description -Support upsert row by query primary key +Used to write data to Clickhouse. -### allow_experimental_lightweight_delete [boolean] +## Supported DataSource Info + +In order to use the Clickhouse connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|------------------------------------------------------------------------------------------------------------------| +| Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-clickhouse) | + +## Data Type Mapping + +| SeaTunnel Data type | Clickhouse Data type | +|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------| +| STRING | String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | +| INT | Int8 / UInt8 / Int16 / UInt16 / Int32 | +| BIGINT | UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | +| DOUBLE | Float64 | +| DECIMAL | Decimal | +| FLOAT | Float32 | +| DATE | Date | +| TIME | DateTime | +| ARRAY | Array | +| MAP | Map | + +## Sink Options + +| Name | Type | Required | Default | Description | +|---------------------------------------|---------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| host | String | Yes | - | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"`. | +| database | String | Yes | - | The `ClickHouse` database. | +| table | String | Yes | - | The table name. | +| username | String | Yes | - | `ClickHouse` user username. | +| password | String | Yes | - | `ClickHouse` user password. | +| clickhouse.config | Map | No | | In addition to the above mandatory parameters that must be specified by `clickhouse-jdbc` , users can also specify multiple optional parameters, which cover all the [parameters](https://github.com/ClickHouse/clickhouse-jdbc/tree/master/clickhouse-client#configuration) provided by `clickhouse-jdbc`. | +| bulk_size | String | No | 20000 | The number of rows written through [Clickhouse-jdbc](https://github.com/ClickHouse/clickhouse-jdbc) each time, the `default is 20000`. | +| split_mode | String | No | false | This mode only support clickhouse table which engine is 'Distributed'.And `internal_replication` option-should be `true`.They will split distributed table data in seatunnel and perform write directly on each shard. The shard weight define is clickhouse will counted. | +| sharding_key | String | No | - | When use split_mode, which node to send data to is a problem, the default is random selection, but the 'sharding_key' parameter can be used to specify the field for the sharding algorithm. This option only worked when 'split_mode' is true. | +| primary_key | String | No | - | Mark the primary key column from clickhouse table, and based on primary key execute INSERT/UPDATE/DELETE to clickhouse table. | +| support_upsert | Boolean | No | false | Support upsert row by query primary key. | +| allow_experimental_lightweight_delete | Boolean | No | false | Allow experimental lightweight delete based on `*MergeTree` table engine. | +| common-options | | No | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. | + +## How to Create a Clickhouse Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that writes randomly generated data to a Clickhouse database: + +```bash +# Set the basic configuration of the task to be performed +env { + execution.parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 1000 +} -Allow experimental lightweight delete based on `*MergeTree` table engine +source { + FakeSource { + row.num = 2 + bigint.min = 0 + bigint.max = 10000000 + split.num = 1 + split.read-interval = 300 + schema { + fields { + c_bigint = bigint + } + } + } +} -### common options +sink { + Clickhouse { + host = "127.0.0.1:9092" + database = "default" + table = "test" + username = "xxxxx" + password = "xxxxx" + } +} +``` -Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details +### Tips -## Examples +> 1.[SeaTunnel Deployment Document](../../start-v2/locally/deployment.md).
+> 2.The table to be written to needs to be created in advance before synchronization.
+> 3.When sink is writing to the ClickHouse table, you don't need to set its schema because the connector will query ClickHouse for the current table's schema information before writing.
-Simple +## Clickhouse Sink Config ```hocon sink { @@ -98,9 +113,9 @@ sink { host = "localhost:8123" database = "default" table = "fake_all" - username = "default" - password = "" - clickhouse.confg = { + username = "xxxxx" + password = "xxxxx" + clickhouse.config = { max_rows_to_read = "100" read_overflow_mode = "throw" } @@ -108,7 +123,7 @@ sink { } ``` -Split mode +## Split Mode ```hocon sink { @@ -116,8 +131,8 @@ sink { host = "localhost:8123" database = "default" table = "fake_all" - username = "default" - password = "" + username = "xxxxx" + password = "xxxxx" # split mode options split_mode = true @@ -126,7 +141,7 @@ sink { } ``` -CDC(Change data capture) +## CDC(Change data capture) Sink ```hocon sink { @@ -134,8 +149,8 @@ sink { host = "localhost:8123" database = "default" table = "fake_all" - username = "default" - password = "" + username = "xxxxx" + password = "xxxxx" # cdc options primary_key = "id" @@ -144,7 +159,7 @@ sink { } ``` -CDC(Change data capture) for *MergeTree engine +## CDC(Change data capture) for *MergeTree engine ```hocon sink { @@ -152,8 +167,8 @@ sink { host = "localhost:8123" database = "default" table = "fake_all" - username = "default" - password = "" + username = "xxxxx" + password = "xxxxx" # cdc options primary_key = "id" @@ -163,21 +178,3 @@ sink { } ``` -## Changelog - -### 2.2.0-beta 2022-09-26 - -- Add ClickHouse Sink Connector - -### 2.3.0-beta 2022-10-20 - -- [Improve] Clickhouse Support Int128,Int256 Type ([3067](https://github.com/apache/seatunnel/pull/3067)) - -### next version - -- [Improve] Clickhouse Sink support nest type and array type([3047](https://github.com/apache/seatunnel/pull/3047)) -- [Improve] Clickhouse Sink support geo type([3141](https://github.com/apache/seatunnel/pull/3141)) -- [Feature] Support CDC write DELETE/UPDATE/INSERT events ([3653](https://github.com/apache/seatunnel/pull/3653)) -- [Improve] Remove Clickhouse Fields Config ([3826](https://github.com/apache/seatunnel/pull/3826)) -- [Improve] Change Connector Custom Config Prefix To Map [3719](https://github.com/apache/seatunnel/pull/3719) - diff --git a/docs/en/connector-v2/sink/CosFile.md b/docs/en/connector-v2/sink/CosFile.md new file mode 100644 index 00000000000..563b174c3c8 --- /dev/null +++ b/docs/en/connector-v2/sink/CosFile.md @@ -0,0 +1,259 @@ +# CosFile + +> Cos file sink connector + +## Description + +Output data to cos file system. + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and cos_api-bundle-{version}.jar in ${SEATUNNEL_HOME}/lib dir, download: [Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). It only supports hadoop version 2.6.5+ and version 8.0.2+. + +::: + +## Key features + +- [x] [exactly-once](../../concept/connector-v2-features.md) + +By default, we use 2PC commit to ensure `exactly-once` + +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + +## Options + +| name | type | required | default value | remarks | +|----------------------------------|---------|----------|--------------------------------------------|-----------------------------------------------------------| +| path | string | yes | - | | +| bucket | string | yes | - | | +| secret_id | string | yes | - | | +| secret_key | string | yes | - | | +| region | string | yes | - | | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format is text | +| row_delimiter | string | no | "\n" | Only used when file_format is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | + +### path [string] + +The target dir path is required. + +### bucket [string] + +The bucket address of cos file system, for example: `cosn://seatunnel-test-1259587829` + +### secret_id [string] + +The secret id of cos file system. + +### secret_key [string] + +The secret key of cos file system. + +### region [string] + +The region of cos file system. + +### custom_filename [boolean] + +Whether custom the filename + +### file_name_expression [string] + +Only used when `custom_filename` is `true` + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### filename_time_format [string] + +Only used when `custom_filename` is `true` + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +### file_format_type [string] + +We supported as the following file types: + +`text` `json` `csv` `orc` `parquet` `excel` + +Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` file format. + +### have_partition [boolean] + +Whether you need processing partitions. + +### partition_by [array] + +Only used when `have_partition` is `true`. + +Partition data based on selected fields. + +### partition_dir_expression [string] + +Only used when `have_partition` is `true`. + +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. + +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. + +### is_partition_field_write_in_file [boolean] + +Only used when `have_partition` is `true`. + +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. + +For example, if you want to write a Hive Data File, Its value should be `false`. + +### sink_columns [array] + +Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. + +### is_enable_transaction [boolean] + +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +Only support `true` now. + +### batch_size [int] + +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. + +### compress_codec [string] + +The compress codec of files and the details that supported as the following shown: + +- txt: `lzo` `none` +- json: `lzo` `none` +- csv: `lzo` `none` +- orc: `lzo` `snappy` `lz4` `zlib` `none` +- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` + +Tips: excel type does not support any compression format + +### common options + +Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. + +### max_rows_in_memory [int] + +When File Format is Excel,The maximum number of data items that can be cached in the memory. + +### sheet_name [string] + +Writer the sheet of the workbook + +## Example + +For text file format with `have_partition` and `custom_filename` and `sink_columns` + +```hocon + + CosFile { + path="/sink" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction = true + } + +``` + +For parquet file format with `have_partition` and `sink_columns` + +```hocon + + CosFile { + path="/sink" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_format_type = "parquet" + sink_columns = ["name","age"] + } + +``` + +For orc file format simple config + +```bash + + CosFile { + path="/sink" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + file_format_type = "orc" + } + +``` + +## Changelog + +### next version + +- Add file cos sink connector ([4979](https://github.com/apache/seatunnel/pull/4979)) + diff --git a/docs/en/connector-v2/sink/DB2.md b/docs/en/connector-v2/sink/DB2.md new file mode 100644 index 00000000000..fc0aaca0943 --- /dev/null +++ b/docs/en/connector-v2/sink/DB2.md @@ -0,0 +1,170 @@ +# DB2 + +> JDBC DB2 Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------| +| DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example DB2 datasource: cp db2-connector-java-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| DB2 Data type | SeaTunnel Data type | +|------------------------------------------------------------------------------------------------------|---------------------|---| +| BOOLEAN | BOOLEAN | +| SMALLINT | SHORT | +| INT
INTEGER
| INTEGER | +| BIGINT | LONG | +| DECIMAL
DEC
NUMERIC
NUM | DECIMAL(38,18) | +| REAL | FLOAT | +| FLOAT
DOUBLE
DOUBLE PRECISION
DECFLOAT | DOUBLE | +| CHAR
VARCHAR
LONG VARCHAR
CLOB
GRAPHIC
VARGRAPHIC
LONG VARGRAPHIC
DBCLOB | STRING | +| BLOB | BYTES | +| DATE | DATE | +| TIME | TIME | +| TIMESTAMP | TIMESTAMP | +| ROWID
XML | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use DB2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, DB2 is `com.db2.cj.jdbc.Db2XADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your DB2. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + user = "root" + password = "123456" + # Automatically generate sql statements based on database table names + generate_sink_sql = true + database = test + table = test_table + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + + max_retries = 0 + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "com.db2.cj.jdbc.Db2XADataSource" + } +} +``` + diff --git a/docs/en/connector-v2/sink/Doris.md b/docs/en/connector-v2/sink/Doris.md index f586ac3bcca..6bf8dc5369c 100644 --- a/docs/en/connector-v2/sink/Doris.md +++ b/docs/en/connector-v2/sink/Doris.md @@ -2,11 +2,24 @@ > Doris sink connector +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + ## Description Used to send data to Doris. Both support streaming and batch mode. The internal implementation of Doris sink connector is cached and imported by stream load in batches. +## Supported DataSource Info + :::tip Version Supported @@ -17,67 +30,186 @@ Version Supported ::: -## Key features - -- [x] [exactly-once](../../concept/connector-v2-features.md) -- [x] [cdc](../../concept/connector-v2-features.md) - -## Options - -| name | type | required | default value | -|--------------------|--------|----------|---------------| -| fenodes | string | yes | - | -| username | string | yes | - | -| password | string | yes | - | -| table.identifier | string | yes | - | -| sink.label-prefix | string | yes | - | -| sink.enable-2pc | bool | no | true | -| sink.enable-delete | bool | no | false | -| doris.config | map | yes | - | - -### fenodes [string] - -`Doris` cluster fenodes address, the format is `"fe_ip:fe_http_port, ..."` - -### username [string] - -`Doris` user username - -### password [string] - -`Doris` user password +## Sink Options + +| Name | Type | Required | Default | Description | +|---------------------|--------|----------|------------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| fenodes | String | Yes | - | `Doris` cluster fenodes address, the format is `"fe_ip:fe_http_port, ..."` | +| username | String | Yes | - | `Doris` user username | +| password | String | Yes | - | `Doris` user password | +| table.identifier | String | Yes | - | The name of `Doris` table | +| sink.label-prefix | String | Yes | - | The label prefix used by stream load imports. In the 2pc scenario, global uniqueness is required to ensure the EOS semantics of SeaTunnel. | +| sink.enable-2pc | bool | No | - | Whether to enable two-phase commit (2pc), the default is true, to ensure Exactly-Once semantics. For two-phase commit, please refer to [here](https://doris.apache.org/docs/dev/sql-manual/sql-reference/Data-Manipulation-Statements/Load/STREAM-LOAD). | +| sink.enable-delete | bool | No | - | Whether to enable deletion. This option requires Doris table to enable batch delete function (0.15+ version is enabled by default), and only supports Unique model. you can get more detail at this [link](https://doris.apache.org/docs/dev/data-operate/update-delete/batch-delete-manual) | +| sink.check-interval | int | No | 10000 | check exception with the interval while loading | +| sink.max-retries | int | No | 3 | the max retry times if writing records to database failed | +| sink.buffer-size | int | No | 256 * 1024 | the buffer size to cache data for stream load. | +| sink.buffer-count | int | No | 3 | the buffer count to cache data for stream load. | +| doris.config | map | yes | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql,and supported formats. | + +## Data Type Mapping + +| Doris Data type | SeaTunnel Data type | +|-----------------|-----------------------------------------| +| BOOLEAN | BOOLEAN | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT
TINYINT | +| INT | INT
SMALLINT
TINYINT | +| BIGINT | BIGINT
INT
SMALLINT
TINYINT | +| LARGEINT | BIGINT
INT
SMALLINT
TINYINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE
FLOAT | +| DECIMAL | DECIMAL
DOUBLE
FLOAT | +| DATE | DATE | +| DATETIME | TIMESTAMP | +| CHAR | STRING | +| VARCHAR | STRING | +| STRING | STRING | +| ARRAY | ARRAY | +| MAP | MAP | +| JSON | STRING | +| HLL | Not supported yet | +| BITMAP | Not supported yet | +| QUANTILE_STATE | Not supported yet | +| STRUCT | Not supported yet | -### table.identifier [string] - -The name of `Doris` table +#### Supported import data formats -### sink.label-prefix [string] +The supported formats include CSV and JSON -The label prefix used by stream load imports. In the 2pc scenario, global uniqueness is required to ensure the EOS semantics of SeaTunnel. +## Task Example -### sink.enable-2pc [bool] +### Simple: -Whether to enable two-phase commit (2pc), the default is true, to ensure Exactly-Once semantics. For two-phase commit, please refer to [here](https://doris.apache.org/docs/dev/sql-manual/sql-reference/Data-Manipulation-Statements/Load/STREAM-LOAD). +> The following example describes writing multiple data types to Doris, and users need to create corresponding tables downstream -### sink.enable-delete [bool] +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} -Whether to enable deletion. This option requires Doris table to enable batch delete function (0.15+ version is enabled by default), and only supports Unique model. you can get more detail at this link: +source { + FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} -https://doris.apache.org/docs/dev/data-operate/update-delete/batch-delete-manual +sink { + Doris { + fenodes = "doris_cdc_e2e:8030" + username = root + password = "" + table.identifier = "test.e2e_table_sink" + sink.label-prefix = "test-cdc" + sink.enable-2pc = "true" + sink.enable-delete = "true" + doris.config { + format = "json" + read_json_by_line = "true" + } + } +} +``` -### doris.config [map] +### CDC(Change Data Capture) Event: -The parameter of the stream load `data_desc`, you can get more detail at this link: +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Doris Sink,FakeSource simulates CDC data with schema, score (int type),Doris needs to create a table sink named test.e2e_table_sink and a corresponding table for it. -https://doris.apache.org/docs/dev/sql-manual/sql-reference/Data-Manipulation-Statements/Load/STREAM-LOAD +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} -#### Supported import data formats +source { + FakeSource { + schema = { + fields { + pk_id = bigint + name = string + score = int + sex = boolean + number = tinyint + height = float + sight = double + create_time = date + update_time = timestamp + } + } + rows = [ + { + kind = INSERT + fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = INSERT + fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = INSERT + fields = [3, "C", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = UPDATE_BEFORE + fields = [1, "A", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = UPDATE_AFTER + fields = [1, "A_1", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + }, + { + kind = DELETE + fields = [2, "B", 100, true, 1, 170.0, 4.3, "2020-02-02", "2020-02-02T02:02:02"] + } + ] + } +} -The supported formats include CSV and JSON. Default value: CSV +sink { + Doris { + fenodes = "doris_cdc_e2e:8030" + username = root + password = "" + table.identifier = "test.e2e_table_sink" + sink.label-prefix = "test-cdc" + sink.enable-2pc = "true" + sink.enable-delete = "true" + doris.config { + format = "json" + read_json_by_line = "true" + } + } +} -## Example +``` -Use JSON format to import data +### Use JSON format to import data ``` sink { @@ -97,7 +229,7 @@ sink { ``` -Use CSV format to import data +### Use CSV format to import data ``` sink { diff --git a/docs/en/connector-v2/sink/FtpFile.md b/docs/en/connector-v2/sink/FtpFile.md index b92bcd7fcc3..8b3214e44b3 100644 --- a/docs/en/connector-v2/sink/FtpFile.md +++ b/docs/en/connector-v2/sink/FtpFile.md @@ -40,9 +40,9 @@ By default, we use 2PC commit to ensure `exactly-once` | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | -| file_format | string | no | "csv" | | -| field_delimiter | string | no | '\001' | Only used when file_format is text | -| row_delimiter | string | no | "\n" | Only used when file_format is text | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | @@ -52,8 +52,8 @@ By default, we use 2PC commit to ensure `exactly-once` | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | -| max_rows_in_memory | int | no | - | Only used when file_format is excel. | -| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | ### host [string] @@ -103,13 +103,13 @@ When the format in the `file_name_expression` parameter is `xxxx-${now}` , `file | m | Minute in hour | | s | Second in minute | -### file_format [string] +### file_format_type [string] We supported as the following file types: `text` `json` `csv` `orc` `parquet` `excel` -Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] @@ -198,7 +198,7 @@ FtpFile { username = "username" password = "password" path = "/data/ftp" - file_format = "text" + file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" sink_columns = ["name","age"] @@ -216,7 +216,7 @@ FtpFile { username = "username" password = "password" path = "/data/ftp" - file_format = "text" + file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true diff --git a/docs/en/connector-v2/sink/HdfsFile.md b/docs/en/connector-v2/sink/HdfsFile.md index 1e094a5e573..135c5115c2a 100644 --- a/docs/en/connector-v2/sink/HdfsFile.md +++ b/docs/en/connector-v2/sink/HdfsFile.md @@ -1,20 +1,14 @@ # HdfsFile -> HDFS file sink connector +> HDFS File Sink Connector -## Description - -Output data to hdfs file - -:::tip - -If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. - -If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. +## Support Those Engines -::: +> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features +## Key Features - [x] [exactly-once](../../concept/connector-v2-features.md) @@ -30,183 +24,120 @@ By default, we use 2PC commit to ensure `exactly-once` - [x] compress codec - [x] lzo -## Options - -| name | type | required | default value | remarks | -|----------------------------------|---------|----------|--------------------------------------------|-----------------------------------------------------------| -| fs.defaultFS | string | yes | - | | -| path | string | yes | - | | -| hdfs_site_path | string | no | - | | -| custom_filename | boolean | no | false | Whether you need custom the filename | -| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | -| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | -| file_format_type | string | no | "csv" | | -| field_delimiter | string | no | '\001' | Only used when file_format is text | -| row_delimiter | string | no | "\n" | Only used when file_format is text | -| have_partition | boolean | no | false | Whether you need processing partitions. | -| partition_by | array | no | - | Only used then have_partition is true | -| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | -| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | -| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | -| is_enable_transaction | boolean | no | true | | -| batch_size | int | no | 1000000 | | -| compress_codec | string | no | none | | -| kerberos_principal | string | no | - | -| kerberos_keytab_path | string | no | - | | -| compress_codec | string | no | none | | -| common-options | object | no | - | | -| max_rows_in_memory | int | no | - | Only used when file_format is excel. | -| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | - -### fs.defaultFS [string] - -The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` - -### path [string] - -The target dir path is required. - -### hdfs_site_path [string] - -The path of `hdfs-site.xml`, used to load ha configuration of namenodes - -### custom_filename [boolean] - -Whether custom the filename - -### file_name_expression [string] - -Only used when `custom_filename` is `true` - -`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, -`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. - -Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. - -### filename_time_format [string] - -Only used when `custom_filename` is `true` - -When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: - -| Symbol | Description | -|--------|--------------------| -| y | Year | -| M | Month | -| d | Day of month | -| H | Hour in day (0-23) | -| m | Minute in hour | -| s | Second in minute | - -### file_format_type [string] - -We supported as the following file types: - -`text` `json` `csv` `orc` `parquet` `excel` - -Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. - -### field_delimiter [string] - -The separator between columns in a row of data. Only needed by `text` file format. - -### row_delimiter [string] - -The separator between rows in a file. Only needed by `text` file format. - -### have_partition [boolean] - -Whether you need processing partitions. - -### partition_by [array] - -Only used when `have_partition` is `true`. - -Partition data based on selected fields. - -### partition_dir_expression [string] - -Only used when `have_partition` is `true`. - -If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. - -Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. - -### is_partition_field_write_in_file [boolean] - -Only used when `have_partition` is `true`. - -If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file. - -For example, if you want to write a Hive Data File, Its value should be `false`. - -### sink_columns [array] - -Which columns need be write to file, default value is all of the columns get from `Transform` or `Source`. -The order of the fields determines the order in which the file is actually written. - -### is_enable_transaction [boolean] - -If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. - -Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. - -Only support `true` now. - -### batch_size [int] - -The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. - -### compress_codec [string] - -The compress codec of files and the details that supported as the following shown: - -- txt: `lzo` `none` -- json: `lzo` `none` -- csv: `lzo` `none` -- orc: `lzo` `snappy` `lz4` `zlib` `none` -- parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none` - -Tips: excel type does not support any compression format - -### kerberos_principal [string] - -The principal of kerberos - -### kerberos_keytab_path [string] - -The keytab path of kerberos - -### common options - -Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details +## Description -### max_rows_in_memory [int] +Output data to hdfs file -When File Format is Excel,The maximum number of data items that can be cached in the memory. +## Supported DataSource Info + +| Datasource | Supported Versions | +|------------|--------------------| +| HdfsFile | hadoop 2.x and 3.x | + +## Sink Options + +| Name | Type | Required | Default | Description | +|----------------------------------|---------|----------|--------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| fs.defaultFS | string | yes | - | The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` | +| path | string | yes | - | The target dir path is required. | +| hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when `custom_filename` is `true`.`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`,`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`.Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when `custom_filename` is `true`.When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows:[y:Year,M:Month,d:Day of month,H:Hour in day (0-23),m:Minute in hour,s:Second in minute] | +| file_format_type | string | no | "csv" | We supported as the following file types:`text` `json` `csv` `orc` `parquet` `excel`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | +| field_delimiter | string | no | '\001' | Only used when file_format is text,The separator between columns in a row of data. Only needed by `text` file format. | +| row_delimiter | string | no | "\n" | Only used when file_format is text,The separator between rows in a file. Only needed by `text` file format. | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used then have_partition is true,Partition data based on selected fields. | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true,If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. | +| is_partition_field_write_in_file | boolean | no | false | Only used when `have_partition` is `true`. If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be write into data file.For example, if you want to write a Hive Data File, Its value should be `false`. | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns.Which columns need be write to file, default value is all of the columns get from `Transform` or `Source`. The order of the fields determines the order in which the file is actually written. | +| is_enable_transaction | boolean | no | true | If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory.Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file.Only support `true` now. | +| batch_size | int | no | 1000000 | The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. | +| compress_codec | string | no | none | The compress codec of files and the details that supported as the following shown:[txt: `lzo` `none`,json: `lzo` `none`,csv: `lzo` `none`,orc: `lzo` `snappy` `lz4` `zlib` `none`,parquet: `lzo` `snappy` `lz4` `gzip` `brotli` `zstd` `none`].Tips: excel type does not support any compression format. | +| kerberos_principal | string | no | - | The principal of kerberos | +| kerberos_keytab_path | string | no | - | The keytab path of kerberos | +| compress_codec | string | no | none | compress codec | +| common-options | object | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | +| max_rows_in_memory | int | no | - | Only used when file_format is excel.When File Format is Excel,The maximum number of data items that can be cached in the memory. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel.Writer the sheet of the workbook | + +### Tips + +> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Hdfs. -### sheet_name [string] +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} -Writer the sheet of the workbook +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(30, 8)" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} -## Example +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} -For orc file format simple config +sink { + HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format = "orc" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` -```bash +### For orc file format simple config +``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" file_format = "orc" } - ``` -For text file format with `have_partition` and `custom_filename` and `sink_columns` - -```bash +### For text file format with `have_partition` and `custom_filename` and `sink_columns` +``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" @@ -223,13 +154,11 @@ HdfsFile { sink_columns = ["name","age"] is_enable_transaction = true } - ``` -For parquet file format with `have_partition` and `custom_filename` and `sink_columns` - -```bash +### For parquet file format with `have_partition` and `custom_filename` and `sink_columns` +``` HdfsFile { fs.defaultFS = "hdfs://hadoopcluster" path = "/tmp/hive/warehouse/test2" @@ -244,32 +173,27 @@ HdfsFile { sink_columns = ["name","age"] is_enable_transaction = true } - ``` -## Changelog +### For kerberos simple config -### 2.2.0-beta 2022-09-26 - -- Add HDFS File Sink Connector - -### 2.3.0-beta 2022-10-20 - -- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) -- [BugFix] Fix filesystem get error ([3117](https://github.com/apache/seatunnel/pull/3117)) -- [BugFix] Solved the bug of can not parse '\t' as delimiter from config file ([3083](https://github.com/apache/seatunnel/pull/3083)) - -### 2.3.0 2022-12-30 - -- [BugFix] Fixed the following bugs that failed to write data to files ([3258](https://github.com/apache/seatunnel/pull/3258)) - - When field from upstream is null it will throw NullPointerException - - Sink columns mapping failed - - When restore writer from states getting transaction directly failed +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + hdfs_site_path = "/path/to/your/hdfs_site_path" + kerberos_principal = "your_principal@EXAMPLE.COM" + kerberos_keytab_path = "/path/to/your/keytab/file.keytab" +} +``` -### Next version +### For compress simple config -- [Improve] Support setting batch size for every file ([3625](https://github.com/apache/seatunnel/pull/3625)) -- [Improve] Support lzo compression for text in file format ([3782](https://github.com/apache/seatunnel/pull/3782)) -- [Improve] Support kerberos authentication ([3840](https://github.com/apache/seatunnel/pull/3840)) -- [Improve] Support file compress ([3899](https://github.com/apache/seatunnel/pull/3899)) +``` +HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + compress_codec = "lzo" +} +``` diff --git a/docs/en/connector-v2/sink/Jdbc.md b/docs/en/connector-v2/sink/Jdbc.md index 1d05714059b..755de8bb9a7 100644 --- a/docs/en/connector-v2/sink/Jdbc.md +++ b/docs/en/connector-v2/sink/Jdbc.md @@ -9,9 +9,9 @@ semantics (using XA transaction guarantee). :::tip -Warn: for license compliance, you have to provide database driver yourself, copy to `$SEATNUNNEL_HOME/plugins/jdbc/lib/` directory in order to make them work. +Warn: for license compliance, you have to provide database driver yourself, copy to `$SEATNUNNEL_HOME/lib/` directory in order to make them work. -e.g. If you use MySQL, should download and copy `mysql-connector-java-xxx.jar` to `$SEATNUNNEL_HOME/plugins/jdbc/lib/` +e.g. If you use MySQL, should download and copy `mysql-connector-java-xxx.jar` to `$SEATNUNNEL_HOME/lib/`. For Spark/Flink, you should also copy it to `$SPARK_HOME/jars/` or `$FLINK_HOME/lib/`. ::: @@ -26,28 +26,28 @@ support `Xa transactions`. You can set `is_exactly_once=true` to enable it. ## Options -| name | type | required | default value | -|------------------------------|---------|----------|---------------| -| url | String | Yes | - | -| driver | String | Yes | - | -| user | String | No | - | -| password | String | No | - | -| query | String | No | - | -| database | String | No | - | -| table | String | No | - | -| primary_keys | Array | No | - | -| enable_upsert | Boolean | No | false | -| isPrimaryKeyUpdated | Boolean | No | true | -| connection_check_timeout_sec | Int | No | 30 | -| max_retries | Int | No | 0 | -| batch_size | Int | No | 1000 | -| batch_interval_ms | Int | No | 1000 | -| is_exactly_once | Boolean | No | false | -| xa_data_source_class_name | String | No | - | -| max_commit_attempts | Int | No | 3 | -| transaction_timeout_sec | Int | No | -1 | -| auto_commit | Boolean | No | true | -| common-options | | no | - | +| name | type | required | default value | +|-------------------------------------------|---------|----------|---------------| +| url | String | Yes | - | +| driver | String | Yes | - | +| user | String | No | - | +| password | String | No | - | +| query | String | No | - | +| compatible_mode | String | No | - | +| database | String | No | - | +| table | String | No | - | +| primary_keys | Array | No | - | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | +| connection_check_timeout_sec | Int | No | 30 | +| max_retries | Int | No | 0 | +| batch_size | Int | No | 1000 | +| is_exactly_once | Boolean | No | false | +| generate_sink_sql | Boolean | No | false | +| xa_data_source_class_name | String | No | - | +| max_commit_attempts | Int | No | 3 | +| transaction_timeout_sec | Int | No | -1 | +| auto_commit | Boolean | No | true | +| common-options | | no | - | ### driver [string] @@ -69,6 +69,12 @@ The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost/tes Use this sql write upstream input datas to database. e.g `INSERT ...` +### compatible_mode [string] + +The compatible mode of database, required when the database supports multiple compatible modes. For example, when using OceanBase database, you need to set it to 'mysql' or 'oracle'. + +Postgres 9.5 version or below,please set it to `postgresLow` to support cdc + ### database [string] Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database. @@ -85,15 +91,10 @@ This option is mutually exclusive with `query` and has a higher priority. This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. -### enable_upsert [boolean] - -Choose to use INSERT/UPDATE or UPSERT sql to process update events(INSERT, UPDATE_AFTER) based on `primary_keys` exists. +### support_upsert_by_query_primary_key_exist [boolean] -**Note**: that this method has low performance on database not support UPSERT sql - -### isPrimaryKeyUpdated [boolean] - -When executing the update statement, consider whether the primary key is updated or not. Note that this parameter does not take effect when directly configuring the `query`. +Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupported upsert syntax. +**Note**: that this method has low performance ### connection_check_timeout_sec [int] @@ -105,12 +106,7 @@ The number of retries to submit failed (executeBatch) ### batch_size[int] -For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms` -, the data will be flushed into the database - -### batch_interval_ms[int] - -For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms` +For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval` , the data will be flushed into the database ### is_exactly_once[boolean] @@ -118,6 +114,10 @@ For batch writing, when the number of buffers reaches the number of `batch_size` Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to set `xa_data_source_class_name`. +### generate_sink_sql[boolean] + +Generate sql statements based on the database table you want to write to + ### xa_data_source_class_name[string] The xa data source class name of the database Driver, for example, mysql is `com.mysql.cj.jdbc.MysqlXADataSource`, and @@ -169,6 +169,7 @@ there are some reference value for params above. | Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb | com.amazon.redshift.xa.RedshiftXADataSource | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 | | Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | / | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc | | Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | / | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar | +| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | / | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.3/oceanbase-client-2.4.3.jar | ## Example @@ -221,6 +222,26 @@ sink { } ``` +Postgresql 9.5 version below support CDC(Change data capture) event + +``` +sink { + jdbc { + url = "jdbc:postgresql://localhost:5432" + driver = "org.postgresql.Driver" + user = "root" + password = "123456" + compatible_mode="postgresLow" + database = "sink_database" + table = "sink_table" + support_upsert_by_query_primary_key_exist = true + generate_sink_sql = true + primary_keys = ["key1", "key2", ...] + } +} + +``` + ## Changelog ### 2.2.0-beta 2022-09-26 diff --git a/docs/en/connector-v2/sink/Kafka.md b/docs/en/connector-v2/sink/Kafka.md index f971e5390b0..1e258a058ad 100644 --- a/docs/en/connector-v2/sink/Kafka.md +++ b/docs/en/connector-v2/sink/Kafka.md @@ -1,36 +1,52 @@ # Kafka > Kafka sink connector -> - ## Description -Write Rows to a Kafka topic. +## Support Those Engines + +> Spark
+> Flink
+> Seatunnel Zeta
-## Key features +## Key Features - [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +> By default, we will use 2pc to guarantee the message is sent to kafka exactly once. + +## Description + +Write Rows to a Kafka topic. -By default, we will use 2pc to guarantee the message is sent to kafka exactly once. +## Supported DataSource Info -## Options +In order to use the Kafka connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. -| name | type | required | default value | -|----------------------|--------|----------|---------------| -| topic | string | yes | - | -| bootstrap.servers | string | yes | - | -| kafka.config | map | no | - | -| semantics | string | no | NON | -| partition_key_fields | array | no | - | -| partition | int | no | - | -| assign_partitions | array | no | - | -| transaction_prefix | string | no | - | -| format | String | no | json | -| field_delimiter | String | no | , | -| common-options | config | no | - | +| Datasource | Supported Versions | Maven | +|------------|--------------------|-------------------------------------------------------------------------------------------------------------| +| Kafka | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-kafka) | -### topic [string] +## Sink Options -Kafka Topic. +| Name | Type | Required | Default | Description | +|----------------------|--------|----------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| topic | String | Yes | - | When the table is used as sink, the topic name is the topic to write data to. | +| bootstrap.servers | String | Yes | - | Comma separated list of Kafka brokers. | +| kafka.config | Map | No | - | In addition to the above parameters that must be specified by the `Kafka producer` client, the user can also specify multiple non-mandatory parameters for the `producer` client, covering [all the producer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#producerconfigs). | +| semantics | String | No | NON | Semantics that can be chosen EXACTLY_ONCE/AT_LEAST_ONCE/NON, default NON. | +| partition_key_fields | Array | No | - | Configure which fields are used as the key of the kafka message. | +| partition | Int | No | - | We can specify the partition, all messages will be sent to this partition. | +| assign_partitions | Array | No | - | We can decide which partition to send based on the content of the message. The function of this parameter is to distribute information. | +| transaction_prefix | String | No | - | If semantic is specified as EXACTLY_ONCE, the producer will write all messages in a Kafka transaction,kafka distinguishes different transactions by different transactionId. This parameter is prefix of kafka transactionId, make sure different job use different prefix. | +| format | String | No | json | Data format. The default format is json. Optional text format, canal-json and debezium-json.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | +| field_delimiter | String | No | , | Customize the field delimiter for data format. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Parameter Interpretation + +### Topic Formats Currently two formats are supported: @@ -47,27 +63,13 @@ Currently two formats are supported: If `${name}` is set as the topic. So the first row is sent to Jack topic, and the second row is sent to Mary topic. -### bootstrap.servers [string] - -Kafka Brokers List. - -### kafka.config [kafka producer config] - -In addition to the above parameters that must be specified by the `Kafka producer` client, the user can also specify multiple non-mandatory parameters for the `producer` client, covering [all the producer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#producerconfigs). - -### semantics [string] - -Semantics that can be chosen EXACTLY_ONCE/AT_LEAST_ONCE/NON, default NON. +### Semantics In EXACTLY_ONCE, producer will write all messages in a Kafka transaction that will be committed to Kafka on a checkpoint. - In AT_LEAST_ONCE, producer will wait for all outstanding messages in the Kafka buffers to be acknowledged by the Kafka producer on a checkpoint. - NON does not provide any guarantees: messages may be lost in case of issues on the Kafka broker and messages may be duplicated. -### partition_key_fields [array] - -Configure which fields are used as the key of the kafka message. +### Partition Key Fields For example, if you want to use value of fields from upstream data as key, you can assign field names to this property. @@ -79,55 +81,48 @@ Upstream data is the following: | Mary | 23 | data-example2 | If name is set as the key, then the hash value of the name column will determine which partition the message is sent to. - If not set partition key fields, the null message key will be sent to. - The format of the message key is json, If name is set as the key, for example '{"name":"Jack"}'. - The selected field must be an existing field in the upstream. -### partition [int] - -We can specify the partition, all messages will be sent to this partition. - -### assign_partitions [array] - -We can decide which partition to send based on the content of the message. The function of this parameter is to distribute information. +### Assign Partitions For example, there are five partitions in total, and the assign_partitions field in config is as follows: assign_partitions = ["shoe", "clothing"] - Then the message containing "shoe" will be sent to partition zero ,because "shoe" is subscribed as zero in assign_partitions, and the message containing "clothing" will be sent to partition one.For other messages, the hash algorithm will be used to divide them into the remaining partitions. - This function by `MessageContentPartitioner` class implements `org.apache.kafka.clients.producer.Partitioner` interface.If we need custom partitions, we need to implement this interface as well. -### transaction_prefix [string] - -If semantic is specified as EXACTLY_ONCE, the producer will write all messages in a Kafka transaction. -Kafka distinguishes different transactions by different transactionId. This parameter is prefix of kafka transactionId, make sure different job use different prefix. - -### format +## Task Example -Data format. The default format is json. Optional text format, canal-json and debezium-json. -If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option. -If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details. -If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. +### Simple: -### field_delimiter +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to Kafka Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target topic is test_topic will also be 16 rows of data in the topic. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. -Customize the field delimiter for data format. - -### common options [config] - -Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details. +```hocon +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} -## Examples +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } +} -```hocon sink { - kafka { - topic = "seatunnel" + topic = "test_topic" bootstrap.servers = "localhost:9092" partition = 3 format = json @@ -139,7 +134,6 @@ sink { buffer.memory = 33554432 } } - } ``` @@ -162,7 +156,6 @@ sink { sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};" } } - } ``` @@ -199,22 +192,6 @@ sink { sasl.client.callback.handler.class="software.amazon.msk.auth.iam.IAMClientCallbackHandler" } } - } ``` -## Changelog - -### 2.3.0-beta 2022-10-20 - -- Add Kafka Sink Connector - -### next version - -- [Improve] Support to specify multiple partition keys [3230](https://github.com/apache/incubator-seatunnel/pull/3230) -- [Improve] Add text format for kafka sink connector [3711](https://github.com/apache/incubator-seatunnel/pull/3711) -- [Improve] Support extract topic from SeaTunnelRow fields [3742](https://github.com/apache/incubator-seatunnel/pull/3742) -- [Improve] Change Connector Custom Config Prefix To Map [3719](https://github.com/apache/incubator-seatunnel/pull/3719) -- [Improve] Support read canal format message [3950](https://github.com/apache/incubator-seatunnel/pull/3950) -- [Improve] Support read debezium format message [3981](https://github.com/apache/incubator-seatunnel/pull/3981) - diff --git a/docs/en/connector-v2/sink/LocalFile.md b/docs/en/connector-v2/sink/LocalFile.md index fb008e909a9..8e2c1526e90 100644 --- a/docs/en/connector-v2/sink/LocalFile.md +++ b/docs/en/connector-v2/sink/LocalFile.md @@ -20,7 +20,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you By default, we use 2PC commit to ensure `exactly-once` -- [x] file format +- [x] file format type - [x] text - [x] csv - [x] parquet @@ -36,9 +36,9 @@ By default, we use 2PC commit to ensure `exactly-once` | custom_filename | boolean | no | false | Whether you need custom the filename | | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | -| file_format | string | no | "csv" | | -| field_delimiter | string | no | '\001' | Only used when file_format is text | -| row_delimiter | string | no | "\n" | Only used when file_format is text | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | @@ -48,8 +48,8 @@ By default, we use 2PC commit to ensure `exactly-once` | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | -| max_rows_in_memory | int | no | - | Only used when file_format is excel. | -| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | ### path [string] @@ -83,13 +83,13 @@ When the format in the `file_name_expression` parameter is `xxxx-${now}` , `file | m | Minute in hour | | s | Second in minute | -### file_format [string] +### file_format_type [string] We supported as the following file types: `text` `json` `csv` `orc` `parquet` `excel` -Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] @@ -174,7 +174,7 @@ For orc file format simple config LocalFile { path = "/tmp/hive/warehouse/test2" - file_format = "orc" + file_format_type = "orc" } ``` @@ -185,7 +185,7 @@ For parquet file format with `sink_columns` LocalFile { path = "/tmp/hive/warehouse/test2" - file_format = "parquet" + file_format_type = "parquet" sink_columns = ["name","age"] } @@ -197,7 +197,7 @@ For text file format with `have_partition` and `custom_filename` and `sink_colum LocalFile { path = "/tmp/hive/warehouse/test2" - file_format = "text" + file_format_type = "text" field_delimiter = "\t" row_delimiter = "\n" have_partition = true @@ -224,7 +224,7 @@ LocalFile { partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" - file_format="excel" + file_format_type="excel" filename_time_format="yyyy.MM.dd" is_enable_transaction=true } diff --git a/docs/en/connector-v2/sink/MongoDB.md b/docs/en/connector-v2/sink/MongoDB.md index 40355583cb1..c4cbad95ef3 100644 --- a/docs/en/connector-v2/sink/MongoDB.md +++ b/docs/en/connector-v2/sink/MongoDB.md @@ -11,7 +11,7 @@ Key Features ------------ -- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) - [x] [cdc](../../concept/connector-v2-features.md) **Tips** @@ -73,10 +73,12 @@ The following table lists the field data type mapping from MongoDB BSON type to | retry.interval | Duration | No | 1000 | Specifies the retry time interval if writing records to database failed, the unit is millisecond. | | upsert-enable | Boolean | No | false | Whether to write documents via upsert mode. | | primary-key | List | No | - | The primary keys for upsert/update. Keys are in `["id","name",...]` format for properties. | +| transaction | Boolean | No | false | Whether to use transactions in MongoSink (requires MongoDB 4.2+). | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | -**Tips** +### Tips -> 1.The data flushing logic of the MongoDB Sink Connector is jointly controlled by three parameters: `buffer-flush.max-rows`, `buffer-flush.interval`, and `checkpoint.interval`. +> 1.The data flushing logic of the MongoDB Sink Connector is jointly controlled by three parameters: `buffer-flush.max-rows`, `buffer-flush.interval`, and `checkpoint.interval`.
> Data flushing will be triggered if any of these conditions are met.
> 2.Compatible with the historical parameter `upsert-key`. If `upsert-key` is set, please do not set `primary-key`.
diff --git a/docs/en/connector-v2/sink/Mysql.md b/docs/en/connector-v2/sink/Mysql.md index 92254c1b54f..55c825ed168 100644 --- a/docs/en/connector-v2/sink/Mysql.md +++ b/docs/en/connector-v2/sink/Mysql.md @@ -67,8 +67,7 @@ semantics (using XA transaction guarantee). | support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | -| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms`
, the data will be flushed into the database | -| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | | generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | | xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, mysql is `com.mysql.cj.jdbc.MysqlXADataSource`, and
please refer to appendix for other data sources | diff --git a/docs/en/connector-v2/sink/OceanBase.md b/docs/en/connector-v2/sink/OceanBase.md new file mode 100644 index 00000000000..3cea0b5e6e6 --- /dev/null +++ b/docs/en/connector-v2/sink/OceanBase.md @@ -0,0 +1,185 @@ +# OceanBase + +> JDBC OceanBase Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once semantics. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|--------------------------------|---------------------------|--------------------------------------|-------------------------------------------------------------------------------| +| OceanBase | All OceanBase server versions. | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2883/test | [Download](https://mvnrepository.com/artifact/com.oceanbase/oceanbase-client) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example: cp oceanbase-client-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +### Mysql Mode + +| Mysql Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT(1)
INT UNSIGNED | BOOLEAN | +| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | +| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
FLOAT UNSIGNED | FLOAT | +| DOUBLE
DOUBLE UNSIGNED | DOUBLE | +| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
TIMESTAMP | TIMESTAMP | +| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | +| GEOMETRY
UNKNOWN | Not supported yet | + +### Oracle Mode + +| Oracle Data type | SeaTunnel Data type | +|-----------------------------------------------------------|---------------------| +| Number(p), p <= 9 | INT | +| Number(p), p <= 18 | BIGINT | +| Number(p), p > 18 | DECIMAL(38,18) | +| REAL
BINARY_FLOAT | FLOAT | +| BINARY_DOUBLE | DOUBLE | +| CHAR
NCHAR
NVARCHAR2
NCLOB
CLOB
ROWID | STRING | +| DATE | DATE | +| TIMESTAMP
TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
RAW
LONG RAW
BFILE | BYTES | +| UNKNOWN | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oceanbase://localhost:2883/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.oceanbase.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| compatible_mode | String | Yes | - | The compatible mode of OceanBase, can be 'mysql' or 'oracle'. | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your mysql. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:oceanbase://localhost:2883/test" + driver = "com.oceanbase.jdbc.Driver" + user = "root" + password = "123456" + compatible_mode = "mysql" + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + jdbc { + url = "jdbc:oceanbase://localhost:2883/test" + driver = "com.oceanbase.jdbc.Driver" + user = "root" + password = "123456" + compatible_mode = "mysql" + # Automatically generate sql statements based on database table names + generate_sink_sql = true + database = test + table = test_table + } +} +``` + +### CDC(Change Data Capture) Event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + url = "jdbc:oceanbase://localhost:3306/test" + driver = "com.oceanbase.jdbc.Driver" + user = "root" + password = "123456" + compatible_mode = "mysql" + generate_sink_sql = true + # You need to configure both database and table + database = test + table = sink_table + primary_keys = ["id","name"] + } +} +``` + diff --git a/docs/en/connector-v2/sink/Oracle.md b/docs/en/connector-v2/sink/Oracle.md new file mode 100644 index 00000000000..feda00b8159 --- /dev/null +++ b/docs/en/connector-v2/sink/Oracle.md @@ -0,0 +1,191 @@ +# Oracle + +> JDBC Oracle Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| +| Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Oracle datasource: cp ojdbc8-xxxxxx.jar $SEATNUNNEL_HOME/lib/
+> To support the i18n character set, copy the orai18n.jar to the $SEATNUNNEL_HOME/lib/ directory. + +## Data Type Mapping + +| PostgreSQL Data type | SeaTunnel Data type | +|--------------------------------------------------------------------------------------|---------------------| +| INTEGER | INT | +| FLOAT | DECIMAL(38, 18) | +| NUMBER(precision <= 9, scale == 0) | INT | +| NUMBER(9 < precision <= 18, scale == 0) | BIGINT | +| NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | +| NUMBER(scale != 0) | DECIMAL(38, 18) | +| BINARY_DOUBLE | DOUBLE | +| BINARY_FLOAT
REAL | FLOAT | +| CHAR
NCHAR
NVARCHAR2
VARCHAR2
LONG
ROWID
NCLOB
CLOB
| STRING | +| DATE | DATE | +| TIMESTAMP
TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
RAW
LONG RAW
BFILE | BYTES | + +## Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Oracle the value is `oracle.jdbc.OracleDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms`
, the data will be flushed into the database | +| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to. | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, Oracle is `oracle.jdbc.xa.client.OracleXADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your PostgreSQL. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + user = root + password = 123456 + query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + user = root + password = 123456 + + generate_sink_sql = true + database = XE + table = "TEST.TEST_TABLE" + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + + max_retries = 0 + user = root + password = 123456 + query = "INSERT INTO TEST.TEST_TABLE(NAME,AGE) VALUES(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "oracle.jdbc.xa.client.OracleXADataSource" + } +} +``` + +### CDC(Change Data Capture) Event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + user = root + password = 123456 + + generate_sink_sql = true + # You need to configure both database and table + database = XE + table = "TEST.TEST_TABLE" + primary_keys = ["ID"] + } +} +``` + diff --git a/docs/en/connector-v2/sink/OssFile.md b/docs/en/connector-v2/sink/OssFile.md index d40cf4bf958..a3095ecfd1a 100644 --- a/docs/en/connector-v2/sink/OssFile.md +++ b/docs/en/connector-v2/sink/OssFile.md @@ -44,8 +44,8 @@ By default, we use 2PC commit to ensure `exactly-once` | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | -| field_delimiter | string | no | '\001' | Only used when file_format is text | -| row_delimiter | string | no | "\n" | Only used when file_format is text | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | @@ -55,8 +55,8 @@ By default, we use 2PC commit to ensure `exactly-once` | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | -| max_rows_in_memory | int | no | - | Only used when file_format is excel. | -| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | ### path [string] @@ -112,7 +112,7 @@ We supported as the following file types: `text` `json` `csv` `orc` `parquet` `excel` -Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] diff --git a/docs/en/connector-v2/sink/OssJindoFile.md b/docs/en/connector-v2/sink/OssJindoFile.md index 02547f3aa6a..1d098da009c 100644 --- a/docs/en/connector-v2/sink/OssJindoFile.md +++ b/docs/en/connector-v2/sink/OssJindoFile.md @@ -44,8 +44,8 @@ By default, we use 2PC commit to ensure `exactly-once` | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | -| field_delimiter | string | no | '\001' | Only used when file_format is text | -| row_delimiter | string | no | "\n" | Only used when file_format is text | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | @@ -55,8 +55,8 @@ By default, we use 2PC commit to ensure `exactly-once` | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | -| max_rows_in_memory | int | no | - | Only used when file_format is excel. | -| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | ### path [string] @@ -112,7 +112,7 @@ We supported as the following file types: `text` `json` `csv` `orc` `parquet` `excel` -Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] diff --git a/docs/en/connector-v2/sink/PostgreSql.md b/docs/en/connector-v2/sink/PostgreSql.md new file mode 100644 index 00000000000..67e2ed64d95 --- /dev/null +++ b/docs/en/connector-v2/sink/PostgreSql.md @@ -0,0 +1,203 @@ +# PostgreSql + +> JDBC PostgreSql Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|------------------------------------------------------------|-----------------------|---------------------------------------|--------------------------------------------------------------------------| +| PostgreSQL | Different dependency version has different driver class. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | +| PostgreSQL | If you want to manipulate the GEOMETRY type in PostgreSQL. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example PostgreSQL datasource: cp postgresql-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+> If you want to manipulate the GEOMETRY type in PostgreSQL, add postgresql-xxx.jar and postgis-jdbc-xxx.jar to $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| PostgreSQL Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL
| BOOLEAN | +| _BOOL
| ARRAY<BOOLEAN> | +| BYTEA
| BYTES | +| _BYTEA
| ARRAY<TINYINT> | +| INT2
SMALLSERIAL
INT4
SERIAL
| INT | +| _INT2
_INT4
| ARRAY<INT> | +| INT8
BIGSERIAL
| BIGINT | +| _INT8
| ARRAY<BIGINT> | +| FLOAT4
| FLOAT | +| _FLOAT4
| ARRAY<FLOAT> | +| FLOAT8
| DOUBLE | +| _FLOAT8
| ARRAY<DOUBLE> | +| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | +| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | +| BPCHAR
CHARACTER
VARCHAR
TEXT
GEOMETRY
GEOGRAPHY
JSON
JSONB | STRING | +| _BPCHAR
_CHARACTER
_VARCHAR
_TEXT | ARRAY<STRING> | +| TIMESTAMP
| TIMESTAMP | +| TIME
| TIME | +| DATE
| DATE | +| OTHER DATA TYPES | NOT SUPPORTED YET | + +## Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test
if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use PostgreSQL the value is `org.postgresql.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to. | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, PostgreSQL is `org.postgresql.xa.PGXADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your PostgreSQL. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = root + password = 123456 + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + Jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = org.postgresql.Driver + user = root + password = 123456 + + generate_sink_sql = true + database = test + table = "public.test_table" + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + + max_retries = 0 + user = root + password = 123456 + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "org.postgresql.xa.PGXADataSource" + } +} +``` + +### CDC(Change Data Capture) Event + +> CDC change data is also supported by us In this case, you need config database, table and primary_keys. + +``` +sink { + jdbc { + # if you would use json or jsonb type insert please add jdbc url stringtype=unspecified option + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = root + password = 123456 + + generate_sink_sql = true + # You need to configure both database and table + database = test + table = sink_table + primary_keys = ["id","name"] + } +} +``` + diff --git a/docs/en/connector-v2/sink/Redis.md b/docs/en/connector-v2/sink/Redis.md index fcface7da22..7d2ef237e1c 100644 --- a/docs/en/connector-v2/sink/Redis.md +++ b/docs/en/connector-v2/sink/Redis.md @@ -23,6 +23,7 @@ Used to write data to Redis. | mode | string | no | single | | nodes | list | yes when mode=cluster | - | | format | string | no | json | +| expire | long | no | -1 | | common-options | | no | - | ### host [string] @@ -120,6 +121,10 @@ Connector will generate data as the following and write it to redis: ``` +### expire [long] + +Set redis expiration time, the unit is second. The default value is -1, keys do not automatically expire by default. + ### common options Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details diff --git a/docs/en/connector-v2/sink/S3-Redshift.md b/docs/en/connector-v2/sink/S3-Redshift.md index 928e23a072e..2e02e2f446a 100644 --- a/docs/en/connector-v2/sink/S3-Redshift.md +++ b/docs/en/connector-v2/sink/S3-Redshift.md @@ -14,32 +14,41 @@ Output data to AWS Redshift. ## Key features - [x] [exactly-once](../../concept/connector-v2-features.md) -- [x] [cdc](../../concept/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + ## Options -| name | type | required | default value | -|-------------------------------------|--------|----------|--------------------------------| -| jdbc_url | string | yes | - | -| jdbc_user | string | yes | - | -| jdbc_password | string | yes | - | -| path | string | yes | - | -| tmp_path | string | yes | /tmp/seatunnel | -| bucket | string | yes | - | -| access_key | string | no | - | -| access_secret | string | no | - | -| hadoop_s3_properties | map | no | - | -| changelog_mode | enum | no | APPEND_ONLY | -| changelog_buffer_flush_size | int | no | 20000 | -| changelog_buffer_flush_interval_ms | int | no | 20000 | -| redshift_table | string | yes | - | -| redshift_table_primary_keys | array | no | - | -| redshift_temporary_table_name | string | no | st_temporary_${redshift_table} | -| redshift_s3_iam_role | string | no | - | -| redshift_s3_file_commit_worker_size | int | no | 1 | -| common-options | | no | - | +| name | type | required | default value | +|----------------------------------|---------|----------|-----------------------------------------------------------| +| jdbc_url | string | yes | - | +| jdbc_user | string | yes | - | +| jdbc_password | string | yes | - | +| execute_sql | string | yes | - | +| path | string | yes | - | +| bucket | string | yes | - | +| access_key | string | no | - | +| access_secret | string | no | - | +| hadoop_s3_properties | map | no | - | +| file_name_expression | string | no | "${transactionId}" | +| file_format_type | string | no | "text" | +| filename_time_format | string | no | "yyyy.MM.dd" | +| field_delimiter | string | no | '\001' | +| row_delimiter | string | no | "\n" | +| partition_by | array | no | - | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | +| is_partition_field_write_in_file | boolean | no | false | +| sink_columns | array | no | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | +| batch_size | int | no | 1000000 | +| common-options | | no | - | ### jdbc_url @@ -53,13 +62,32 @@ The JDBC user to connect to the Redshift database. The JDBC password to connect to the Redshift database. -### path [string] +### execute_sql -The target dir path is required. +The SQL to execute after the data is written to S3. + +eg: + +```sql + +COPY target_table FROM 's3://yourbucket${path}' IAM_ROLE 'arn:XXX' REGION 'your region' format as json 'auto'; +``` + +`target_table` is the table name in Redshift. -### tmp_path [string] +`${path}` is the path of the file written to S3. please confirm your sql include this variable. and don't need replace it. we will replace it when execute sql. -The temporary path is required. +IAM_ROLE is the role that has permission to access S3. + +format is the format of the file written to S3. please confirm this format is same as the file format you set in the configuration. + +please refer to [Redshift COPY](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html) for more details. + +please confirm that the role has permission to access S3. + +### path [string] + +The target dir path is required. ### bucket [string] @@ -83,40 +111,76 @@ hadoop_s3_properties { } ``` -### changelog_mode [enum] +### file_name_expression [string] + +`file_name_expression` describes the file expression which will be created into the `path`. We can add the variable `${now}` or `${uuid}` in the `file_name_expression`, like `test_${uuid}_${now}`, +`${now}` represents the current time, and its format can be defined by specifying the option `filename_time_format`. + +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. + +### file_format_type [string] + +We supported as the following file types: + +`text` `csv` `parquet` `orc` `json` + +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. + +### filename_time_format [string] + +When the format in the `file_name_expression` parameter is `xxxx-${now}` , `filename_time_format` can specify the time format of the path, and the default value is `yyyy.MM.dd` . The commonly used time formats are listed as follows: + +| Symbol | Description | +|--------|--------------------| +| y | Year | +| M | Month | +| d | Day of month | +| H | Hour in day (0-23) | +| m | Minute in hour | +| s | Second in minute | + +See [Java SimpleDateFormat](https://docs.oracle.com/javase/tutorial/i18n/format/simpleDateFormat.html) for detailed time format syntax. + +### field_delimiter [string] + +The separator between columns in a row of data. Only needed by `text` and `csv` file format. + +### row_delimiter [string] + +The separator between rows in a file. Only needed by `text` and `csv` file format. + +### partition_by [array] -The changelog mode of the sink writer, support: -`APPEND_ONLY`: Only append data to the target table. -`APPEND_ON_DUPLICATE_UPDATE`: If the primary key exists, update(update/delete) the data, otherwise insert the data. -`APPEND_ON_DUPLICATE_UPDATE_AUTOMATIC`: If the primary key exists, update(update/delete) the data, otherwise insert the data. Automatically switch copy/merge mode between snapshot sync and incremental sync. +Partition data based on selected fields -### changelog_buffer_flush_size [int] +### partition_dir_expression [string] -Flush buffer to s3 size of redshift changelog +If the `partition_by` is specified, we will generate the corresponding partition directory based on the partition information, and the final file will be placed in the partition directory. -### changelog_buffer_flush_interval_ms [int] +Default `partition_dir_expression` is `${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/`. `k0` is the first partition field and `v0` is the value of the first partition field. -Flush buffer to s3 interval of redshift changelog +### is_partition_field_write_in_file [boolean] -### redshift_table [string] +If `is_partition_field_write_in_file` is `true`, the partition field and the value of it will be written into data file. -The target table name of redshift changelog. +For example, if you want to write a Hive Data File, Its value should be `false`. -### redshift_table_primary_keys [array] +### sink_columns [array] -The primary keys of the buffer/target-table, only needed by `APPEND_ON_DUPLICATE_UPDATE*` and `APPEND_ON_DUPLICATE_DELETE*` changelog mode. +Which columns need be written to file, default value is all the columns get from `Transform` or `Source`. +The order of the fields determines the order in which the file is actually written. -### redshift_temporary_table_name [string] +### is_enable_transaction [boolean] -The temporary table of redshift changelog. +If `is_enable_transaction` is true, we will ensure that data will not be lost or duplicated when it is written to the target directory. -### redshift_s3_iam_role [string] +Please note that, If `is_enable_transaction` is `true`, we will auto add `${transactionId}_` in the head of the file. -The s3 iam role of redshift changelog. +Only support `true` now. -### redshift_s3_file_commit_worker_size [int] +### batch_size [int] -The worker size of redshift changelog file commit. +The maximum number of rows in a file. For SeaTunnel Engine, the number of lines in the file is determined by `batch_size` and `checkpoint.interval` jointly decide. If the value of `checkpoint.interval` is large enough, sink writer will write rows in a file until the rows in the file larger than `batch_size`. If `checkpoint.interval` is small, the sink writer will create a new file when a new checkpoint trigger. ### common options @@ -124,100 +188,91 @@ Sink plugin common parameters, please refer to [Sink Common Options](common-opti ## Example -For append only +For text file format ```hocon S3Redshift { - source_table_name = "test_xxx_1" - - # file config - tmp_path = "/tmp/seatunnel/s3_redshift_xxx/" - path = "/seatunnel/s3_redshift_xxx/" - - # s3 config - fs.s3a.endpoint = "s3.cn-north-1.amazonaws.com.cn" - bucket = "s3a://seatunnel-test-bucket" - access_key = "xxxxxxxxxxxxxxxxx" - secret_key = "xxxxxxxxxxxxxxxxx" - fs.s3a.aws.credentials.provider = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" - - # redshift config jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" jdbc_user = "xxx" jdbc_password = "xxxx" - - # cdc changelog config - changelog_mode = "APPEND_ONLY" - redshift_table = "your_target_table" + execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' removequotes emptyasnull blanksasnull maxerror 100 delimiter '|' ;" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/text" + row_delimiter="\n" + partition_dir_expression="${k0}=${v0}" + is_partition_field_write_in_file=true + file_name_expression="${transactionId}_${now}" + file_format_type = "text" + filename_time_format="yyyy.MM.dd" + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + } } ``` -Support write cdc changelog event(APPEND_ON_DUPLICATE_UPDATE/APPEND_ON_DUPLICATE_UPDATE_AUTOMATIC). - -*Using Redshift COPY sql import s3 file into tmp table, and use Redshift DELETE/MERGE sql merge tmp table data into target table.* -- [Redshift TEMPORARY Table](https://docs.aws.amazon.com/redshift/latest/dg/r_CREATE_TABLE_NEW.html) -- [Redshift COPY SQL](https://docs.aws.amazon.com/redshift/latest/dg/r_COPY.html) -- [Redshift DELETE USING SQL](https://docs.aws.amazon.com/redshift/latest/dg/r_DELETE.html) -- [Redshift MERGE SQL](https://docs.aws.amazon.com/redshift/latest/dg/r_MERGE.html) - -Config example: +For parquet file format ```hocon -env { - job.mode = "BATCH" - checkpoint.interval = 20000 -} - -source { - FakeSource { - result_table_name = "test_xxx_1" - row.num = 100000 - split.num = 10 - schema = { - fields { - id = "int" - name = "string" - } - } - } -} -sink { S3Redshift { - source_table_name = "test_xxx_1" - - # file config - tmp_path = "/tmp/seatunnel/s3_redshift_xxx/" - path = "/seatunnel/s3_redshift_xxx/" - - # s3 config - fs.s3a.endpoint = "s3.cn-north-1.amazonaws.com.cn" - bucket = "s3a://seatunnel-test-bucket" + jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" + jdbc_user = "xxx" + jdbc_password = "xxxx" + execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' format as PARQUET;" access_key = "xxxxxxxxxxxxxxxxx" secret_key = "xxxxxxxxxxxxxxxxx" - fs.s3a.aws.credentials.provider = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/parquet" + row_delimiter="\n" + partition_dir_expression="${k0}=${v0}" + is_partition_field_write_in_file=true + file_name_expression="${transactionId}_${now}" + file_format_type = "parquet" + filename_time_format="yyyy.MM.dd" + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + } + } + +``` + +For orc file format + +```hocon - # redshift config + S3Redshift { jdbc_url = "jdbc:redshift://xxx.amazonaws.com.cn:5439/xxx" jdbc_user = "xxx" jdbc_password = "xxxx" - - # cdc changelog config - changelog_mode = "APPEND_ON_DUPLICATE_UPDATE" - changelog_buffer_flush_size = 50000 - redshift_table = "your_target_table" - redshift_table_primary_keys = ["id"] + execute_sql="COPY table_name FROM 's3://test${path}' IAM_ROLE 'arn:aws-cn:iam::xxx' REGION 'cn-north-1' format as ORC;" + access_key = "xxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxx" + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/orc" + row_delimiter="\n" + partition_dir_expression="${k0}=${v0}" + is_partition_field_write_in_file=true + file_name_expression="${transactionId}_${now}" + file_format_type = "orc" + filename_time_format="yyyy.MM.dd" + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.aws.credentials.provider" = "org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider" + } } -} + ``` ## Changelog ### 2.3.0-beta 2022-10-20 -### next version - -- Support write cdc changelog event - diff --git a/docs/en/connector-v2/sink/S3File.md b/docs/en/connector-v2/sink/S3File.md index dcaee7338fe..4bb670ae38c 100644 --- a/docs/en/connector-v2/sink/S3File.md +++ b/docs/en/connector-v2/sink/S3File.md @@ -1,24 +1,17 @@ # S3File -> S3 file sink connector +> S3 File Sink Connector -## Description - -Output data to aws s3 file system. - -:::tip +## Support Those Engines -If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. +> Spark
+> Flink
+> SeaTunnel Zeta
-If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. - -To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir. - -::: - -## Key features +## Key Features - [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) By default, we use 2PC commit to ensure `exactly-once` @@ -30,59 +23,100 @@ By default, we use 2PC commit to ensure `exactly-once` - [x] json - [x] excel -## Options - -| name | type | required | default value | remarks | -|----------------------------------|---------|----------|-------------------------------------------------------|--------------------------------------------------------------------------------------------------------| -| path | string | yes | - | | -| bucket | string | yes | - | | -| fs.s3a.endpoint | string | yes | - | | -| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | | -| access_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | -| access_secret | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | -| custom_filename | boolean | no | false | Whether you need custom the filename | -| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | -| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | -| file_format_type | string | no | "csv" | | -| field_delimiter | string | no | '\001' | Only used when file_format is text | -| row_delimiter | string | no | "\n" | Only used when file_format is text | -| have_partition | boolean | no | false | Whether you need processing partitions. | -| partition_by | array | no | - | Only used then have_partition is true | -| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | -| is_partition_field_write_in_file | boolean | no | false | Only used then have_partition is true | -| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | -| is_enable_transaction | boolean | no | true | | -| batch_size | int | no | 1000000 | | -| compress_codec | string | no | none | | -| common-options | object | no | - | | -| max_rows_in_memory | int | no | - | Only used when file_format is excel. | -| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | - -### path [string] - -The target dir path is required. - -### bucket [string] - -The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. - -### fs.s3a.endpoint [string] - -fs s3a endpoint - -### fs.s3a.aws.credentials.provider [string] - -The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. - -More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A) - -### access_key [string] - -The access key of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) +## Description -### access_secret [string] +Output data to aws s3 file system. -The access secret of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) +## Supported DataSource Info + +| Datasource | Supported Versions | +|------------|--------------------| +| S3 | current | + +## Database Dependency + +> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. +> +> If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under `${SEATUNNEL_HOME}/lib` to confirm this. +> To use this connector you need put `hadoop-aws-3.1.4.jar` and `aws-java-sdk-bundle-1.11.271.jar` in `${SEATUNNEL_HOME}/lib` dir. + +## Data Type Mapping + +If write to `csv`, `text` file type, All column will be string. + +### Orc File Type + +| SeaTunnel Data type | Orc Data type | +|----------------------|-----------------------| +| STRING | STRING | +| BOOLEAN | BOOLEAN | +| TINYINT | BYTE | +| SMALLINT | SHORT | +| INT | INT | +| BIGINT | LONG | +| FLOAT | FLOAT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| BYTES | BINARY | +| DATE | DATE | +| TIME
TIMESTAMP | TIMESTAMP | +| ROW | STRUCT | +| NULL | UNSUPPORTED DATA TYPE | +| ARRAY | LIST | +| Map | Map | + +### Parquet File Type + +| SeaTunnel Data type | Parquet Data type | +|----------------------|-----------------------| +| STRING | STRING | +| BOOLEAN | BOOLEAN | +| TINYINT | INT_8 | +| SMALLINT | INT_16 | +| INT | INT32 | +| BIGINT | INT64 | +| FLOAT | FLOAT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| BYTES | BINARY | +| DATE | DATE | +| TIME
TIMESTAMP | TIMESTAMP_MILLIS | +| ROW | GroupType | +| NULL | UNSUPPORTED DATA TYPE | +| ARRAY | LIST | +| Map | Map | + +## Sink Options + +| name | type | required | default value | Description | +|----------------------------------|---------|----------|-------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | | +| bucket | string | yes | - | | +| fs.s3a.endpoint | string | yes | - | | +| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. | +| access_key | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | +| access_secret | string | no | - | Only used when fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider | +| custom_filename | boolean | no | false | Whether you need custom the filename | +| file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | +| filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | +| file_format_type | string | no | "csv" | | +| field_delimiter | string | no | '\001' | Only used when file_format is text | +| row_delimiter | string | no | "\n" | Only used when file_format is text | +| have_partition | boolean | no | false | Whether you need processing partitions. | +| partition_by | array | no | - | Only used when have_partition is true | +| partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used when have_partition is true | +| is_partition_field_write_in_file | boolean | no | false | Only used when have_partition is true | +| sink_columns | array | no | | When this parameter is empty, all fields are sink columns | +| is_enable_transaction | boolean | no | true | | +| batch_size | int | no | 1000000 | | +| compress_codec | string | no | none | | +| common-options | object | no | - | | +| max_rows_in_memory | int | no | - | Only used when file_format is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | +| hadoop_s3_properties | map | no | | If you need to add a other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | +| | ### hadoop_s3_properties [map] @@ -129,7 +163,7 @@ We supported as the following file types: `text` `json` `csv` `orc` `parquet` `excel` -Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] @@ -208,6 +242,83 @@ Writer the sheet of the workbook ## Example +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to S3File Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target s3 dir will also create a file and all of the data in write in it. +> Before run this job, you need create s3 path: /seatunnel/text. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + c_map = "map>" + c_array = "array" + name = string + c_boolean = boolean + age = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + S3File { + bucket = "s3a://seatunnel-test" + tmp_path = "/tmp/seatunnel" + path="/seatunnel/text" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + file_format_type = "text" + field_delimiter = "\t" + row_delimiter = "\n" + have_partition = true + partition_by = ["age"] + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + custom_filename = true + file_name_expression = "${transactionId}_${now}" + filename_time_format = "yyyy.MM.dd" + sink_columns = ["name","age"] + is_enable_transaction=true + hadoop_s3_properties { + "fs.s3a.buffer.dir" = "/data/st_test/s3a" + "fs.s3a.fast.upload.buffer" = "disk" + } + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + For text file format with `have_partition` and `custom_filename` and `sink_columns` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` ```hocon diff --git a/docs/en/connector-v2/sink/SelectDB-Cloud.md b/docs/en/connector-v2/sink/SelectDB-Cloud.md index 24d22d5a2d0..6ad2997903b 100644 --- a/docs/en/connector-v2/sink/SelectDB-Cloud.md +++ b/docs/en/connector-v2/sink/SelectDB-Cloud.md @@ -2,139 +2,169 @@ > SelectDB Cloud sink connector -## Description +## Support Those Engines -Used to send data to SelectDB Cloud. Both support streaming and batch mode. -The internal implementation of SelectDB Cloud sink connector upload after batch caching and commit the CopyInto sql to load data into the table. +> Spark
+> Flink
+> SeaTunnel Zeta
-:::tip - -Version Supported - -* supported `SelectDB Cloud version is >= 2.2.x` - -::: - -## Key features +## Key Features - [x] [exactly-once](../../concept/connector-v2-features.md) - [x] [cdc](../../concept/connector-v2-features.md) -## Options - -| name | type | required | default value | -|--------------------|--------|----------|------------------------| -| load-url | string | yes | - | -| jdbc-url | string | yes | - | -| cluster-name | string | yes | - | -| username | string | yes | - | -| password | string | yes | - | -| table.identifier | string | yes | - | -| sink.enable-delete | bool | no | false | -| selectdb.config | map | yes | - | -| sink.buffer-size | int | no | 10 * 1024 * 1024 (1MB) | -| sink.buffer-count | int | no | 10000 | -| sink.max-retries | int | no | 3 | - -### load-url [string] - -`SelectDB Cloud` warehouse http address, the format is `warehouse_ip:http_port` - -### jdbc-url [string] - -`SelectDB Cloud` warehouse jdbc address, the format is `warehouse_ip:mysql_port` - -### cluster-name [string] - -`SelectDB Cloud` cluster name - -### username [string] - -`SelectDB Cloud` user username - -### password [string] - -`SelectDB Cloud` user password - -### table.identifier [string] - -The name of `SelectDB Cloud` table, the format is `database.table` +## Description -### sink.enable-delete [string] +Used to send data to SelectDB Cloud. Both support streaming and batch mode. +The internal implementation of SelectDB Cloud sink connector upload after batch caching and commit the CopyInto sql to load data into the table. -Whether to enable deletion. This option requires SelectDB Cloud table to enable batch delete function, and only supports Unique model. +## Supported DataSource Info -`ALTER TABLE example_db.my_table ENABLE FEATURE "BATCH_DELETE";` +:::tip -### selectdb.config [map] +Version Supported -Write property configuration +* supported `SelectDB Cloud version is >= 2.2.x` -CSV Write: +::: -``` -selectdb.config { - file.type="csv" - file.column_separator="," - file.line_delimiter="\n" +## Sink Options + +| Name | Type | Required | Default | Description | +|--------------------|--------|----------|------------------------|-------------------------------------------------------------------------------------------------------------------------------------------| +| load-url | String | Yes | - | `SelectDB Cloud` warehouse http address, the format is `warehouse_ip:http_port` | +| jdbc-url | String | Yes | - | `SelectDB Cloud` warehouse jdbc address, the format is `warehouse_ip:mysql_port` | +| cluster-name | String | Yes | - | `SelectDB Cloud` cluster name | +| username | String | Yes | - | `SelectDB Cloud` user username | +| password | String | Yes | - | `SelectDB Cloud` user password | +| table.identifier | String | Yes | - | The name of `SelectDB Cloud` table, the format is `database.table` | +| sink.enable-delete | bool | No | false | Whether to enable deletion. This option requires SelectDB Cloud table to enable batch delete function, and only supports Unique model. | +| sink.max-retries | int | No | 3 | the max retry times if writing records to database failed | +| sink.buffer-size | int | No | 10 * 1024 * 1024 (1MB) | the buffer size to cache data for stream load. | +| sink.buffer-count | int | No | 10000 | the buffer count to cache data for stream load. | +| selectdb.config | map | yes | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql,and supported formats. | + +## Data Type Mapping + +| SelectDB Cloud Data type | SeaTunnel Data type | +|--------------------------|-----------------------------------------| +| BOOLEAN | BOOLEAN | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT
TINYINT | +| INT | INT
SMALLINT
TINYINT | +| BIGINT | BIGINT
INT
SMALLINT
TINYINT | +| LARGEINT | BIGINT
INT
SMALLINT
TINYINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE
FLOAT | +| DECIMAL | DECIMAL
DOUBLE
FLOAT | +| DATE | DATE | +| DATETIME | TIMESTAMP | +| CHAR | STRING | +| VARCHAR | STRING | +| STRING | STRING | +| ARRAY | ARRAY | +| MAP | MAP | +| JSON | STRING | +| HLL | Not supported yet | +| BITMAP | Not supported yet | +| QUANTILE_STATE | Not supported yet | +| STRUCT | Not supported yet | + +#### Supported import data formats + +The supported formats include CSV and JSON + +## Task Example + +### Simple: + +> The following example describes writing multiple data types to SelectDBCloud, and users need to create corresponding tables downstream + +```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 } -``` -JSON Write: +source { + FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} -``` -selectdb.config { - file.type="json" +sink { + SelectDBCloud { + load-url = "warehouse_ip:http_port" + jdbc-url = "warehouse_ip:mysql_port" + cluster-name = "Cluster" + table.identifier = "test.test" + username = "admin" + password = "******" + selectdb.config { + file.type = "json" + } + } } ``` -### sink.buffer-size [string] - -The maximum capacity of the cache, in bytes, that is flushed to the object storage. The default is 10MB. it is not recommended to modify it. - -### sink.buffer-count [string] - -Maximum number of entries flushed to the object store. The default value is 10000. it is not recommended to modify. - -### sink.max-retries [string] - -The maximum number of retries in the Commit phase, the default is 3. - -## Example - -Use JSON format to import data +### Use JSON format to import data ``` sink { SelectDBCloud { - load-url="warehouse_ip:http_port" - jdbc-url="warehouse_ip:mysql_port" - cluster-name="Cluster" - table.identifier="test.test" - username="admin" - password="******" + load-url = "warehouse_ip:http_port" + jdbc-url = "warehouse_ip:mysql_port" + cluster-name = "Cluster" + table.identifier = "test.test" + username = "admin" + password = "******" selectdb.config { - file.type="json" + file.type = "json" } } } + ``` -Use CSV format to import data +### Use CSV format to import data ``` sink { SelectDBCloud { - load-url="warehouse_ip:http_port" - jdbc-url="warehouse_ip:mysql_port" - cluster-name="Cluster" - table.identifier="test.test" - username="admin" - password="******" + load-url = "warehouse_ip:http_port" + jdbc-url = "warehouse_ip:mysql_port" + cluster-name = "Cluster" + table.identifier = "test.test" + username = "admin" + password = "******" selectdb.config { - file.type="csv" - file.column_separator="," - file.line_delimiter="\n" + file.type = "csv" + file.column_separator = "," + file.line_delimiter = "\n" } } } diff --git a/docs/en/connector-v2/sink/SftpFile.md b/docs/en/connector-v2/sink/SftpFile.md index 79643b8c8aa..b6460f39e39 100644 --- a/docs/en/connector-v2/sink/SftpFile.md +++ b/docs/en/connector-v2/sink/SftpFile.md @@ -41,8 +41,8 @@ By default, we use 2PC commit to ensure `exactly-once` | file_name_expression | string | no | "${transactionId}" | Only used when custom_filename is true | | filename_time_format | string | no | "yyyy.MM.dd" | Only used when custom_filename is true | | file_format_type | string | no | "csv" | | -| field_delimiter | string | no | '\001' | Only used when file_format is text | -| row_delimiter | string | no | "\n" | Only used when file_format is text | +| field_delimiter | string | no | '\001' | Only used when file_format_type is text | +| row_delimiter | string | no | "\n" | Only used when file_format_type is text | | have_partition | boolean | no | false | Whether you need processing partitions. | | partition_by | array | no | - | Only used then have_partition is true | | partition_dir_expression | string | no | "${k0}=${v0}/${k1}=${v1}/.../${kn}=${vn}/" | Only used then have_partition is true | @@ -52,8 +52,8 @@ By default, we use 2PC commit to ensure `exactly-once` | batch_size | int | no | 1000000 | | | compress_codec | string | no | none | | | common-options | object | no | - | | -| max_rows_in_memory | int | no | - | Only used when file_format is excel. | -| sheet_name | string | no | Sheet${Random number} | Only used when file_format is excel. | +| max_rows_in_memory | int | no | - | Only used when file_format_type is excel. | +| sheet_name | string | no | Sheet${Random number} | Only used when file_format_type is excel. | ### host [string] @@ -109,7 +109,7 @@ We supported as the following file types: `text` `json` `csv` `orc` `parquet` `excel` -Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. +Please note that, The final file name will end with the file_format_type's suffix, the suffix of the text file is `txt`. ### field_delimiter [string] diff --git a/docs/en/connector-v2/sink/Snowflake.md b/docs/en/connector-v2/sink/Snowflake.md index 21bfb175ef7..1dfff5e09c7 100644 --- a/docs/en/connector-v2/sink/Snowflake.md +++ b/docs/en/connector-v2/sink/Snowflake.md @@ -61,8 +61,7 @@ Write data through jdbc. Support Batch mode and Streaming mode, support concurre | support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | | connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | | max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | -| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `batch_interval_ms`
, the data will be flushed into the database | -| batch_interval_ms | Int | No | 1000 | For batch writing, when the number of buffers reaches the number of `batch_size` or the time reaches `batch_interval_ms`, the data will be flushed into the database | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | | max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | | transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | | auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | diff --git a/docs/en/connector-v2/sink/StarRocks.md b/docs/en/connector-v2/sink/StarRocks.md index 7c6491fb591..763743ce967 100644 --- a/docs/en/connector-v2/sink/StarRocks.md +++ b/docs/en/connector-v2/sink/StarRocks.md @@ -2,94 +2,44 @@ > StarRocks sink connector -## Description +## Support These Engines -Used to send data to StarRocks. Both support streaming and batch mode. -The internal implementation of StarRocks sink connector is cached and imported by stream load in batches. +> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features +## Key Features - [ ] [exactly-once](../../concept/connector-v2-features.md) - [x] [cdc](../../concept/connector-v2-features.md) -## Options - -| name | type | required | default value | -|-----------------------------|---------|----------|-----------------| -| nodeUrls | list | yes | - | -| base-url | string | yes | - | -| username | string | yes | - | -| password | string | yes | - | -| database | string | yes | - | -| table | string | no | - | -| labelPrefix | string | no | - | -| batch_max_rows | long | no | 1024 | -| batch_max_bytes | int | no | 5 * 1024 * 1024 | -| batch_interval_ms | int | no | - | -| max_retries | int | no | - | -| retry_backoff_multiplier_ms | int | no | - | -| max_retry_backoff_ms | int | no | - | -| enable_upsert_delete | boolean | no | false | -| save_mode_create_template | string | no | see below | -| starrocks.config | map | no | - | - -### nodeUrls [list] - -`StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]` - -### base-url [string] - -The JDBC URL like `jdbc:mysql://localhost:9030/` or `jdbc:mysql://localhost:9030` or `jdbc:mysql://localhost:9030/db` - -### username [string] - -`StarRocks` user username - -### password [string] - -`StarRocks` user password - -### database [string] - -The name of StarRocks database - -### table [string] - -The name of StarRocks table, If not set, the table name will be the name of the upstream table - -### labelPrefix [string] - -The prefix of StarRocks stream load label - -### batch_max_rows [long] - -For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks - -### batch_max_bytes [int] - -For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks - -### batch_interval_ms [int] - -For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks - -### max_retries [int] - -The number of retries to flush failed - -### retry_backoff_multiplier_ms [int] - -Using as a multiplier for generating the next delay for backoff - -### max_retry_backoff_ms [int] - -The amount of time to wait before attempting to retry a request to `StarRocks` - -### enable_upsert_delete [boolean] +## Description -Whether to enable upsert/delete, only supports PrimaryKey model. +Used to send data to StarRocks. Both support streaming and batch mode. +The internal implementation of StarRocks sink connector is cached and imported by stream load in batches. -### save_mode_create_template [string] +## Sink Options + +| Name | Type | Required | Default | Description | +|-----------------------------|---------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| nodeUrls | list | yes | - | `StarRocks` cluster address, the format is `["fe_ip:fe_http_port", ...]` | +| base-url | string | yes | - | The JDBC URL like `jdbc:mysql://localhost:9030/` or `jdbc:mysql://localhost:9030` or `jdbc:mysql://localhost:9030/db` | +| username | string | yes | - | `StarRocks` user username | +| password | string | yes | - | `StarRocks` user password | +| database | string | yes | - | The name of StarRocks database | +| table | string | no | - | The name of StarRocks table, If not set, the table name will be the name of the upstream table | +| labelPrefix | string | no | - | The prefix of StarRocks stream load label | +| batch_max_rows | long | no | 1024 | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks | +| batch_max_bytes | int | no | 5 * 1024 * 1024 | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks | +| batch_interval_ms | int | no | - | For batch writing, when the number of buffers reaches the number of `batch_max_rows` or the byte size of `batch_max_bytes` or the time reaches `batch_interval_ms`, the data will be flushed into the StarRocks | +| max_retries | int | no | - | The number of retries to flush failed | +| retry_backoff_multiplier_ms | int | no | - | Using as a multiplier for generating the next delay for backoff | +| max_retry_backoff_ms | int | no | - | The amount of time to wait before attempting to retry a request to `StarRocks` | +| enable_upsert_delete | boolean | no | false | Whether to enable upsert/delete, only supports PrimaryKey model. | +| save_mode_create_template | string | no | see below | see below | +| starrocks.config | map | no | - | The parameter of the stream load `data_desc` | + +### save_mode_create_template We use templates to automatically create starrocks tables, which will create corresponding table creation statements based on the type of upstream data and schema type, @@ -131,19 +81,72 @@ You can use the following placeholders description of StarRocks - rowtype_primary_key: Used to get the primary key in the upstream schema (maybe a list) -### starrocks.config [map] - -The parameter of the stream load `data_desc` +## Data Type Mapping + +| StarRocks Data type | SeaTunnel Data type | +|---------------------|---------------------| +| BOOLEAN | BOOLEAN | +| TINYINT | TINYINT | +| SMALLINT | SMALLINT | +| INT | INT | +| BIGINT | BIGINT | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| DECIMAL | DECIMAL | +| DATE | STRING | +| TIME | STRING | +| DATETIME | STRING | +| STRING | STRING | +| ARRAY | STRING | +| MAP | STRING | +| BYTES | STRING | #### Supported import data formats -The supported formats include CSV and JSON. Default value: JSON +The supported formats include CSV and JSON -## Example +## Task Example -Use JSON format to import data +### Simple: + +> The following example describes writing multiple data types to StarRocks, and users need to create corresponding tables downstream ```hocon +env { + parallelism = 1 + job.mode = "BATCH" + checkpoint.interval = 10000 +} + +source { + FakeSource { + row.num = 10 + map.size = 10 + array.size = 10 + bytes.length = 10 + string.length = 10 + schema = { + fields { + c_map = "map>" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_decimal = "decimal(16, 1)" + c_null = "null" + c_bytes = bytes + c_date = date + c_timestamp = timestamp + } + } + } +} + sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] @@ -158,12 +161,29 @@ sink { } } } - ``` -Use CSV format to import data +### Support write cdc changelog event(INSERT/UPDATE/DELETE) ```hocon +sink { + StarRocks { + nodeUrls = ["e2e_starRocksdb:8030"] + username = root + password = "" + database = "test" + table = "e2e_table_sink" + ... + + // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model. + enable_upsert_delete = true + } +} +``` + +### Use JSON format to import data + +``` sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] @@ -173,17 +193,17 @@ sink { table = "e2e_table_sink" batch_max_rows = 10 starrocks.config = { - format = "CSV" - column_separator = "\\x01" - row_delimiter = "\\x02" + format = "JSON" + strip_outer_array = true } } } + ``` -Support write cdc changelog event(INSERT/UPDATE/DELETE) +### Use CSV format to import data -```hocon +``` sink { StarRocks { nodeUrls = ["e2e_starRocksdb:8030"] @@ -191,10 +211,12 @@ sink { password = "" database = "test" table = "e2e_table_sink" - ... - - // Support upsert/delete event synchronization (enable_upsert_delete=true), only supports PrimaryKey model. - enable_upsert_delete = true + batch_max_rows = 10 + starrocks.config = { + format = "CSV" + column_separator = "\\x01" + row_delimiter = "\\x02" + } } } ``` diff --git a/docs/en/connector-v2/sink/Tablestore.md b/docs/en/connector-v2/sink/Tablestore.md index ed59895c65f..8f161ad25f6 100644 --- a/docs/en/connector-v2/sink/Tablestore.md +++ b/docs/en/connector-v2/sink/Tablestore.md @@ -21,7 +21,6 @@ Write data to `Tablestore` | table | string | yes | - | | primary_keys | array | yes | - | | batch_size | string | no | 25 | -| batch_interval_ms | string | no | 1000 | | common-options | config | no | - | ### end_point [string] diff --git a/docs/en/connector-v2/sink/Vertica.md b/docs/en/connector-v2/sink/Vertica.md new file mode 100644 index 00000000000..9a624407682 --- /dev/null +++ b/docs/en/connector-v2/sink/Vertica.md @@ -0,0 +1,172 @@ +# Vertica + +> JDBC Vertica Sink Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [cdc](../../concept/connector-v2-features.md) + +> Use `Xa transactions` to ensure `exactly-once`. So only support `exactly-once` for the database which is +> support `Xa transactions`. You can set `is_exactly_once=true` to enable it. + +## Description + +Write data through jdbc. Support Batch mode and Streaming mode, support concurrent writing, support exactly-once +semantics (using XA transaction guarantee). + +## Supported DataSource Info + +| Datasource | Supported Versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|-------------------------|---------------------------------------|----------------------------------------------------------------------| +| Vertica | Different dependency version has different driver class. | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [Download](https://www.vertica.com/download/vertica/client-drivers/) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Vertica datasource: cp vertica-jdbc-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Vertica Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT(1)
INT UNSIGNED | BOOLEAN | +| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | +| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
FLOAT UNSIGNED | FLOAT | +| DOUBLE
DOUBLE UNSIGNED | DOUBLE | +| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
TIMESTAMP | TIMESTAMP | +| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | +| GEOMETRY
UNKNOWN | Not supported yet | + +## Sink Options + +| Name | Type | Required | Default | Description | +|-------------------------------------------|---------|----------|---------|----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Vertical the value is `com.vertica.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | No | - | Use this sql write upstream input datas to database. e.g `INSERT ...`,`query` have the higher priority | +| database | String | No | - | Use this `database` and `table-name` auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| table | String | No | - | Use database and this table-name auto-generate sql and receive upstream input datas write to database.
This option is mutually exclusive with `query` and has a higher priority. | +| primary_keys | Array | No | - | This option is used to support operations such as `insert`, `delete`, and `update` when automatically generate sql. | +| support_upsert_by_query_primary_key_exist | Boolean | No | false | Choose to use INSERT sql, UPDATE sql to process update events(INSERT, UPDATE_AFTER) based on query primary key exists. This configuration is only used when database unsupport upsert syntax. **Note**: that this method has low performance | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete. | +| max_retries | Int | No | 0 | The number of retries to submit failed (executeBatch) | +| batch_size | Int | No | 1000 | For batch writing, when the number of buffered records reaches the number of `batch_size` or the time reaches `checkpoint.interval`
, the data will be flushed into the database | +| is_exactly_once | Boolean | No | false | Whether to enable exactly-once semantics, which will use Xa transactions. If on, you need to
set `xa_data_source_class_name`. | +| generate_sink_sql | Boolean | No | false | Generate sql statements based on the database table you want to write to | +| xa_data_source_class_name | String | No | - | The xa data source class name of the database Driver, for example, vertical is `com.vertical.cj.jdbc.VerticalXADataSource`, and
please refer to appendix for other data sources | +| max_commit_attempts | Int | No | 3 | The number of retries for transaction commit failures | +| transaction_timeout_sec | Int | No | -1 | The timeout after the transaction is opened, the default is -1 (never timeout). Note that setting the timeout may affect
exactly-once semantics | +| auto_commit | Boolean | No | true | Automatic transaction commit is enabled by default | +| common-options | | no | - | Sink plugin common parameters, please refer to [Sink Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example defines a SeaTunnel synchronization task that automatically generates data through FakeSource and sends it to JDBC Sink. FakeSource generates a total of 16 rows of data (row.num=16), with each row having two fields, name (string type) and age (int type). The final target table is test_table will also be 16 rows of data in the table. Before run this job, you need create database test and table test_table in your vertical. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + parallelism = 1 + result_table_name = "fake" + row.num = 16 + schema = { + fields { + name = "string" + age = "int" + } + } + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` + +### Generate Sink SQL + +> This example not need to write complex sql statements, you can configure the database name table name to automatically generate add statements for you + +``` +sink { + jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + user = "root" + password = "123456" + # Automatically generate sql statements based on database table names + generate_sink_sql = true + database = test + table = test_table + } +} +``` + +### Exactly-once : + +> For accurate write scene we guarantee accurate once + +``` +sink { + jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + + max_retries = 0 + user = "root" + password = "123456" + query = "insert into test_table(name,age) values(?,?)" + + is_exactly_once = "true" + + xa_data_source_class_name = "com.vertical.cj.jdbc.VerticalXADataSource" + } +} +``` + diff --git a/docs/en/connector-v2/source/Clickhouse.md b/docs/en/connector-v2/source/Clickhouse.md index ef5d99c05ea..7596bf72a8f 100644 --- a/docs/en/connector-v2/source/Clickhouse.md +++ b/docs/en/connector-v2/source/Clickhouse.md @@ -2,87 +2,96 @@ > Clickhouse source connector -## Description +## Support Those Engines -Used to read data from Clickhouse. +> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features +## Key Features - [x] [batch](../../concept/connector-v2-features.md) - [ ] [stream](../../concept/connector-v2-features.md) - [ ] [exactly-once](../../concept/connector-v2-features.md) - [x] [column projection](../../concept/connector-v2-features.md) - -supports query SQL and can achieve projection effect. - - [ ] [parallelism](../../concept/connector-v2-features.md) - [ ] [support user-defined split](../../concept/connector-v2-features.md) -## Options - -| name | type | required | default value | -|----------------|--------|----------|---------------| -| host | string | yes | - | -| database | string | yes | - | -| sql | string | yes | - | -| username | string | yes | - | -| password | string | yes | - | -| common-options | | no | - | - -### host [string] - -`ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` . - -### database [string] - -The `ClickHouse` database - -### sql [string] - -The query sql used to search data though Clickhouse server - -### username [string] - -`ClickHouse` user username - -### password [string] +> supports query SQL and can achieve projection effect. -`ClickHouse` user password - -### common options +## Description -Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details +Used to read data from Clickhouse. -## Examples +## Supported DataSource Info + +In order to use the Clickhouse connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|------------------------------------------------------------------------------------------------------------------| +| Clickhouse | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-clickhouse) | + +## Data Type Mapping + +| Clickhouse Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------------------------------------------------------------|---------------------| +| String / Int128 / UInt128 / Int256 / UInt256 / Point / Ring / Polygon MultiPolygon | STRING | +| Int8 / UInt8 / Int16 / UInt16 / Int32 | INT | +| UInt64 / Int64 / IntervalYear / IntervalQuarter / IntervalMonth / IntervalWeek / IntervalDay / IntervalHour / IntervalMinute / IntervalSecond | BIGINT | +| Float64 | DOUBLE | +| Decimal | DECIMAL | +| Float32 | FLOAT | +| Date | DATE | +| DateTime | TIME | +| Array | ARRAY | +| Map | MAP | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------|--------|----------|------------------------|------------------------------------------------------------------------------------------------------------------------------------------| +| host | String | Yes | - | `ClickHouse` cluster address, the format is `host:port` , allowing multiple `hosts` to be specified. Such as `"host1:8123,host2:8123"` . | +| database | String | Yes | - | The `ClickHouse` database. | +| sql | String | Yes | - | The query sql used to search data though Clickhouse server. | +| username | String | Yes | - | `ClickHouse` user username. | +| password | String | Yes | - | `ClickHouse` user password. | +| server_time_zone | String | No | ZoneId.systemDefault() | The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | + +## How to Create a Clickhouse Data Synchronization Jobs + +The following example demonstrates how to create a data synchronization job that reads data from Clickhouse and prints it on the local client: + +```bash +# Set the basic configuration of the task to be performed +env { + execution.parallelism = 1 + job.mode = "BATCH" +} -```hocon +# Create a source to connect to Clickhouse source { - Clickhouse { host = "localhost:8123" database = "default" sql = "select * from test where age = 20 limit 100" - username = "default" - password = "" + username = "xxxxx" + password = "xxxxx" + server_time_zone = "UTC" result_table_name = "test" } - } -``` - -## Changelog - -### 2.2.0-beta 2022-09-26 -- Add ClickHouse Source Connector - -### 2.3.0-beta 2022-10-20 - -- [Improve] Clickhouse Source random use host when config multi-host ([3108](https://github.com/apache/seatunnel/pull/3108)) - -### next version +# Console printing of the read Clickhouse data +sink { + Console { + parallelism = 1 + } +} +``` -- [Improve] Clickhouse Source support nest type and array type([3047](https://github.com/apache/seatunnel/pull/3047)) +### Tips -- [Improve] Clickhouse Source support geo type([3141](https://github.com/apache/seatunnel/pull/3141)) +> 1.[SeaTunnel Deployment Document](../../start-v2/locally/deployment.md). diff --git a/docs/en/connector-v2/source/CosFile.md b/docs/en/connector-v2/source/CosFile.md new file mode 100644 index 00000000000..dd1e77ebcfd --- /dev/null +++ b/docs/en/connector-v2/source/CosFile.md @@ -0,0 +1,294 @@ +# CosFile + +> Cos file source connector + +## Description + +Read data from aliyun Cos file system. + +:::tip + +If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. + +If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. + +To use this connector you need put hadoop-cos-{hadoop.version}-{version}.jar and cos_api-bundle-{version}.jar in ${SEATUNNEL_HOME}/lib dir, download: [Hadoop-Cos-release](https://github.com/tencentyun/hadoop-cos/releases). It only supports hadoop version 2.6.5+ and version 8.0.2+. + +::: + +## Key features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) + +Read all the data in a split in a pollNext call. What splits are read will be saved in snapshot. + +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) +- [x] file format type + - [x] text + - [x] csv + - [x] parquet + - [x] orc + - [x] json + - [x] excel + +## Options + +| name | type | required | default value | +|---------------------------|---------|----------|---------------------| +| path | string | yes | - | +| file_format_type | string | yes | - | +| bucket | string | yes | - | +| secret_id | string | yes | - | +| secret_key | string | yes | - | +| region | string | yes | - | +| read_columns | list | yes | - | +| delimiter | string | no | \001 | +| parse_partition_from_path | boolean | no | true | +| skip_header_row_number | long | no | 0 | +| date_format | string | no | yyyy-MM-dd | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | +| time_format | string | no | HH:mm:ss | +| schema | config | no | - | +| common-options | | no | - | +| sheet_name | string | no | - | +| file_filter_pattern | string | no | - | + +### path [string] + +The source file path. + +### delimiter [string] + +Field delimiter, used to tell connector how to slice and dice fields when reading text files + +default `\001`, the same as hive's default delimiter + +### parse_partition_from_path [boolean] + +Control whether parse the partition keys and values from file path + +For example if you read a file from path `cosn://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` + +Every record data from file will be added these two fields: + +| name | age | +|---------------|-----| +| tyrantlucifer | 26 | + +Tips: **Do not define partition fields in schema option** + +### date_format [string] + +Date type format, used to tell connector how to convert string to date, supported as the following formats: + +`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` + +default `yyyy-MM-dd` + +### datetime_format [string] + +Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: + +`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` + +default `yyyy-MM-dd HH:mm:ss` + +### time_format [string] + +Time type format, used to tell connector how to convert string to time, supported as the following formats: + +`HH:mm:ss` `HH:mm:ss.SSS` + +default `HH:mm:ss` + +### skip_header_row_number [long] + +Skip the first few lines, but only for the txt and csv. + +For example, set like following: + +`skip_header_row_number = 2` + +then SeaTunnel will skip the first 2 lines from source files + +### file_format_type [string] + +File type, supported as the following file types: + +`text` `csv` `parquet` `orc` `json` `excel` + +If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. + +For example: + +upstream data is the following: + +```json + +{"code": 200, "data": "get success", "success": true} + +``` + +You can also save multiple pieces of data in one file and split them by newline: + +```json lines + +{"code": 200, "data": "get success", "success": true} +{"code": 300, "data": "get failed", "success": false} + +``` + +you should assign schema as the following: + +```hocon + +schema { + fields { + code = int + data = string + success = boolean + } +} + +``` + +connector will generate data as the following: + +| code | data | success | +|------|-------------|---------| +| 200 | get success | true | + +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. + +If you assign file type to `text` `csv`, you can choose to specify the schema information or not. + +For example, upstream data is the following: + +```text + +tyrantlucifer#26#male + +``` + +If you do not assign data schema connector will treat the upstream data as the following: + +| content | +|-----------------------| +| tyrantlucifer#26#male | + +If you assign data schema, you should also assign the option `delimiter` too except CSV file type + +you should assign schema and delimiter as the following: + +```hocon + +delimiter = "#" +schema { + fields { + name = string + age = int + gender = string + } +} + +``` + +connector will generate data as the following: + +| name | age | gender | +|---------------|-----|--------| +| tyrantlucifer | 26 | male | + +### bucket [string] + +The bucket address of Cos file system, for example: `Cos://tyrantlucifer-image-bed` + +### secret_id [string] + +The secret id of Cos file system. + +### secret_key [string] + +The secret key of Cos file system. + +### region [string] + +The region of cos file system. + +### schema [config] + +#### fields [Config] + +The schema of upstream data. + +### read_columns [list] + +The read column list of the data source, user can use it to implement field projection. + +The file type supported column projection as the following shown: + +- text +- json +- csv +- orc +- parquet +- excel + +**Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured** + +### common options + +Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. + +### sheet_name [string] + +Reader the sheet of the workbook,Only used when file_format is excel. + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + +## Example + +```hocon + + CosFile { + path = "/seatunnel/orc" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + file_format_type = "orc" + } + +``` + +```hocon + + CosFile { + path = "/seatunnel/json" + bucket = "cosn://seatunnel-test-1259587829" + secret_id = "xxxxxxxxxxxxxxxxxxx" + secret_key = "xxxxxxxxxxxxxxxxxxx" + region = "ap-chengdu" + file_format_type = "json" + schema { + fields { + id = int + name = string + } + } + } + +``` + +## Changelog + +### next version + +- Add file cos source connector ([4979](https://github.com/apache/seatunnel/pull/4979)) + diff --git a/docs/en/connector-v2/source/DB2.md b/docs/en/connector-v2/source/DB2.md new file mode 100644 index 00000000000..c9eb6a578b6 --- /dev/null +++ b/docs/en/connector-v2/source/DB2.md @@ -0,0 +1,155 @@ +# DB2 + +> JDBC DB2 Source Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------------|-----------------------------------|-----------------------------------------------------------------------| +| DB2 | Different dependency version has different driver class. | com.ibm.db2.jdbc.app.DB2Driver | jdbc:db2://127.0.0.1:50000/dbname | [Download](https://mvnrepository.com/artifact/com.ibm.db2.jcc/db2jcc) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example DB2 datasource: cp db2-connector-java-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| DB2 Data type | SeaTunnel Data type | +|------------------------------------------------------------------------------------------------------|---------------------|---| +| BOOLEAN | BOOLEAN | +| SMALLINT | SHORT | +| INT
INTEGER
| INTEGER | +| BIGINT | LONG | +| DECIMAL
DEC
NUMERIC
NUM | DECIMAL(38,18) | +| REAL | FLOAT | +| FLOAT
DOUBLE
DOUBLE PRECISION
DECFLOAT | DOUBLE | +| CHAR
VARCHAR
LONG VARCHAR
CLOB
GRAPHIC
VARGRAPHIC
LONG VARGRAPHIC
DBCLOB | STRING | +| BLOB | BYTES | +| DATE | DATE | +| TIME | TIME | +| TIMESTAMP | TIMESTAMP | +| ROWID
XML | Not supported yet | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:db2://127.0.0.1:50000/dbname | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use db2 the value is `com.ibm.db2.jdbc.app.DB2Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 2 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + query = "select * from table_xxx" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +source { + Jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + url = "jdbc:db2://127.0.0.1:50000/dbname" + driver = "com.ibm.db2.jdbc.app.DB2Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + partition_column = "id" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + } +} +``` + diff --git a/docs/en/connector-v2/source/FtpFile.md b/docs/en/connector-v2/source/FtpFile.md index b550bde8baa..c692a7483a6 100644 --- a/docs/en/connector-v2/source/FtpFile.md +++ b/docs/en/connector-v2/source/FtpFile.md @@ -48,6 +48,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you | schema | config | no | - | | common-options | | no | - | | sheet_name | string | no | - | +| file_filter_pattern | string | no | - | ### host [string] @@ -225,7 +226,7 @@ Source plugin common parameters, please refer to [Source Common Options](common- ### sheet_name [string] -Reader the sheet of the workbook,Only used when file_format is excel. +Reader the sheet of the workbook,Only used when file_format_type is excel. ## Example diff --git a/docs/en/connector-v2/source/HdfsFile.md b/docs/en/connector-v2/source/HdfsFile.md index d255f4fd3a7..88c1e35f87e 100644 --- a/docs/en/connector-v2/source/HdfsFile.md +++ b/docs/en/connector-v2/source/HdfsFile.md @@ -1,20 +1,14 @@ # HdfsFile -> Hdfs file source connector +> Hdfs File Source Connector -## Description - -Read data from hdfs file system. - -:::tip +## Support Those Engines -If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. +> Spark
+> Flink
+> SeaTunnel Zeta
-If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. - -::: - -## Key features +## Key Features - [x] [batch](../../concept/connector-v2-features.md) - [ ] [stream](../../concept/connector-v2-features.md) @@ -33,233 +27,57 @@ Read all the data in a split in a pollNext call. What splits are read will be sa - [x] json - [x] excel -## Options - -| name | type | required | default value | -|---------------------------|---------|----------|---------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| fs.defaultFS | string | yes | - | -| read_columns | list | yes | - | -| hdfs_site_path | string | no | - | -| delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| kerberos_principal | string | no | - | -| kerberos_keytab_path | string | no | - | -| skip_header_row_number | long | no | 0 | -| schema | config | no | - | -| common-options | | no | - | -| sheet_name | string | no | - | - -### path [string] - -The source file path. - -### delimiter [string] - -Field delimiter, used to tell connector how to slice and dice fields when reading text files - -default `\001`, the same as hive's default delimiter - -### parse_partition_from_path [boolean] - -Control whether parse the partition keys and values from file path - -For example if you read a file from path `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` - -Every record data from file will be added these two fields: - -| name | age | -|---------------|-----| -| tyrantlucifer | 26 | - -Tips: **Do not define partition fields in schema option** - -### date_format [string] - -Date type format, used to tell connector how to convert string to date, supported as the following formats: - -`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` - -default `yyyy-MM-dd` - -### datetime_format [string] - -Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: - -`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` - -default `yyyy-MM-dd HH:mm:ss` - -### time_format [string] - -Time type format, used to tell connector how to convert string to time, supported as the following formats: - -`HH:mm:ss` `HH:mm:ss.SSS` - -default `HH:mm:ss` - -### skip_header_row_number [long] - -Skip the first few lines, but only for the txt and csv. - -For example, set like following: - -`skip_header_row_number = 2` - -then SeaTunnel will skip the first 2 lines from source files - -### file_format_type [string] - -File type, supported as the following file types: - -`text` `csv` `parquet` `orc` `json` `excel` - -If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. - -For example: - -upstream data is the following: - -```json - -{"code": 200, "data": "get success", "success": true} - -``` - -You can also save multiple pieces of data in one file and split them by newline: - -```json lines - -{"code": 200, "data": "get success", "success": true} -{"code": 300, "data": "get failed", "success": false} - -``` - -you should assign schema as the following: - -```hocon - -schema { - fields { - code = int - data = string - success = boolean - } -} - -``` - -connector will generate data as the following: - -| code | data | success | -|------|-------------|---------| -| 200 | get success | true | - -If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. - -If you assign file type to `text` `csv`, you can choose to specify the schema information or not. +## Description -For example, upstream data is the following: +Read data from hdfs file system. -```text +## Supported DataSource Info -tyrantlucifer#26#male +| Datasource | Supported Versions | +|------------|--------------------| +| HdfsFile | hadoop 2.x and 3.x | -``` +## Source Options -If you do not assign data schema connector will treat the upstream data as the following: +| Name | Type | Required | Default | Description | +|---------------------------|---------|----------|---------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | The source file path. | +| file_format_type | string | yes | - | We supported as the following file types:`text` `json` `csv` `orc` `parquet` `excel`.Please note that, The final file name will end with the file_format's suffix, the suffix of the text file is `txt`. | +| fs.defaultFS | string | yes | - | The hadoop cluster address that start with `hdfs://`, for example: `hdfs://hadoopcluster` | +| read_columns | list | yes | - | The read column list of the data source, user can use it to implement field projection.The file type supported column projection as the following shown:[text,json,csv,orc,parquet,excel].Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured. | +| hdfs_site_path | string | no | - | The path of `hdfs-site.xml`, used to load ha configuration of namenodes | +| delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. default `\001`, the same as hive's default delimiter | +| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `hdfs://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields:[name:tyrantlucifer,age:26].Tips:Do not define partition fields in schema option. | +| date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd`.Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` default `yyyy-MM-dd` | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` .default `yyyy-MM-dd HH:mm:ss` | +| time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS`.default `HH:mm:ss` | +| kerberos_principal | string | no | - | The principal of kerberos | +| kerberos_keytab_path | string | no | - | The keytab path of kerberos | +| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv.For example, set like following:`skip_header_row_number = 2`.then Seatunnel will skip the first 2 lines from source files | +| schema | config | no | - | the schema fields of upstream data | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | +| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | -| content | -|-----------------------| -| tyrantlucifer#26#male | +### Tips -If you assign data schema, you should also assign the option `delimiter` too except CSV file type +> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. -you should assign schema and delimiter as the following: +## Task Example -```hocon +### Simple: -delimiter = "#" -schema { - fields { - name = string - age = int - gender = string - } -} +> This example defines a SeaTunnel synchronization task that read data from Hdfs and sends it to Hdfs. ``` - -connector will generate data as the following: - -| name | age | gender | -|---------------|-----|--------| -| tyrantlucifer | 26 | male | - -### fs.defaultFS [string] - -Hdfs cluster address. - -### hdfs_site_path [string] - -The path of `hdfs-site.xml`, used to load ha configuration of namenodes - -### kerberos_principal [string] - -The principal of kerberos - -### kerberos_keytab_path [string] - -The keytab path of kerberos - -### schema [Config] - -#### fields [Config] - -the schema fields of upstream data - -### read_columns [list] - -The read column list of the data source, user can use it to implement field projection. - -The file type supported column projection as the following shown: - -- text -- json -- csv -- orc -- parquet -- excel - -**Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured** - -### common options - -Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. - -### sheet_name [string] - -Reader the sheet of the workbook,Only used when file_format is excel. - -## Example - -```hocon - -HdfsFile { - path = "/apps/hive/demo/student" - file_format_type = "parquet" - fs.defaultFS = "hdfs://namenode001" +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" } -``` - -```hocon - -HdfsFile { +source { + HdfsFile { schema { fields { name = string @@ -269,24 +87,24 @@ HdfsFile { path = "/apps/hive/demo/student" type = "json" fs.defaultFS = "hdfs://namenode001" + } + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/category/source-v2 } -``` - -## Changelog - -### 2.2.0-beta 2022-09-26 - -- Add HDFS File Source Connector - -### 2.3.0-beta 2022-10-20 - -- [BugFix] Fix the bug of incorrect path in windows environment ([2980](https://github.com/apache/seatunnel/pull/2980)) -- [Improve] Support extract partition from SeaTunnelRow fields ([3085](https://github.com/apache/seatunnel/pull/3085)) -- [Improve] Support parse field from file path ([2985](https://github.com/apache/seatunnel/pull/2985)) - -### next version +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} -- [Improve] Support skip header for csv and txt files ([3900](https://github.com/apache/seatunnel/pull/3840)) -- [Improve] Support kerberos authentication ([3840](https://github.com/apache/seatunnel/pull/3840)) +sink { + HdfsFile { + fs.defaultFS = "hdfs://hadoopcluster" + path = "/tmp/hive/warehouse/test2" + file_format = "orc" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/category/sink-v2 +} +``` diff --git a/docs/en/connector-v2/source/Hive.md b/docs/en/connector-v2/source/Hive.md index f9f35aaf733..afa9893d5b2 100644 --- a/docs/en/connector-v2/source/Hive.md +++ b/docs/en/connector-v2/source/Hive.md @@ -33,17 +33,18 @@ Read all the data in a split in a pollNext call. What splits are read will be sa ## Options -| name | type | required | default value | -|----------------------|--------|----------|---------------| -| table_name | string | yes | - | -| metastore_uri | string | yes | - | -| kerberos_principal | string | no | - | -| kerberos_keytab_path | string | no | - | -| hdfs_site_path | string | no | - | -| hive_site_path | string | no | - | -| read_partitions | list | no | - | -| read_columns | list | no | - | -| common-options | | no | - | +| name | type | required | default value | +|-------------------------------|---------|----------|---------------| +| table_name | string | yes | - | +| metastore_uri | string | yes | - | +| kerberos_principal | string | no | - | +| kerberos_keytab_path | string | no | - | +| hdfs_site_path | string | no | - | +| hive_site_path | string | no | - | +| read_partitions | list | no | - | +| read_columns | list | no | - | +| abort_drop_partition_metadata | boolean | no | true | +| common-options | | no | - | ### table_name [string] @@ -80,6 +81,10 @@ The keytab file path of kerberos authentication The read column list of the data source, user can use it to implement field projection. +### abort_drop_partition_metadata [list] + +Flag to decide whether to drop partition metadata from Hive Metastore during an abort operation. Note: this only affects the metadata in the metastore, the data in the partition will always be deleted(data generated during the synchronization process). + ### common options Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details diff --git a/docs/en/connector-v2/source/Hudi.md b/docs/en/connector-v2/source/Hudi.md index cb3b154d58b..ffe17f7de71 100644 --- a/docs/en/connector-v2/source/Hudi.md +++ b/docs/en/connector-v2/source/Hudi.md @@ -2,69 +2,67 @@ > Hudi source connector -## Description +## Support Those Engines -Used to read data from Hudi. Currently, only supports hudi cow table and Snapshot Query with Batch Mode. +> Spark
+> Flink
+> SeaTunnel Zeta
-In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9. - -## Key features +## Key Features - [x] [batch](../../concept/connector-v2-features.md) - -Currently, only supports hudi cow table and Snapshot Query with Batch Mode - - [ ] [stream](../../concept/connector-v2-features.md) - [x] [exactly-once](../../concept/connector-v2-features.md) - [ ] [column projection](../../concept/connector-v2-features.md) - [x] [parallelism](../../concept/connector-v2-features.md) - [ ] [support user-defined split](../../concept/connector-v2-features.md) -## Options - -| name | type | required | default value | -|-------------------------|---------|------------------------------|---------------| -| table.path | string | yes | - | -| table.type | string | yes | - | -| conf.files | string | yes | - | -| use.kerberos | boolean | no | false | -| kerberos.principal | string | yes when use.kerberos = true | - | -| kerberos.principal.file | string | yes when use.kerberos = true | - | -| common-options | config | no | - | - -### table.path [string] - -`table.path` The hdfs root path of hudi table,such as 'hdfs://nameserivce/data/hudi/hudi_table/'. +## Description -### table.type [string] +Used to read data from Hudi. Currently, only supports hudi cow table and Snapshot Query with Batch Mode. -`table.type` The type of hudi table. Now we only support 'cow', 'mor' is not support yet. +In order to use this connector, You must ensure your spark/flink cluster already integrated hive. The tested hive version is 2.3.9. -### conf.files [string] +## Supported DataSource Info -`conf.files` The environment conf file path list(local path), which used to init hdfs client to read hudi table file. The example is '/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml'. +:::tip -### use.kerberos [boolean] +* Currently, only supports Hudi cow table and Snapshot Query with Batch Mode -`use.kerberos` Whether to enable Kerberos, default is false. +::: -### kerberos.principal [string] +## Data Type Mapping -`kerberos.principal` When use kerberos, we should set kerberos princal such as 'test_user@xxx'. +| Hudi Data type | Seatunnel Data type | +|----------------|---------------------| +| ALL TYPE | STRING | -### kerberos.principal.file [string] +## Source Options -`kerberos.principal.file` When use kerberos, we should set kerberos princal file such as '/home/test/test_user.keytab'. +| Name | Type | Required | Default | Description | +|-------------------------|--------|------------------------------|---------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| table.path | String | Yes | - | The hdfs root path of hudi table,such as 'hdfs://nameserivce/data/hudi/hudi_table/'. | +| table.type | String | Yes | - | The type of hudi table. Now we only support 'cow', 'mor' is not support yet. | +| conf.files | String | Yes | - | The environment conf file path list(local path), which used to init hdfs client to read hudi table file. The example is '/home/test/hdfs-site.xml;/home/test/core-site.xml;/home/test/yarn-site.xml'. | +| use.kerberos | bool | No | false | Whether to enable Kerberos, default is false. | +| kerberos.principal | String | yes when use.kerberos = true | - | When use kerberos, we should set kerberos principal such as 'test_user@xxx'. | +| kerberos.principal.file | string | yes when use.kerberos = true | - | When use kerberos, we should set kerberos principal file such as '/home/test/test_user.keytab'. | +| common-options | config | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | -### common options +## Task Example -Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. +### Simple: -## Examples +> This example reads from a Hudi COW table and configures Kerberos for the environment, printing to the console. ```hocon -source { - +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 2 + job.mode = "BATCH" +} +source{ Hudi { table.path = "hdfs://nameserivce/data/hudi/hudi_table/" table.type = "cow" @@ -73,7 +71,15 @@ source { kerberos.principal = "test_user@xxx" kerberos.principal.file = "/home/test/test_user.keytab" } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql/ +} +sink { + Console {} } ``` diff --git a/docs/en/connector-v2/source/Jdbc.md b/docs/en/connector-v2/source/Jdbc.md index 3cd76e9cd4a..d82df87a02e 100644 --- a/docs/en/connector-v2/source/Jdbc.md +++ b/docs/en/connector-v2/source/Jdbc.md @@ -8,9 +8,9 @@ Read external data source data through JDBC. :::tip -Warn: for license compliance, you have to provide database driver yourself, copy to `$SEATNUNNEL_HOME/plugins/jdbc/lib/` directory in order to make them work. +Warn: for license compliance, you have to provide database driver yourself, copy to `$SEATNUNNEL_HOME/lib/` directory in order to make them work. -e.g. If you use MySQL, should download and copy `mysql-connector-java-xxx.jar` to `$SEATNUNNEL_HOME/plugins/jdbc/lib/` +e.g. If you use MySQL, should download and copy `mysql-connector-java-xxx.jar` to `$SEATNUNNEL_HOME/lib/`. For Spark/Flink, you should also copy it to `$SPARK_HOME/jars/` or `$FLINK_HOME/lib/`. ::: @@ -35,6 +35,7 @@ supports query SQL and can achieve projection effect. | user | String | No | - | | password | String | No | - | | query | String | Yes | - | +| compatible_mode | String | No | - | | connection_check_timeout_sec | Int | No | 30 | | partition_column | String | No | - | | partition_upper_bound | Long | No | - | @@ -63,6 +64,10 @@ The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost/tes Query statement +### compatible_mode [string] + +The compatible mode of database, required when the database supports multiple compatible modes. For example, when using OceanBase database, you need to set it to 'mysql' or 'oracle'. + ### connection_check_timeout_sec [int] The time in seconds to wait for the database operation used to validate the connection to complete. @@ -71,11 +76,11 @@ The time in seconds to wait for the database operation used to validate the conn The column name for parallelism's partition, only support numeric type. -### partition_upper_bound [long] +### partition_upper_bound [BigDecimal] The partition_column max value for scan, if not set SeaTunnel will query database get max value. -### partition_lower_bound [long] +### partition_lower_bound [BigDecimal] The partition_column min value for scan, if not set SeaTunnel will query database get min value. @@ -120,6 +125,7 @@ there are some reference value for params above. | Snowflake | net.snowflake.client.jdbc.SnowflakeDriver | jdbc:snowflake://.snowflakecomputing.com | https://mvnrepository.com/artifact/net.snowflake/snowflake-jdbc | | Redshift | com.amazon.redshift.jdbc42.Driver | jdbc:redshift://localhost:5439/testdb?defaultRowFetchSize=1000 | https://mvnrepository.com/artifact/com.amazon.redshift/redshift-jdbc42 | | Vertica | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433 | https://repo1.maven.org/maven2/com/vertica/jdbc/vertica-jdbc/12.0.3-0/vertica-jdbc-12.0.3-0.jar | +| OceanBase | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2881 | https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.3/oceanbase-client-2.4.3.jar | ## Example diff --git a/docs/en/connector-v2/source/LocalFile.md b/docs/en/connector-v2/source/LocalFile.md index d33288b7a57..80adfa6d9ad 100644 --- a/docs/en/connector-v2/source/LocalFile.md +++ b/docs/en/connector-v2/source/LocalFile.md @@ -49,6 +49,7 @@ Read all the data in a split in a pollNext call. What splits are read will be sa | schema | config | no | - | | common-options | | no | - | | sheet_name | string | no | - | +| file_filter_pattern | string | no | - | ### path [string] @@ -223,7 +224,11 @@ Source plugin common parameters, please refer to [Source Common Options](common- ### sheet_name [string] -Reader the sheet of the workbook,Only used when file_format is excel. +Reader the sheet of the workbook,Only used when file_format_type is excel. + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. ## Example diff --git a/docs/en/connector-v2/source/MongoDB-CDC.md b/docs/en/connector-v2/source/MongoDB-CDC.md index cb7c2f32acf..d78f70110fc 100644 --- a/docs/en/connector-v2/source/MongoDB-CDC.md +++ b/docs/en/connector-v2/source/MongoDB-CDC.md @@ -84,8 +84,8 @@ The following table lists the field data type mapping from MongoDB BSON type to | Int64 | BIGINT | | Double | DOUBLE | | Decimal128 | DECIMAL | -| Date | Date | -| Timestamp | Timestamp | +| Date | DATE | +| Timestamp | TIMESTAMP | | Object | ROW | | Array | ARRAY | @@ -274,9 +274,38 @@ sink { } ``` -## Changelog +## Format of real-time streaming data -- [Feature]Add MongoDB CDC Source Connector([4923](https://github.com/apache/seatunnel/pull/4923)) - -### next version +```shell +{ + _id : { }, // Identifier of the open change stream, can be assigned to the 'resumeAfter' parameter for subsequent resumption of this change stream + "operationType" : "", // The type of change operation that occurred, such as: insert, delete, update, etc. + "fullDocument" : { }, // The full document data involved in the change operation. This field does not exist in delete operations + "ns" : { + "db" : "", // The database where the change operation occurred + "coll" : "" // The collection where the change operation occurred + }, + "to" : { // These fields are displayed only when the operation type is 'rename' + "db" : "", // The new database name after the change + "coll" : "" // The new collection name after the change + }, + "source":{ + "ts_ms":"", // The timestamp when the change operation occurred + "table":"" // The collection where the change operation occurred + "db":"", // The database where the change operation occurred + "snapshot":"false" // Identify the current stage of data synchronization + }, + "documentKey" : { "_id" : }, // The _id field value of the document involved in the change operation + "updateDescription" : { // Description of the update operation + "updatedFields" : { }, // The fields and values that the update operation modified + "removedFields" : [ "", ... ] // The fields and values that the update operation removed + } + "clusterTime" : , // The timestamp of the Oplog log entry corresponding to the change operation + "txnNumber" : , // If the change operation is executed in a multi-document transaction, this field and value are displayed, representing the transaction number + "lsid" : { // Represents information related to the Session in which the transaction is located + "id" : , + "uid" : + } +} +``` diff --git a/docs/en/connector-v2/source/MongoDB.md b/docs/en/connector-v2/source/MongoDB.md index 14f283afb43..137fb205b8c 100644 --- a/docs/en/connector-v2/source/MongoDB.md +++ b/docs/en/connector-v2/source/MongoDB.md @@ -79,6 +79,11 @@ For specific types in MongoDB, we use Extended JSON format to map them to SeaTun | fetch.size | Int | No | 2048 | Set the number of documents obtained from the server for each batch. Setting the appropriate batch size can improve query performance and avoid the memory pressure caused by obtaining a large amount of data at one time. | | max.time-min | Long | No | 600 | This parameter is a MongoDB query option that limits the maximum execution time for query operations. The value of maxTimeMin is in Minute. If the execution time of the query exceeds the specified time limit, MongoDB will terminate the operation and return an error. | | flat.sync-string | Boolean | No | true | By utilizing flatSyncString, only one field attribute value can be set, and the field type must be a String. This operation will perform a string mapping on a single MongoDB data entry. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> 1.The parameter `match.query` is compatible with the historical old version parameter `matchQuery`, and they are equivalent replacements.
## How to Create a MongoDB Data Synchronization Jobs diff --git a/docs/en/connector-v2/source/MyHours.md b/docs/en/connector-v2/source/MyHours.md index ec3a9355336..f90d42ab1cb 100644 --- a/docs/en/connector-v2/source/MyHours.md +++ b/docs/en/connector-v2/source/MyHours.md @@ -2,11 +2,13 @@ > My Hours source connector -## Description +## Support Those Engines -Used to read data from My Hours. +> Spark
+> Flink
+> SeaTunnel Zeta
-## Key features +## Key Features - [x] [batch](../../concept/connector-v2-features.md) - [ ] [stream](../../concept/connector-v2-features.md) @@ -15,71 +17,103 @@ Used to read data from My Hours. - [ ] [parallelism](../../concept/connector-v2-features.md) - [ ] [support user-defined split](../../concept/connector-v2-features.md) -## Options - -| name | type | required | default value | -|-----------------------------|---------|----------|---------------| -| url | String | Yes | - | -| email | String | Yes | - | -| password | String | Yes | - | -| method | String | No | get | -| schema | Config | No | - | -| schema.fields | Config | No | - | -| format | String | No | json | -| params | Map | No | - | -| body | String | No | - | -| json_field | Config | No | - | -| content_json | String | No | - | -| poll_interval_ms | int | No | - | -| retry | int | No | - | -| retry_backoff_multiplier_ms | int | No | 100 | -| retry_backoff_max_ms | int | No | 10000 | -| enable_multi_lines | boolean | No | false | -| common-options | config | No | - | - -### url [String] - -http request url - -### email [String] - -email for login - -### password [String] - -password for login - -### method [String] - -http request method, only supports GET, POST method - -### params [Map] - -http params - -### body [String] - -http body - -### poll_interval_ms [int] +## Description -request http api interval(millis) in stream mode +Used to read data from My Hours. -### retry [int] +## Key features -The max retry times if request http return to `IOException` +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [ ] [exactly-once](../../concept/connector-v2-features.md) +- [ ] [column projection](../../concept/connector-v2-features.md) +- [ ] [parallelism](../../concept/connector-v2-features.md) +- [ ] [support user-defined split](../../concept/connector-v2-features.md) -### retry_backoff_multiplier_ms [int] +## Supported DataSource Info + +In order to use the My Hours connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Dependency | +|------------|--------------------|---------------------------------------------------------------------------------------------| +| My Hours | universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2) | + +## Source Options + +| Name | Type | Required | Default | Description | +|-----------------------------|---------|----------|---------|--------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | Http request url. | +| email | String | Yes | - | My hours login email address. | +| password | String | Yes | - | My hours login password. | +| schema | Config | No | - | Http and seatunnel data structure mapping | +| schema.fields | Config | No | - | The schema fields of upstream data | +| json_field | Config | No | - | This parameter helps you configure the schema,so this parameter must be used with schema. | +| content_json | String | No | - | This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. | +| format | String | No | json | The format of upstream data, now only support `json` `text`, default `json`. | +| method | String | No | get | Http request method, only supports GET, POST method. | +| headers | Map | No | - | Http headers. | +| params | Map | No | - | Http params. | +| body | String | No | - | Http body. | +| poll_interval_ms | Int | No | - | Request http api interval(millis) in stream mode. | +| retry | Int | No | - | The max retry times if request http return to `IOException`. | +| retry_backoff_multiplier_ms | Int | No | 100 | The retry-backoff times(millis) multiplier if request http failed. | +| retry_backoff_max_ms | Int | No | 10000 | The maximum retry-backoff times(millis) if request http failed | +| enable_multi_lines | Boolean | No | false | | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## How to Create a My Hours Data Synchronization Jobs -The retry-backoff times(millis) multiplier if request http failed +```hocon +env { + execution.parallelism = 1 + job.mode = "BATCH" +} -### retry_backoff_max_ms [int] +MyHours{ + url = "https://api2.myhours.com/api/Projects/getAll" + email = "seatunnel@test.com" + password = "seatunnel" + schema { + fields { + name = string + archived = boolean + dateArchived = string + dateCreated = string + clientName = string + budgetAlertPercent = string + budgetType = int + totalTimeLogged = double + budgetValue = double + totalAmount = double + totalExpense = double + laborCost = double + totalCost = double + billableTimeLogged = double + totalBillableAmount = double + billable = boolean + roundType = int + roundInterval = int + budgetSpentPercentage = double + budgetTarget = int + budgetPeriodType = string + budgetSpent = string + id = string + } + } +} -The maximum retry-backoff times(millis) if request http failed +# Console printing of the read data +sink { + Console { + parallelism = 1 + } +} +``` -### format [String] +## Parameter Interpretation -the format of upstream data, now only support `json` `text`, default `json`. +### format when you assign format is `json`, you should also assign schema option, for example: @@ -98,11 +132,11 @@ you should assign schema as the following: ```hocon schema { - fields { - code = int - data = string - success = boolean - } + fields { + code = int + data = string + success = boolean + } } ``` @@ -131,13 +165,7 @@ connector will generate data as the following: |----------------------------------------------------------| | {"code": 200, "data": "get success", "success": true} | -### schema [Config] - -#### fields [Config] - -the schema fields of upstream data - -### content_json [String] +### content_json This parameter can get some json data.If you only need the data in the 'book' section, configure `content_field = "$.store.book.*"`. @@ -212,14 +240,14 @@ Here is an example: - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_contentjson_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_contentjson_to_assert.conf). -### json_field [Config] +### json_field This parameter helps you configure the schema,so this parameter must be used with schema. If your data looks something like this: ```json -{ +{ "store": { "book": [ { @@ -273,47 +301,6 @@ source { - Test data can be found at this link [mockserver-config.json](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/mockserver-config.json) - See this link for task configuration [http_jsonpath_to_assert.conf](../../../../seatunnel-e2e/seatunnel-connector-v2-e2e/connector-http-e2e/src/test/resources/http_jsonpath_to_assert.conf). -### common options - -Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details - -## Example - -```hocon -MyHours{ - url = "https://api2.myhours.com/api/Projects/getAll" - email = "seatunnel@test.com" - password = "seatunnel" - schema { - fields { - name = string - archived = boolean - dateArchived = string - dateCreated = string - clientName = string - budgetAlertPercent = string - budgetType = int - totalTimeLogged = double - budgetValue = double - totalAmount = double - totalExpense = double - laborCost = double - totalCost = double - billableTimeLogged = double - totalBillableAmount = double - billable = boolean - roundType = int - roundInterval = int - budgetSpentPercentage = double - budgetTarget = int - budgetPeriodType = string - budgetSpent = string - id = string - } - } -} -``` - ## Changelog ### next version diff --git a/docs/en/connector-v2/source/MySQL-CDC.md b/docs/en/connector-v2/source/MySQL-CDC.md index f26c1e60a01..caeeca06283 100644 --- a/docs/en/connector-v2/source/MySQL-CDC.md +++ b/docs/en/connector-v2/source/MySQL-CDC.md @@ -155,7 +155,7 @@ By default, a random number is generated between 5400 and 6400, though we recomm ### server-time-zone [String] -The session time zone in database server. +The session time zone in database server. If not set, then ZoneId.systemDefault() is used to determine the server time zone. ### connect.timeout.ms [long] diff --git a/docs/en/connector-v2/source/Mysql.md b/docs/en/connector-v2/source/Mysql.md index d04c7eec302..32933f8c9a8 100644 --- a/docs/en/connector-v2/source/Mysql.md +++ b/docs/en/connector-v2/source/Mysql.md @@ -56,24 +56,24 @@ Read external data source data through JDBC. ## Source Options -| Name | Type | Required | Default | Description | -|------------------------------|--------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test | -| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | -| user | String | No | - | Connection instance user name | -| password | String | No | - | Connection instance password | -| query | String | Yes | - | Query statement | -| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | -| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | -| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | -| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | -| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | -| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | -| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:mysql://localhost:3306:3306/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use MySQL the value is `com.mysql.cj.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | ### Tips -> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks , When your shard read field is a large number type such as bigint(30) and above and the data is not evenly distributed, it is recommended to set the parallelism level to 1 to ensure that the data skew problem is resolved ## Task Example diff --git a/docs/en/connector-v2/source/OceanBase.md b/docs/en/connector-v2/source/OceanBase.md new file mode 100644 index 00000000000..bd035793eee --- /dev/null +++ b/docs/en/connector-v2/source/OceanBase.md @@ -0,0 +1,168 @@ +# OceanBase + +> JDBC OceanBase Source Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|--------------------------------|---------------------------|--------------------------------------|-------------------------------------------------------------------------------| +| OceanBase | All OceanBase server versions. | com.oceanbase.jdbc.Driver | jdbc:oceanbase://localhost:2883/test | [Download](https://mvnrepository.com/artifact/com.oceanbase/oceanbase-client) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example: cp oceanbase-client-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +### Mysql Mode + +| Mysql Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT(1)
INT UNSIGNED | BOOLEAN | +| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | +| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | BIGINT | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
FLOAT UNSIGNED | FLOAT | +| DOUBLE
DOUBLE UNSIGNED | DOUBLE | +| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
TIMESTAMP | TIMESTAMP | +| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | +| GEOMETRY
UNKNOWN | Not supported yet | + +### Oracle Mode + +| Oracle Data type | SeaTunnel Data type | +|-----------------------------------------------------------|---------------------| +| Number(p), p <= 9 | INT | +| Number(p), p <= 18 | BIGINT | +| Number(p), p > 18 | DECIMAL(38,18) | +| REAL
BINARY_FLOAT | FLOAT | +| BINARY_DOUBLE | DOUBLE | +| CHAR
NCHAR
NVARCHAR2
NCLOB
CLOB
ROWID | STRING | +| DATE | DATE | +| TIMESTAMP
TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
RAW
LONG RAW
BFILE | BYTES | +| UNKNOWN | Not supported yet | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oceanbase://localhost:2883/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source, should be `com.oceanbase.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| compatible_mode | String | Yes | - | The compatible mode of OceanBase, can be 'mysql' or 'oracle'. | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type column and string type column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. Default value is job parallelism. | +| fetch_size | Int | No | 0 | For queries that return a large number of objects, you can configure
the row fetch size used in the query to improve performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +``` +env { + execution.parallelism = 2 + job.mode = "BATCH" +} + +source { + Jdbc { + driver = "com.oceanbase.jdbc.Driver" + url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + user = "root" + password = "" + compatible_mode = "mysql" + query = "select * from source" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data. You can do this if you want to read the whole table + +``` +source { + Jdbc { + driver = "com.oceanbase.jdbc.Driver" + url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + user = "root" + password = "" + compatible_mode = "mysql" + query = "select * from source" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + driver = "com.oceanbase.jdbc.Driver" + url = "jdbc:oceanbase://localhost:2883/test?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true" + user = "root" + password = "" + compatible_mode = "mysql" + query = "select * from source" + partition_column = "id" + partition_num = 10 + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + } +} +``` + diff --git a/docs/en/connector-v2/source/Oracle.md b/docs/en/connector-v2/source/Oracle.md new file mode 100644 index 00000000000..c1cedbded7a --- /dev/null +++ b/docs/en/connector-v2/source/Oracle.md @@ -0,0 +1,154 @@ +# Oracle + +> JDBC Oracle Source Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|--------------------------|----------------------------------------|--------------------------------------------------------------------| +| Oracle | Different dependency version has different driver class. | oracle.jdbc.OracleDriver | jdbc:oracle:thin:@datasource01:1523:xe | https://mvnrepository.com/artifact/com.oracle.database.jdbc/ojdbc8 | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Oracle datasource: cp ojdbc8-xxxxxx.jar $SEATNUNNEL_HOME/lib/
+> To support the i18n character set, copy the orai18n.jar to the $SEATNUNNEL_HOME/lib/ directory. + +## Data Type Mapping + +| PostgreSQL Data type | SeaTunnel Data type | +|--------------------------------------------------------------------------------------|---------------------| +| INTEGER | INT | +| FLOAT | DECIMAL(38, 18) | +| NUMBER(precision <= 9, scale == 0) | INT | +| NUMBER(9 < precision <= 18, scale == 0) | BIGINT | +| NUMBER(18 < precision, scale == 0) | DECIMAL(38, 0) | +| NUMBER(scale != 0) | DECIMAL(38, 18) | +| BINARY_DOUBLE | DOUBLE | +| BINARY_FLOAT
REAL | FLOAT | +| CHAR
NCHAR
NVARCHAR2
VARCHAR2
LONG
ROWID
NCLOB
CLOB
| STRING | +| DATE | DATE | +| TIMESTAMP
TIMESTAMP WITH LOCAL TIME ZONE | TIMESTAMP | +| BLOB
RAW
LONG RAW
BFILE | BYTES | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:oracle:thin:@datasource01:1523:xe | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use MySQL the value is `oracle.jdbc.OracleDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 2 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + user = "root" + password = "123456" + query = "SELECT * FROM TEST_TABLE" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +source { + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "SELECT * FROM TEST_TABLE" + # Parallel sharding reads fields + partition_column = "ID" + # Number of fragments + partition_num = 10 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + url = "jdbc:oracle:thin:@datasource01:1523:xe" + driver = "oracle.jdbc.OracleDriver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "SELECT * FROM TEST_TABLE" + partition_column = "ID" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + } +} +``` + diff --git a/docs/en/connector-v2/source/OssFile.md b/docs/en/connector-v2/source/OssFile.md index 532b4d03aa7..7c992581f5a 100644 --- a/docs/en/connector-v2/source/OssFile.md +++ b/docs/en/connector-v2/source/OssFile.md @@ -56,6 +56,7 @@ Read all the data in a split in a pollNext call. What splits are read will be sa | schema | config | no | - | | common-options | | no | - | | sheet_name | string | no | - | +| file_filter_pattern | string | no | - | ### path [string] @@ -246,7 +247,7 @@ Source plugin common parameters, please refer to [Source Common Options](common- ### sheet_name [string] -Reader the sheet of the workbook,Only used when file_format is excel. +Reader the sheet of the workbook,Only used when file_format_type is excel. ## Example @@ -282,6 +283,10 @@ Reader the sheet of the workbook,Only used when file_format is excel. ``` +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. + ## Changelog ### 2.2.0-beta 2022-09-26 diff --git a/docs/en/connector-v2/source/OssJindoFile.md b/docs/en/connector-v2/source/OssJindoFile.md index 3e3649e19b9..f77c4a4543a 100644 --- a/docs/en/connector-v2/source/OssJindoFile.md +++ b/docs/en/connector-v2/source/OssJindoFile.md @@ -56,6 +56,7 @@ Read all the data in a split in a pollNext call. What splits are read will be sa | schema | config | no | - | | common-options | | no | - | | sheet_name | string | no | - | +| file_filter_pattern | string | no | - | ### path [string] @@ -246,7 +247,11 @@ Source plugin common parameters, please refer to [Source Common Options](common- ### sheet_name [string] -Reader the sheet of the workbook,Only used when file_format is excel. +Reader the sheet of the workbook,Only used when file_format_type is excel. + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. ## Example diff --git a/docs/en/connector-v2/source/PostgreSQL.md b/docs/en/connector-v2/source/PostgreSQL.md new file mode 100644 index 00000000000..50839780726 --- /dev/null +++ b/docs/en/connector-v2/source/PostgreSQL.md @@ -0,0 +1,158 @@ +# PostgreSQL + +> JDBC PostgreSQL Source Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|------------------------------------------------------------|-----------------------|---------------------------------------|--------------------------------------------------------------------------| +| PostgreSQL | Different dependency version has different driver class. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/org.postgresql/postgresql) | +| PostgreSQL | If you want to manipulate the GEOMETRY type in PostgreSQL. | org.postgresql.Driver | jdbc:postgresql://localhost:5432/test | [Download](https://mvnrepository.com/artifact/net.postgis/postgis-jdbc) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example PostgreSQL datasource: cp postgresql-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/
+> If you want to manipulate the GEOMETRY type in PostgreSQL, add postgresql-xxx.jar and postgis-jdbc-xxx.jar to $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| PostgreSQL Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------| +| BOOL
| BOOLEAN | +| _BOOL
| ARRAY<BOOLEAN> | +| BYTEA
| BYTES | +| _BYTEA
| ARRAY<TINYINT> | +| INT2
SMALLSERIAL
INT4
SERIAL
| INT | +| _INT2
_INT4
| ARRAY<INT> | +| INT8
BIGSERIAL
| BIGINT | +| _INT8
| ARRAY<BIGINT> | +| FLOAT4
| FLOAT | +| _FLOAT4
| ARRAY<FLOAT> | +| FLOAT8
| DOUBLE | +| _FLOAT8
| ARRAY<DOUBLE> | +| NUMERIC(Get the designated column's specified column size>0) | DECIMAL(Get the designated column's specified column size,Gets the number of digits in the specified column to the right of the decimal point) | +| NUMERIC(Get the designated column's specified column size<0) | DECIMAL(38, 18) | +| BPCHAR
CHARACTER
VARCHAR
TEXT
GEOMETRY
GEOGRAPHY
JSON
JSONB | STRING | +| _BPCHAR
_CHARACTER
_VARCHAR
_TEXT | ARRAY<STRING> | +| TIMESTAMP
| TIMESTAMP | +| TIME
| TIME | +| DATE
| DATE | +| OTHER DATA TYPES | NOT SUPPORTED YET | + +## Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:postgresql://localhost:5432/test | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use PostgreSQL the value is `org.postgresql.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 2 + job.mode = "BATCH" +} + +source{ + Jdbc { + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source limit 16" + } +} + +transform { + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +source{ + jdbc{ + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source" + partition_column= "id" + partition_num = 5 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source{ + jdbc{ + url = "jdbc:postgresql://localhost:5432/test" + driver = "org.postgresql.Driver" + user = "root" + password = "test" + query = "select * from source" + partition_column= "id" + + # The name of the table returned + result_table_name = "jdbc" + partition_lower_bound = 1 + partition_upper_bound = 50 + partition_num = 5 + } +} +``` + diff --git a/docs/en/connector-v2/source/S3File.md b/docs/en/connector-v2/source/S3File.md index f58a1a6bc36..54124a37038 100644 --- a/docs/en/connector-v2/source/S3File.md +++ b/docs/en/connector-v2/source/S3File.md @@ -1,22 +1,14 @@ # S3File -> S3 file source connector +> S3 File Source Connector -## Description - -Read data from aws s3 file system. - -:::tip - -If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x. - -If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you download and install SeaTunnel Engine. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this. +## Support Those Engines -To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir. +> Spark
+> Flink
+> SeaTunnel Zeta
-::: - -## Key features +## Key Features - [x] [batch](../../concept/connector-v2-features.md) - [ ] [stream](../../concept/connector-v2-features.md) @@ -35,103 +27,31 @@ Read all the data in a split in a pollNext call. What splits are read will be sa - [x] json - [x] excel -## Options - -| name | type | required | default value | -|---------------------------------|---------|----------|-------------------------------------------------------| -| path | string | yes | - | -| file_format_type | string | yes | - | -| bucket | string | yes | - | -| fs.s3a.endpoint | string | yes | - | -| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | -| read_columns | list | no | - | -| access_key | string | no | - | -| access_secret | string | no | - | -| hadoop_s3_properties | map | no | - | -| delimiter | string | no | \001 | -| parse_partition_from_path | boolean | no | true | -| date_format | string | no | yyyy-MM-dd | -| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | -| time_format | string | no | HH:mm:ss | -| skip_header_row_number | long | no | 0 | -| schema | config | no | - | -| common-options | | no | - | -| sheet_name | string | no | - | - -### path [string] - -The source file path. - -### fs.s3a.endpoint [string] - -fs s3a endpoint - -### fs.s3a.aws.credentials.provider [string] - -The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. - -More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A) - -### delimiter [string] - -Field delimiter, used to tell connector how to slice and dice fields when reading text files - -default `\001`, the same as hive's default delimiter - -### parse_partition_from_path [boolean] - -Control whether parse the partition keys and values from file path - -For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26` - -Every record data from file will be added these two fields: - -| name | age | -|---------------|-----| -| tyrantlucifer | 26 | - -Tips: **Do not define partition fields in schema option** - -### date_format [string] - -Date type format, used to tell connector how to convert string to date, supported as the following formats: - -`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd` - -default `yyyy-MM-dd` - -### datetime_format [string] - -Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats: - -`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` - -default `yyyy-MM-dd HH:mm:ss` - -### time_format [string] - -Time type format, used to tell connector how to convert string to time, supported as the following formats: - -`HH:mm:ss` `HH:mm:ss.SSS` - -default `HH:mm:ss` +## Description -### skip_header_row_number [long] +Read data from aws s3 file system. -Skip the first few lines, but only for the txt and csv. +## Supported DataSource Info -For example, set like following: +| Datasource | Supported versions | +|------------|--------------------| +| S3 | current | -`skip_header_row_number = 2` +## Dependency -then SeaTunnel will skip the first 2 lines from source files +> If you use spark/flink, In order to use this connector, You must ensure your spark/flink cluster already integrated hadoop. The tested hadoop version is 2.x.
+> +> If you use SeaTunnel Zeta, It automatically integrated the hadoop jar when you download and install SeaTunnel Zeta. You can check the jar package under ${SEATUNNEL_HOME}/lib to confirm this.
+> To use this connector you need put hadoop-aws-3.1.4.jar and aws-java-sdk-bundle-1.11.271.jar in ${SEATUNNEL_HOME}/lib dir. -### file_format_type [string] +## Data Type Mapping -File type, supported as the following file types: +Data type mapping is related to the type of file being read, We supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` +### JSON File Type + If you assign file type to `json`, you should also assign schema option to tell connector how to parse data to the row you want. For example: @@ -173,7 +93,7 @@ connector will generate data as the following: |------|-------------|---------| | 200 | get success | true | -If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. +### Text Or CSV File Type If you assign file type to `text` `csv`, you can choose to specify the schema information or not. @@ -214,61 +134,102 @@ connector will generate data as the following: |---------------|-----|--------| | tyrantlucifer | 26 | male | -### bucket [string] - -The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. - -### access_key [string] - -The access key of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) - -### access_secret [string] - -The access secret of s3 file system. If this parameter is not set, please confirm that the credential provider chain can be authenticated correctly, you could check this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) - -### hadoop_s3_properties [map] +### Orc File Type -If you need to add a other option, you could add it here and refer to this [hadoop-aws](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) - -``` -hadoop_s3_properties { - "xxx" = "xxx" - } -``` - -### schema [config] - -#### fields [Config] - -The schema of upstream data. - -### read_columns [list] - -The read column list of the data source, user can use it to implement field projection. - -The file type supported column projection as the following shown: - -- text -- json -- csv -- orc -- parquet -- excel +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. -**Tips: If the user wants to use this feature when reading `text` `json` `csv` files, the schema option must be configured** +| Orc Data type | SeaTunnel Data type | +|----------------------------------|----------------------------------------------------------------| +| BOOLEAN | BOOLEAN | +| INT | INT | +| BYTE | BYTE | +| SHORT | SHORT | +| LONG | LONG | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| BINARY | BINARY | +| STRING
VARCHAR
CHAR
| STRING | +| DATE | LOCAL_DATE_TYPE | +| TIMESTAMP | LOCAL_DATE_TIME_TYPE | +| DECIMAL | DECIMAL | +| LIST(STRING) | STRING_ARRAY_TYPE | +| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | +| LIST(TINYINT) | BYTE_ARRAY_TYPE | +| LIST(SMALLINT) | SHORT_ARRAY_TYPE | +| LIST(INT) | INT_ARRAY_TYPE | +| LIST(BIGINT) | LONG_ARRAY_TYPE | +| LIST(FLOAT) | FLOAT_ARRAY_TYPE | +| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | +| Map | MapType, This type of K and V will transform to SeaTunnel type | +| STRUCT | SeaTunnelRowType | + +### Parquet File Type -### common options +If you assign file type to `parquet` `orc`, schema option not required, connector can find the schema of upstream data automatically. -Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. +| Orc Data type | SeaTunnel Data type | +|----------------------|----------------------------------------------------------------| +| INT_8 | BYTE | +| INT_16 | SHORT | +| DATE | DATE | +| TIMESTAMP_MILLIS | TIMESTAMP | +| INT64 | LONG | +| INT96 | TIMESTAMP | +| BINARY | BYTES | +| FLOAT | FLOAT | +| DOUBLE | DOUBLE | +| BOOLEAN | BOOLEAN | +| FIXED_LEN_BYTE_ARRAY | TIMESTAMP
DECIMAL | +| DECIMAL | DECIMAL | +| LIST(STRING) | STRING_ARRAY_TYPE | +| LIST(BOOLEAN) | BOOLEAN_ARRAY_TYPE | +| LIST(TINYINT) | BYTE_ARRAY_TYPE | +| LIST(SMALLINT) | SHORT_ARRAY_TYPE | +| LIST(INT) | INT_ARRAY_TYPE | +| LIST(BIGINT) | LONG_ARRAY_TYPE | +| LIST(FLOAT) | FLOAT_ARRAY_TYPE | +| LIST(DOUBLE) | DOUBLE_ARRAY_TYPE | +| Map | MapType, This type of K and V will transform to SeaTunnel type | +| STRUCT | SeaTunnelRowType | -### sheet_name [string] +## Options -Reader the sheet of the workbook,Only used when file_format is excel. +| name | type | required | default value | Description | +|---------------------------------|---------|----------|-------------------------------------------------------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| path | string | yes | - | The s3 path that needs to be read can have sub paths, but the sub paths need to meet certain format requirements. Specific requirements can be referred to "parse_partition_from_path" option | +| file_format_type | string | yes | - | File type, supported as the following file types: `text` `csv` `parquet` `orc` `json` `excel` | +| bucket | string | yes | - | The bucket address of s3 file system, for example: `s3n://seatunnel-test`, if you use `s3a` protocol, this parameter should be `s3a://seatunnel-test`. | +| fs.s3a.endpoint | string | yes | - | fs s3a endpoint | +| fs.s3a.aws.credentials.provider | string | yes | com.amazonaws.auth.InstanceProfileCredentialsProvider | The way to authenticate s3a. We only support `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` and `com.amazonaws.auth.InstanceProfileCredentialsProvider` now. More information about the credential provider you can see [Hadoop AWS Document](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html#Simple_name.2Fsecret_credentials_with_SimpleAWSCredentialsProvider.2A) | +| read_columns | list | no | - | The read column list of the data source, user can use it to implement field projection. The file type supported column projection as the following shown: `text` `csv` `parquet` `orc` `json` `excel` . If the user wants to use this feature when reading `text` `json` `csv` files, the "schema" option must be configured. | +| access_key | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` | +| access_secret | string | no | - | Only used when `fs.s3a.aws.credentials.provider = org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider ` | +| hadoop_s3_properties | map | no | - | If you need to add other option, you could add it here and refer to this [link](https://hadoop.apache.org/docs/stable/hadoop-aws/tools/hadoop-aws/index.html) | +| delimiter | string | no | \001 | Field delimiter, used to tell connector how to slice and dice fields when reading text files. Default `\001`, the same as hive's default delimiter. | +| parse_partition_from_path | boolean | no | true | Control whether parse the partition keys and values from file path. For example if you read a file from path `s3n://hadoop-cluster/tmp/seatunnel/parquet/name=tyrantlucifer/age=26`. Every record data from file will be added these two fields: name="tyrantlucifer", age=16 | +| date_format | string | no | yyyy-MM-dd | Date type format, used to tell connector how to convert string to date, supported as the following formats:`yyyy-MM-dd` `yyyy.MM.dd` `yyyy/MM/dd`. default `yyyy-MM-dd` | +| datetime_format | string | no | yyyy-MM-dd HH:mm:ss | Datetime type format, used to tell connector how to convert string to datetime, supported as the following formats:`yyyy-MM-dd HH:mm:ss` `yyyy.MM.dd HH:mm:ss` `yyyy/MM/dd HH:mm:ss` `yyyyMMddHHmmss` | +| time_format | string | no | HH:mm:ss | Time type format, used to tell connector how to convert string to time, supported as the following formats:`HH:mm:ss` `HH:mm:ss.SSS` | +| skip_header_row_number | long | no | 0 | Skip the first few lines, but only for the txt and csv. For example, set like following:`skip_header_row_number = 2`. Then SeaTunnel will skip the first 2 lines from source files | +| schema | config | no | - | The schema of upstream data. | +| common-options | | no | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. | +| sheet_name | string | no | - | Reader the sheet of the workbook,Only used when file_format is excel. | ## Example -```hocon +1. In this example, We read data from s3 path `s3a://seatunnel-test/seatunnel/text` and the file type is orc in this path. + We use `org.apache.hadoop.fs.s3a.SimpleAWSCredentialsProvider` to authentication so `access_key` and `secret_key` is required. + All columns in the file will be read and send to sink. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} +source { S3File { path = "/seatunnel/text" fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" @@ -278,9 +239,21 @@ Reader the sheet of the workbook,Only used when file_format is excel. bucket = "s3a://seatunnel-test" file_format_type = "orc" } +} +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + Console {} +} ``` +2. Use `InstanceProfileCredentialsProvider` to authentication + The file type in S3 is json, so need config schema option. + ```hocon S3File { @@ -299,6 +272,48 @@ Reader the sheet of the workbook,Only used when file_format is excel. ``` +3. Use `InstanceProfileCredentialsProvider` to authentication + The file type in S3 is json and has five fields (`id`, `name`, `age`, `sex`, `type`), so need config schema option. + In this job, we only need send `id` and `name` column to mysql. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + S3File { + path = "/seatunnel/json" + bucket = "s3a://seatunnel-test" + fs.s3a.endpoint="s3.cn-north-1.amazonaws.com.cn" + fs.s3a.aws.credentials.provider="com.amazonaws.auth.InstanceProfileCredentialsProvider" + file_format_type = "json" + read_columns = ["id", "name"] + schema { + fields { + id = int + name = string + age = int + sex = int + type = string + } + } + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/category/transform-v2 +} + +sink { + Console {} +} +``` + ## Changelog ### 2.3.0-beta 2022-10-20 diff --git a/docs/en/connector-v2/source/SftpFile.md b/docs/en/connector-v2/source/SftpFile.md index 500ec2af5b5..184a587a928 100644 --- a/docs/en/connector-v2/source/SftpFile.md +++ b/docs/en/connector-v2/source/SftpFile.md @@ -47,6 +47,7 @@ If you use SeaTunnel Engine, It automatically integrated the hadoop jar when you | schema | config | no | - | | common-options | | no | - | | sheet_name | string | no | - | +| file_filter_pattern | string | no | - | ### host [string] @@ -224,7 +225,11 @@ Source plugin common parameters, please refer to [Source Common Options](common- ### sheet_name [string] -Reader the sheet of the workbook,Only used when file_format is excel. +Reader the sheet of the workbook,Only used when file_format_type is excel. + +### file_filter_pattern [string] + +Filter pattern, which used for filtering files. ## Example diff --git a/docs/en/connector-v2/source/Snowflake.md b/docs/en/connector-v2/source/Snowflake.md index cd824eab463..a7835013d58 100644 --- a/docs/en/connector-v2/source/Snowflake.md +++ b/docs/en/connector-v2/source/Snowflake.md @@ -56,20 +56,20 @@ Read external data source data through JDBC. ## Options -| name | type | required | default | description | -|------------------------------|--------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| -| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:snowflake://.snowflakecomputing.com | -| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Snowflake the value is `net.snowflake.client.jdbc.SnowflakeDriver`. | -| user | String | No | - | Connection instance user name | -| password | String | No | - | Connection instance password | -| query | String | Yes | - | Query statement | -| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | -| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | -| partition_lower_bound | Long | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | -| partition_upper_bound | Long | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | -| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | -| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | -| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | +| name | type | required | default | description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:snowflake://.snowflakecomputing.com | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Snowflake the value is `net.snowflake.client.jdbc.SnowflakeDriver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | ## tips diff --git a/docs/en/connector-v2/source/Vertica.md b/docs/en/connector-v2/source/Vertica.md new file mode 100644 index 00000000000..df387ac30bf --- /dev/null +++ b/docs/en/connector-v2/source/Vertica.md @@ -0,0 +1,157 @@ +# Vertica + +> JDBC Vertica Source Connector + +## Support Those Engines + +> Spark
+> Flink
+> SeaTunnel Zeta
+ +## Key Features + +- [x] [batch](../../concept/connector-v2-features.md) +- [ ] [stream](../../concept/connector-v2-features.md) +- [x] [exactly-once](../../concept/connector-v2-features.md) +- [x] [column projection](../../concept/connector-v2-features.md) +- [x] [parallelism](../../concept/connector-v2-features.md) +- [x] [support user-defined split](../../concept/connector-v2-features.md) + +> supports query SQL and can achieve projection effect. + +## Description + +Read external data source data through JDBC. + +## Supported DataSource Info + +| Datasource | Supported versions | Driver | Url | Maven | +|------------|----------------------------------------------------------|-------------------------|---------------------------------------|----------------------------------------------------------------------| +| Vertica | Different dependency version has different driver class. | com.vertica.jdbc.Driver | jdbc:vertica://localhost:5433/vertica | [Download](https://www.vertica.com/download/vertica/client-drivers/) | + +## Database Dependency + +> Please download the support list corresponding to 'Maven' and copy it to the '$SEATNUNNEL_HOME/plugins/jdbc/lib/' working directory
+> For example Vertica datasource: cp vertica-jdbc-xxx.jar $SEATNUNNEL_HOME/plugins/jdbc/lib/ + +## Data Type Mapping + +| Vertical Data type | SeaTunnel Data type | +|-----------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------| +| BIT | BOOLEAN | +| TINYINT
TINYINT UNSIGNED
SMALLINT
SMALLINT UNSIGNED
MEDIUMINT
MEDIUMINT UNSIGNED
INT
INTEGER
YEAR | INT | +| INT UNSIGNED
INTEGER UNSIGNED
BIGINT | LONG | +| BIGINT UNSIGNED | DECIMAL(20,0) | +| DECIMAL(x,y)(Get the designated column's specified column size.<38) | DECIMAL(x,y) | +| DECIMAL(x,y)(Get the designated column's specified column size.>38) | DECIMAL(38,18) | +| DECIMAL UNSIGNED | DECIMAL((Get the designated column's specified column size)+1,
(Gets the designated column's number of digits to right of the decimal point.))) | +| FLOAT
FLOAT UNSIGNED | FLOAT | +| DOUBLE
DOUBLE UNSIGNED | DOUBLE | +| CHAR
VARCHAR
TINYTEXT
MEDIUMTEXT
TEXT
LONGTEXT
JSON | STRING | +| DATE | DATE | +| TIME | TIME | +| DATETIME
TIMESTAMP | TIMESTAMP | +| TINYBLOB
MEDIUMBLOB
BLOB
LONGBLOB
BINARY
VARBINAR
BIT(n) | BYTES | +| GEOMETRY
UNKNOWN | Not supported yet | + +## Source Options + +| Name | Type | Required | Default | Description | +|------------------------------|------------|----------|-----------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| url | String | Yes | - | The URL of the JDBC connection. Refer to a case: jdbc:vertica://localhost:5433/vertica | +| driver | String | Yes | - | The jdbc class name used to connect to the remote data source,
if you use Vertica the value is `com.vertica.jdbc.Driver`. | +| user | String | No | - | Connection instance user name | +| password | String | No | - | Connection instance password | +| query | String | Yes | - | Query statement | +| connection_check_timeout_sec | Int | No | 30 | The time in seconds to wait for the database operation used to validate the connection to complete | +| partition_column | String | No | - | The column name for parallelism's partition, only support numeric type,Only support numeric type primary key, and only can config one column. | +| partition_lower_bound | BigDecimal | No | - | The partition_column min value for scan, if not set SeaTunnel will query database get min value. | +| partition_upper_bound | BigDecimal | No | - | The partition_column max value for scan, if not set SeaTunnel will query database get max value. | +| partition_num | Int | No | job parallelism | The number of partition count, only support positive integer. default value is job parallelism | +| fetch_size | Int | No | 0 | For queries that return a large number of objects,you can configure
the row fetch size used in the query toimprove performance by
reducing the number database hits required to satisfy the selection criteria.
Zero means use jdbc default value. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +### Tips + +> If partition_column is not set, it will run in single concurrency, and if partition_column is set, it will be executed in parallel according to the concurrency of tasks. + +## Task Example + +### Simple: + +> This example queries type_bin 'table' 16 data in your test "database" in single parallel and queries all of its fields. You can also specify which fields to query for final output to the console. + +``` +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 2 + job.mode = "BATCH" +} +source{ + Jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + query = "select * from type_bin limit 16" + } +} + +transform { + # If you would like to get more information about how to configure seatunnel and see full list of transform plugins, + # please go to https://seatunnel.apache.org/docs/transform-v2/sql +} + +sink { + Console {} +} +``` + +### Parallel: + +> Read your query table in parallel with the shard field you configured and the shard data You can do this if you want to read the whole table + +``` +source { + Jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + # Parallel sharding reads fields + partition_column = "id" + # Number of fragments + partition_num = 10 + } +} +``` + +### Parallel Boundary: + +> It is more efficient to specify the data within the upper and lower bounds of the query It is more efficient to read your data source according to the upper and lower boundaries you configured + +``` +source { + Jdbc { + url = "jdbc:vertica://localhost:5433/vertica" + driver = "com.vertica.jdbc.Driver" + connection_check_timeout_sec = 100 + user = "root" + password = "123456" + # Define query logic as required + query = "select * from type_bin" + partition_column = "id" + # Read start boundary + partition_lower_bound = 1 + # Read end boundary + partition_upper_bound = 500 + partition_num = 10 + } +} +``` + diff --git a/docs/en/connector-v2/source/kafka.md b/docs/en/connector-v2/source/kafka.md index 2ed6ec6f12e..16b9c5420b3 100644 --- a/docs/en/connector-v2/source/kafka.md +++ b/docs/en/connector-v2/source/kafka.md @@ -2,11 +2,13 @@ > Kafka source connector -## Description +## Support Those Engines -Source connector for Apache Kafka. +> Spark
+> Flink
+> Seatunnel Zeta
-## Key features +## Key Features - [x] [batch](../../concept/connector-v2-features.md) - [x] [stream](../../concept/connector-v2-features.md) @@ -15,111 +17,54 @@ Source connector for Apache Kafka. - [x] [parallelism](../../concept/connector-v2-features.md) - [ ] [support user-defined split](../../concept/connector-v2-features.md) -## Options - -| name | type | required | default value | -|-------------------------------------|---------|----------|--------------------------| -| topic | String | yes | - | -| bootstrap.servers | String | yes | - | -| pattern | Boolean | no | false | -| consumer.group | String | no | SeaTunnel-Consumer-Group | -| commit_on_checkpoint | Boolean | no | true | -| kafka.config | Map | no | - | -| common-options | config | no | - | -| schema | | no | - | -| format | String | no | json | -| format_error_handle_way | String | no | fail | -| field_delimiter | String | no | , | -| start_mode | String | no | group_offsets | -| start_mode.offsets | | no | | -| start_mode.timestamp | Long | no | | -| partition-discovery.interval-millis | long | no | -1 | - -### topic [string] - -`Kafka topic` name. If there are multiple `topics`, use `,` to split, for example: `"tpc1,tpc2"`. - -### bootstrap.servers [string] - -`Kafka` cluster address, separated by `","`. - -### pattern [boolean] - -If `pattern` is set to `true`,the regular expression for a pattern of topic names to read from. All topics in clients with names that match the specified regular expression will be subscribed by the consumer. - -### consumer.group [string] - -`Kafka consumer group id`, used to distinguish different consumer groups. - -### commit_on_checkpoint [boolean] - -If true the consumer's offset will be periodically committed in the background. - -## partition-discovery.interval-millis [long] - -The interval for dynamically discovering topics and partitions. - -### kafka.config [map] - -In addition to the above necessary parameters that must be specified by the `Kafka consumer` client, users can also specify multiple `consumer` client non-mandatory parameters, covering [all consumer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#consumerconfigs). - -### common-options [config] - -Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details. - -### schema - -The structure of the data, including field names and field types. - -## format - -Data format. The default format is json. Optional text format, canal-json and debezium-json. -If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option. -If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details. -If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. - -## format_error_handle_way - -The processing method of data format error. The default value is fail, and the optional value is (fail, skip). -When fail is selected, data format error will block and an exception will be thrown. -When skip is selected, data format error will skip this line data. - -## field_delimiter - -Customize the field delimiter for data format. - -## start_mode - -The initial consumption pattern of consumers,there are several types: -[earliest],[group_offsets],[latest],[specific_offsets],[timestamp] - -## start_mode.timestamp - -The time required for consumption mode to be "timestamp". - -## start_mode.offsets - -The offset required for consumption mode to be specific_offsets. - -for example: +## Description -```hocon -start_mode.offsets = { - info-0 = 70 - info-1 = 10 - info-2 = 10 - } -``` +Source connector for Apache Kafka. -## Example +## Supported DataSource Info + +In order to use the Kafka connector, the following dependencies are required. +They can be downloaded via install-plugin.sh or from the Maven central repository. + +| Datasource | Supported Versions | Maven | +|------------|--------------------|-------------------------------------------------------------------------------------------------------------| +| Kafka | Universal | [Download](https://mvnrepository.com/artifact/org.apache.seatunnel/seatunnel-connectors-v2/connector-kafka) | + +## Source Options + +| Name | Type | Required | Default | Description | +|-------------------------------------|-----------------------------------------------------------------------------|----------|--------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| topic | String | Yes | - | Topic name(s) to read data from when the table is used as source. It also supports topic list for source by separating topic by comma like 'topic-1,topic-2'. | +| bootstrap.servers | String | Yes | - | Comma separated list of Kafka brokers. | +| pattern | Boolean | No | false | If `pattern` is set to `true`,the regular expression for a pattern of topic names to read from. All topics in clients with names that match the specified regular expression will be subscribed by the consumer. | +| consumer.group | String | No | SeaTunnel-Consumer-Group | `Kafka consumer group id`, used to distinguish different consumer groups. | +| commit_on_checkpoint | Boolean | No | true | If true the consumer's offset will be periodically committed in the background. | +| kafka.config | Map | No | - | In addition to the above necessary parameters that must be specified by the `Kafka consumer` client, users can also specify multiple `consumer` client non-mandatory parameters, covering [all consumer parameters specified in the official Kafka document](https://kafka.apache.org/documentation.html#consumerconfigs). | +| schema | Config | No | - | The structure of the data, including field names and field types. | +| format | String | No | json | Data format. The default format is json. Optional text format, canal-json and debezium-json.If you use json or text format. The default field separator is ", ". If you customize the delimiter, add the "field_delimiter" option.If you use canal format, please refer to [canal-json](../formats/canal-json.md) for details.If you use debezium format, please refer to [debezium-json](../formats/debezium-json.md) for details. | +| format_error_handle_way | String | No | fail | The processing method of data format error. The default value is fail, and the optional value is (fail, skip). When fail is selected, data format error will block and an exception will be thrown. When skip is selected, data format error will skip this line data. | +| field_delimiter | String | No | , | Customize the field delimiter for data format. | +| start_mode | StartMode[earliest],[group_offsets],[latest],[specific_offsets],[timestamp] | No | group_offsets | The initial consumption pattern of consumers. | +| start_mode.offsets | Config | No | - | The offset required for consumption mode to be specific_offsets. | +| start_mode.timestamp | Long | No | - | The time required for consumption mode to be "timestamp". | +| partition-discovery.interval-millis | Long | No | -1 | The interval for dynamically discovering topics and partitions. | +| common-options | | No | - | Source plugin common parameters, please refer to [Source Common Options](common-options.md) for details | + +## Task Example ### Simple +> This example reads the data of kafka's topic_1, topic_2, topic_3 and prints it to the client.And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in Install SeaTunnel to install and deploy SeaTunnel. And if you have not yet installed and deployed SeaTunnel, you need to follow the instructions in [Install SeaTunnel](../../start-v2/locally/deployment.md) to install and deploy SeaTunnel. And then follow the instructions in [Quick Start With SeaTunnel Engine](../../start-v2/locally/quick-start-seatunnel-engine.md) to run this job. + ```hocon +# Defining the runtime environment +env { + # You can set flink configuration here + execution.parallelism = 2 + job.mode = "BATCH" +} source { - Kafka { - result_table_name = "kafka_name" schema = { fields { name = "string" @@ -136,8 +81,10 @@ source { auto.offset.reset = "earliest" enable.auto.commit = "false" } - } - + } +} +sink { + Console {} } ``` @@ -145,14 +92,12 @@ source { ```hocon source { - Kafka { topic = ".*seatunnel*." pattern = "true" bootstrap.servers = "localhost:9092" consumer.group = "seatunnel_group" } - } ``` @@ -169,7 +114,7 @@ source { kafka.config = { security.protocol=SASL_SSL sasl.mechanism=SCRAM-SHA-512 - sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};" + sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";" #security.protocol=SASL_SSL #sasl.mechanism=AWS_MSK_IAM #sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" @@ -205,7 +150,7 @@ source { kafka.config = { #security.protocol=SASL_SSL #sasl.mechanism=SCRAM-SHA-512 - #sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required \nusername=${username}\npassword=${password};" + #sasl.jaas.config="org.apache.kafka.common.security.scram.ScramLoginModule required username=\"username\" password=\"password\";" security.protocol=SASL_SSL sasl.mechanism=AWS_MSK_IAM sasl.jaas.config="software.amazon.msk.auth.iam.IAMLoginModule required;" @@ -215,18 +160,3 @@ source { } ``` -## Changelog - -### 2.3.0-beta 2022-10-20 - -- Add Kafka Source Connector - -### Next Version - -- [Improve] Support setting read starting offset or time at startup config ([3157](https://github.com/apache/incubator-seatunnel/pull/3157)) -- [Improve] Support for dynamic discover topic & partition in streaming mode ([3125](https://github.com/apache/incubator-seatunnel/pull/3125)) -- [Improve] Change Connector Custom Config Prefix To Map [3719](https://github.com/apache/incubator-seatunnel/pull/3719) -- [Bug] Fixed the problem that parsing the offset format failed when the startup mode was offset([3810](https://github.com/apache/incubator-seatunnel/pull/3810)) -- [Improve] Support read canal format message [3950](https://github.com/apache/incubator-seatunnel/pull/3950) -- [Improve] Support read debezium format message [3981](https://github.com/apache/incubator-seatunnel/pull/3981) - diff --git a/docs/en/seatunnel-engine/checkpoint-storage.md b/docs/en/seatunnel-engine/checkpoint-storage.md index a88f301439e..f2a6487f28d 100644 --- a/docs/en/seatunnel-engine/checkpoint-storage.md +++ b/docs/en/seatunnel-engine/checkpoint-storage.md @@ -59,8 +59,6 @@ seatunnel: checkpoint: interval: 6000 timeout: 7000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 @@ -94,8 +92,6 @@ seatunnel: checkpoint: interval: 6000 timeout: 7000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 @@ -119,8 +115,6 @@ seatunnel: checkpoint: interval: 6000 timeout: 7000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 @@ -152,6 +146,28 @@ seatunnel: kerberosKeytab: your-kerberos-keytab ``` +if HDFS is in HA mode , you can config like this: + +```yaml +seatunnel: + engine: + checkpoint: + storage: + type: hdfs + max-retained: 3 + plugin-config: + storage.type: hdfs + fs.defaultFS: hdfs://usdp-bing + seatunnel.hadoop.dfs.nameservices: usdp-bing + seatunnel.hadoop.dfs.ha.namenodes.usdp-bing: nn1,nn2 + seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn1: usdp-bing-nn1:8020 + seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn2: usdp-bing-nn2:8020 + seatunnel.hadoop.dfs.client.failover.proxy.provider.usdp-bing: org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider" + +``` + +if HDFS has some other configs in `hdfs-site.xml` or `core-site.xml` , just set HDFS config by using `seatunnel.hadoop.` prefix. + #### LocalFile ```yaml @@ -160,8 +176,6 @@ seatunnel: checkpoint: interval: 6000 timeout: 7000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 diff --git a/docs/en/seatunnel-engine/deployment.md b/docs/en/seatunnel-engine/deployment.md index c07cd45d6b1..18c1a587a2a 100644 --- a/docs/en/seatunnel-engine/deployment.md +++ b/docs/en/seatunnel-engine/deployment.md @@ -75,14 +75,6 @@ The interval between two checkpoints, unit is milliseconds. If the `checkpoint.i The timeout of a checkpoint. If a checkpoint cannot be completed within the timeout period, a checkpoint failure will be triggered. Therefore, Job will be restored. -**max-concurrent** - -How many checkpoints can be performed simultaneously at most. - -**tolerable-failure** - -Maximum number of retries after checkpoint failure. - Example ``` @@ -95,14 +87,24 @@ seatunnel: checkpoint: interval: 300000 timeout: 10000 - max-concurrent: 1 - tolerable-failure: 2 ``` **checkpoint storage** About the checkpoint storage, you can see [checkpoint storage](checkpoint-storage.md) +### 4.4 Historical Job expiration Config + +The information about each completed Job, such as status, counters, and error logs, is stored in the IMap object. As the number of running jobs increases, the memory increases and eventually the memory will overflow. Therefore, you can adjust the history-job-expire-minutes parameter to solve this problem. The time unit of this parameter is minute. The default value is 1440 minutes, that is, one day. + +Example + +``` +seatunnel: + engine: + history-job-expire-minutes: 1440 +``` + ## 5. Config SeaTunnel Engine Server All SeaTunnel Engine Server config in `hazelcast.yaml` file. diff --git a/docs/en/seatunnel-engine/rest-api.md b/docs/en/seatunnel-engine/rest-api.md index 2edec3496ad..2f44421a3d6 100644 --- a/docs/en/seatunnel-engine/rest-api.md +++ b/docs/en/seatunnel-engine/rest-api.md @@ -180,3 +180,61 @@ network: ------------------------------------------------------------------------------------------ +### Submit Job. + +
+POST /hazelcast/rest/maps/submit-job (Returns jobId and jobName if job submitted successfully.) + +#### Parameters + +> | name | type | data type | description | +> |----------------------|----------|-----------|-----------------------------------| +> | jobId | optional | string | job id | +> | jobName | optional | string | job name | +> | isStartWithSavePoint | optional | string | if job is started with save point | + +#### Body + +```json +{ + "env": { + "job.mode": "batch" + }, + "source": [ + { + "plugin_name": "FakeSource", + "result_table_name": "fake", + "row.num": 100, + "schema": { + "fields": { + "name": "string", + "age": "int", + "card": "int" + } + } + } + ], + "transform": [ + ], + "sink": [ + { + "plugin_name": "Console", + "source_table_name": ["fake"] + } + ] +} +``` + +#### Responses + +```json +{ + "jobId": 733584788375666689, + "jobName": "rest_api_test" +} +``` + +
+ +------------------------------------------------------------------------------------------ + diff --git a/docs/en/start-v2/kubernetes/kubernetes.mdx b/docs/en/start-v2/kubernetes/kubernetes.mdx index 1c4e013752e..6ba479aa4ff 100644 --- a/docs/en/start-v2/kubernetes/kubernetes.mdx +++ b/docs/en/start-v2/kubernetes/kubernetes.mdx @@ -42,16 +42,14 @@ To run the image with SeaTunnel, first create a `Dockerfile`: ```Dockerfile FROM flink:1.13 -ENV SEATUNNEL_VERSION="2.3.0" -ENV SEATUNNEL_HOME = "/opt/seatunnel" +ENV SEATUNNEL_VERSION="2.3.2" +ENV SEATUNNEL_HOME="/opt/seatunnel" -RUN mkdir -p $SEATUNNEL_HOME +RUN wget https://dlcdn.apache.org/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz +RUN tar -xzvf apache-seatunnel-${SEATUNNEL_VERSION}-bin.tar.gz +RUN mv apache-seatunnel-${SEATUNNEL_VERSION} ${SEATUNNEL_HOME} -RUN wget https://archive.apache.org/dist/seatunnel/${SEATUNNEL_VERSION}/apache-seatunnel-incubating-${SEATUNNEL_VERSION}-bin.tar.gz -RUN tar -xzvf apache-seatunnel-incubating-${SEATUNNEL_VERSION}-bin.tar.gz - -RUN cp -r apache-seatunnel-incubating-${SEATUNNEL_VERSION}/* $SEATUNNEL_HOME/ -RUN rm -rf apache-seatunnel-incubating-${SEATUNNEL_VERSION}* +RUN cd ${SEATUNNEL_HOME}||sh bin/install-plugin.sh ${SEATUNNEL_VERSION} ``` Then run the following commands to build the image: @@ -205,7 +203,7 @@ spec: - key: seatunnel.streaming.conf path: seatunnel.streaming.conf job: - jarURI: local:///opt/seatunnel/starter/seatunnel-flink-starter.jar + jarURI: local:///opt/seatunnel/starter/seatunnel-flink-13-starter.jar entryClass: org.apache.seatunnel.core.starter.flink.SeaTunnelFlink args: ["--config", "/data/seatunnel.streaming.conf"] parallelism: 2 diff --git a/docs/en/start-v2/locally/deployment.md b/docs/en/start-v2/locally/deployment.md index 6f82a9d8489..1e5c0f9ed62 100644 --- a/docs/en/start-v2/locally/deployment.md +++ b/docs/en/start-v2/locally/deployment.md @@ -23,8 +23,8 @@ Or you can download it by terminal ```shell export version="2.3.2" -wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-incubating-${version}-bin.tar.gz" -tar -xzvf "apache-seatunnel-incubating-${version}-bin.tar.gz" +wget "https://archive.apache.org/dist/seatunnel/${version}/apache-seatunnel-${version}-bin.tar.gz" +tar -xzvf "apache-seatunnel-${version}-bin.tar.gz" ``` diff --git a/docs/en/start-v2/locally/quick-start-flink.md b/docs/en/start-v2/locally/quick-start-flink.md index cf01a0fccdb..9fd9f1eb43d 100644 --- a/docs/en/start-v2/locally/quick-start-flink.md +++ b/docs/en/start-v2/locally/quick-start-flink.md @@ -68,14 +68,14 @@ You could start the application by the following commands flink version between `1.12.x` and `1.14.x` ```shell -cd "apache-seatunnel-incubating-${version}" +cd "apache-seatunnel-${version}" ./bin/start-seatunnel-flink-13-connector-v2.sh --config ./config/v2.streaming.conf.template ``` flink version between `1.15.x` and `1.16.x` ```shell -cd "apache-seatunnel-incubating-${version}" +cd "apache-seatunnel-${version}" ./bin/start-seatunnel-flink-15-connector-v2.sh --config ./config/v2.streaming.conf.template ``` diff --git a/docs/en/start-v2/locally/quick-start-seatunnel-engine.md b/docs/en/start-v2/locally/quick-start-seatunnel-engine.md index db998897027..f469c570e3a 100644 --- a/docs/en/start-v2/locally/quick-start-seatunnel-engine.md +++ b/docs/en/start-v2/locally/quick-start-seatunnel-engine.md @@ -59,7 +59,7 @@ More information about config please check [config concept](../../concept/config You could start the application by the following commands ```shell -cd "apache-seatunnel-incubating-${version}" +cd "apache-seatunnel-${version}" ./bin/seatunnel.sh --config ./config/v2.batch.config.template -e local ``` diff --git a/docs/en/start-v2/locally/quick-start-spark.md b/docs/en/start-v2/locally/quick-start-spark.md index 88aebd5aa43..903217c8ec1 100644 --- a/docs/en/start-v2/locally/quick-start-spark.md +++ b/docs/en/start-v2/locally/quick-start-spark.md @@ -69,7 +69,7 @@ You could start the application by the following commands spark 2.4.x ```bash -cd "apache-seatunnel-incubating-${version}" +cd "apache-seatunnel-${version}" ./bin/start-seatunnel-spark-2-connector-v2.sh \ --master local[4] \ --deploy-mode client \ @@ -79,7 +79,7 @@ cd "apache-seatunnel-incubating-${version}" spark3.x.x ```shell -cd "apache-seatunnel-incubating-${version}" +cd "apache-seatunnel-${version}" ./bin/start-seatunnel-spark-3-connector-v2.sh \ --master local[4] \ --deploy-mode client \ diff --git a/docs/sidebars.js b/docs/sidebars.js index 0ff7206de7c..a8f2527413b 100644 --- a/docs/sidebars.js +++ b/docs/sidebars.js @@ -170,6 +170,7 @@ const sidebars = { "seatunnel-engine/deployment", "seatunnel-engine/local-mode", "seatunnel-engine/cluster-mode", + "seatunnel-engine/savepoint", "seatunnel-engine/checkpoint-storage", "seatunnel-engine/rest-api", "seatunnel-engine/tcp" diff --git a/plugin-mapping.properties b/plugin-mapping.properties index 96d8d9adfc5..1d7292fef7c 100644 --- a/plugin-mapping.properties +++ b/plugin-mapping.properties @@ -47,8 +47,10 @@ seatunnel.source.LocalFile = connector-file-local seatunnel.sink.LocalFile = connector-file-local seatunnel.source.OssFile = connector-file-oss seatunnel.sink.OssFile = connector-file-oss -seatunnel.source.OssJindoFile = connector-file-oss-jindo -seatunnel.sink.OssJindoFile = connector-file-oss-jindo +seatunnel.source.OssJindoFile = connector-file-jindo-oss +seatunnel.sink.OssJindoFile = connector-file-jindo-oss +seatunnel.source.CosFile = connector-file-cos +seatunnel.sink.CosFile = connector-file-cos seatunnel.source.Pulsar = connector-pulsar seatunnel.source.Hudi = connector-hudi seatunnel.sink.DingTalk = connector-dingtalk diff --git a/plugins/README.md b/plugins/README.md index 2c27ae7732a..0d58dfba490 100644 --- a/plugins/README.md +++ b/plugins/README.md @@ -2,6 +2,8 @@ This directory used to store some third party jar package dependency by connector running, such as jdbc drivers. +!!!Attention: If you use Zeta Engine, please add jar to `$SEATUNNEL_HOME/lib/` directory on each node. + ## directory structure The jar dependency by connector need put in `plugins/${connector name}/lib/` dir. diff --git a/release-note.md b/release-note.md index 31a911626bc..d647bdbad93 100644 --- a/release-note.md +++ b/release-note.md @@ -7,6 +7,8 @@ - [Core] [API] Fix parse nested row data type key changed upper (#4459) - [Starter][Flink]Support transform-v2 for flink #3396 - [Flink] Support flink 1.14.x #3963 +- [Core][Translation][Spark] Fix SeaTunnelRowConvertor fail to convert when schema contains row type (#5170) + ### Transformer - [Spark] Support transform-v2 for spark (#3409) - [ALL]Add FieldMapper Transform #3781 @@ -45,6 +47,7 @@ - [Connector-v2] [File] Fix configuration file format and error comments (#4762) - [Connector-v2] [Jdbc] Fix oracle sql table identifier (#4754) - [Connector-v2] [Clickhouse] fix get clickhouse local table name with closing bracket from distributed table engineFull (#4710) +- [Connector-v2] [CDC] Fix jdbc connection leak for mysql (#5037) ### Zeta(ST-Engine) @@ -81,6 +84,8 @@ ## Improve +- [Improve][Connector-V2][Jdbc-Source] Support for Decimal types as splict keys (#4634) + ### Core - [Core] [Spark] Push transform operation from Spark Driver to Executors (#4503) @@ -180,8 +185,10 @@ - [Docs] Fix markdown syntax (#4426) - [Docs] Fix Kafka Doc Error Config Key "kafka." (#4427) - [Docs] Add Transform to Quick Start v2 (#4436) +- [Docs] Fix Dockerfile and seatunnel-flink.yaml in Set Up with Kubernetes (#4788) - [Docs] Fix Mysql sink format doc (#4800) - [Docs] Add the generate sink sql parameter for the jdbc sink document (#4797) - [Docs] Add the generate sink sql parameter And example (#4769) - [Docs] Redshift add defaultRowFetchSize (#4616) - [Docs] Refactor connector-v2 docs using unified format Mysql (#4590) +- [Docs] Add Value types in Java to Schema features (#5087) diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java index a4ce408d73b..72057aef5f5 100644 --- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java +++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/configuration/Options.java @@ -30,7 +30,7 @@ import java.util.List; import java.util.Map; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; public class Options { diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnEvent.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnEvent.java index a9f4f1e1e06..a61dccc08d1 100644 --- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnEvent.java +++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/event/AlterTableColumnEvent.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.seatunnel.api.table.event; import org.apache.seatunnel.api.table.catalog.TablePath; diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java index 6bf0a2a865d..4eedb2255ad 100644 --- a/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java +++ b/seatunnel-api/src/main/java/org/apache/seatunnel/api/table/type/SeaTunnelRowType.java @@ -20,7 +20,7 @@ import java.util.Arrays; import java.util.List; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; public class SeaTunnelRowType implements CompositeType { private static final long serialVersionUID = 2L; diff --git a/seatunnel-api/src/test/resources/conf/option-test.conf b/seatunnel-api/src/test/resources/conf/option-test.conf index 4f20d493d4c..9461e5298b9 100644 --- a/seatunnel-api/src/test/resources/conf/option-test.conf +++ b/seatunnel-api/src/test/resources/conf/option-test.conf @@ -101,7 +101,7 @@ sink { partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_name_expression = "${transactionId}_${now}" - file_format = "text" + file_format_type = "text" sink_columns = ["name","age"] } } \ No newline at end of file diff --git a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/source/AmazonDynamoDBSourceReader.java b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/source/AmazonDynamoDBSourceReader.java index afaafa3f8a9..c25f8b0e0b1 100644 --- a/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/source/AmazonDynamoDBSourceReader.java +++ b/seatunnel-connectors-v2/connector-amazondynamodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/amazondynamodb/source/AmazonDynamoDBSourceReader.java @@ -31,11 +31,13 @@ import software.amazon.awssdk.auth.credentials.StaticCredentialsProvider; import software.amazon.awssdk.regions.Region; import software.amazon.awssdk.services.dynamodb.DynamoDbClient; +import software.amazon.awssdk.services.dynamodb.model.AttributeValue; import software.amazon.awssdk.services.dynamodb.model.ScanRequest; import software.amazon.awssdk.services.dynamodb.model.ScanResponse; import java.io.IOException; import java.net.URI; +import java.util.Map; @Slf4j public class AmazonDynamoDBSourceReader extends AbstractSingleSplitReader { @@ -78,18 +80,25 @@ public void close() throws IOException { @Override @SuppressWarnings("magicnumber") public void pollNext(Collector output) throws Exception { - ScanResponse scan = - dynamoDbClient.scan( - ScanRequest.builder() - .tableName(amazondynamodbSourceOptions.getTable()) - .build()); - if (scan.hasItems()) { - scan.items() - .forEach( - item -> { - output.collect(seaTunnelRowDeserializer.deserialize(item)); - }); - } + Map lastKeyEvaluated = null; + + ScanResponse scan; + do { + scan = + dynamoDbClient.scan( + ScanRequest.builder() + .tableName(amazondynamodbSourceOptions.getTable()) + .exclusiveStartKey(lastKeyEvaluated) + .build()); + if (scan.hasItems()) { + scan.items() + .forEach( + item -> { + output.collect(seaTunnelRowDeserializer.deserialize(item)); + }); + } + lastKeyEvaluated = scan.lastEvaluatedKey(); + } while (lastKeyEvaluated != null && !lastKeyEvaluated.isEmpty()); context.signalNoMoreElement(); } } diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/config/JdbcSourceConfigFactory.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/config/JdbcSourceConfigFactory.java index ec1a6976d85..068ee4be116 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/config/JdbcSourceConfigFactory.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/config/JdbcSourceConfigFactory.java @@ -60,7 +60,7 @@ public abstract class JdbcSourceConfigFactory implements SourceConfig.Factory SERVER_TIME_ZONE = Options.key("server-time-zone") .stringType() - .defaultValue("UTC") - .withDescription("The session time zone in database server."); + .defaultValue(ZoneId.systemDefault().getId()) + .withDescription( + "The session time zone in database server." + + "If not set, then ZoneId.systemDefault() is used to determine the server time zone"); public static final Option SERVER_ID = Options.key("server-id") diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java index ea7149e1c84..e99e7dab4b1 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/AbstractJdbcSourceChunkSplitter.java @@ -398,20 +398,6 @@ protected int ObjectCompare(Object obj1, Object obj2) { return ObjectUtils.compare(obj1, obj2); } - private static Column getSplitColumn(Table table) { - List primaryKeys = table.primaryKeyColumns(); - if (primaryKeys.isEmpty()) { - throw new UnsupportedOperationException( - String.format( - "Incremental snapshot for tables requires primary key," - + " but table %s doesn't have primary key.", - table.id())); - } - - // use first field in primary key as the split key - return primaryKeys.get(0); - } - @SuppressWarnings("MagicNumber") private static void maySleep(int count, TableId tableId) { // every 100 queries to sleep 1s diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/ChunkRange.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/ChunkRange.java index 1cf62f3448b..c543bad18cd 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/ChunkRange.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/enumerator/splitter/ChunkRange.java @@ -22,7 +22,7 @@ import java.util.Objects; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; /** * An internal structure describes a chunk range with a chunk start (inclusive) and chunk end diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceRecordEmitter.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceRecordEmitter.java index ba7ffb35f0a..3fbbd744b9c 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceRecordEmitter.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/IncrementalSourceRecordEmitter.java @@ -117,10 +117,8 @@ protected void processElement( splitState.asSnapshotSplitState().setLowWatermark(watermark); } else if (isHighWatermarkEvent(element) && splitState.isSnapshotSplitState()) { splitState.asSnapshotSplitState().setHighWatermark(watermark); - } else if (isSchemaChangeBeforeWatermarkEvent(element) - && splitState.isIncrementalSplitState()) { - emitElement(element, output); - } else if (isSchemaChangeAfterWatermarkEvent(element) + } else if ((isSchemaChangeBeforeWatermarkEvent(element) + || isSchemaChangeAfterWatermarkEvent(element)) && splitState.isIncrementalSplitState()) { emitElement(element, output); } diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceScanFetcher.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceScanFetcher.java index 7a09ac6bc4e..97c0c523e63 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceScanFetcher.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceScanFetcher.java @@ -223,14 +223,11 @@ public void close() { private boolean isChangeRecordInChunkRange(SourceRecord record) { if (taskContext.isDataChangeRecord(record)) { + // fix the between condition return taskContext.isRecordBetween( record, - null == currentSnapshotSplit.getSplitStart() - ? null - : new Object[] {currentSnapshotSplit.getSplitStart()}, - null == currentSnapshotSplit.getSplitEnd() - ? null - : new Object[] {currentSnapshotSplit.getSplitEnd()}); + currentSnapshotSplit.getSplitStart(), + currentSnapshotSplit.getSplitEnd()); } return false; } diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java index 6a15688443c..31fdaaf2e50 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/base/source/reader/external/IncrementalSourceStreamFetcher.java @@ -20,6 +20,7 @@ import org.apache.seatunnel.common.utils.SeaTunnelException; import org.apache.seatunnel.connectors.cdc.base.schema.SchemaChangeResolver; import org.apache.seatunnel.connectors.cdc.base.source.offset.Offset; +import org.apache.seatunnel.connectors.cdc.base.source.split.CompletedSnapshotSplitInfo; import org.apache.seatunnel.connectors.cdc.base.source.split.IncrementalSplit; import org.apache.seatunnel.connectors.cdc.base.source.split.SourceRecords; import org.apache.seatunnel.connectors.cdc.base.source.split.SourceSplitBase; @@ -31,17 +32,24 @@ import com.google.common.util.concurrent.ThreadFactoryBuilder; import io.debezium.connector.base.ChangeEventQueue; import io.debezium.pipeline.DataChangeEvent; +import io.debezium.relational.TableId; import lombok.extern.slf4j.Slf4j; import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.List; +import java.util.Map; +import java.util.Set; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; import java.util.concurrent.ThreadFactory; import java.util.concurrent.TimeUnit; +import static org.apache.seatunnel.connectors.cdc.base.utils.SourceRecordUtils.getTableId; + /** * Fetcher to fetch data from table split, the split is the incremental split {@link * IncrementalSplit}. @@ -51,6 +59,8 @@ public class IncrementalSourceStreamFetcher implements Fetcher pureBinlogPhaseTables; private volatile ChangeEventQueue queue; private volatile Throwable readException; @@ -60,6 +70,11 @@ public class IncrementalSourceStreamFetcher implements Fetcher maxSplitHighWatermarkMap; + // finished spilt info + private Map> finishedSplitsInfo; + private static final long READER_CLOSE_TIMEOUT_SECONDS = 30L; public IncrementalSourceStreamFetcher( @@ -71,6 +86,7 @@ public IncrementalSourceStreamFetcher( ThreadFactory threadFactory = new ThreadFactoryBuilder().setNameFormat("debezium-reader-" + subTaskId).build(); this.executorService = Executors.newSingleThreadExecutor(threadFactory); + this.pureBinlogPhaseTables = new HashSet<>(); } @Override @@ -257,24 +273,79 @@ public void close() { private boolean shouldEmit(SourceRecord sourceRecord) { if (taskContext.isDataChangeRecord(sourceRecord)) { Offset position = taskContext.getStreamOffset(sourceRecord); - // TODO: The sourceRecord from MongoDB CDC and MySQL CDC are inconsistent. For - // compatibility, the getTableId method is commented out for now. - // TableId tableId = getTableId(sourceRecord); + TableId tableId = getTableId(sourceRecord); if (!taskContext.isExactlyOnce()) { - // log.trace( - // "The table {} is not support exactly-once, so ignore the - // watermark check", - // tableId); + log.trace( + "The table {} is not support exactly-once, so ignore the watermark check", + tableId); return position.isAfter(splitStartWatermark); } - // TODO only the table who captured snapshot splits need to filter( Used to support - // Exactly-Once ) - return position.isAfter(splitStartWatermark); + // check whether the pure binlog mode has been entered + if (hasEnterPureBinlogPhase(tableId, position)) { + return true; + } + // not enter pure binlog mode and need to check whether the current record meets the + // emitting conditions. + if (finishedSplitsInfo.containsKey(tableId)) { + for (CompletedSnapshotSplitInfo splitInfo : finishedSplitsInfo.get(tableId)) { + if (taskContext.isRecordBetween( + sourceRecord, + splitInfo.getSplitStart(), + splitInfo.getSplitEnd()) + && position.isAfter(splitInfo.getWatermark().getHighWatermark())) { + return true; + } + } + } + return false; } return true; } + private boolean hasEnterPureBinlogPhase(TableId tableId, Offset position) { + // only the table who captured snapshot splits need to filter + if (pureBinlogPhaseTables.contains(tableId)) { + return true; + } + // the existed tables those have finished snapshot reading + if (maxSplitHighWatermarkMap.containsKey(tableId) + && position.isAtOrAfter(maxSplitHighWatermarkMap.get(tableId))) { + pureBinlogPhaseTables.add(tableId); + return true; + } + return false; + } + private void configureFilter() { splitStartWatermark = currentIncrementalSplit.getStartupOffset(); + Map> splitsInfoMap = new HashMap<>(); + Map tableIdBinlogPositionMap = new HashMap<>(); + List completedSnapshotSplitInfos = + currentIncrementalSplit.getCompletedSnapshotSplitInfos(); + + // latest-offset mode + if (completedSnapshotSplitInfos.isEmpty()) { + for (TableId tableId : currentIncrementalSplit.getTableIds()) { + tableIdBinlogPositionMap.put(tableId, currentIncrementalSplit.getStartupOffset()); + } + } + + // calculate the max high watermark of every table + for (CompletedSnapshotSplitInfo finishedSplitInfo : completedSnapshotSplitInfos) { + TableId tableId = finishedSplitInfo.getTableId(); + List list = + splitsInfoMap.getOrDefault(tableId, new ArrayList<>()); + list.add(finishedSplitInfo); + splitsInfoMap.put(tableId, list); + + Offset highWatermark = finishedSplitInfo.getWatermark().getHighWatermark(); + Offset maxHighWatermark = tableIdBinlogPositionMap.get(tableId); + if (maxHighWatermark == null || highWatermark.isAfter(maxHighWatermark)) { + tableIdBinlogPositionMap.put(tableId, highWatermark); + } + } + this.finishedSplitsInfo = splitsInfoMap; + this.maxSplitHighWatermarkMap = tableIdBinlogPositionMap; + this.pureBinlogPhaseTables.clear(); } } diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConverters.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConverters.java index bffd3ee4380..bec86250dda 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConverters.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-base/src/main/java/org/apache/seatunnel/connectors/cdc/debezium/row/SeaTunnelRowDebeziumDeserializationConverters.java @@ -210,6 +210,8 @@ public Object convert(Object dbzObj, Schema schema) { return dbzObj; } else if (dbzObj instanceof BigDecimal) { return ((BigDecimal) dbzObj).byteValue(); + } else if (dbzObj instanceof Boolean) { + return Boolean.TRUE.equals(dbzObj) ? Byte.valueOf("1") : Byte.valueOf("0"); } else { return Byte.parseByte(dbzObj.toString()); } diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceOptions.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceOptions.java index df73772e071..170bef34e96 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceOptions.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/config/MongodbSourceOptions.java @@ -19,6 +19,7 @@ import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; +import org.apache.seatunnel.api.configuration.SingleChoiceOption; import org.apache.seatunnel.connectors.cdc.base.option.SourceOptions; import org.apache.seatunnel.connectors.cdc.base.option.StartupMode; import org.apache.seatunnel.connectors.cdc.base.option.StopMode; @@ -107,6 +108,8 @@ public class MongodbSourceOptions extends SourceOptions { + " { \"name\": \"source\"," + " \"type\": [{\"name\": \"source\", \"type\": \"record\", \"fields\": [" + " {\"name\": \"ts_ms\", \"type\": \"long\"}," + + " {\"name\": \"table\", \"type\": [\"string\", \"null\"]}," + + " {\"name\": \"db\", \"type\": [\"string\", \"null\"]}," + " {\"name\": \"snapshot\", \"type\": [\"string\", \"null\"] } ]" + " }, \"null\" ] }," + " { \"name\": \"ts_ms\", \"type\": [\"long\", \"null\"]}," @@ -234,7 +237,7 @@ public class MongodbSourceOptions extends SourceOptions { .withDescription( "Decides if the table options contains Debezium client properties that start with prefix 'debezium'."); - public static final Option STARTUP_MODE = + public static final SingleChoiceOption STARTUP_MODE = Options.key(SourceOptions.STARTUP_MODE_KEY) .singleChoice( StartupMode.class, @@ -245,7 +248,7 @@ public class MongodbSourceOptions extends SourceOptions { "Optional startup mode for CDC source, valid enumerations are " + "\"initial\", \"earliest\", \"latest\", \"timestamp\"\n or \"specific\""); - public static final Option STOP_MODE = + public static final SingleChoiceOption STOP_MODE = Options.key(SourceOptions.STOP_MODE_KEY) .singleChoice(StopMode.class, Collections.singletonList(StopMode.NEVER)) .defaultValue(StopMode.NEVER) diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/source/fetch/MongodbFetchTaskContext.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/source/fetch/MongodbFetchTaskContext.java index 534baa72abd..75fb5c2698d 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/source/fetch/MongodbFetchTaskContext.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/source/fetch/MongodbFetchTaskContext.java @@ -27,10 +27,13 @@ import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.source.offset.ChangeStreamOffset; import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils; +import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; import org.bson.BsonDocument; +import org.bson.BsonInt64; +import org.bson.BsonString; import org.bson.BsonType; import org.bson.BsonValue; @@ -50,12 +53,21 @@ import java.util.stream.Collectors; import static org.apache.seatunnel.common.exception.CommonErrorCode.ILLEGAL_ARGUMENT; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.COLL_FIELD; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.DB_FIELD; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.DOCUMENT_KEY; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.FULL_DOCUMENT; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.ID_FIELD; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.NS_FIELD; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.OPERATION_TYPE; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.OPERATION_TYPE_INSERT; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.SNAPSHOT_FIELD; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.SNAPSHOT_TRUE; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.SOURCE_FIELD; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.TS_MS_FIELD; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.BsonUtils.compareBsonValue; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils.buildSourceRecord; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils.extractBsonDocument; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils.getDocumentKey; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbRecordUtils.getResumeToken; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils.MongodbUtils.createMongoClient; @@ -172,9 +184,27 @@ public void rewriteOutputBuffer( switch (OperationType.fromString(operationType)) { case INSERT: + outputBuffer.put(key, changeRecord); + break; case UPDATE: case REPLACE: - outputBuffer.put(key, changeRecord); + Schema valueSchema = changeRecord.valueSchema(); + BsonDocument fullDocument = + extractBsonDocument(value, valueSchema, FULL_DOCUMENT); + if (fullDocument == null) { + break; + } + BsonDocument valueDocument = normalizeSnapshotDocument(fullDocument, value); + SourceRecord record = + buildSourceRecord( + changeRecord.sourcePartition(), + changeRecord.sourceOffset(), + changeRecord.topic(), + changeRecord.kafkaPartition(), + changeRecord.keySchema(), + changeRecord.key(), + valueDocument); + outputBuffer.put(key, record); break; case DELETE: outputBuffer.remove(key); @@ -202,6 +232,30 @@ record -> { .collect(Collectors.toList()); } + private BsonDocument normalizeSnapshotDocument( + @Nonnull final BsonDocument fullDocument, Struct value) { + return new BsonDocument() + .append(ID_FIELD, new BsonString(value.getString(DOCUMENT_KEY))) + .append(OPERATION_TYPE, new BsonString(OPERATION_TYPE_INSERT)) + .append( + NS_FIELD, + new BsonDocument( + DB_FIELD, + new BsonString( + value.getStruct(NS_FIELD).getString(DB_FIELD))) + .append( + COLL_FIELD, + new BsonString( + value.getStruct(NS_FIELD).getString(COLL_FIELD)))) + .append(DOCUMENT_KEY, new BsonString(value.getString(DOCUMENT_KEY))) + .append(FULL_DOCUMENT, fullDocument) + .append(TS_MS_FIELD, new BsonInt64(value.getInt64(TS_MS_FIELD))) + .append( + SOURCE_FIELD, + new BsonDocument(SNAPSHOT_FIELD, new BsonString(SNAPSHOT_TRUE)) + .append(TS_MS_FIELD, new BsonInt64(0L))); + } + @Override public void close() { Runtime.getRuntime() diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/utils/MongodbRecordUtils.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/utils/MongodbRecordUtils.java index 84af2f7fda6..1e9ab577229 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/utils/MongodbRecordUtils.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/utils/MongodbRecordUtils.java @@ -18,12 +18,14 @@ package org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils; import org.apache.commons.lang3.StringUtils; +import org.apache.kafka.connect.data.Schema; import org.apache.kafka.connect.data.SchemaAndValue; import org.apache.kafka.connect.data.Struct; import org.apache.kafka.connect.source.SourceRecord; import org.bson.BsonDocument; import org.bson.BsonTimestamp; +import org.bson.BsonValue; import org.bson.json.JsonWriterSettings; import com.mongodb.kafka.connect.source.json.formatter.DefaultJson; @@ -39,6 +41,7 @@ import java.util.Map; import static com.mongodb.kafka.connect.source.schema.AvroSchema.fromJson; +import static io.debezium.connector.AbstractSourceInfo.TABLE_NAME_KEY; import static org.apache.seatunnel.connectors.cdc.base.source.split.wartermark.WatermarkEvent.isWatermarkEvent; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.COLL_FIELD; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.DB_FIELD; @@ -46,6 +49,7 @@ import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.ID_FIELD; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.NS_FIELD; import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.OUTPUT_SCHEMA; +import static org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.config.MongodbSourceOptions.SOURCE_FIELD; public class MongodbRecordUtils { @@ -63,7 +67,18 @@ public static BsonDocument getResumeToken(SourceRecord sourceRecord) { public static BsonDocument getDocumentKey(@Nonnull SourceRecord sourceRecord) { Struct value = (Struct) sourceRecord.value(); - return BsonDocument.parse(value.getString(DOCUMENT_KEY)); + return extractBsonDocument(value, sourceRecord.valueSchema(), DOCUMENT_KEY); + } + + public static BsonDocument extractBsonDocument( + Struct value, @Nonnull Schema valueSchema, String fieldName) { + if (valueSchema.field(fieldName) != null) { + String docString = value.getString(fieldName); + if (docString != null) { + return BsonDocument.parse(docString); + } + } + return null; } public static String getOffsetValue(@Nonnull SourceRecord sourceRecord, String key) { @@ -117,6 +132,12 @@ public static String getOffsetValue(@Nonnull SourceRecord sourceRecord, String k SchemaAndValue keySchemaAndValue = schemaAndValue.toSchemaAndValue( fromJson(AvroSchemaDefaults.DEFAULT_AVRO_KEY_SCHEMA), keyDocument); + BsonDocument source = valueDocument.get(SOURCE_FIELD).asDocument(); + BsonValue table = valueDocument.get(NS_FIELD).asDocument().get(COLL_FIELD); + BsonValue db = valueDocument.get(NS_FIELD).asDocument().get(DB_FIELD); + source.append(TABLE_NAME_KEY, table); + source.append(DB_FIELD, db); + valueDocument.replace(SOURCE_FIELD, source); SchemaAndValue valueSchemaAndValue = schemaAndValue.toSchemaAndValue(fromJson(OUTPUT_SCHEMA), valueDocument); @@ -130,6 +151,30 @@ public static String getOffsetValue(@Nonnull SourceRecord sourceRecord, String k valueSchemaAndValue.value()); } + public static @Nonnull SourceRecord buildSourceRecord( + Map sourcePartition, + Map sourceOffset, + String topicName, + Integer partition, + Schema keySchema, + Object key, + BsonDocument valueDocument) { + BsonValueToSchemaAndValue schemaAndValue = + new BsonValueToSchemaAndValue(new DefaultJson().getJsonWriterSettings()); + SchemaAndValue valueSchemaAndValue = + schemaAndValue.toSchemaAndValue(fromJson(OUTPUT_SCHEMA), valueDocument); + + return new SourceRecord( + sourcePartition, + sourceOffset, + topicName, + partition, + keySchema, + key, + valueSchemaAndValue.schema(), + valueSchemaAndValue.value()); + } + public static @Nonnull Map createSourceOffsetMap( @Nonnull BsonDocument idDocument, boolean isSnapshotRecord) { Map sourceOffset = new HashMap<>(); diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/utils/ResumeToken.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/utils/ResumeToken.java index 5ee8962bc53..1ef6870c85e 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/utils/ResumeToken.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mongodb/utils/ResumeToken.java @@ -17,6 +17,8 @@ package org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.utils; +import org.apache.seatunnel.connectors.seatunnel.cdc.mongodb.exception.MongodbConnectorException; + import org.bson.BsonDocument; import org.bson.BsonTimestamp; import org.bson.BsonValue; @@ -27,6 +29,8 @@ import java.nio.ByteOrder; import java.util.Objects; +import static org.apache.seatunnel.common.exception.CommonErrorCode.ILLEGAL_ARGUMENT; + public class ResumeToken { private static final int K_TIMESTAMP = 130; @@ -41,14 +45,15 @@ public static BsonTimestamp decodeTimestamp(BsonDocument resumeToken) { } else if (bsonValue.isString()) { // Hex-encoded string (v0 or v1) keyStringBytes = hexToUint8Array(bsonValue.asString().getValue()); } else { - throw new IllegalArgumentException( - "Unknown resume token format: " + resumeToken.toJson()); + throw new MongodbConnectorException( + ILLEGAL_ARGUMENT, "Unknown resume token format: " + bsonValue); } ByteBuffer buffer = ByteBuffer.wrap(keyStringBytes).order(ByteOrder.BIG_ENDIAN); int kType = buffer.get() & 0xff; if (kType != K_TIMESTAMP) { - throw new IllegalArgumentException("Unknown keyType of timestamp: " + kType); + throw new MongodbConnectorException( + ILLEGAL_ARGUMENT, "Unknown keyType of timestamp: " + kType); } int t = buffer.getInt(); diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/config/MySqlSourceConfigFactory.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/config/MySqlSourceConfigFactory.java index ef697f2e19c..7317b040f3b 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/config/MySqlSourceConfigFactory.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/config/MySqlSourceConfigFactory.java @@ -89,7 +89,7 @@ public MySqlSourceConfig create(int subtaskId) { if (serverIdRange != null) { props.setProperty("database.server.id.range", String.valueOf(serverIdRange)); - int serverId = serverIdRange.getServerId(subtaskId); + long serverId = serverIdRange.getServerId(subtaskId); props.setProperty("database.server.id", String.valueOf(serverId)); } if (databaseList != null) { diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/config/ServerIdRange.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/config/ServerIdRange.java index 55ac9c14b08..a012cf4c4b4 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/config/ServerIdRange.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/config/ServerIdRange.java @@ -21,7 +21,7 @@ import java.io.Serializable; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; /** * This class defines a range of server id. The boundaries of the range are inclusive. @@ -32,27 +32,27 @@ public class ServerIdRange implements Serializable { private static final long serialVersionUID = 1L; /** Start of the range (inclusive). */ - private final int startServerId; + private final long startServerId; /** End of the range (inclusive). */ - private final int endServerId; + private final long endServerId; - public ServerIdRange(int startServerId, int endServerId) { + public ServerIdRange(long startServerId, long endServerId) { this.startServerId = startServerId; this.endServerId = endServerId; } - public int getStartServerId() { + public long getStartServerId() { return startServerId; } - public int getEndServerId() { + public long getEndServerId() { return endServerId; } - public int getServerId(int subTaskId) { + public long getServerId(int subTaskId) { checkArgument(subTaskId >= 0, "Subtask ID %s shouldn't be a negative number.", subTaskId); - if (subTaskId > getNumberOfServerIds()) { + if ((long) subTaskId > getNumberOfServerIds()) { throw new IllegalArgumentException( String.format( "Subtask ID %s is out of server id range %s, " @@ -64,8 +64,8 @@ public int getServerId(int subTaskId) { return startServerId + subTaskId; } - public int getNumberOfServerIds() { - return endServerId - startServerId + 1; + public long getNumberOfServerIds() { + return endServerId - startServerId + 1L; } @Override @@ -96,14 +96,14 @@ public static ServerIdRange from(String range) { return new ServerIdRange( parseServerId(idArray[0].trim()), parseServerId(idArray[1].trim())); } else { - int serverId = parseServerId(range); + long serverId = parseServerId(range); return new ServerIdRange(serverId, serverId); } } - private static int parseServerId(String serverIdValue) { + private static long parseServerId(String serverIdValue) { try { - return Integer.parseInt(serverIdValue); + return Long.parseLong(serverIdValue); } catch (NumberFormatException e) { throw new IllegalStateException( String.format("The server id %s is not a valid numeric.", serverIdValue), e); diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/source/offset/BinlogOffset.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/source/offset/BinlogOffset.java index 195b1a5a7c6..0d91c02fee7 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/source/offset/BinlogOffset.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/source/offset/BinlogOffset.java @@ -176,7 +176,13 @@ public int compareTo(Offset offset) { // compared ... long timestamp = this.getTimestamp(); long targetTimestamp = that.getTimestamp(); - return Long.compare(timestamp, targetTimestamp); + // Timestamps are presupposes that they exist, + // because timestamps do not exist for low watermark and high watermark. + // If not judging here results in the really binlog offset comparison to watermark + // always being true. + if (timestamp != 0 && targetTimestamp != 0) { + return Long.compare(timestamp, targetTimestamp); + } } // First compare the MySQL binlog filenames diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/source/reader/fetch/MySqlSourceFetchTaskContext.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/source/reader/fetch/MySqlSourceFetchTaskContext.java index 7fc61921681..ab8d01f24a6 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/source/reader/fetch/MySqlSourceFetchTaskContext.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/source/reader/fetch/MySqlSourceFetchTaskContext.java @@ -18,6 +18,7 @@ package org.apache.seatunnel.connectors.seatunnel.cdc.mysql.source.reader.fetch; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.common.utils.ReflectionUtils; import org.apache.seatunnel.connectors.cdc.base.config.JdbcSourceConfig; import org.apache.seatunnel.connectors.cdc.base.dialect.JdbcDataSourceDialect; import org.apache.seatunnel.connectors.cdc.base.relational.JdbcSourceEventDispatcher; @@ -65,6 +66,7 @@ import java.time.Instant; import java.util.List; import java.util.Map; +import java.util.Optional; import static org.apache.seatunnel.connectors.seatunnel.cdc.mysql.source.offset.BinlogOffset.BINLOG_FILENAME_OFFSET_KEY; import static org.apache.seatunnel.connectors.seatunnel.cdc.mysql.utils.MySqlConnectionUtils.createBinaryClient; @@ -300,13 +302,27 @@ public MySqlTaskContextImpl( MySqlDatabaseSchema schema, BinaryLogClient reusedBinaryLogClient) { super(config, schema); - this.reusedBinaryLogClient = reusedBinaryLogClient; + this.reusedBinaryLogClient = resetBinaryLogClient(reusedBinaryLogClient); } @Override public BinaryLogClient getBinaryLogClient() { return reusedBinaryLogClient; } + + /** reset the listener of binaryLogClient before fetch task start. */ + private BinaryLogClient resetBinaryLogClient(BinaryLogClient binaryLogClient) { + Optional eventListenersField = + ReflectionUtils.getField( + binaryLogClient, BinaryLogClient.class, "eventListeners"); + eventListenersField.ifPresent(o -> ((List) o).clear()); + Optional lifecycleListeners = + ReflectionUtils.getField( + binaryLogClient, BinaryLogClient.class, "lifecycleListeners"); + lifecycleListeners.ifPresent( + o -> ((List) o).clear()); + return binaryLogClient; + } } /** Copied from debezium for accessing here. */ diff --git a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/utils/MySqlTypeUtils.java b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/utils/MySqlTypeUtils.java index 01760bfc9f6..267476b3ffe 100644 --- a/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/utils/MySqlTypeUtils.java +++ b/seatunnel-connectors-v2/connector-cdc/connector-cdc-mysql/src/main/java/org/apache/seatunnel/connectors/seatunnel/cdc/mysql/utils/MySqlTypeUtils.java @@ -87,6 +87,7 @@ public static SeaTunnelDataType convertFromColumn(Column column) { case MYSQL_BIT: return BasicType.BOOLEAN_TYPE; case MYSQL_TINYINT: + return column.length() == 1 ? BasicType.BOOLEAN_TYPE : BasicType.INT_TYPE; case MYSQL_TINYINT_UNSIGNED: case MYSQL_SMALLINT: case MYSQL_SMALLINT_UNSIGNED: diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/config/ClickhouseConfig.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/config/ClickhouseConfig.java index 3d4b19edb24..f7c8e032ccf 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/config/ClickhouseConfig.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/config/ClickhouseConfig.java @@ -20,6 +20,7 @@ import org.apache.seatunnel.api.configuration.Option; import org.apache.seatunnel.api.configuration.Options; +import java.time.ZoneId; import java.util.Collections; import java.util.List; import java.util.Map; @@ -75,6 +76,15 @@ public class ClickhouseConfig { .noDefaultValue() .withDescription("Clickhouse server password"); + /** Clickhouse server timezone */ + public static final Option SERVER_TIME_ZONE = + Options.key("server_time_zone") + .stringType() + .defaultValue(ZoneId.systemDefault().getId()) + .withDescription( + "The session time zone in database server." + + "If not set, then ZoneId.systemDefault() is used to determine the server time zone"); + /** Split mode when table is distributed engine */ public static final Option SPLIT_MODE = Options.key("split_mode") diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/ClickhouseSink.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/ClickhouseSink.java index 42a92733552..360c5925988 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/ClickhouseSink.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/ClickhouseSink.java @@ -61,6 +61,7 @@ import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.HOST; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.PASSWORD; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.PRIMARY_KEY; +import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.SERVER_TIME_ZONE; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.SHARDING_KEY; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.SPLIT_MODE; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.SUPPORT_UPSERT; @@ -101,6 +102,7 @@ public void prepare(Config config) throws PrepareFailException { ImmutableMap.builder() .put(BULK_SIZE.key(), BULK_SIZE.defaultValue()) .put(SPLIT_MODE.key(), SPLIT_MODE.defaultValue()) + .put(SERVER_TIME_ZONE.key(), SERVER_TIME_ZONE.defaultValue()) .build(); config = config.withFallback(ConfigFactory.parseMap(defaultConfig)); @@ -111,6 +113,7 @@ public void prepare(Config config) throws PrepareFailException { ClickhouseUtil.createNodes( config.getString(HOST.key()), config.getString(DATABASE.key()), + config.getString(SERVER_TIME_ZONE.key()), null, null); } else { @@ -118,6 +121,7 @@ public void prepare(Config config) throws PrepareFailException { ClickhouseUtil.createNodes( config.getString(HOST.key()), config.getString(DATABASE.key()), + config.getString(SERVER_TIME_ZONE.key()), config.getString(USERNAME.key()), config.getString(PASSWORD.key())); } diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/ClickhouseSinkWriter.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/ClickhouseSinkWriter.java index 443eec921aa..235279b4d5a 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/ClickhouseSinkWriter.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/ClickhouseSinkWriter.java @@ -90,6 +90,7 @@ public void write(SeaTunnelRow element) throws IOException { @Override public Optional prepareCommit() throws IOException { + flush(); return Optional.empty(); } @@ -99,23 +100,7 @@ public void abortPrepare() {} @Override public void close() throws IOException { this.proxy.close(); - for (ClickhouseBatchStatement batchStatement : statementMap.values()) { - try (ClickHouseConnectionImpl needClosedConnection = - batchStatement.getClickHouseConnection(); - JdbcBatchStatementExecutor needClosedStatement = - batchStatement.getJdbcBatchStatementExecutor()) { - IntHolder intHolder = batchStatement.getIntHolder(); - if (intHolder.getValue() > 0) { - flush(needClosedStatement); - intHolder.setValue(0); - } - } catch (SQLException e) { - throw new ClickhouseConnectorException( - CommonErrorCode.SQL_OPERATION_FAILED, - "Failed to close prepared statement.", - e); - } - } + flush(); } private void addIntoBatch(SeaTunnelRow row, JdbcBatchStatementExecutor clickHouseStatement) { @@ -138,6 +123,26 @@ private void flush(JdbcBatchStatementExecutor clickHouseStatement) { } } + private void flush() { + for (ClickhouseBatchStatement batchStatement : statementMap.values()) { + try (ClickHouseConnectionImpl needClosedConnection = + batchStatement.getClickHouseConnection(); + JdbcBatchStatementExecutor needClosedStatement = + batchStatement.getJdbcBatchStatementExecutor()) { + IntHolder intHolder = batchStatement.getIntHolder(); + if (intHolder.getValue() > 0) { + flush(needClosedStatement); + intHolder.setValue(0); + } + } catch (SQLException e) { + throw new ClickhouseConnectorException( + CommonErrorCode.SQL_OPERATION_FAILED, + "Failed to close prepared statement.", + e); + } + } + } + private Map initStatementMap() { Map result = new HashMap<>(Common.COLLECTION_SIZE); shardRouter diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/executor/FieldNamedPreparedStatement.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/executor/FieldNamedPreparedStatement.java index 58c7ce650b5..5fa82e8c59a 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/executor/FieldNamedPreparedStatement.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/client/executor/FieldNamedPreparedStatement.java @@ -46,8 +46,8 @@ import java.util.List; import java.util.Map; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; @RequiredArgsConstructor public class FieldNamedPreparedStatement implements PreparedStatement { diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/file/ClickhouseFileSink.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/file/ClickhouseFileSink.java index 26846b5850a..762815ee043 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/file/ClickhouseFileSink.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/sink/file/ClickhouseFileSink.java @@ -66,6 +66,7 @@ import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.NODE_FREE_PASSWORD; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.NODE_PASS; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.PASSWORD; +import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.SERVER_TIME_ZONE; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.SHARDING_KEY; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.TABLE; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.USERNAME; @@ -114,6 +115,7 @@ public void prepare(Config config) throws PrepareFailException { ClickhouseUtil.createNodes( config.getString(HOST.key()), config.getString(DATABASE.key()), + config.getString(SERVER_TIME_ZONE.key()), config.getString(USERNAME.key()), config.getString(PASSWORD.key())); diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSource.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSource.java index 5b902f06078..e49b03091a2 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSource.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSource.java @@ -18,6 +18,7 @@ package org.apache.seatunnel.connectors.seatunnel.clickhouse.source; import org.apache.seatunnel.shade.com.typesafe.config.Config; +import org.apache.seatunnel.shade.com.typesafe.config.ConfigFactory; import org.apache.seatunnel.api.common.PrepareFailException; import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; @@ -44,13 +45,16 @@ import com.clickhouse.client.ClickHouseNode; import com.clickhouse.client.ClickHouseResponse; import com.google.auto.service.AutoService; +import com.google.common.collect.ImmutableMap; import java.util.List; +import java.util.Map; import java.util.concurrent.ThreadLocalRandom; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.DATABASE; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.HOST; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.PASSWORD; +import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.SERVER_TIME_ZONE; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.SQL; import static org.apache.seatunnel.connectors.seatunnel.clickhouse.config.ClickhouseConfig.USERNAME; @@ -86,10 +90,17 @@ public void prepare(Config config) throws PrepareFailException { "PluginName: %s, PluginType: %s, Message: %s", getPluginName(), PluginType.SOURCE, result.getMsg())); } + Map defaultConfig = + ImmutableMap.builder() + .put(SERVER_TIME_ZONE.key(), SERVER_TIME_ZONE.defaultValue()) + .build(); + + config = config.withFallback(ConfigFactory.parseMap(defaultConfig)); servers = ClickhouseUtil.createNodes( config.getString(HOST.key()), config.getString(DATABASE.key()), + config.getString(SERVER_TIME_ZONE.key()), config.getString(USERNAME.key()), config.getString(PASSWORD.key())); diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java index 8d3de88abb8..bb4d3905ee8 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/source/ClickhouseSourceReader.java @@ -83,10 +83,14 @@ record -> { Object[] values = new Object[this.rowTypeInfo.getFieldNames().length]; for (int i = 0; i < record.size(); i++) { - values[i] = - TypeConvertUtil.valueUnwrap( - this.rowTypeInfo.getFieldType(i), - record.getValue(i)); + if (record.getValue(i).isNullOrEmpty()) { + values[i] = null; + } else { + values[i] = + TypeConvertUtil.valueUnwrap( + this.rowTypeInfo.getFieldType(i), + record.getValue(i)); + } } output.collect(new SeaTunnelRow(values)); }); diff --git a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/util/ClickhouseUtil.java b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/util/ClickhouseUtil.java index e70567a11e8..e8e491635fc 100644 --- a/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/util/ClickhouseUtil.java +++ b/seatunnel-connectors-v2/connector-clickhouse/src/main/java/org/apache/seatunnel/connectors/seatunnel/clickhouse/util/ClickhouseUtil.java @@ -30,7 +30,11 @@ public class ClickhouseUtil { public static List createNodes( - String nodeAddress, String database, String username, String password) { + String nodeAddress, + String database, + String serverTimeZone, + String username, + String password) { return Arrays.stream(nodeAddress.split(",")) .map( address -> { @@ -42,12 +46,14 @@ public static List createNodes( ClickHouseProtocol.HTTP, Integer.parseInt(nodeAndPort[1])) .database(database) + .timeZone(serverTimeZone) .build(); } return ClickHouseNode.builder() .host(nodeAndPort[0]) .port(ClickHouseProtocol.HTTP, Integer.parseInt(nodeAndPort[1])) .database(database) + .timeZone(serverTimeZone) .credentials( ClickHouseCredentials.fromUserAndPassword( username, password)) diff --git a/seatunnel-connectors-v2/connector-common/src/main/java/org/apache/seatunnel/connectors/seatunnel/common/source/AbstractSingleSplitSource.java b/seatunnel-connectors-v2/connector-common/src/main/java/org/apache/seatunnel/connectors/seatunnel/common/source/AbstractSingleSplitSource.java index fb4c5c85277..360e8601a22 100644 --- a/seatunnel-connectors-v2/connector-common/src/main/java/org/apache/seatunnel/connectors/seatunnel/common/source/AbstractSingleSplitSource.java +++ b/seatunnel-connectors-v2/connector-common/src/main/java/org/apache/seatunnel/connectors/seatunnel/common/source/AbstractSingleSplitSource.java @@ -23,7 +23,7 @@ import org.apache.seatunnel.api.source.SourceReader; import org.apache.seatunnel.api.source.SourceSplitEnumerator; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; public abstract class AbstractSingleSplitSource implements SeaTunnelSource { diff --git a/seatunnel-connectors-v2/connector-doris/src/main/java/org/apache/seatunnel/connectors/doris/datatype/DorisDataTypeConvertor.java b/seatunnel-connectors-v2/connector-doris/src/main/java/org/apache/seatunnel/connectors/doris/datatype/DorisDataTypeConvertor.java index 9cdc1ef12f6..7c9f08dfb71 100644 --- a/seatunnel-connectors-v2/connector-doris/src/main/java/org/apache/seatunnel/connectors/doris/datatype/DorisDataTypeConvertor.java +++ b/seatunnel-connectors-v2/connector-doris/src/main/java/org/apache/seatunnel/connectors/doris/datatype/DorisDataTypeConvertor.java @@ -114,14 +114,6 @@ public SeaTunnelDataType toSeaTunnelType( return BasicType.VOID_TYPE; case BOOLEAN: return BasicType.BOOLEAN_TYPE; - // case BIT: - // precision = (Integer) - // dataTypeProperties.get(MysqlDataTypeConvertor.PRECISION); - // if (precision == 1) { - // return BasicType.BOOLEAN_TYPE; - // } else { - // return PrimitiveByteArrayType.INSTANCE; - // } case TINYINT: return BasicType.BYTE_TYPE; case SMALLINT: @@ -135,8 +127,6 @@ public SeaTunnelDataType toSeaTunnelType( return BasicType.FLOAT_TYPE; case DOUBLE: return BasicType.DOUBLE_TYPE; - // case TIME: - // return LocalTimeType.LOCAL_TIME_TYPE; case DATE: return LocalTimeType.LOCAL_DATE_TYPE; case TIMESTAMP: @@ -176,7 +166,6 @@ public String toConnectorType( return VARCHAR; case BOOLEAN: return BOOLEAN; - case TINYINT: return TINYINT; case SMALLINT: @@ -200,7 +189,7 @@ public String toConnectorType( return TIMESTAMP; default: throw new UnsupportedOperationException( - String.format("Doesn't support HIVE type '%s'' yet.", sqlType)); + String.format("Doesn't support Doris type '%s'' yet.", sqlType)); } } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/source/BaseHdfsFileSource.java b/seatunnel-connectors-v2/connector-file/connector-file-base-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/source/BaseHdfsFileSource.java index 9864fc3750b..57d2ceca6eb 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/source/BaseHdfsFileSource.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/source/BaseHdfsFileSource.java @@ -79,12 +79,7 @@ public void prepare(Config pluginConfig) throws PrepareFailException { throw new FileConnectorException( FileConnectorErrorCode.FILE_LIST_GET_FAILED, errorMsg, e); } - if (filePaths.isEmpty()) { - throw new FileConnectorException( - FileConnectorErrorCode.FILE_LIST_EMPTY, - "The target file list is empty," - + "SeaTunnel will not be able to sync empty table"); - } + // support user-defined schema FileFormat fileFormat = FileFormat.valueOf( diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfig.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfig.java index 18187fc6b1e..7571a973e06 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfig.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/BaseSourceConfig.java @@ -118,4 +118,11 @@ public class BaseSourceConfig { .stringType() .noDefaultValue() .withDescription("To be read sheet name,only valid for excel files"); + + public static final Option FILE_FILTER_PATTERN = + Options.key("file_filter_pattern") + .stringType() + .noDefaultValue() + .withDescription( + "File pattern. The connector will filter some files base on the pattern."); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/FileSystemType.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/FileSystemType.java index 8d50cee4697..3d3965b7c3f 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/FileSystemType.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/config/FileSystemType.java @@ -24,6 +24,7 @@ public enum FileSystemType implements Serializable { LOCAL("LocalFile"), OSS("OssFile"), OSS_JINDO("OssJindoFile"), + COS("CosFile"), FTP("FtpFile"), SFTP("SftpFile"), S3("S3File"); diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/config/FileSinkConfig.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/config/FileSinkConfig.java index 021fb59ba8b..87b005fec6f 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/config/FileSinkConfig.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sink/config/FileSinkConfig.java @@ -40,7 +40,7 @@ import java.util.Map; import java.util.stream.Collectors; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; @Data public class FileSinkConfig extends BaseFileSinkConfig implements PartitionConfig { diff --git a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java index ea6c902c05a..e4e1694f30d 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-base/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/source/reader/AbstractReadStrategy.java @@ -24,6 +24,8 @@ import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.connectors.seatunnel.file.config.BaseSourceConfig; import org.apache.seatunnel.connectors.seatunnel.file.config.HadoopConf; +import org.apache.seatunnel.connectors.seatunnel.file.exception.FileConnectorErrorCode; +import org.apache.seatunnel.connectors.seatunnel.file.exception.FileConnectorException; import org.apache.seatunnel.connectors.seatunnel.file.sink.util.FileSystemUtils; import org.apache.hadoop.conf.Configuration; @@ -43,6 +45,9 @@ import java.util.LinkedHashMap; import java.util.List; import java.util.Map; +import java.util.Objects; +import java.util.regex.Matcher; +import java.util.regex.Pattern; import static org.apache.parquet.avro.AvroReadSupport.READ_INT96_AS_FIXED; import static org.apache.parquet.avro.AvroSchemaConverter.ADD_LIST_ELEMENT_RECORDS; @@ -74,6 +79,8 @@ public abstract class AbstractReadStrategy implements ReadStrategy { protected long skipHeaderNumber = BaseSourceConfig.SKIP_HEADER_ROW_NUMBER.defaultValue(); protected transient boolean isKerberosAuthorization = false; + protected Pattern pattern; + @Override public void init(HadoopConf conf) { this.hadoopConf = conf; @@ -126,7 +133,7 @@ public List getFileNamesByPath(HadoopConf hadoopConf, String path) throw fileNames.addAll(getFileNamesByPath(hadoopConf, fileStatus.getPath().toString())); continue; } - if (fileStatus.isFile()) { + if (fileStatus.isFile() && filterFileByPattern(fileStatus)) { // filter '_SUCCESS' file if (!fileStatus.getPath().getName().equals("_SUCCESS") && !fileStatus.getPath().getName().startsWith(".")) { @@ -146,6 +153,15 @@ public List getFileNamesByPath(HadoopConf hadoopConf, String path) throw } } } + + if (fileNames.isEmpty()) { + throw new FileConnectorException( + FileConnectorErrorCode.FILE_LIST_EMPTY, + "The target file list is empty," + + "SeaTunnel will not be able to sync empty table, " + + "please check the configuration parameters such as: [file_filter_pattern]"); + } + return fileNames; } @@ -166,6 +182,11 @@ public void setPluginConfig(Config pluginConfig) { if (pluginConfig.hasPath(BaseSourceConfig.READ_COLUMNS.key())) { readColumns.addAll(pluginConfig.getStringList(BaseSourceConfig.READ_COLUMNS.key())); } + if (pluginConfig.hasPath(BaseSourceConfig.FILE_FILTER_PATTERN.key())) { + String filterPattern = + pluginConfig.getString(BaseSourceConfig.FILE_FILTER_PATTERN.key()); + this.pattern = Pattern.compile(Matcher.quoteReplacement(filterPattern)); + } } @Override @@ -214,4 +235,11 @@ protected SeaTunnelRowType mergePartitionTypes(String path, SeaTunnelRowType sea // return merge row type return new SeaTunnelRowType(newFieldNames, newFieldTypes); } + + protected boolean filterFileByPattern(FileStatus fileStatus) { + if (Objects.nonNull(pattern)) { + return pattern.matcher(fileStatus.getPath().getName()).matches(); + } + return true; + } } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/pom.xml b/seatunnel-connectors-v2/connector-file/connector-file-cos/pom.xml new file mode 100644 index 00000000000..457357ad81f --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/pom.xml @@ -0,0 +1,64 @@ + + + + 4.0.0 + + org.apache.seatunnel + connector-file + ${revision} + + + connector-file-cos + SeaTunnel : Connectors V2 : File : Cos + + + 2.6.5-8.0.2 + + + + + + org.apache.seatunnel + connector-file-base + ${project.version} + + + + org.apache.flink + flink-shaded-hadoop-2 + provided + + + org.apache.avro + avro + + + + + + com.qcloud.cos + hadoop-cos + ${hadoop-cos.version} + provided + + + + diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/config/CosConf.java b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/config/CosConf.java new file mode 100644 index 00000000000..211c2453687 --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/config/CosConf.java @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.file.cos.config; + +import org.apache.seatunnel.shade.com.typesafe.config.Config; + +import org.apache.seatunnel.connectors.seatunnel.file.config.HadoopConf; + +import org.apache.hadoop.fs.CosNConfigKeys; + +import java.util.HashMap; + +public class CosConf extends HadoopConf { + private static final String HDFS_IMPL = "org.apache.hadoop.fs.CosFileSystem"; + private static final String SCHEMA = "cosn"; + + @Override + public String getFsHdfsImpl() { + return HDFS_IMPL; + } + + @Override + public String getSchema() { + return SCHEMA; + } + + public CosConf(String hdfsNameKey) { + super(hdfsNameKey); + } + + public static HadoopConf buildWithConfig(Config config) { + HadoopConf hadoopConf = new CosConf(config.getString(CosConfig.BUCKET.key())); + HashMap cosOptions = new HashMap<>(); + cosOptions.put( + CosNConfigKeys.COSN_USERINFO_SECRET_ID_KEY, + config.getString(CosConfig.SECRET_ID.key())); + cosOptions.put( + CosNConfigKeys.COSN_USERINFO_SECRET_KEY_KEY, + config.getString(CosConfig.SECRET_KEY.key())); + cosOptions.put(CosNConfigKeys.COSN_REGION_KEY, config.getString(CosConfig.REGION.key())); + hadoopConf.setExtraOptions(cosOptions); + return hadoopConf; + } +} diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/config/CosConfig.java b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/config/CosConfig.java new file mode 100644 index 00000000000..cbbd68ef7dc --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/config/CosConfig.java @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.file.cos.config; + +import org.apache.seatunnel.api.configuration.Option; +import org.apache.seatunnel.api.configuration.Options; +import org.apache.seatunnel.connectors.seatunnel.file.config.BaseSourceConfig; + +public class CosConfig extends BaseSourceConfig { + public static final Option SECRET_ID = + Options.key("secret_id") + .stringType() + .noDefaultValue() + .withDescription("COS bucket secret id"); + public static final Option SECRET_KEY = + Options.key("secret_key") + .stringType() + .noDefaultValue() + .withDescription("COS bucket secret key"); + public static final Option REGION = + Options.key("region").stringType().noDefaultValue().withDescription("COS region"); + public static final Option BUCKET = + Options.key("bucket").stringType().noDefaultValue().withDescription("COS bucket"); +} diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSink.java b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSink.java new file mode 100644 index 00000000000..bfc6fa4ff10 --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSink.java @@ -0,0 +1,63 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.file.cos.sink; + +import org.apache.seatunnel.shade.com.typesafe.config.Config; + +import org.apache.seatunnel.api.common.PrepareFailException; +import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; +import org.apache.seatunnel.api.sink.SeaTunnelSink; +import org.apache.seatunnel.common.config.CheckConfigUtil; +import org.apache.seatunnel.common.config.CheckResult; +import org.apache.seatunnel.common.constants.PluginType; +import org.apache.seatunnel.connectors.seatunnel.file.config.FileSystemType; +import org.apache.seatunnel.connectors.seatunnel.file.cos.config.CosConf; +import org.apache.seatunnel.connectors.seatunnel.file.cos.config.CosConfig; +import org.apache.seatunnel.connectors.seatunnel.file.exception.FileConnectorException; +import org.apache.seatunnel.connectors.seatunnel.file.sink.BaseFileSink; + +import com.google.auto.service.AutoService; + +@AutoService(SeaTunnelSink.class) +public class CosFileSink extends BaseFileSink { + @Override + public String getPluginName() { + return FileSystemType.COS.getFileSystemPluginName(); + } + + @Override + public void prepare(Config pluginConfig) throws PrepareFailException { + super.prepare(pluginConfig); + CheckResult result = + CheckConfigUtil.checkAllExists( + pluginConfig, + CosConfig.FILE_PATH.key(), + CosConfig.REGION.key(), + CosConfig.SECRET_ID.key(), + CosConfig.SECRET_KEY.key(), + CosConfig.BUCKET.key()); + if (!result.isSuccess()) { + throw new FileConnectorException( + SeaTunnelAPIErrorCode.CONFIG_VALIDATION_FAILED, + String.format( + "PluginName: %s, PluginType: %s, Message: %s", + getPluginName(), PluginType.SINK, result.getMsg())); + } + hadoopConf = CosConf.buildWithConfig(pluginConfig); + } +} diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSinkFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSinkFactory.java new file mode 100644 index 00000000000..9de5386bc6b --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/sink/CosFileSinkFactory.java @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.file.cos.sink; + +import org.apache.seatunnel.api.configuration.util.OptionRule; +import org.apache.seatunnel.api.table.factory.Factory; +import org.apache.seatunnel.api.table.factory.TableSinkFactory; +import org.apache.seatunnel.connectors.seatunnel.file.config.BaseSinkConfig; +import org.apache.seatunnel.connectors.seatunnel.file.config.FileFormat; +import org.apache.seatunnel.connectors.seatunnel.file.config.FileSystemType; +import org.apache.seatunnel.connectors.seatunnel.file.cos.config.CosConfig; + +import com.google.auto.service.AutoService; + +@AutoService(Factory.class) +public class CosFileSinkFactory implements TableSinkFactory { + @Override + public String factoryIdentifier() { + return FileSystemType.COS.getFileSystemPluginName(); + } + + @Override + public OptionRule optionRule() { + return OptionRule.builder() + .required(CosConfig.FILE_PATH) + .required(CosConfig.BUCKET) + .required(CosConfig.SECRET_ID) + .required(CosConfig.SECRET_KEY) + .required(CosConfig.REGION) + .optional(BaseSinkConfig.FILE_FORMAT_TYPE) + .conditional( + BaseSinkConfig.FILE_FORMAT_TYPE, + FileFormat.TEXT, + BaseSinkConfig.ROW_DELIMITER, + BaseSinkConfig.FIELD_DELIMITER, + BaseSinkConfig.TXT_COMPRESS) + .conditional( + BaseSinkConfig.FILE_FORMAT_TYPE, + FileFormat.CSV, + BaseSinkConfig.TXT_COMPRESS) + .conditional( + BaseSinkConfig.FILE_FORMAT_TYPE, + FileFormat.JSON, + BaseSinkConfig.TXT_COMPRESS) + .conditional( + BaseSinkConfig.FILE_FORMAT_TYPE, + FileFormat.ORC, + BaseSinkConfig.ORC_COMPRESS) + .conditional( + BaseSinkConfig.FILE_FORMAT_TYPE, + FileFormat.PARQUET, + BaseSinkConfig.PARQUET_COMPRESS) + .optional(BaseSinkConfig.CUSTOM_FILENAME) + .conditional( + BaseSinkConfig.CUSTOM_FILENAME, + true, + BaseSinkConfig.FILE_NAME_EXPRESSION, + BaseSinkConfig.FILENAME_TIME_FORMAT) + .optional(BaseSinkConfig.HAVE_PARTITION) + .conditional( + BaseSinkConfig.HAVE_PARTITION, + true, + BaseSinkConfig.PARTITION_BY, + BaseSinkConfig.PARTITION_DIR_EXPRESSION, + BaseSinkConfig.IS_PARTITION_FIELD_WRITE_IN_FILE) + .optional(BaseSinkConfig.SINK_COLUMNS) + .optional(BaseSinkConfig.IS_ENABLE_TRANSACTION) + .optional(BaseSinkConfig.DATE_FORMAT) + .optional(BaseSinkConfig.DATETIME_FORMAT) + .optional(BaseSinkConfig.TIME_FORMAT) + .build(); + } +} diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/source/CosFileSource.java b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/source/CosFileSource.java new file mode 100644 index 00000000000..aefc339121e --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/source/CosFileSource.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.file.cos.source; + +import org.apache.seatunnel.shade.com.typesafe.config.Config; + +import org.apache.seatunnel.api.common.PrepareFailException; +import org.apache.seatunnel.api.common.SeaTunnelAPIErrorCode; +import org.apache.seatunnel.api.source.SeaTunnelSource; +import org.apache.seatunnel.api.table.catalog.CatalogTableUtil; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.common.config.CheckConfigUtil; +import org.apache.seatunnel.common.config.CheckResult; +import org.apache.seatunnel.common.constants.PluginType; +import org.apache.seatunnel.common.exception.CommonErrorCode; +import org.apache.seatunnel.connectors.seatunnel.file.config.FileFormat; +import org.apache.seatunnel.connectors.seatunnel.file.config.FileSystemType; +import org.apache.seatunnel.connectors.seatunnel.file.cos.config.CosConf; +import org.apache.seatunnel.connectors.seatunnel.file.cos.config.CosConfig; +import org.apache.seatunnel.connectors.seatunnel.file.exception.FileConnectorErrorCode; +import org.apache.seatunnel.connectors.seatunnel.file.exception.FileConnectorException; +import org.apache.seatunnel.connectors.seatunnel.file.source.BaseFileSource; +import org.apache.seatunnel.connectors.seatunnel.file.source.reader.ReadStrategyFactory; + +import com.google.auto.service.AutoService; + +import java.io.IOException; + +@AutoService(SeaTunnelSource.class) +public class CosFileSource extends BaseFileSource { + @Override + public String getPluginName() { + return FileSystemType.COS.getFileSystemPluginName(); + } + + @Override + public void prepare(Config pluginConfig) throws PrepareFailException { + CheckResult result = + CheckConfigUtil.checkAllExists( + pluginConfig, + CosConfig.FILE_PATH.key(), + CosConfig.FILE_FORMAT_TYPE.key(), + CosConfig.SECRET_ID.key(), + CosConfig.SECRET_KEY.key(), + CosConfig.REGION.key(), + CosConfig.BUCKET.key()); + if (!result.isSuccess()) { + throw new FileConnectorException( + SeaTunnelAPIErrorCode.CONFIG_VALIDATION_FAILED, + String.format( + "PluginName: %s, PluginType: %s, Message: %s", + getPluginName(), PluginType.SOURCE, result.getMsg())); + } + readStrategy = + ReadStrategyFactory.of(pluginConfig.getString(CosConfig.FILE_FORMAT_TYPE.key())); + readStrategy.setPluginConfig(pluginConfig); + String path = pluginConfig.getString(CosConfig.FILE_PATH.key()); + hadoopConf = CosConf.buildWithConfig(pluginConfig); + try { + filePaths = readStrategy.getFileNamesByPath(hadoopConf, path); + } catch (IOException e) { + String errorMsg = String.format("Get file list from this path [%s] failed", path); + throw new FileConnectorException( + FileConnectorErrorCode.FILE_LIST_GET_FAILED, errorMsg, e); + } + // support user-defined schema + FileFormat fileFormat = + FileFormat.valueOf( + pluginConfig.getString(CosConfig.FILE_FORMAT_TYPE.key()).toUpperCase()); + // only json text csv type support user-defined schema now + if (pluginConfig.hasPath(CatalogTableUtil.SCHEMA.key())) { + switch (fileFormat) { + case CSV: + case TEXT: + case JSON: + case EXCEL: + SeaTunnelRowType userDefinedSchema = + CatalogTableUtil.buildWithConfig(pluginConfig).getSeaTunnelRowType(); + readStrategy.setSeaTunnelRowTypeInfo(userDefinedSchema); + rowType = readStrategy.getActualSeaTunnelRowTypeInfo(); + break; + case ORC: + case PARQUET: + throw new FileConnectorException( + CommonErrorCode.UNSUPPORTED_OPERATION, + "SeaTunnel does not support user-defined schema for [parquet, orc] files"); + default: + // never got in there + throw new FileConnectorException( + CommonErrorCode.ILLEGAL_ARGUMENT, + "SeaTunnel does not supported this file format"); + } + } else { + try { + rowType = readStrategy.getSeaTunnelRowTypeInfo(hadoopConf, filePaths.get(0)); + } catch (FileConnectorException e) { + String errorMsg = + String.format("Get table schema from file [%s] failed", filePaths.get(0)); + throw new FileConnectorException( + CommonErrorCode.TABLE_SCHEMA_GET_FAILED, errorMsg, e); + } + } + } +} diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/source/CosFileSourceFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/source/CosFileSourceFactory.java new file mode 100644 index 00000000000..496e9277f4e --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/cos/source/CosFileSourceFactory.java @@ -0,0 +1,71 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.file.cos.source; + +import org.apache.seatunnel.api.configuration.util.OptionRule; +import org.apache.seatunnel.api.source.SeaTunnelSource; +import org.apache.seatunnel.api.table.catalog.CatalogTableUtil; +import org.apache.seatunnel.api.table.factory.Factory; +import org.apache.seatunnel.api.table.factory.TableSourceFactory; +import org.apache.seatunnel.connectors.seatunnel.file.config.BaseSourceConfig; +import org.apache.seatunnel.connectors.seatunnel.file.config.FileFormat; +import org.apache.seatunnel.connectors.seatunnel.file.config.FileSystemType; +import org.apache.seatunnel.connectors.seatunnel.file.cos.config.CosConfig; + +import com.google.auto.service.AutoService; + +import java.util.Arrays; + +@AutoService(Factory.class) +public class CosFileSourceFactory implements TableSourceFactory { + @Override + public String factoryIdentifier() { + return FileSystemType.OSS.getFileSystemPluginName(); + } + + @Override + public OptionRule optionRule() { + return OptionRule.builder() + .required(CosConfig.FILE_PATH) + .required(CosConfig.BUCKET) + .required(CosConfig.SECRET_ID) + .required(CosConfig.SECRET_KEY) + .required(CosConfig.REGION) + .required(BaseSourceConfig.FILE_FORMAT_TYPE) + .conditional( + BaseSourceConfig.FILE_FORMAT_TYPE, + FileFormat.TEXT, + BaseSourceConfig.DELIMITER) + .conditional( + BaseSourceConfig.FILE_FORMAT_TYPE, + Arrays.asList( + FileFormat.TEXT, FileFormat.JSON, FileFormat.EXCEL, FileFormat.CSV), + CatalogTableUtil.SCHEMA) + .optional(BaseSourceConfig.PARSE_PARTITION_FROM_PATH) + .optional(BaseSourceConfig.DATE_FORMAT) + .optional(BaseSourceConfig.DATETIME_FORMAT) + .optional(BaseSourceConfig.TIME_FORMAT) + .optional(BaseSourceConfig.FILE_FILTER_PATTERN) + .build(); + } + + @Override + public Class getSourceClass() { + return CosFileSource.class; + } +} diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/resources/services/org.apache.hadoop.fs.FileSystem b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/resources/services/org.apache.hadoop.fs.FileSystem new file mode 100644 index 00000000000..b4ecb7e0c7e --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/main/resources/services/org.apache.hadoop.fs.FileSystem @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +org.apache.hadoop.fs.CosFileSystem \ No newline at end of file diff --git a/seatunnel-connectors-v2/connector-file/connector-file-cos/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/cos/CosFileFactoryTest.java b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/cos/CosFileFactoryTest.java new file mode 100644 index 00000000000..6691f5b1f2a --- /dev/null +++ b/seatunnel-connectors-v2/connector-file/connector-file-cos/src/test/java/org/apache/seatunnel/connectors/seatunnel/file/cos/CosFileFactoryTest.java @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.file.cos; + +import org.apache.seatunnel.connectors.seatunnel.file.cos.sink.CosFileSinkFactory; +import org.apache.seatunnel.connectors.seatunnel.file.cos.source.CosFileSourceFactory; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class CosFileFactoryTest { + + @Test + void optionRule() { + Assertions.assertNotNull((new CosFileSourceFactory()).optionRule()); + Assertions.assertNotNull((new CosFileSinkFactory()).optionRule()); + } +} diff --git a/seatunnel-connectors-v2/connector-file/connector-file-ftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/ftp/source/FtpFileSourceFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-ftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/ftp/source/FtpFileSourceFactory.java index d2d11da5b46..4ab637c4348 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-ftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/ftp/source/FtpFileSourceFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-ftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/ftp/source/FtpFileSourceFactory.java @@ -60,6 +60,7 @@ public OptionRule optionRule() { .optional(BaseSourceConfig.DATE_FORMAT) .optional(BaseSourceConfig.DATETIME_FORMAT) .optional(BaseSourceConfig.TIME_FORMAT) + .optional(BaseSourceConfig.FILE_FILTER_PATTERN) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/source/HdfsFileSourceFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/source/HdfsFileSourceFactory.java index d4c17384904..c3d406d62c7 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/source/HdfsFileSourceFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-hadoop/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/hdfs/source/HdfsFileSourceFactory.java @@ -57,6 +57,7 @@ public OptionRule optionRule() { .optional(BaseSourceConfig.DATE_FORMAT) .optional(BaseSourceConfig.DATETIME_FORMAT) .optional(BaseSourceConfig.TIME_FORMAT) + .optional(BaseSourceConfig.FILE_FILTER_PATTERN) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-jindo-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSourceFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-jindo-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSourceFactory.java index 50256767625..eaea7bccb61 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-jindo-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSourceFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-jindo-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSourceFactory.java @@ -60,6 +60,7 @@ public OptionRule optionRule() { .optional(BaseSourceConfig.DATE_FORMAT) .optional(BaseSourceConfig.DATETIME_FORMAT) .optional(BaseSourceConfig.TIME_FORMAT) + .optional(BaseSourceConfig.FILE_FILTER_PATTERN) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-local/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/local/source/LocalFileSourceFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-local/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/local/source/LocalFileSourceFactory.java index 4ae2ae3a9b6..03ec8660ce2 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-local/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/local/source/LocalFileSourceFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-local/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/local/source/LocalFileSourceFactory.java @@ -56,6 +56,7 @@ public OptionRule optionRule() { .optional(BaseSourceConfig.DATE_FORMAT) .optional(BaseSourceConfig.DATETIME_FORMAT) .optional(BaseSourceConfig.TIME_FORMAT) + .optional(BaseSourceConfig.FILE_FILTER_PATTERN) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSourceFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSourceFactory.java index c6a2d704092..e7d862bd44a 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSourceFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-oss/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/oss/source/OssFileSourceFactory.java @@ -60,6 +60,7 @@ public OptionRule optionRule() { .optional(BaseSourceConfig.DATE_FORMAT) .optional(BaseSourceConfig.DATETIME_FORMAT) .optional(BaseSourceConfig.TIME_FORMAT) + .optional(BaseSourceConfig.FILE_FILTER_PATTERN) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/source/S3FileSourceFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/source/S3FileSourceFactory.java index 71156a21b66..a3b48088650 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/source/S3FileSourceFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-s3/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/s3/source/S3FileSourceFactory.java @@ -65,6 +65,7 @@ public OptionRule optionRule() { .optional(BaseSourceConfig.DATE_FORMAT) .optional(BaseSourceConfig.DATETIME_FORMAT) .optional(BaseSourceConfig.TIME_FORMAT) + .optional(BaseSourceConfig.FILE_FILTER_PATTERN) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/source/SftpFileSourceFactory.java b/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/source/SftpFileSourceFactory.java index 18cda2fbe5e..e9efe1cdf9b 100644 --- a/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/source/SftpFileSourceFactory.java +++ b/seatunnel-connectors-v2/connector-file/connector-file-sftp/src/main/java/org/apache/seatunnel/connectors/seatunnel/file/sftp/source/SftpFileSourceFactory.java @@ -60,6 +60,7 @@ public OptionRule optionRule() { .optional(BaseSourceConfig.DATE_FORMAT) .optional(BaseSourceConfig.DATETIME_FORMAT) .optional(BaseSourceConfig.TIME_FORMAT) + .optional(BaseSourceConfig.FILE_FILTER_PATTERN) .build(); } diff --git a/seatunnel-connectors-v2/connector-file/pom.xml b/seatunnel-connectors-v2/connector-file/pom.xml index d20e6296cbf..4bdfa981cee 100644 --- a/seatunnel-connectors-v2/connector-file/pom.xml +++ b/seatunnel-connectors-v2/connector-file/pom.xml @@ -39,6 +39,7 @@ connector-file-sftp connector-file-s3 connector-file-jindo-oss + connector-file-cos diff --git a/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/commit/HiveSinkAggregatedCommitter.java b/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/commit/HiveSinkAggregatedCommitter.java index 1d1e70b3104..ed33689a29d 100644 --- a/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/commit/HiveSinkAggregatedCommitter.java +++ b/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/commit/HiveSinkAggregatedCommitter.java @@ -36,11 +36,14 @@ import java.util.Map; import java.util.stream.Collectors; +import static org.apache.seatunnel.connectors.seatunnel.hive.config.HiveConfig.ABORT_DROP_PARTITION_METADATA; + @Slf4j public class HiveSinkAggregatedCommitter extends FileSinkAggregatedCommitter { private final Config pluginConfig; private final String dbName; private final String tableName; + private final boolean abortDropPartitionMetadata; public HiveSinkAggregatedCommitter( Config pluginConfig, String dbName, String tableName, FileSystemUtils fileSystemUtils) { @@ -48,6 +51,10 @@ public HiveSinkAggregatedCommitter( this.pluginConfig = pluginConfig; this.dbName = dbName; this.tableName = tableName; + this.abortDropPartitionMetadata = + pluginConfig.hasPath(ABORT_DROP_PARTITION_METADATA.key()) + ? pluginConfig.getBoolean(ABORT_DROP_PARTITION_METADATA.key()) + : ABORT_DROP_PARTITION_METADATA.defaultValue(); } @Override @@ -87,21 +94,23 @@ public List commit( @Override public void abort(List aggregatedCommitInfos) throws Exception { super.abort(aggregatedCommitInfos); - HiveMetaStoreProxy hiveMetaStore = HiveMetaStoreProxy.getInstance(pluginConfig); - for (FileAggregatedCommitInfo aggregatedCommitInfo : aggregatedCommitInfos) { - Map> partitionDirAndValuesMap = - aggregatedCommitInfo.getPartitionDirAndValuesMap(); - List partitions = - partitionDirAndValuesMap.keySet().stream() - .map(partition -> partition.replaceAll("\\\\", "/")) - .collect(Collectors.toList()); - try { - hiveMetaStore.dropPartitions(dbName, tableName, partitions); - log.info("Remove these partitions {}", partitions); - } catch (TException e) { - log.error("Failed to remove these partitions {}", partitions, e); + if (abortDropPartitionMetadata) { + HiveMetaStoreProxy hiveMetaStore = HiveMetaStoreProxy.getInstance(pluginConfig); + for (FileAggregatedCommitInfo aggregatedCommitInfo : aggregatedCommitInfos) { + Map> partitionDirAndValuesMap = + aggregatedCommitInfo.getPartitionDirAndValuesMap(); + List partitions = + partitionDirAndValuesMap.keySet().stream() + .map(partition -> partition.replaceAll("\\\\", "/")) + .collect(Collectors.toList()); + try { + hiveMetaStore.dropPartitions(dbName, tableName, partitions); + log.info("Remove these partitions {}", partitions); + } catch (TException e) { + log.error("Failed to remove these partitions {}", partitions, e); + } } + hiveMetaStore.close(); } - hiveMetaStore.close(); } } diff --git a/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/config/HiveConfig.java b/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/config/HiveConfig.java index 627b561cc94..565f49bc776 100644 --- a/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/config/HiveConfig.java +++ b/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/config/HiveConfig.java @@ -40,6 +40,13 @@ public class HiveConfig { .noDefaultValue() .withDescription("Hive metastore uri"); + public static final Option ABORT_DROP_PARTITION_METADATA = + Options.key("abort_drop_partition_metadata") + .booleanType() + .defaultValue(false) + .withDescription( + "Flag to decide whether to drop partition metadata from Hive Metastore during an abort operation. Note: this only affects the metadata in the metastore, the data in the partition will always be deleted(data generated during the synchronization process)."); + public static final Option HIVE_SITE_PATH = Options.key("hive_site_path") .stringType() diff --git a/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/sink/HiveSinkFactory.java b/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/sink/HiveSinkFactory.java index 6674b778c4a..b98f6cffa50 100644 --- a/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/sink/HiveSinkFactory.java +++ b/seatunnel-connectors-v2/connector-hive/src/main/java/org/apache/seatunnel/connectors/seatunnel/hive/sink/HiveSinkFactory.java @@ -24,6 +24,8 @@ import com.google.auto.service.AutoService; +import static org.apache.seatunnel.connectors.seatunnel.hive.config.HiveConfig.ABORT_DROP_PARTITION_METADATA; + @AutoService(Factory.class) public class HiveSinkFactory implements TableSinkFactory { @Override @@ -36,6 +38,7 @@ public OptionRule optionRule() { return OptionRule.builder() .required(HiveConfig.TABLE_NAME) .required(HiveConfig.METASTORE_URI) + .optional(ABORT_DROP_PARTITION_METADATA) .build(); } } diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergTableLoader.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergTableLoader.java index 21fe3d0fb3f..9fe93027629 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergTableLoader.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/IcebergTableLoader.java @@ -34,7 +34,7 @@ import java.io.IOException; import java.io.Serializable; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; @Slf4j public class IcebergTableLoader implements Closeable, Serializable { diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/CommonConfig.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/CommonConfig.java index 634749a8324..1adce9a3995 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/CommonConfig.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/config/CommonConfig.java @@ -27,10 +27,10 @@ import java.io.Serializable; import java.util.List; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HADOOP; import static org.apache.seatunnel.connectors.seatunnel.iceberg.config.IcebergCatalogType.HIVE; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; @Getter @ToString diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java index c26554f5547..a1c6933e4ae 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/IcebergSource.java @@ -54,7 +54,7 @@ import java.util.ArrayList; import java.util.List; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; @AutoService(SeaTunnelSource.class) public class IcebergSource diff --git a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanSplitPlanner.java b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanSplitPlanner.java index b986702b76b..4a084d73a53 100644 --- a/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanSplitPlanner.java +++ b/seatunnel-connectors-v2/connector-iceberg/src/main/java/org/apache/seatunnel/connectors/seatunnel/iceberg/source/enumerator/scan/IcebergScanSplitPlanner.java @@ -43,7 +43,7 @@ import java.util.List; import java.util.Optional; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; @Slf4j public class IcebergScanSplitPlanner { diff --git a/seatunnel-connectors-v2/connector-iotdb/src/main/java/org/apache/seatunnel/connectors/seatunnel/iotdb/config/SinkConfig.java b/seatunnel-connectors-v2/connector-iotdb/src/main/java/org/apache/seatunnel/connectors/seatunnel/iotdb/config/SinkConfig.java index bc9810e7f17..9a7b7a112de 100644 --- a/seatunnel-connectors-v2/connector-iotdb/src/main/java/org/apache/seatunnel/connectors/seatunnel/iotdb/config/SinkConfig.java +++ b/seatunnel-connectors-v2/connector-iotdb/src/main/java/org/apache/seatunnel/connectors/seatunnel/iotdb/config/SinkConfig.java @@ -30,8 +30,8 @@ import java.time.ZoneId; import java.util.List; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; @Setter @Getter diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/AbstractJdbcCatalog.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/AbstractJdbcCatalog.java index b3f1138dedb..afc382c8000 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/AbstractJdbcCatalog.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/AbstractJdbcCatalog.java @@ -51,8 +51,8 @@ import java.util.Optional; import java.util.stream.Collectors; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; public abstract class AbstractJdbcCatalog implements Catalog { private static final Logger LOG = LoggerFactory.getLogger(AbstractJdbcCatalog.class); diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MySqlCatalog.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MySqlCatalog.java index ca96c6d243b..83d30852061 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MySqlCatalog.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MySqlCatalog.java @@ -50,6 +50,7 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; +import java.util.Locale; import java.util.Map; import java.util.Optional; import java.util.Set; @@ -217,13 +218,16 @@ private void buildTable(ResultSet resultSet, TableSchema.Builder builder) throws int scale = resultSet.getInt("NUMERIC_SCALE"); long columnLength = resultSet.getLong("CHARACTER_MAXIMUM_LENGTH"); long octetLength = resultSet.getLong("CHARACTER_OCTET_LENGTH"); - SeaTunnelDataType type = fromJdbcType(sourceType, precision, scale); + if (sourceType.toLowerCase(Locale.ROOT).contains("unsigned")) { + typeName += "_UNSIGNED"; + } + SeaTunnelDataType type = fromJdbcType(typeName, precision, scale); String comment = resultSet.getString("COLUMN_COMMENT"); Object defaultValue = resultSet.getObject("COLUMN_DEFAULT"); String isNullableStr = resultSet.getString("IS_NULLABLE"); boolean isNullable = isNullableStr.equals("YES"); long bitLen = 0; - MysqlType mysqlType = MysqlType.getByName(sourceType); + MysqlType mysqlType = MysqlType.valueOf(typeName); switch (mysqlType) { case BIT: bitLen = precision; @@ -317,9 +321,7 @@ protected boolean dropTableInternal(TablePath tablePath) throws CatalogException Connection connection = getConnection(dbUrl); try (PreparedStatement ps = connection.prepareStatement( - String.format( - "DROP TABLE IF EXISTS %s.%s;", - tablePath.getDatabaseName(), tablePath.getTableName()))) { + String.format("DROP TABLE IF EXISTS %s;", tablePath.getFullName()))) { // Will there exist concurrent drop for one table? return ps.execute(); } catch (SQLException e) { diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MysqlCreateTableSqlBuilder.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MysqlCreateTableSqlBuilder.java index ccf9c8cd0e9..862f0bb6643 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MysqlCreateTableSqlBuilder.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MysqlCreateTableSqlBuilder.java @@ -36,8 +36,8 @@ import java.util.List; import java.util.stream.Collectors; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; public class MysqlCreateTableSqlBuilder { diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCatalog.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCatalog.java index 5172cc6461c..84ffff44123 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCatalog.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCatalog.java @@ -160,10 +160,7 @@ protected boolean createTableInternal(TablePath tablePath, CatalogTable table) ps.execute(); } catch (Exception e) { throw new CatalogException( - String.format( - "Failed creating table %s.%s", - tablePath.getSchemaName(), tablePath.getTableName()), - e); + String.format("Failed creating table %s", tablePath.getFullName()), e); } } return true; diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCreateTableSqlBuilder.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCreateTableSqlBuilder.java index ada628a13f0..61dee202965 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCreateTableSqlBuilder.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCreateTableSqlBuilder.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.oracle; import org.apache.seatunnel.api.table.catalog.CatalogTable; diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilder.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilder.java index c6a38d5cf59..d646ed053d7 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilder.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresCreateTableSqlBuilder.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.psql; import org.apache.seatunnel.api.table.catalog.CatalogTable; diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresDataTypeConvertor.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresDataTypeConvertor.java index c87a2fc1188..d1f8a5691df 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresDataTypeConvertor.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/psql/PostgresDataTypeConvertor.java @@ -105,6 +105,8 @@ public class PostgresDataTypeConvertor implements DataTypeConvertor { public static final String PG_INTERVAL = "interval"; public static final String PG_GEOMETRY = "geometry"; public static final String PG_GEOGRAPHY = "geography"; + public static final String PG_JSON = "json"; + public static final String PG_JSONB = "jsonb"; @Override public SeaTunnelDataType toSeaTunnelType(String connectorDataType) { @@ -160,6 +162,8 @@ public SeaTunnelDataType toSeaTunnelType( case PG_INTERVAL: case PG_GEOMETRY: case PG_GEOGRAPHY: + case PG_JSON: + case PG_JSONB: return BasicType.STRING_TYPE; case PG_CHAR_ARRAY: case PG_CHARACTER_ARRAY: diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sqlserver/SqlServerCreateTableSqlBuilder.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sqlserver/SqlServerCreateTableSqlBuilder.java index 5c2f05e2db5..1774518dcdd 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sqlserver/SqlServerCreateTableSqlBuilder.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sqlserver/SqlServerCreateTableSqlBuilder.java @@ -36,8 +36,8 @@ import java.util.Map; import java.util.stream.Collectors; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; public class SqlServerCreateTableSqlBuilder { diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcConnectionConfig.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcConnectionConfig.java index afceddc59a0..555963af2cf 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcConnectionConfig.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcConnectionConfig.java @@ -27,6 +27,7 @@ public class JdbcConnectionConfig implements Serializable { public String url; public String driverName; + public String compatibleMode; public int connectionCheckTimeoutSeconds = JdbcOptions.CONNECTION_CHECK_TIMEOUT_SEC.defaultValue(); public int maxRetries = JdbcOptions.MAX_RETRIES.defaultValue(); @@ -37,7 +38,6 @@ public class JdbcConnectionConfig implements Serializable { public boolean autoCommit = JdbcOptions.AUTO_COMMIT.defaultValue(); public int batchSize = JdbcOptions.BATCH_SIZE.defaultValue(); - public int batchIntervalMs = JdbcOptions.BATCH_INTERVAL_MS.defaultValue(); public String xaDataSourceClassName; @@ -48,12 +48,12 @@ public class JdbcConnectionConfig implements Serializable { public static JdbcConnectionConfig of(ReadonlyConfig config) { JdbcConnectionConfig.Builder builder = JdbcConnectionConfig.builder(); builder.url(config.get(JdbcOptions.URL)); + builder.compatibleMode(config.get(JdbcOptions.COMPATIBLE_MODE)); builder.driverName(config.get(JdbcOptions.DRIVER)); builder.autoCommit(config.get(JdbcOptions.AUTO_COMMIT)); builder.maxRetries(config.get(JdbcOptions.MAX_RETRIES)); builder.connectionCheckTimeoutSeconds(config.get(JdbcOptions.CONNECTION_CHECK_TIMEOUT_SEC)); builder.batchSize(config.get(JdbcOptions.BATCH_SIZE)); - builder.batchIntervalMs(config.get(JdbcOptions.BATCH_INTERVAL_MS)); if (config.get(JdbcOptions.IS_EXACTLY_ONCE)) { builder.xaDataSourceClassName(config.get(JdbcOptions.XA_DATA_SOURCE_CLASS_NAME)); builder.maxCommitAttempts(config.get(JdbcOptions.MAX_COMMIT_ATTEMPTS)); @@ -74,6 +74,10 @@ public String getDriverName() { return driverName; } + public String getCompatibleMode() { + return compatibleMode; + } + public boolean isAutoCommit() { return autoCommit; } @@ -98,10 +102,6 @@ public int getBatchSize() { return batchSize; } - public int getBatchIntervalMs() { - return batchIntervalMs; - } - public String getXaDataSourceClassName() { return xaDataSourceClassName; } @@ -121,6 +121,7 @@ public static JdbcConnectionConfig.Builder builder() { public static final class Builder { private String url; private String driverName; + private String compatibleMode; private int connectionCheckTimeoutSeconds = JdbcOptions.CONNECTION_CHECK_TIMEOUT_SEC.defaultValue(); private int maxRetries = JdbcOptions.MAX_RETRIES.defaultValue(); @@ -129,7 +130,6 @@ public static final class Builder { private String query; private boolean autoCommit = JdbcOptions.AUTO_COMMIT.defaultValue(); private int batchSize = JdbcOptions.BATCH_SIZE.defaultValue(); - private int batchIntervalMs = JdbcOptions.BATCH_INTERVAL_MS.defaultValue(); private String xaDataSourceClassName; private int maxCommitAttempts = JdbcOptions.MAX_COMMIT_ATTEMPTS.defaultValue(); private int transactionTimeoutSec = JdbcOptions.TRANSACTION_TIMEOUT_SEC.defaultValue(); @@ -146,6 +146,11 @@ public Builder driverName(String driverName) { return this; } + public Builder compatibleMode(String compatibleMode) { + this.compatibleMode = compatibleMode; + return this; + } + public Builder connectionCheckTimeoutSeconds(int connectionCheckTimeoutSeconds) { this.connectionCheckTimeoutSeconds = connectionCheckTimeoutSeconds; return this; @@ -181,11 +186,6 @@ public Builder batchSize(int batchSize) { return this; } - public Builder batchIntervalMs(int batchIntervalMs) { - this.batchIntervalMs = batchIntervalMs; - return this; - } - public Builder xaDataSourceClassName(String xaDataSourceClassName) { this.xaDataSourceClassName = xaDataSourceClassName; return this; @@ -204,8 +204,8 @@ public Builder transactionTimeoutSec(int transactionTimeoutSec) { public JdbcConnectionConfig build() { JdbcConnectionConfig jdbcConnectionConfig = new JdbcConnectionConfig(); jdbcConnectionConfig.batchSize = this.batchSize; - jdbcConnectionConfig.batchIntervalMs = this.batchIntervalMs; jdbcConnectionConfig.driverName = this.driverName; + jdbcConnectionConfig.compatibleMode = this.compatibleMode; jdbcConnectionConfig.maxRetries = this.maxRetries; jdbcConnectionConfig.password = this.password; jdbcConnectionConfig.connectionCheckTimeoutSeconds = this.connectionCheckTimeoutSeconds; diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcOptions.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcOptions.java index c241dddb578..76759a583e1 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcOptions.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcOptions.java @@ -38,6 +38,12 @@ public interface JdbcOptions { .intType() .defaultValue(30) .withDescription("connection check time second"); + Option COMPATIBLE_MODE = + Options.key("compatible_mode") + .stringType() + .noDefaultValue() + .withDescription( + "The compatible mode of database, required when the database supports multiple compatible modes. For example, when using OceanBase database, you need to set it to 'mysql' or 'oracle'."); Option MAX_RETRIES = Options.key("max_retries").intType().defaultValue(0).withDescription("max_retired"); @@ -76,12 +82,6 @@ public interface JdbcOptions { "For queries that return a large number of objects, " + "you can configure the row fetch size used in the query to improve performance by reducing the number database hits required to satisfy the selection criteria. Zero means use jdbc default value."); - Option BATCH_INTERVAL_MS = - Options.key("batch_interval_ms") - .intType() - .defaultValue(0) - .withDescription("batch interval milliSecond"); - Option IS_EXACTLY_ONCE = Options.key("is_exactly_once") .booleanType() diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcSourceConfig.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcSourceConfig.java index 4c6221549bf..00130b32acc 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcSourceConfig.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/config/JdbcSourceConfig.java @@ -33,6 +33,7 @@ public class JdbcSourceConfig implements Serializable { private JdbcConnectionConfig jdbcConnectionConfig; public String query; + public String compatibleMode; private String partitionColumn; private BigDecimal partitionUpperBound; private BigDecimal partitionLowerBound; @@ -44,6 +45,7 @@ public static JdbcSourceConfig of(ReadonlyConfig config) { builder.jdbcConnectionConfig(JdbcConnectionConfig.of(config)); builder.query(config.get(JdbcOptions.QUERY)); builder.fetchSize(config.get(JdbcOptions.FETCH_SIZE)); + config.getOptional(JdbcOptions.COMPATIBLE_MODE).ifPresent(builder::compatibleMode); config.getOptional(JdbcOptions.PARTITION_COLUMN).ifPresent(builder::partitionColumn); config.getOptional(JdbcOptions.PARTITION_UPPER_BOUND) .ifPresent(builder::partitionUpperBound); diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/JdbcOutputFormat.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/JdbcOutputFormat.java index 45fe77f201a..6c0ad676a92 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/JdbcOutputFormat.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/JdbcOutputFormat.java @@ -33,11 +33,6 @@ import java.io.IOException; import java.io.Serializable; import java.sql.SQLException; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; import java.util.function.Supplier; import static com.google.common.base.Preconditions.checkNotNull; @@ -57,9 +52,6 @@ public class JdbcOutputFormat> implem private transient E jdbcStatementExecutor; private transient int batchCount = 0; private transient volatile boolean closed = false; - - private transient ScheduledExecutorService scheduler; - private transient ScheduledFuture scheduledFuture; private transient volatile Exception flushException; public JdbcOutputFormat( @@ -82,37 +74,6 @@ public void open() throws IOException { e); } jdbcStatementExecutor = createAndOpenStatementExecutor(statementExecutorFactory); - - if (jdbcConnectionConfig.getBatchIntervalMs() != 0 - && jdbcConnectionConfig.getBatchSize() != 1) { - this.scheduler = - Executors.newScheduledThreadPool( - 1, - runnable -> { - AtomicInteger cnt = new AtomicInteger(0); - Thread thread = new Thread(runnable); - thread.setDaemon(true); - thread.setName( - "jdbc-upsert-output-format" + "-" + cnt.incrementAndGet()); - return thread; - }); - this.scheduledFuture = - this.scheduler.scheduleWithFixedDelay( - () -> { - synchronized (JdbcOutputFormat.this) { - if (!closed) { - try { - flush(); - } catch (Exception e) { - flushException = e; - } - } - } - }, - jdbcConnectionConfig.getBatchIntervalMs(), - jdbcConnectionConfig.getBatchIntervalMs(), - TimeUnit.MILLISECONDS); - } } private E createAndOpenStatementExecutor(StatementExecutorFactory statementExecutorFactory) { @@ -228,15 +189,19 @@ public synchronized void close() { if (!closed) { closed = true; - if (this.scheduledFuture != null) { - scheduledFuture.cancel(false); - this.scheduler.shutdown(); - } - - try { - if (batchCount > 0) { + if (batchCount > 0) { + try { flush(); + } catch (Exception e) { + LOG.warn("Writing records to JDBC failed.", e); + flushException = + new JdbcConnectorException( + CommonErrorCode.FLUSH_DATA_FAILED, + "Writing records to JDBC failed.", + e); } + } + try { if (jdbcStatementExecutor != null) { jdbcStatementExecutor.closeStatements(); } @@ -254,11 +219,11 @@ public synchronized void close() { public void updateExecutor(boolean reconnect) throws SQLException, ClassNotFoundException { try { jdbcStatementExecutor.closeStatements(); - } catch (Exception e) { + } catch (SQLException e) { if (!reconnect) { throw e; } - LOG.info("Close JDBC statement failed on reconnect.", e); + LOG.error("Close JDBC statement failed on reconnect.", e); } jdbcStatementExecutor.prepareStatements( reconnect diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/JdbcDialectFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/JdbcDialectFactory.java index d5a365bfbd5..a8d4b960473 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/JdbcDialectFactory.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/JdbcDialectFactory.java @@ -38,7 +38,21 @@ public interface JdbcDialectFactory { /** @return Creates a new instance of the {@link JdbcDialect}. */ JdbcDialect create(); - default JdbcDialect create(String fieldIde) { - return null; + default JdbcDialect createWithFieldIde(String fieldIde) { + return create(); + } + + default JdbcDialect createWithCompatible(String compatibleMode) { + return create(); + } + + /** + * Create a {@link JdbcDialect} instance based on the driver type and compatible mode. + * + * @param compatibleMode The compatible mode + * @return a new instance of {@link JdbcDialect} + */ + default JdbcDialect create(String compatibleMode, String fieldIde) { + return create(); } } diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/JdbcDialectLoader.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/JdbcDialectLoader.java index bbd28995b40..b7fbbfeb53a 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/JdbcDialectLoader.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/JdbcDialectLoader.java @@ -39,18 +39,23 @@ public final class JdbcDialectLoader { private JdbcDialectLoader() {} public static JdbcDialect load(String url) { - return load(url, ""); + return load(url, "", ""); + } + + public static JdbcDialect load(String url, String compatibleMode) { + return load(url, compatibleMode, ""); } /** * Loads the unique JDBC Dialect that can handle the given database url. * * @param url A database URL. + * @param compatibleMode The compatible mode. + * @return The loaded dialect. * @throws IllegalStateException if the loader cannot find exactly one dialect that can * unambiguously process the given database URL. - * @return The loaded dialect. */ - public static JdbcDialect load(String url, String fieldIde) { + public static JdbcDialect load(String url, String compatibleMode, String fieldIde) { ClassLoader cl = Thread.currentThread().getContextClassLoader(); List foundFactories = discoverFactories(cl); @@ -94,10 +99,15 @@ public static JdbcDialect load(String url, String fieldIde) { .sorted() .collect(Collectors.joining("\n")))); } - if (StringUtils.isNotEmpty(fieldIde)) { - return matchingFactories.get(0).create(fieldIde); + if (StringUtils.isNotEmpty(fieldIde) && StringUtils.isNotEmpty(compatibleMode)) { + return matchingFactories.get(0).create(compatibleMode, fieldIde); + } else if (StringUtils.isNotEmpty(fieldIde)) { + return matchingFactories.get(0).createWithFieldIde(fieldIde); + } else if (StringUtils.isNotEmpty(compatibleMode)) { + return matchingFactories.get(0).createWithCompatible(compatibleMode); + } else { + return matchingFactories.get(0).create(); } - return matchingFactories.get(0).create(); } private static List discoverFactories(ClassLoader classLoader) { diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/dm/DmdbDialect.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/dm/DmdbDialect.java index c3a929be29c..67db4fb7c60 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/dm/DmdbDialect.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/dm/DmdbDialect.java @@ -17,11 +17,15 @@ package org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.dm; +import org.apache.seatunnel.api.table.catalog.TablePath; import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.converter.JdbcRowConverter; import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.JdbcDialect; import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.JdbcDialectTypeMapper; +import java.util.Arrays; +import java.util.List; import java.util.Optional; +import java.util.stream.Collectors; public class DmdbDialect implements JdbcDialect { @@ -43,6 +47,64 @@ public JdbcDialectTypeMapper getJdbcDialectTypeMapper() { @Override public Optional getUpsertStatement( String database, String tableName, String[] fieldNames, String[] uniqueKeyFields) { - return Optional.empty(); + List nonUniqueKeyFields = + Arrays.stream(fieldNames) + .filter(fieldName -> !Arrays.asList(uniqueKeyFields).contains(fieldName)) + .collect(Collectors.toList()); + String valuesBinding = + Arrays.stream(fieldNames) + .map(fieldName -> ":" + fieldName + " " + quoteIdentifier(fieldName)) + .collect(Collectors.joining(", ")); + String usingClause = String.format("SELECT %s", valuesBinding); + String onConditions = + Arrays.stream(uniqueKeyFields) + .map( + fieldName -> + String.format( + "TARGET.%s=SOURCE.%s", + quoteIdentifier(fieldName), + quoteIdentifier(fieldName))) + .collect(Collectors.joining(" AND ")); + + String updateSetClause = + nonUniqueKeyFields.stream() + .map( + fieldName -> + String.format( + "TARGET.%s=SOURCE.%s", + quoteIdentifier(fieldName), + quoteIdentifier(fieldName))) + .collect(Collectors.joining(", ")); + + String insertFields = + Arrays.stream(fieldNames) + .map(this::quoteIdentifier) + .collect(Collectors.joining(", ")); + String insertValues = + Arrays.stream(fieldNames) + .map(fieldName -> "SOURCE." + quoteIdentifier(fieldName)) + .collect(Collectors.joining(", ")); + String upsertSQL = + String.format( + " MERGE INTO %s TARGET" + + " USING (%s) SOURCE" + + " ON (%s) " + + " WHEN MATCHED THEN" + + " UPDATE SET %s" + + " WHEN NOT MATCHED THEN" + + " INSERT (%s) VALUES (%s)", + tableIdentifier(database, tableName), + usingClause, + onConditions, + updateSetClause, + insertFields, + insertValues); + + return Optional.of(upsertSQL); + } + + @Override + public String extractTableName(TablePath tablePath) { + return tablePath.getTableName(); } } diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/mysql/MySqlDialectFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/mysql/MySqlDialectFactory.java index 605c42945dd..24000bcd3f1 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/mysql/MySqlDialectFactory.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/mysql/MySqlDialectFactory.java @@ -36,7 +36,7 @@ public JdbcDialect create() { } @Override - public JdbcDialect create(String fieldIde) { + public JdbcDialect createWithFieldIde(String fieldIde) { return new MysqlDialect(fieldIde); } } diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oceanbase/OceanBaseDialectFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oceanbase/OceanBaseDialectFactory.java new file mode 100644 index 00000000000..5c9ff369847 --- /dev/null +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oceanbase/OceanBaseDialectFactory.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.oceanbase; + +import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.JdbcDialect; +import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.JdbcDialectFactory; +import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.mysql.MysqlDialect; +import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.oracle.OracleDialect; + +import com.google.auto.service.AutoService; + +import javax.annotation.Nonnull; + +@AutoService(JdbcDialectFactory.class) +public class OceanBaseDialectFactory implements JdbcDialectFactory { + @Override + public boolean acceptsURL(String url) { + return url.startsWith("jdbc:oceanbase:"); + } + + @Override + public JdbcDialect create() { + throw new UnsupportedOperationException( + "Can't create JdbcDialect without compatible mode for OceanBase"); + } + + @Override + public JdbcDialect createWithCompatible(@Nonnull String compatibleMode) { + if ("oracle".equalsIgnoreCase(compatibleMode)) { + return new OracleDialect(); + } + return new MysqlDialect(); + } +} diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oracle/OracleDialectFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oracle/OracleDialectFactory.java index 8da513782c8..3801fb2c987 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oracle/OracleDialectFactory.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oracle/OracleDialectFactory.java @@ -36,7 +36,7 @@ public JdbcDialect create() { } @Override - public JdbcDialect create(String fieldIde) { + public JdbcDialect createWithFieldIde(String fieldIde) { return new OracleDialect(fieldIde); } } diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oracle/OracleTypeMapper.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oracle/OracleTypeMapper.java index 1ce9c3a72b9..1822484b309 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oracle/OracleTypeMapper.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/oracle/OracleTypeMapper.java @@ -87,9 +87,10 @@ public SeaTunnelDataType mapping(ResultSetMetaData metadata, int colIndex) if (scale == 0) { if (precision <= 9) { return BasicType.INT_TYPE; - } - if (precision <= 18) { + } else if (precision <= 18) { return BasicType.LONG_TYPE; + } else if (precision <= 38) { + return new DecimalType(38, 0); } } return new DecimalType(38, 18); diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psql/PostgresDialectFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psql/PostgresDialectFactory.java index 0d41a9c91fd..b0544d03863 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psql/PostgresDialectFactory.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psql/PostgresDialectFactory.java @@ -19,9 +19,12 @@ import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.JdbcDialect; import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.JdbcDialectFactory; +import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.psqllow.PostgresLowDialect; import com.google.auto.service.AutoService; +import javax.annotation.Nonnull; + @AutoService(JdbcDialectFactory.class) public class PostgresDialectFactory implements JdbcDialectFactory { @Override @@ -31,11 +34,28 @@ public boolean acceptsURL(String url) { @Override public JdbcDialect create() { + throw new UnsupportedOperationException( + "Can't create JdbcDialect without compatible mode for Postgres"); + } + + @Override + public JdbcDialect createWithFieldIde(String fieldIde) { + return new PostgresDialect(fieldIde); + } + + @Override + public JdbcDialect createWithCompatible(String compatibleMode) { + if ("postgresLow".equalsIgnoreCase(compatibleMode)) { + return new PostgresLowDialect(); + } return new PostgresDialect(); } @Override - public JdbcDialect create(String fieldIde) { + public JdbcDialect create(@Nonnull String compatibleMode, String fieldIde) { + if ("postgresLow".equalsIgnoreCase(compatibleMode)) { + return new PostgresLowDialect(); + } return new PostgresDialect(fieldIde); } } diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psql/PostgresTypeMapper.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psql/PostgresTypeMapper.java index fe40f80a468..0260c780fb6 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psql/PostgresTypeMapper.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psql/PostgresTypeMapper.java @@ -87,6 +87,8 @@ public class PostgresTypeMapper implements JdbcDialectTypeMapper { private static final String PG_INTERVAL = "interval"; private static final String PG_GEOMETRY = "geometry"; private static final String PG_GEOGRAPHY = "geography"; + private static final String PG_JSON = "json"; + private static final String PG_JSONB = "jsonb"; @SuppressWarnings("checkstyle:MagicNumber") @Override @@ -141,6 +143,8 @@ public SeaTunnelDataType mapping(ResultSetMetaData metadata, int colIndex) case PG_INTERVAL: case PG_GEOMETRY: case PG_GEOGRAPHY: + case PG_JSON: + case PG_JSONB: return BasicType.STRING_TYPE; case PG_CHAR_ARRAY: case PG_CHARACTER_ARRAY: diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psqllow/PostgresLowDialect.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psqllow/PostgresLowDialect.java new file mode 100644 index 00000000000..e367207ffa2 --- /dev/null +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/psqllow/PostgresLowDialect.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.psqllow; + +import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.psql.PostgresDialect; + +import java.util.Optional; + +public class PostgresLowDialect extends PostgresDialect { + @Override + public Optional getUpsertStatement( + String database, String tableName, String[] fieldNames, String[] uniqueKeyFields) { + return Optional.empty(); + } +} diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/redshift/RedshiftDialectFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/redshift/RedshiftDialectFactory.java index 7c6aa8148ca..b288adef864 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/redshift/RedshiftDialectFactory.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/redshift/RedshiftDialectFactory.java @@ -35,7 +35,7 @@ public JdbcDialect create() { } @Override - public JdbcDialect create(String fieldIde) { + public JdbcDialect createWithFieldIde(String fieldIde) { return new RedshiftDialect(fieldIde); } } diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/sqlserver/SqlServerDialectFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/sqlserver/SqlServerDialectFactory.java index 8889ab93ebe..d563b887a3b 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/sqlserver/SqlServerDialectFactory.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/sqlserver/SqlServerDialectFactory.java @@ -36,7 +36,7 @@ public JdbcDialect create() { } @Override - public JdbcDialect create(String fieldIde) { + public JdbcDialect createWithFieldIde(String fieldIde) { return new SqlServerDialect(fieldIde); } } diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/executor/FieldNamedPreparedStatement.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/executor/FieldNamedPreparedStatement.java index b8ba7e2fe10..29c98c79387 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/executor/FieldNamedPreparedStatement.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/executor/FieldNamedPreparedStatement.java @@ -47,8 +47,8 @@ import java.util.List; import java.util.Map; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; @RequiredArgsConstructor @Slf4j diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/split/JdbcNumericBetweenParametersProvider.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/split/JdbcNumericBetweenParametersProvider.java index ced1d2831f1..4bf8834f6d5 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/split/JdbcNumericBetweenParametersProvider.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/split/JdbcNumericBetweenParametersProvider.java @@ -21,8 +21,8 @@ import java.math.BigDecimal; import java.math.RoundingMode; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkState; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkState; /** * This query parameters generator is an helper class to parameterize from/to queries on a numeric diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/xa/SemanticXidGenerator.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/xa/SemanticXidGenerator.java index 39c72267c68..578b5de8082 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/xa/SemanticXidGenerator.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/xa/SemanticXidGenerator.java @@ -25,7 +25,7 @@ import java.security.SecureRandom; import java.util.Arrays; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; /** * Generates {@link Xid} from: diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/xa/XidImpl.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/xa/XidImpl.java index 299ba28cdbf..05dbe6d3c94 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/xa/XidImpl.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/xa/XidImpl.java @@ -26,7 +26,7 @@ import java.util.Arrays; import java.util.Objects; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; /** * A simple {@link Xid} implementation that stores branch and global transaction identifiers as byte diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcExactlyOnceSinkWriter.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcExactlyOnceSinkWriter.java index d573180e14e..94152557ac0 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcExactlyOnceSinkWriter.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcExactlyOnceSinkWriter.java @@ -50,8 +50,8 @@ import java.util.List; import java.util.Optional; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkState; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkState; public class JdbcExactlyOnceSinkWriter implements SinkWriter { private static final Logger LOG = LoggerFactory.getLogger(JdbcExactlyOnceSinkWriter.class); diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcSink.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcSink.java index fd808cd62aa..8429fa4bcfa 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcSink.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcSink.java @@ -110,6 +110,7 @@ public void prepare(Config pluginConfig) throws PrepareFailException { this.dialect = JdbcDialectLoader.load( jdbcSinkConfig.getJdbcConnectionConfig().getUrl(), + jdbcSinkConfig.getJdbcConnectionConfig().getCompatibleMode(), config.get(JdbcOptions.FIELD_IDE) == null ? FieldIdeEnum.ORIGINAL.getValue() : config.get(JdbcOptions.FIELD_IDE).getValue()); diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcSinkFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcSinkFactory.java index 22cdd24ae0b..117d3ff1a62 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcSinkFactory.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/sink/JdbcSinkFactory.java @@ -22,6 +22,7 @@ import org.apache.seatunnel.api.sink.DataSaveMode; import org.apache.seatunnel.api.table.catalog.CatalogOptions; import org.apache.seatunnel.api.table.catalog.CatalogTable; +import org.apache.seatunnel.api.table.catalog.ConstraintKey; import org.apache.seatunnel.api.table.catalog.PrimaryKey; import org.apache.seatunnel.api.table.catalog.TableIdentifier; import org.apache.seatunnel.api.table.connector.TableSink; @@ -43,11 +44,12 @@ import java.util.HashMap; import java.util.Map; import java.util.Optional; +import java.util.stream.Collectors; import static org.apache.seatunnel.api.sink.SinkCommonOptions.MULTI_TABLE_SINK_REPLICA; import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.AUTO_COMMIT; -import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.BATCH_INTERVAL_MS; import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.BATCH_SIZE; +import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.COMPATIBLE_MODE; import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.CONNECTION_CHECK_TIMEOUT_SEC; import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.CUSTOM_SQL; import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.DATABASE; @@ -83,6 +85,7 @@ public TableSink createSink(TableFactoryContext context) { Map catalogOptions = config.get(CatalogOptions.CATALOG_OPTIONS); Optional optionalTable = config.getOptional(TABLE); if (!optionalTable.isPresent()) { + catalogOptions = catalogOptions == null ? new HashMap<>() : catalogOptions; String prefix = catalogOptions.get(JdbcCatalogOptions.TABLE_PREFIX.key()); String suffix = catalogOptions.get(JdbcCatalogOptions.TABLE_SUFFIX.key()); if (StringUtils.isNotEmpty(prefix) || StringUtils.isNotEmpty(suffix)) { @@ -126,6 +129,21 @@ public TableSink createSink(TableFactoryContext context) { PrimaryKey primaryKey = catalogTable.getTableSchema().getPrimaryKey(); if (primaryKey != null && !CollectionUtils.isEmpty(primaryKey.getColumnNames())) { map.put(PRIMARY_KEYS.key(), String.join(",", primaryKey.getColumnNames())); + } else { + Optional keyOptional = + catalogTable.getTableSchema().getConstraintKeys().stream() + .filter( + key -> + ConstraintKey.ConstraintType.UNIQUE_KEY.equals( + key.getConstraintType())) + .findFirst(); + if (keyOptional.isPresent()) { + map.put( + PRIMARY_KEYS.key(), + keyOptional.get().getColumnNames().stream() + .map(key -> key.getColumnName()) + .collect(Collectors.joining(","))); + } } config = ReadonlyConfig.fromMap(new HashMap<>(map)); } @@ -136,7 +154,10 @@ public TableSink createSink(TableFactoryContext context) { ? FieldIdeEnum.ORIGINAL.getValue() : config.get(JdbcOptions.FIELD_IDE).getValue(); JdbcDialect dialect = - JdbcDialectLoader.load(sinkConfig.getJdbcConnectionConfig().getUrl(), fieldIde); + JdbcDialectLoader.load( + sinkConfig.getJdbcConnectionConfig().getUrl(), + sinkConfig.getJdbcConnectionConfig().getCompatibleMode(), + fieldIde); CatalogTable finalCatalogTable = catalogTable; // get saveMode DataSaveMode dataSaveMode = DataSaveMode.ERROR_WHEN_EXISTS; @@ -157,12 +178,12 @@ public OptionRule optionRule() { PASSWORD, CONNECTION_CHECK_TIMEOUT_SEC, BATCH_SIZE, - BATCH_INTERVAL_MS, IS_EXACTLY_ONCE, GENERATE_SINK_SQL, AUTO_COMMIT, ENABLE_UPSERT, PRIMARY_KEYS, + COMPATIBLE_MODE, SUPPORT_UPSERT_BY_INSERT_ONLY, IS_PRIMARY_KEY_UPDATED, MULTI_TABLE_SINK_REPLICA) diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/JdbcSource.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/JdbcSource.java index d4a50d15c36..1bf1b332fa9 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/JdbcSource.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/JdbcSource.java @@ -21,6 +21,7 @@ import org.apache.seatunnel.api.common.PrepareFailException; import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.api.configuration.util.ConfigValidator; import org.apache.seatunnel.api.serialization.Serializer; import org.apache.seatunnel.api.source.Boundedness; import org.apache.seatunnel.api.source.SeaTunnelSource; @@ -94,12 +95,15 @@ public String getPluginName() { @Override public void prepare(Config pluginConfig) throws PrepareFailException { ReadonlyConfig config = ReadonlyConfig.fromConfig(pluginConfig); + ConfigValidator.of(config).validate(new JdbcSourceFactory().optionRule()); this.jdbcSourceConfig = JdbcSourceConfig.of(config); this.jdbcConnectionProvider = new SimpleJdbcConnectionProvider(jdbcSourceConfig.getJdbcConnectionConfig()); this.query = jdbcSourceConfig.getQuery(); this.jdbcDialect = - JdbcDialectLoader.load(jdbcSourceConfig.getJdbcConnectionConfig().getUrl()); + JdbcDialectLoader.load( + jdbcSourceConfig.getJdbcConnectionConfig().getUrl(), + jdbcSourceConfig.getJdbcConnectionConfig().getCompatibleMode()); try (Connection connection = jdbcConnectionProvider.getOrEstablishConnection()) { this.typeInfo = initTableField(connection); this.partitionParameter = diff --git a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/JdbcSourceFactory.java b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/JdbcSourceFactory.java index 691e24ff822..8c21a842339 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/JdbcSourceFactory.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/main/java/org/apache/seatunnel/connectors/seatunnel/jdbc/source/JdbcSourceFactory.java @@ -54,6 +54,7 @@ import java.util.Map; import java.util.Optional; +import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.COMPATIBLE_MODE; import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.CONNECTION_CHECK_TIMEOUT_SEC; import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.DRIVER; import static org.apache.seatunnel.connectors.seatunnel.jdbc.config.JdbcOptions.FETCH_SIZE; @@ -83,7 +84,10 @@ TableSource createSource(TableFactoryContext context) { JdbcConnectionProvider connectionProvider = new SimpleJdbcConnectionProvider(config.getJdbcConnectionConfig()); final String querySql = config.getQuery(); - JdbcDialect dialect = JdbcDialectLoader.load(config.getJdbcConnectionConfig().getUrl()); + JdbcDialect dialect = + JdbcDialectLoader.load( + config.getJdbcConnectionConfig().getUrl(), + config.getJdbcConnectionConfig().getCompatibleMode()); TableSchema tableSchema = catalogTable.getTableSchema(); SeaTunnelRowType rowType = tableSchema.toPhysicalRowDataType(); Optional partitionParameter = @@ -255,7 +259,8 @@ public OptionRule optionRule() { PARTITION_COLUMN, PARTITION_UPPER_BOUND, PARTITION_LOWER_BOUND, - PARTITION_NUM) + PARTITION_NUM, + COMPATIBLE_MODE) .build(); } diff --git a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MySqlCatalogTest.java b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MySqlCatalogTest.java index 511907ce980..daf87b3693a 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MySqlCatalogTest.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/mysql/MySqlCatalogTest.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.mysql; import org.apache.seatunnel.api.table.catalog.CatalogTable; diff --git a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCatalogTest.java b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCatalogTest.java index 86160308dad..6b8c49bc0ab 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCatalogTest.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/oracle/OracleCatalogTest.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.oracle; import org.apache.seatunnel.api.table.catalog.CatalogTable; diff --git a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sql/MysqlCreateTableSqlBuilderTest.java b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sql/MysqlCreateTableSqlBuilderTest.java index e3c95c5c250..f5f5fa7bcef 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sql/MysqlCreateTableSqlBuilderTest.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sql/MysqlCreateTableSqlBuilderTest.java @@ -92,16 +92,15 @@ public void testBuild() { String createTableSql = MysqlCreateTableSqlBuilder.builder(tablePath, catalogTable) - .build("", catalogTable.getOptions().get("fieldIde")); + .build("mysql", catalogTable.getOptions().get("fieldIde")); String expect = "CREATE TABLE IF NOT EXISTS test_table (\n" - + "\tid BIGINT (22) NOT NULL COMMENT 'id', \n" - + "\tname VARCHAR (128) NOT NULL COMMENT 'name', \n" - + "\tage INT NULL COMMENT 'age', \n" - + "\tcreateTime TIMESTAMP (3) NULL COMMENT 'createTime', \n" - + "\tlastUpdateTime TIMESTAMP (3) NULL COMMENT 'lastUpdateTime', \n" - + "\tPRIMARY KEY (`id`), \n" - + "\tKEY `name` (`name`)\n" + + "\tid null NOT NULL COMMENT 'id', \n" + + "\tname null NOT NULL COMMENT 'name', \n" + + "\tage null NULL COMMENT 'age', \n" + + "\tcreateTime null NULL COMMENT 'createTime', \n" + + "\tlastUpdateTime null NULL COMMENT 'lastUpdateTime', \n" + + "\tPRIMARY KEY (`id`)\n" + ") COMMENT = 'User table';"; CONSOLE.println(expect); System.out.println(createTableSql); diff --git a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sqlserver/SqlServerCatalogTest.java b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sqlserver/SqlServerCatalogTest.java index 8fcba328932..5e457910f03 100644 --- a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sqlserver/SqlServerCatalogTest.java +++ b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/catalog/sqlserver/SqlServerCatalogTest.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.sqlserver; import org.apache.seatunnel.api.table.catalog.CatalogTable; diff --git a/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/PostgresDialectFactoryTest.java b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/PostgresDialectFactoryTest.java new file mode 100644 index 00000000000..be598978915 --- /dev/null +++ b/seatunnel-connectors-v2/connector-jdbc/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/internal/dialect/PostgresDialectFactoryTest.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect; + +import org.apache.seatunnel.connectors.seatunnel.jdbc.internal.dialect.psql.PostgresDialectFactory; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Optional; + +public class PostgresDialectFactoryTest { + + @Test + public void testPostgresDialectCreate() { + PostgresDialectFactory postgresDialectFactory = new PostgresDialectFactory(); + JdbcDialect postgresLow = postgresDialectFactory.createWithCompatible("postgresLow"); + String[] fields = {"id", "name", "age"}; + String[] uniqueKeyField = {"id"}; + Optional upsertStatement = + postgresLow.getUpsertStatement("test", "test_a", fields, uniqueKeyField); + Assertions.assertFalse(upsertStatement.isPresent()); + } +} diff --git a/seatunnel-connectors-v2/connector-kafka/pom.xml b/seatunnel-connectors-v2/connector-kafka/pom.xml index 0ce4bba6b17..7955ab3f546 100644 --- a/seatunnel-connectors-v2/connector-kafka/pom.xml +++ b/seatunnel-connectors-v2/connector-kafka/pom.xml @@ -31,6 +31,7 @@ 3.2.0 + 1.6.4.Final @@ -61,6 +62,17 @@ seatunnel-format-compatible-debezium-json ${project.version} + + org.apache.seatunnel + seatunnel-format-compatible-connect-json + ${project.version} + + + org.apache.kafka + connect-json + ${kafka.client.version} + + diff --git a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/Config.java b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/Config.java index edc83220292..c5cfa7248c1 100644 --- a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/Config.java +++ b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/Config.java @@ -28,8 +28,6 @@ public class Config { public static final String CONNECTOR_IDENTITY = "Kafka"; public static final String DEFAULT_TABLE_ID = "default"; - public static final String REPLICATION_FACTOR = "replication.factor"; - /** The default field delimiter is “,” */ public static final String DEFAULT_FIELD_DELIMITER = ","; diff --git a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/MessageFormat.java b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/MessageFormat.java index a694695a6bb..d0b8ba4ba58 100644 --- a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/MessageFormat.java +++ b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/config/MessageFormat.java @@ -22,6 +22,7 @@ public enum MessageFormat { TEXT, CANAL_JSON, COMPATIBLE_DEBEZIUM_JSON, + COMPATIBLE_KAFKA_CONNECT_JSON, KINGBASE_JSON, - DEBEZIUM_JSON, + DEBEZIUM_JSON } diff --git a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSource.java b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSource.java index 51c09e70c38..8e0db1be974 100644 --- a/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSource.java +++ b/seatunnel-connectors-v2/connector-kafka/src/main/java/org/apache/seatunnel/connectors/seatunnel/kafka/source/KafkaSource.java @@ -42,6 +42,7 @@ import org.apache.seatunnel.connectors.seatunnel.kafka.exception.KafkaConnectorException; import org.apache.seatunnel.connectors.seatunnel.kafka.kingbase.KingbaseJsonDeserializationSchema; import org.apache.seatunnel.connectors.seatunnel.kafka.state.KafkaSourceState; +import org.apache.seatunnel.format.compatible.kafka.connect.json.CompatibleKafkaConnectDeserializationSchema; import org.apache.seatunnel.format.json.JsonDeserializationSchema; import org.apache.seatunnel.format.json.canal.CanalJsonDeserializationSchema; import org.apache.seatunnel.format.json.debezium.DebeziumJsonDeserializationSchema; @@ -254,6 +255,15 @@ private DeserializationSchema getDeserializationSchema( } else { return new KingbaseJsonDeserializationSchema((MultipleRowType) typeInfo); } + case COMPATIBLE_KAFKA_CONNECT_JSON: + if (typeInfo instanceof MultipleRowType) { + throw new KafkaConnectorException( + CommonErrorCode.UNSUPPORTED_DATA_TYPE, + "Unsupported table format: " + format); + } else { + return new CompatibleKafkaConnectDeserializationSchema( + (SeaTunnelRowType) typeInfo, option, false, false); + } case DEBEZIUM_JSON: if (typeInfo instanceof SeaTunnelRowType) { boolean includeSchema = option.get(DEBEZIUM_RECORD_INCLUDE_SCHEMA); diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/config/MongodbConfig.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/config/MongodbConfig.java index 1ba9ad70cc4..848a120e270 100644 --- a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/config/MongodbConfig.java +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/config/MongodbConfig.java @@ -56,7 +56,8 @@ public class MongodbConfig { Options.key("match.query") .stringType() .noDefaultValue() - .withDescription("Mongodb's query syntax."); + .withDescription("Mongodb's query syntax.") + .withFallbackKeys("matchQuery"); public static final Option PROJECTION = Options.key("match.projection") @@ -149,4 +150,7 @@ public class MongodbConfig { .withDescription( "The primary keys for upsert/update. Keys are in csv format for properties.") .withFallbackKeys("upsert-key"); + + public static final Option TRANSACTION = + Options.key("transaction").booleanType().defaultValue(false).withDescription("."); } diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbSink.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbSink.java index fa2c212c3d0..160aa966a04 100644 --- a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbSink.java +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbSink.java @@ -20,25 +20,33 @@ import org.apache.seatunnel.shade.com.typesafe.config.Config; import org.apache.seatunnel.api.common.PrepareFailException; +import org.apache.seatunnel.api.serialization.DefaultSerializer; +import org.apache.seatunnel.api.serialization.Serializer; import org.apache.seatunnel.api.sink.SeaTunnelSink; +import org.apache.seatunnel.api.sink.SinkAggregatedCommitter; import org.apache.seatunnel.api.sink.SinkWriter; import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; -import org.apache.seatunnel.connectors.seatunnel.common.sink.AbstractSimpleSink; -import org.apache.seatunnel.connectors.seatunnel.common.sink.AbstractSinkWriter; import org.apache.seatunnel.connectors.seatunnel.mongodb.config.MongodbConfig; import org.apache.seatunnel.connectors.seatunnel.mongodb.serde.RowDataDocumentSerializer; import org.apache.seatunnel.connectors.seatunnel.mongodb.serde.RowDataToBsonConverters; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.commit.MongodbSinkAggregatedCommitter; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state.DocumentBulk; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state.MongodbAggregatedCommitInfo; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state.MongodbCommitInfo; import com.google.auto.service.AutoService; import java.util.List; +import java.util.Optional; import static org.apache.seatunnel.connectors.seatunnel.mongodb.config.MongodbConfig.CONNECTOR_IDENTITY; @AutoService(SeaTunnelSink.class) -public class MongodbSink extends AbstractSimpleSink { +public class MongodbSink + implements SeaTunnelSink< + SeaTunnelRow, DocumentBulk, MongodbCommitInfo, MongodbAggregatedCommitInfo> { private MongodbWriterOptions options; @@ -89,6 +97,10 @@ public void prepare(Config pluginConfig) throws PrepareFailException { if (pluginConfig.hasPath(MongodbConfig.RETRY_INTERVAL.key())) { builder.withRetryInterval(pluginConfig.getLong(MongodbConfig.RETRY_INTERVAL.key())); } + + if (pluginConfig.hasPath(MongodbConfig.TRANSACTION.key())) { + builder.withTransaction(pluginConfig.getBoolean(MongodbConfig.TRANSACTION.key())); + } this.options = builder.build(); } } @@ -109,7 +121,8 @@ public SeaTunnelDataType getConsumedType() { } @Override - public AbstractSinkWriter createWriter(SinkWriter.Context context) { + public SinkWriter createWriter( + SinkWriter.Context context) { return new MongodbWriter( new RowDataDocumentSerializer( RowDataToBsonConverters.createConverter(seaTunnelRowType), @@ -118,4 +131,27 @@ public AbstractSinkWriter createWriter(SinkWriter.Context co options, context); } + + @Override + public Optional> getWriterStateSerializer() { + return options.transaction ? Optional.of(new DefaultSerializer<>()) : Optional.empty(); + } + + @Override + public Optional> + createAggregatedCommitter() { + return options.transaction + ? Optional.of(new MongodbSinkAggregatedCommitter(options)) + : Optional.empty(); + } + + @Override + public Optional> getAggregatedCommitInfoSerializer() { + return options.transaction ? Optional.of(new DefaultSerializer<>()) : Optional.empty(); + } + + @Override + public Optional> getCommitInfoSerializer() { + return options.transaction ? Optional.of(new DefaultSerializer<>()) : Optional.empty(); + } } diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbWriter.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbWriter.java index 794c3bf04a3..0eb131d44ed 100644 --- a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbWriter.java +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbWriter.java @@ -20,16 +20,19 @@ import org.apache.seatunnel.api.sink.SinkWriter; import org.apache.seatunnel.api.table.type.RowKind; import org.apache.seatunnel.api.table.type.SeaTunnelRow; -import org.apache.seatunnel.connectors.seatunnel.common.sink.AbstractSinkWriter; import org.apache.seatunnel.connectors.seatunnel.mongodb.exception.MongodbConnectorException; import org.apache.seatunnel.connectors.seatunnel.mongodb.internal.MongodbClientProvider; import org.apache.seatunnel.connectors.seatunnel.mongodb.internal.MongodbCollectionProvider; import org.apache.seatunnel.connectors.seatunnel.mongodb.serde.DocumentSerializer; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state.DocumentBulk; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state.MongodbCommitInfo; import org.bson.BsonDocument; import com.mongodb.MongoException; import com.mongodb.client.model.BulkWriteOptions; +import com.mongodb.client.model.InsertOneModel; +import com.mongodb.client.model.UpdateOneModel; import com.mongodb.client.model.WriteModel; import lombok.extern.slf4j.Slf4j; @@ -37,12 +40,14 @@ import java.util.List; import java.util.Optional; import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.stream.Collectors; import java.util.stream.IntStream; import static org.apache.seatunnel.common.exception.CommonErrorCode.WRITER_OPERATION_FAILED; @Slf4j -public class MongodbWriter extends AbstractSinkWriter { +public class MongodbWriter implements SinkWriter { private MongodbClientProvider collectionProvider; @@ -60,6 +65,8 @@ public class MongodbWriter extends AbstractSinkWriter { private volatile long lastSendTime = 0L; + private boolean transaction; + // TODO:Reserve parameters. private final SinkWriter.Context context; @@ -84,27 +91,66 @@ private void initOptions(MongodbWriterOptions options) { .build(); this.bulkActions = options.getFlushSize(); this.batchIntervalMs = options.getBatchIntervalMs(); + this.transaction = options.transaction; } @Override public void write(SeaTunnelRow o) { if (o.getRowKind() != RowKind.UPDATE_BEFORE) { bulkRequests.add(serializer.serializeToWriteModel(o)); - if (isOverMaxBatchSizeLimit() || isOverMaxBatchIntervalLimit()) { + if (!transaction && (isOverMaxBatchSizeLimit() || isOverMaxBatchIntervalLimit())) { doBulkWrite(); } } } - @Override - public Optional prepareCommit() { - doBulkWrite(); - return Optional.empty(); + public Optional prepareCommit() { + if (!transaction) { + doBulkWrite(); + return Optional.empty(); + } + + List bsonDocuments = new ArrayList<>(); + AtomicInteger counter = new AtomicInteger(); + + bulkRequests.stream() + .map(this::convertModelToBsonDocument) + .collect( + Collectors.groupingBy( + it -> counter.getAndIncrement() / DocumentBulk.BUFFER_SIZE)) + .values() + .stream() + .map(this::convertBsonDocumentListToDocumentBulk) + .forEach(bsonDocuments::add); + + bulkRequests.clear(); + + return Optional.of(new MongodbCommitInfo(bsonDocuments)); + } + + private BsonDocument convertModelToBsonDocument(WriteModel model) { + if (model instanceof InsertOneModel) { + return ((InsertOneModel) model).getDocument(); + } else if (model instanceof UpdateOneModel) { + return (BsonDocument) ((UpdateOneModel) model).getUpdate(); + } + return null; + } + + private DocumentBulk convertBsonDocumentListToDocumentBulk(List documentList) { + DocumentBulk documentBulk = new DocumentBulk(); + documentList.forEach(documentBulk::add); + return documentBulk; } + @Override + public void abortPrepare() {} + @Override public void close() { - doBulkWrite(); + if (!transaction) { + doBulkWrite(); + } if (collectionProvider != null) { collectionProvider.close(); } diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbWriterOptions.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbWriterOptions.java index be8becd3275..e9b82647756 100644 --- a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbWriterOptions.java +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/MongodbWriterOptions.java @@ -44,16 +44,19 @@ public class MongodbWriterOptions implements Serializable { protected final long retryInterval; + protected final boolean transaction; + public MongodbWriterOptions( String connectString, String database, String collection, int flushSize, - Long batchIntervalMs, + long batchIntervalMs, boolean upsertEnable, String[] primaryKey, int retryMax, - Long retryInterval) { + long retryInterval, + boolean transaction) { this.connectString = connectString; this.database = database; this.collection = collection; @@ -63,6 +66,7 @@ public MongodbWriterOptions( this.primaryKey = primaryKey; this.retryMax = retryMax; this.retryInterval = retryInterval; + this.transaction = transaction; } public static Builder builder() { @@ -89,6 +93,8 @@ public static class Builder { protected long retryInterval; + protected boolean transaction; + public Builder withConnectString(String connectString) { this.connectString = connectString; return this; @@ -134,6 +140,11 @@ public Builder withRetryInterval(Long retryInterval) { return this; } + public Builder withTransaction(boolean transaction) { + this.transaction = transaction; + return this; + } + public MongodbWriterOptions build() { return new MongodbWriterOptions( connectString, @@ -144,7 +155,8 @@ public MongodbWriterOptions build() { upsertEnable, primaryKey, retryMax, - retryInterval); + retryInterval, + transaction); } } } diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/CommittableTransaction.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/CommittableTransaction.java new file mode 100644 index 00000000000..42b61edbe1a --- /dev/null +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/CommittableTransaction.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.mongodb.sink.commit; + +import org.bson.BsonDocument; + +import com.mongodb.client.MongoCollection; +import com.mongodb.client.TransactionBody; +import com.mongodb.client.result.InsertManyResult; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +public class CommittableTransaction implements TransactionBody, Serializable { + + private static final int BUFFER_INIT_SIZE = 1024; + + protected final MongoCollection collection; + + protected List bufferedDocuments = new ArrayList<>(BUFFER_INIT_SIZE); + + public CommittableTransaction( + MongoCollection collection, List documents) { + this.collection = collection; + this.bufferedDocuments.addAll(documents); + } + + @Override + public Integer execute() { + InsertManyResult result = collection.insertMany(bufferedDocuments); + return result.getInsertedIds().size(); + } +} diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/CommittableUpsertTransaction.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/CommittableUpsertTransaction.java new file mode 100644 index 00000000000..1fa3669e969 --- /dev/null +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/CommittableUpsertTransaction.java @@ -0,0 +1,68 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.mongodb.sink.commit; + +import org.bson.BsonDocument; +import org.bson.conversions.Bson; + +import com.mongodb.bulk.BulkWriteResult; +import com.mongodb.client.MongoCollection; +import com.mongodb.client.model.BulkWriteOptions; +import com.mongodb.client.model.Filters; +import com.mongodb.client.model.UpdateOneModel; +import com.mongodb.client.model.UpdateOptions; + +import java.util.ArrayList; +import java.util.List; + +public class CommittableUpsertTransaction extends CommittableTransaction { + + private final String[] upsertKeys; + private final UpdateOptions updateOptions = new UpdateOptions(); + private final BulkWriteOptions bulkWriteOptions = new BulkWriteOptions(); + + public CommittableUpsertTransaction( + MongoCollection collection, + List documents, + String[] upsertKeys) { + super(collection, documents); + this.upsertKeys = upsertKeys; + updateOptions.upsert(true); + bulkWriteOptions.ordered(true); + } + + @Override + public Integer execute() { + List> upserts = new ArrayList<>(); + for (BsonDocument document : bufferedDocuments) { + List filters = new ArrayList<>(upsertKeys.length); + for (String upsertKey : upsertKeys) { + Object o = document.get("$set").asDocument().get(upsertKey); + Bson eq = Filters.eq(upsertKey, o); + filters.add(eq); + } + Bson filter = Filters.and(filters); + UpdateOneModel updateOneModel = + new UpdateOneModel<>(filter, document, updateOptions); + upserts.add(updateOneModel); + } + + BulkWriteResult bulkWriteResult = collection.bulkWrite(upserts, bulkWriteOptions); + return bulkWriteResult.getUpserts().size() + bulkWriteResult.getInsertedCount(); + } +} diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/MongodbSinkAggregatedCommitter.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/MongodbSinkAggregatedCommitter.java new file mode 100644 index 00000000000..0ee73a3012d --- /dev/null +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/commit/MongodbSinkAggregatedCommitter.java @@ -0,0 +1,167 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.mongodb.sink.commit; + +import org.apache.seatunnel.api.sink.SinkAggregatedCommitter; +import org.apache.seatunnel.connectors.seatunnel.mongodb.internal.MongodbClientProvider; +import org.apache.seatunnel.connectors.seatunnel.mongodb.internal.MongodbCollectionProvider; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.MongodbWriterOptions; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state.DocumentBulk; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state.MongodbAggregatedCommitInfo; +import org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state.MongodbCommitInfo; + +import org.bson.BsonDocument; + +import com.mongodb.ReadConcern; +import com.mongodb.ReadPreference; +import com.mongodb.TransactionOptions; +import com.mongodb.WriteConcern; +import com.mongodb.client.ClientSession; +import com.mongodb.client.MongoClient; +import com.mongodb.client.MongoCollection; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; +import java.util.stream.Stream; + +@Slf4j +public class MongodbSinkAggregatedCommitter + implements SinkAggregatedCommitter { + + private static final long waitingTime = 5_000L; + + private static final long TRANSACTION_TIMEOUT_MS = 60_000L; + + private final boolean enableUpsert; + + private final String[] upsertKeys; + + private final MongodbClientProvider collectionProvider; + + private ClientSession clientSession; + + private MongoClient client; + + public MongodbSinkAggregatedCommitter(MongodbWriterOptions options) { + this.enableUpsert = options.isUpsertEnable(); + this.upsertKeys = options.getPrimaryKey(); + this.collectionProvider = + MongodbCollectionProvider.builder() + .connectionString(options.getConnectString()) + .database(options.getDatabase()) + .collection(options.getCollection()) + .build(); + } + + @Override + public List commit( + List aggregatedCommitInfo) { + return aggregatedCommitInfo.stream() + .map(this::processAggregatedCommitInfo) + .filter( + failedAggregatedCommitInfo -> + !failedAggregatedCommitInfo.getCommitInfos().isEmpty()) + .collect(Collectors.toList()); + } + + private MongodbAggregatedCommitInfo processAggregatedCommitInfo( + MongodbAggregatedCommitInfo aggregatedCommitInfo) { + List failedCommitInfos = + aggregatedCommitInfo.getCommitInfos().stream() + .flatMap( + (Function>>) + this::processCommitInfo) + .filter(failedDocumentBulks -> !failedDocumentBulks.isEmpty()) + .map(MongodbCommitInfo::new) + .collect(Collectors.toList()); + + return new MongodbAggregatedCommitInfo(failedCommitInfos); + } + + private Stream> processCommitInfo(MongodbCommitInfo commitInfo) { + client = collectionProvider.getClient(); + clientSession = client.startSession(); + MongoCollection collection = collectionProvider.getDefaultCollection(); + return Stream.of( + commitInfo.getDocumentBulks().stream() + .filter(bulk -> !bulk.getDocuments().isEmpty()) + .filter( + bulk -> { + try { + CommittableTransaction transaction; + if (enableUpsert) { + transaction = + new CommittableUpsertTransaction( + collection, + bulk.getDocuments(), + upsertKeys); + } else { + transaction = + new CommittableTransaction( + collection, bulk.getDocuments()); + } + + int insertedDocs = + clientSession.withTransaction( + transaction, + TransactionOptions.builder() + .readPreference( + ReadPreference.primary()) + .readConcern(ReadConcern.LOCAL) + .writeConcern(WriteConcern.MAJORITY) + .build()); + log.info( + "Inserted {} documents into collection {}.", + insertedDocs, + collection.getNamespace()); + return false; + } catch (Exception e) { + log.error("Failed to commit with Mongo transaction.", e); + return true; + } + }) + .collect(Collectors.toList())); + } + + @Override + public MongodbAggregatedCommitInfo combine(List commitInfos) { + return new MongodbAggregatedCommitInfo(commitInfos); + } + + @Override + public void abort(List aggregatedCommitInfo) {} + + @SneakyThrows + @Override + public void close() { + long deadline = System.currentTimeMillis() + TRANSACTION_TIMEOUT_MS; + while (clientSession.hasActiveTransaction() && System.currentTimeMillis() < deadline) { + // wait for active transaction to finish or timeout + Thread.sleep(waitingTime); + } + if (clientSession != null) { + clientSession.close(); + } + if (client != null) { + client.close(); + } + } +} diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/DocumentBulk.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/DocumentBulk.java new file mode 100644 index 00000000000..72a3d105383 --- /dev/null +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/DocumentBulk.java @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state; + +import org.bson.BsonDocument; + +import lombok.EqualsAndHashCode; +import lombok.ToString; + +import java.io.Serializable; +import java.util.ArrayList; +import java.util.List; + +/** + * DocumentBulk is buffered {@link BsonDocument} in memory, which would be written to MongoDB in a + * single transaction. Due to execution efficiency, each DocumentBulk maybe be limited to a maximum + * size, typically 1,000 documents. But for the transactional mode, the maximum size should not be + * respected because all that data must be written in one transaction. + */ +@ToString +@EqualsAndHashCode +public class DocumentBulk implements Serializable { + + public static final int BUFFER_SIZE = 1024; + + private final List bufferedDocuments; + + public DocumentBulk() { + bufferedDocuments = new ArrayList<>(BUFFER_SIZE); + } + + public void add(BsonDocument document) { + if (bufferedDocuments.size() == BUFFER_SIZE) { + throw new IllegalStateException("DocumentBulk is already full"); + } + bufferedDocuments.add(document); + } + + public int size() { + return bufferedDocuments.size(); + } + + public List getDocuments() { + return bufferedDocuments; + } +} diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/MongodbAggregatedCommitInfo.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/MongodbAggregatedCommitInfo.java new file mode 100644 index 00000000000..6b97d616af0 --- /dev/null +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/MongodbAggregatedCommitInfo.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state; + +import lombok.AllArgsConstructor; +import lombok.Data; + +import java.io.Serializable; +import java.util.List; + +@Data +@AllArgsConstructor +public class MongodbAggregatedCommitInfo implements Serializable { + List commitInfos; +} diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/MongodbCommitInfo.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/MongodbCommitInfo.java new file mode 100644 index 00000000000..052cd4c5a87 --- /dev/null +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/sink/state/MongodbCommitInfo.java @@ -0,0 +1,30 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.mongodb.sink.state; + +import lombok.AllArgsConstructor; +import lombok.Data; + +import java.io.Serializable; +import java.util.List; + +@Data +@AllArgsConstructor +public class MongodbCommitInfo implements Serializable { + List documentBulks; +} diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/source/MongodbSource.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/source/MongodbSource.java index e68796faa9e..d611a9bd53b 100644 --- a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/source/MongodbSource.java +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/source/MongodbSource.java @@ -46,6 +46,7 @@ import com.google.auto.service.AutoService; import java.util.ArrayList; +import java.util.List; import static org.apache.seatunnel.connectors.seatunnel.mongodb.config.MongodbConfig.CONNECTOR_IDENTITY; @@ -111,6 +112,17 @@ public void prepare(Config pluginConfig) throws PrepareFailException { splitStrategyBuilder.setMatchQuery( BsonDocument.parse(pluginConfig.getString(MongodbConfig.MATCH_QUERY.key()))); } + + List fallbackKeys = MongodbConfig.MATCH_QUERY.getFallbackKeys(); + fallbackKeys.forEach( + key -> { + if (pluginConfig.hasPath(key)) { + splitStrategyBuilder.setMatchQuery( + BsonDocument.parse( + pluginConfig.getString(MongodbConfig.MATCH_QUERY.key()))); + } + }); + if (pluginConfig.hasPath(MongodbConfig.SPLIT_KEY.key())) { splitStrategyBuilder.setSplitKey(pluginConfig.getString(MongodbConfig.SPLIT_KEY.key())); } diff --git a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/source/config/MongodbReadOptions.java b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/source/config/MongodbReadOptions.java index f0020eb41ad..faffe15bc34 100644 --- a/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/source/config/MongodbReadOptions.java +++ b/seatunnel-connectors-v2/connector-mongodb/src/main/java/org/apache/seatunnel/connectors/seatunnel/mongodb/source/config/MongodbReadOptions.java @@ -22,10 +22,10 @@ import java.io.Serializable; -import static com.google.common.base.Preconditions.checkArgument; import static org.apache.seatunnel.connectors.seatunnel.mongodb.config.MongodbConfig.CURSOR_NO_TIMEOUT; import static org.apache.seatunnel.connectors.seatunnel.mongodb.config.MongodbConfig.FETCH_SIZE; import static org.apache.seatunnel.connectors.seatunnel.mongodb.config.MongodbConfig.MAX_TIME_MIN; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; /** The configuration class for MongoDB source. */ @EqualsAndHashCode diff --git a/seatunnel-connectors-v2/connector-paimon/pom.xml b/seatunnel-connectors-v2/connector-paimon/pom.xml index 5d415d90c18..8f6fe7c8827 100644 --- a/seatunnel-connectors-v2/connector-paimon/pom.xml +++ b/seatunnel-connectors-v2/connector-paimon/pom.xml @@ -30,7 +30,7 @@ SeaTunnel : Connectors V2 : Paimon - 0.4-SNAPSHOT + 0.4.0-incubating diff --git a/seatunnel-connectors-v2/connector-pulsar/src/main/java/org/apache/seatunnel/connectors/seatunnel/pulsar/source/enumerator/cursor/start/MessageIdStartCursor.java b/seatunnel-connectors-v2/connector-pulsar/src/main/java/org/apache/seatunnel/connectors/seatunnel/pulsar/source/enumerator/cursor/start/MessageIdStartCursor.java index b70c8a540a8..c539f7102dc 100644 --- a/seatunnel-connectors-v2/connector-pulsar/src/main/java/org/apache/seatunnel/connectors/seatunnel/pulsar/source/enumerator/cursor/start/MessageIdStartCursor.java +++ b/seatunnel-connectors-v2/connector-pulsar/src/main/java/org/apache/seatunnel/connectors/seatunnel/pulsar/source/enumerator/cursor/start/MessageIdStartCursor.java @@ -24,7 +24,7 @@ import org.apache.pulsar.client.api.PulsarClientException; import org.apache.pulsar.client.impl.MessageIdImpl; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; /** This cursor would left pulsar start consuming from a specific message id. */ public class MessageIdStartCursor implements StartCursor { diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisConfig.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisConfig.java index c777d237827..511cbe4aa99 100644 --- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisConfig.java +++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisConfig.java @@ -102,6 +102,12 @@ public enum HashKeyParseMode { .withDescription( "hash key parse mode, support all or kv, default value is all"); + public static final Option EXPIRE = + Options.key("expire") + .longType() + .defaultValue(-1L) + .withDescription("Set redis expiration time."); + public enum Format { JSON, // TEXT will be supported later diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisDataType.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisDataType.java index 64772b5381d..a315e0cdae0 100644 --- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisDataType.java +++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisDataType.java @@ -30,8 +30,9 @@ public enum RedisDataType { KEY { @Override - public void set(Jedis jedis, String key, String value) { + public void set(Jedis jedis, String key, String value, long expire) { jedis.set(key, value); + expire(jedis, key, expire); } @Override @@ -41,9 +42,10 @@ public List get(Jedis jedis, String key) { }, HASH { @Override - public void set(Jedis jedis, String key, String value) { + public void set(Jedis jedis, String key, String value, long expire) { Map fieldsMap = JsonUtils.toMap(value); jedis.hset(key, fieldsMap); + expire(jedis, key, expire); } @Override @@ -54,8 +56,9 @@ public List get(Jedis jedis, String key) { }, LIST { @Override - public void set(Jedis jedis, String key, String value) { + public void set(Jedis jedis, String key, String value, long expire) { jedis.lpush(key, value); + expire(jedis, key, expire); } @Override @@ -65,8 +68,9 @@ public List get(Jedis jedis, String key) { }, SET { @Override - public void set(Jedis jedis, String key, String value) { + public void set(Jedis jedis, String key, String value, long expire) { jedis.sadd(key, value); + expire(jedis, key, expire); } @Override @@ -77,8 +81,9 @@ public List get(Jedis jedis, String key) { }, ZSET { @Override - public void set(Jedis jedis, String key, String value) { + public void set(Jedis jedis, String key, String value, long expire) { jedis.zadd(key, 1, value); + expire(jedis, key, expire); } @Override @@ -91,7 +96,13 @@ public List get(Jedis jedis, String key) { return Collections.emptyList(); } - public void set(Jedis jedis, String key, String value) { + private static void expire(Jedis jedis, String key, long expire) { + if (expire > 0) { + jedis.expire(key, expire); + } + } + + public void set(Jedis jedis, String key, String value, long expire) { // do nothing } } diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisParameters.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisParameters.java index c8bb879d0f5..8954b4da2a1 100644 --- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisParameters.java +++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/config/RedisParameters.java @@ -47,6 +47,7 @@ public class RedisParameters implements Serializable { private RedisConfig.RedisMode mode; private RedisConfig.HashKeyParseMode hashKeyParseMode; private List redisNodes = Collections.emptyList(); + private long expire = RedisConfig.EXPIRE.defaultValue(); public void buildWithConfig(Config config) { // set host @@ -89,6 +90,9 @@ public void buildWithConfig(Config config) { if (config.hasPath(RedisConfig.KEY_PATTERN.key())) { this.keysPattern = config.getString(RedisConfig.KEY_PATTERN.key()); } + if (config.hasPath(RedisConfig.EXPIRE.key())) { + this.expire = config.getLong(RedisConfig.EXPIRE.key()); + } // set redis data type try { String dataType = config.getString(RedisConfig.DATA_TYPE.key()); diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkFactory.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkFactory.java index e68a893f79c..22ae1568740 100644 --- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkFactory.java +++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkFactory.java @@ -41,7 +41,8 @@ public OptionRule optionRule() { RedisConfig.AUTH, RedisConfig.USER, RedisConfig.KEY_PATTERN, - RedisConfig.FORMAT) + RedisConfig.FORMAT, + RedisConfig.EXPIRE) .conditional(RedisConfig.MODE, RedisConfig.RedisMode.CLUSTER, RedisConfig.NODES) .build(); } diff --git a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkWriter.java b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkWriter.java index 657e3aaa565..80b1449b9d6 100644 --- a/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkWriter.java +++ b/seatunnel-connectors-v2/connector-redis/src/main/java/org/apache/seatunnel/connectors/seatunnel/redis/sink/RedisSinkWriter.java @@ -59,7 +59,8 @@ public void write(SeaTunnelRow element) throws IOException { } else { key = keyField; } - redisDataType.set(jedis, key, data); + long expire = redisParameters.getExpire(); + redisDataType.set(jedis, key, data, expire); } @Override diff --git a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/catalog/StarRocksCatalog.java b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/catalog/StarRocksCatalog.java index 1828c23fa25..24f437048ea 100644 --- a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/catalog/StarRocksCatalog.java +++ b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/catalog/StarRocksCatalog.java @@ -60,7 +60,7 @@ import java.util.Optional; import java.util.Set; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; public class StarRocksCatalog implements Catalog { diff --git a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/client/source/StarRocksRowBatchReader.java b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/client/source/StarRocksRowBatchReader.java index 4a192705894..2ea7e98c7e5 100644 --- a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/client/source/StarRocksRowBatchReader.java +++ b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/client/source/StarRocksRowBatchReader.java @@ -50,7 +50,7 @@ import java.util.ArrayList; import java.util.List; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; @Slf4j public class StarRocksRowBatchReader { diff --git a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/config/StarRocksSinkOptions.java b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/config/StarRocksSinkOptions.java index 5f66524ee18..ee9e5171c29 100644 --- a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/config/StarRocksSinkOptions.java +++ b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/config/StarRocksSinkOptions.java @@ -145,7 +145,8 @@ public interface StarRocksSinkOptions { Options.key("save_mode") .enumType(DataSaveMode.class) .defaultValue(DataSaveMode.KEEP_SCHEMA_AND_DATA) - .withDescription("save_mode"); + .withDescription( + "Table structure and data processing methods that already exist on the target end"); Option CUSTOM_SQL = Options.key("custom_sql").stringType().noDefaultValue().withDescription("custom_sql"); diff --git a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializer.java b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializer.java index d91a83920df..5bf15c533a1 100644 --- a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializer.java +++ b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializer.java @@ -17,13 +17,12 @@ package org.apache.seatunnel.connectors.seatunnel.starrocks.serialize; -import org.apache.seatunnel.api.table.type.SeaTunnelDataType; import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.api.table.type.SqlType; import org.apache.seatunnel.common.utils.JsonUtils; -import java.util.HashMap; +import java.util.LinkedHashMap; import java.util.Map; public class StarRocksJsonSerializer extends StarRocksBaseSerializer @@ -40,17 +39,18 @@ public StarRocksJsonSerializer(SeaTunnelRowType seaTunnelRowType, boolean enable @Override public String serialize(SeaTunnelRow row) { - Map rowMap = new HashMap<>(row.getFields().length); + Map rowMap = new LinkedHashMap<>(row.getFields().length); for (int i = 0; i < row.getFields().length; i++) { - SeaTunnelDataType fieldType = seaTunnelRowType.getFieldType(i); + SqlType sqlType = seaTunnelRowType.getFieldType(i).getSqlType(); Object value; - if (fieldType.getSqlType() == SqlType.ARRAY - || fieldType.getSqlType() == SqlType.MAP - || fieldType.getSqlType() == SqlType.ROW - || fieldType.getSqlType() == SqlType.MULTIPLE_ROW) { - // For struct type, we cannot transform to JsonString - // Since the whole rowMap will be transformed to JsonString + if (sqlType == SqlType.ARRAY + || sqlType == SqlType.MAP + || sqlType == SqlType.ROW + || sqlType == SqlType.MULTIPLE_ROW) { + // If the field type is complex type, we should keep the origin value. + // It will be transformed to json string in the next step + // JsonUtils.toJsonString(rowMap). value = row.getField(i); } else { value = convert(seaTunnelRowType.getFieldType(i), row.getField(i)); diff --git a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/sink/StarRocksSinkFactory.java b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/sink/StarRocksSinkFactory.java index dcc9cef7412..e6f1ed1fbad 100644 --- a/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/sink/StarRocksSinkFactory.java +++ b/seatunnel-connectors-v2/connector-starrocks/src/main/java/org/apache/seatunnel/connectors/seatunnel/starrocks/sink/StarRocksSinkFactory.java @@ -57,6 +57,7 @@ public OptionRule optionRule() { StarRocksSinkOptions.RETRY_BACKOFF_MULTIPLIER_MS, StarRocksSinkOptions.STARROCKS_CONFIG, StarRocksSinkOptions.ENABLE_UPSERT_DELETE, + StarRocksSinkOptions.SAVE_MODE, StarRocksSinkOptions.SAVE_MODE_CREATE_TEMPLATE) .conditional( StarRocksSinkOptions.SAVE_MODE, diff --git a/seatunnel-connectors-v2/connector-starrocks/src/test/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializerTest.java b/seatunnel-connectors-v2/connector-starrocks/src/test/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializerTest.java new file mode 100644 index 00000000000..6e0d9476441 --- /dev/null +++ b/seatunnel-connectors-v2/connector-starrocks/src/test/java/org/apache/seatunnel/connectors/seatunnel/starrocks/serialize/StarRocksJsonSerializerTest.java @@ -0,0 +1,56 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.starrocks.serialize; + +import org.apache.seatunnel.api.table.type.ArrayType; +import org.apache.seatunnel.api.table.type.BasicType; +import org.apache.seatunnel.api.table.type.MapType; +import org.apache.seatunnel.api.table.type.SeaTunnelDataType; +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +import java.util.Collections; + +public class StarRocksJsonSerializerTest { + + @Test + public void serialize() { + String[] filedNames = {"id", "name", "array", "map"}; + SeaTunnelDataType[] filedTypes = { + BasicType.LONG_TYPE, + BasicType.STRING_TYPE, + ArrayType.STRING_ARRAY_TYPE, + new MapType<>(BasicType.STRING_TYPE, BasicType.STRING_TYPE) + }; + + SeaTunnelRowType seaTunnelRowType = new SeaTunnelRowType(filedNames, filedTypes); + StarRocksJsonSerializer starRocksJsonSerializer = + new StarRocksJsonSerializer(seaTunnelRowType, false); + Object[] fields = { + 1, "Tom", new String[] {"tag1", "tag2"}, Collections.singletonMap("key1", "value1") + }; + SeaTunnelRow seaTunnelRow = new SeaTunnelRow(fields); + String jsonString = starRocksJsonSerializer.serialize(seaTunnelRow); + Assertions.assertEquals( + "{\"id\":1,\"name\":\"Tom\",\"array\":[\"tag1\",\"tag2\"],\"map\":{\"key1\":\"value1\"}}", + jsonString); + } +} diff --git a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/config/TablestoreConfig.java b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/config/TablestoreConfig.java index f64eb8473b0..3e1714c5516 100644 --- a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/config/TablestoreConfig.java +++ b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/config/TablestoreConfig.java @@ -50,11 +50,6 @@ public class TablestoreConfig implements Serializable { .stringType() .defaultValue("25") .withDescription(" Tablestore batch_size"); - public static final Option BATCH_INTERVAL_MS = - Options.key("batch_interval_ms") - .stringType() - .defaultValue("1000") - .withDescription(" Tablestore batch_interval_ms"); public static final Option PRIMARY_KEYS = Options.key("primary_keys") .stringType() diff --git a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/config/TablestoreOptions.java b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/config/TablestoreOptions.java index ba6c0089395..7b2aa6bae67 100644 --- a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/config/TablestoreOptions.java +++ b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/config/TablestoreOptions.java @@ -25,7 +25,6 @@ import java.io.Serializable; import java.util.List; -import static org.apache.seatunnel.connectors.seatunnel.tablestore.config.TablestoreConfig.BATCH_INTERVAL_MS; import static org.apache.seatunnel.connectors.seatunnel.tablestore.config.TablestoreConfig.BATCH_SIZE; @Data @@ -45,7 +44,6 @@ public class TablestoreOptions implements Serializable { private List primaryKeys; public int batchSize = Integer.parseInt(BATCH_SIZE.defaultValue()); - public int batchIntervalMs = Integer.parseInt(BATCH_INTERVAL_MS.defaultValue()); public TablestoreOptions(Config config) { this.endpoint = config.getString(TablestoreConfig.END_POINT.key()); @@ -58,8 +56,5 @@ public TablestoreOptions(Config config) { if (config.hasPath(BATCH_SIZE.key())) { this.batchSize = config.getInt(BATCH_SIZE.key()); } - if (config.hasPath(TablestoreConfig.BATCH_INTERVAL_MS.key())) { - this.batchIntervalMs = config.getInt(TablestoreConfig.BATCH_INTERVAL_MS.key()); - } } } diff --git a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreSinkClient.java b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreSinkClient.java index e3b6f2fbdf3..0637b9b038c 100644 --- a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreSinkClient.java +++ b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreSinkClient.java @@ -27,22 +27,15 @@ import com.alicloud.openservices.tablestore.model.BatchWriteRowRequest; import com.alicloud.openservices.tablestore.model.BatchWriteRowResponse; import com.alicloud.openservices.tablestore.model.RowPutChange; -import com.google.common.util.concurrent.ThreadFactoryBuilder; import lombok.extern.slf4j.Slf4j; import java.io.IOException; import java.util.ArrayList; import java.util.List; -import java.util.concurrent.Executors; -import java.util.concurrent.ScheduledExecutorService; -import java.util.concurrent.ScheduledFuture; -import java.util.concurrent.TimeUnit; @Slf4j public class TablestoreSinkClient { private final TablestoreOptions tablestoreOptions; - private ScheduledExecutorService scheduler; - private ScheduledFuture scheduledFuture; private volatile boolean initialize; private volatile Exception flushException; private SyncClient syncClient; @@ -64,24 +57,6 @@ private void tryInit() throws IOException { tablestoreOptions.getAccessKeySecret(), tablestoreOptions.getInstanceName()); - scheduler = - Executors.newSingleThreadScheduledExecutor( - new ThreadFactoryBuilder() - .setNameFormat("Tablestore-sink-output-%s") - .build()); - scheduledFuture = - scheduler.scheduleAtFixedRate( - () -> { - try { - flush(); - } catch (IOException e) { - flushException = e; - } - }, - tablestoreOptions.getBatchIntervalMs(), - tablestoreOptions.getBatchIntervalMs(), - TimeUnit.MILLISECONDS); - initialize = true; } @@ -96,17 +71,13 @@ public void write(RowPutChange rowPutChange) throws IOException { } public void close() throws IOException { - if (scheduledFuture != null) { - scheduledFuture.cancel(false); - scheduler.shutdown(); - } if (syncClient != null) { flush(); syncClient.shutdown(); } } - synchronized void flush() throws IOException { + synchronized void flush() { checkFlushException(); if (batchList.isEmpty()) { return; diff --git a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreSinkFactory.java b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreSinkFactory.java index efe39a08c4a..674f641ad64 100644 --- a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreSinkFactory.java +++ b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreSinkFactory.java @@ -26,7 +26,6 @@ import static org.apache.seatunnel.connectors.seatunnel.tablestore.config.TablestoreConfig.ACCESS_KEY_ID; import static org.apache.seatunnel.connectors.seatunnel.tablestore.config.TablestoreConfig.ACCESS_KEY_SECRET; -import static org.apache.seatunnel.connectors.seatunnel.tablestore.config.TablestoreConfig.BATCH_INTERVAL_MS; import static org.apache.seatunnel.connectors.seatunnel.tablestore.config.TablestoreConfig.BATCH_SIZE; import static org.apache.seatunnel.connectors.seatunnel.tablestore.config.TablestoreConfig.END_POINT; import static org.apache.seatunnel.connectors.seatunnel.tablestore.config.TablestoreConfig.INSTANCE_NAME; @@ -51,7 +50,7 @@ public OptionRule optionRule() { ACCESS_KEY_SECRET, PRIMARY_KEYS, CatalogTableUtil.SCHEMA) - .optional(BATCH_INTERVAL_MS, BATCH_SIZE) + .optional(BATCH_SIZE) .build(); } } diff --git a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreWriter.java b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreWriter.java index 929a421f7f5..22bfe1be27f 100644 --- a/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreWriter.java +++ b/seatunnel-connectors-v2/connector-tablestore/src/main/java/org/apache/seatunnel/connectors/seatunnel/tablestore/sink/TablestoreWriter.java @@ -25,6 +25,7 @@ import org.apache.seatunnel.connectors.seatunnel.tablestore.serialize.SeaTunnelRowSerializer; import java.io.IOException; +import java.util.Optional; public class TablestoreWriter extends AbstractSinkWriter { @@ -46,4 +47,10 @@ public void write(SeaTunnelRow element) throws IOException { public void close() throws IOException { tablestoreSinkClient.close(); } + + @Override + public Optional prepareCommit() { + tablestoreSinkClient.flush(); + return super.prepareCommit(); + } } diff --git a/seatunnel-core/seatunnel-core-starter/src/main/java/org/apache/seatunnel/core/starter/utils/ConfigBuilder.java b/seatunnel-core/seatunnel-core-starter/src/main/java/org/apache/seatunnel/core/starter/utils/ConfigBuilder.java index ed66b550a04..ad063acac8a 100644 --- a/seatunnel-core/seatunnel-core-starter/src/main/java/org/apache/seatunnel/core/starter/utils/ConfigBuilder.java +++ b/seatunnel-core/seatunnel-core-starter/src/main/java/org/apache/seatunnel/core/starter/utils/ConfigBuilder.java @@ -69,6 +69,12 @@ public static Config of(@NonNull Path filePath) { return config; } + public static Config of(@NonNull Map objectMap) { + log.info("Loading config file from objectMap"); + Config config = ConfigFactory.parseMap(objectMap); + return ConfigShadeUtils.decryptConfig(config); + } + public static Config of(@NonNull ConfigAdapter configAdapter, @NonNull Path filePath) { log.info("With config adapter spi {}", configAdapter.getClass().getName()); try { diff --git a/seatunnel-core/seatunnel-flink-starter/seatunnel-flink-starter-common/src/main/java/org/apache/seatunnel/core/starter/flink/execution/FlinkExecution.java b/seatunnel-core/seatunnel-flink-starter/seatunnel-flink-starter-common/src/main/java/org/apache/seatunnel/core/starter/flink/execution/FlinkExecution.java index a3282cc4a1e..5a4050d884d 100644 --- a/seatunnel-core/seatunnel-flink-starter/seatunnel-flink-starter-common/src/main/java/org/apache/seatunnel/core/starter/flink/execution/FlinkExecution.java +++ b/seatunnel-core/seatunnel-flink-starter/seatunnel-flink-starter-common/src/main/java/org/apache/seatunnel/core/starter/flink/execution/FlinkExecution.java @@ -26,6 +26,7 @@ import org.apache.seatunnel.common.Constants; import org.apache.seatunnel.common.config.Common; import org.apache.seatunnel.common.config.TypesafeConfigUtils; +import org.apache.seatunnel.common.constants.JobMode; import org.apache.seatunnel.common.utils.SeaTunnelException; import org.apache.seatunnel.core.starter.exception.TaskExecuteException; import org.apache.seatunnel.core.starter.execution.PluginExecuteProcessor; @@ -33,6 +34,7 @@ import org.apache.seatunnel.core.starter.execution.TaskExecution; import org.apache.seatunnel.core.starter.flink.FlinkStarter; +import org.apache.flink.api.common.RuntimeExecutionMode; import org.apache.flink.streaming.api.datastream.DataStream; import org.apache.flink.types.Row; @@ -111,6 +113,12 @@ public void execute() throws TaskExecuteException { "Flink Execution Plan: {}", flinkRuntimeEnvironment.getStreamExecutionEnvironment().getExecutionPlan()); log.info("Flink job name: {}", flinkRuntimeEnvironment.getJobName()); + if (!flinkRuntimeEnvironment.isStreaming()) { + flinkRuntimeEnvironment + .getStreamExecutionEnvironment() + .setRuntimeMode(RuntimeExecutionMode.BATCH); + log.info("Flink job Mode: {}", JobMode.BATCH); + } try { flinkRuntimeEnvironment .getStreamExecutionEnvironment() diff --git a/seatunnel-core/seatunnel-flink-starter/seatunnel-flink-starter-common/src/main/java/org/apache/seatunnel/core/starter/flink/execution/SourceExecuteProcessor.java b/seatunnel-core/seatunnel-flink-starter/seatunnel-flink-starter-common/src/main/java/org/apache/seatunnel/core/starter/flink/execution/SourceExecuteProcessor.java index a3897a526e9..6bcc5fe8939 100644 --- a/seatunnel-core/seatunnel-flink-starter/seatunnel-flink-starter-common/src/main/java/org/apache/seatunnel/core/starter/flink/execution/SourceExecuteProcessor.java +++ b/seatunnel-core/seatunnel-flink-starter/seatunnel-flink-starter-common/src/main/java/org/apache/seatunnel/core/starter/flink/execution/SourceExecuteProcessor.java @@ -71,13 +71,15 @@ public List> execute(List> upstreamDataStreams) } else { sourceFunction = new SeaTunnelParallelSource(internalSource); } + boolean bounded = + internalSource.getBoundedness() + == org.apache.seatunnel.api.source.Boundedness.BOUNDED; DataStreamSource sourceStream = addSource( executionEnvironment, sourceFunction, "SeaTunnel " + internalSource.getClass().getSimpleName(), - internalSource.getBoundedness() - == org.apache.seatunnel.api.source.Boundedness.BOUNDED); + bounded); Config pluginConfig = pluginConfigs.get(i); if (pluginConfig.hasPath(CommonOptions.PARALLELISM.key())) { int parallelism = pluginConfig.getInt(CommonOptions.PARALLELISM.key()); diff --git a/seatunnel-core/seatunnel-spark-starter/seatunnel-spark-starter-common/src/main/java/org/apache/seatunnel/core/starter/spark/execution/TransformExecuteProcessor.java b/seatunnel-core/seatunnel-spark-starter/seatunnel-spark-starter-common/src/main/java/org/apache/seatunnel/core/starter/spark/execution/TransformExecuteProcessor.java index 179598b3a61..fc9be559257 100644 --- a/seatunnel-core/seatunnel-spark-starter/seatunnel-spark-starter-common/src/main/java/org/apache/seatunnel/core/starter/spark/execution/TransformExecuteProcessor.java +++ b/seatunnel-core/seatunnel-spark-starter/seatunnel-spark-starter-common/src/main/java/org/apache/seatunnel/core/starter/spark/execution/TransformExecuteProcessor.java @@ -185,7 +185,6 @@ public Row next() { return null; } seaTunnelRow = outputRowConverter.convert(seaTunnelRow); - return new GenericRowWithSchema(seaTunnelRow.getFields(), structType); } catch (Exception e) { throw new TaskExecuteException("Row convert failed, caused: " + e.getMessage(), e); diff --git a/seatunnel-dist/pom.xml b/seatunnel-dist/pom.xml index 913eebd0509..a6c1f6f997b 100644 --- a/seatunnel-dist/pom.xml +++ b/seatunnel-dist/pom.xml @@ -165,6 +165,18 @@ ${project.version} provided + + org.apache.seatunnel + connector-file-jindo-oss + ${project.version} + provided + + + org.apache.seatunnel + connector-file-cos + ${project.version} + provided + org.apache.seatunnel connector-assert @@ -219,6 +231,18 @@ ${project.version} provided + + org.apache.seatunnel + connector-cdc-mysql + ${project.version} + provided + + + org.apache.seatunnel + connector-cdc-mongodb + ${project.version} + provided + org.apache.seatunnel connector-cdc-sqlserver @@ -244,6 +268,13 @@ provided + + org.apache.seatunnel + connector-paimon + ${project.version} + provided + + com.aliyun.phoenix @@ -326,6 +357,21 @@ provided + + net.snowflake + snowflake-jdbc + ${snowflake.version} + provided + + + + + io.netty + netty-buffer + ${netty-buffer.version} + provided + + com.ibm.informix ifx-changestream-client diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mongodb-e2e/src/test/java/mongodb/MongodbCDCIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mongodb-e2e/src/test/java/mongodb/MongodbCDCIT.java index dd7f985f176..c01b36ef188 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mongodb-e2e/src/test/java/mongodb/MongodbCDCIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mongodb-e2e/src/test/java/mongodb/MongodbCDCIT.java @@ -34,7 +34,9 @@ import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.TestTemplate; import org.testcontainers.containers.Container; +import org.testcontainers.containers.output.Slf4jLogConsumer; import org.testcontainers.lifecycle.Startables; +import org.testcontainers.utility.DockerLoggerFactory; import com.mongodb.client.MongoClient; import com.mongodb.client.MongoClients; @@ -78,7 +80,7 @@ public class MongodbCDCIT extends TestSuiteBase implements TestResource { // ---------------------------------------------------------------------------- // mysql - private static final String MYSQL_HOST = "mysql_cdc_e2e"; + private static final String MYSQL_HOST = "mysql_e2e"; private static final String MYSQL_USER_NAME = "st_user"; @@ -104,8 +106,10 @@ private static MySqlContainer createMySqlContainer() { mySqlContainer.withDatabaseName(MYSQL_DATABASE); mySqlContainer.withUsername(MYSQL_USER_NAME); mySqlContainer.withPassword(MYSQL_USER_PASSWORD); + mySqlContainer.withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger("Mysql-Docker-Image"))); // For local test use - // mySqlContainer.setPortBindings(Collections.singletonList("3308:3306")); + mySqlContainer.setPortBindings(Collections.singletonList("3310:3306")); return mySqlContainer; } @@ -134,6 +138,9 @@ public void startUp() { mongodbContainer = new MongoDBContainer(NETWORK); // For local test use mongodbContainer.setPortBindings(Collections.singletonList("27017:27017")); + mongodbContainer.withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger("Mongodb-Docker-Image"))); + Startables.deepStart(Stream.of(mongodbContainer)).join(); mongodbContainer.executeCommandFileInSeparateDatabase(MONGODB_DATABASE); initConnection(); @@ -213,6 +220,7 @@ private List> querySql() { for (int i = 1; i <= columnCount; i++) { objects.add(resultSet.getObject(i)); } + log.info("Print mysql sink data:" + objects); result.add(objects); } return result; diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mongodb-e2e/src/test/resources/mongodbcdc_to_mysql.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mongodb-e2e/src/test/resources/mongodbcdc_to_mysql.conf index 7e4a492390b..614380ab304 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mongodb-e2e/src/test/resources/mongodbcdc_to_mysql.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mongodb-e2e/src/test/resources/mongodbcdc_to_mysql.conf @@ -14,15 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. # -###### -###### This config file is a demonstration of streaming processing in seatunnel config -###### env { # You can set engine configuration here execution.parallelism = 1 job.mode = "STREAMING" - execution.checkpoint.interval = 5000 + checkpoint.interval = 5000 } source { @@ -45,11 +42,10 @@ source { sink { jdbc { - url = "jdbc:mysql://mysql_cdc_e2e:3306?useSSL=false&useUnicode=true&characterEncoding=UTF-8&allowPublicKeyRetrieval=false&useJDBCCompliantTimezoneShift=true&useLegacyDatetimeCode=false&serverTimezone=GMT%2B8" + url = "jdbc:mysql://mysql_e2e:3306/mongodb_cdc" driver = "com.mysql.cj.jdbc.Driver" user = "st_user" password = "seatunnel" - generate_sink_sql = true # You need to configure both database and table database = mongodb_cdc diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mysql-e2e/src/test/resources/mysqlcdc_to_mysql.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mysql-e2e/src/test/resources/mysqlcdc_to_mysql.conf index e8d85aecc5c..0adf2f7e64d 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mysql-e2e/src/test/resources/mysqlcdc_to_mysql.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-mysql-e2e/src/test/resources/mysqlcdc_to_mysql.conf @@ -22,7 +22,7 @@ env { # You can set engine configuration here execution.parallelism = 1 job.mode = "STREAMING" - execution.checkpoint.interval = 5000 + checkpoint.interval = 5000 } source { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-sqlserver-e2e/src/test/resources/sqlservercdc_to_console.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-sqlserver-e2e/src/test/resources/sqlservercdc_to_console.conf index c4ac06877b1..9d3f041ede1 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-sqlserver-e2e/src/test/resources/sqlservercdc_to_console.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-cdc-sqlserver-e2e/src/test/resources/sqlservercdc_to_console.conf @@ -51,8 +51,8 @@ sink { user = "sa" password = "Password!" generate_sink_sql = true - database = "" - table = "column_type_test.dbo.full_types_sink" + database = "column_type_test" + table = "dbo.full_types_sink" batch_size = 1 primary_keys = ["id"] } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/pom.xml new file mode 100644 index 00000000000..aa51e1cc820 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/pom.xml @@ -0,0 +1,48 @@ + + + + 4.0.0 + + org.apache.seatunnel + seatunnel-connector-v2-e2e + ${revision} + + + connector-file-cos-e2e + SeaTunnel : E2E : Connector V2 : File Cos + + + + org.apache.seatunnel + connector-fake + ${project.version} + test + + + org.apache.seatunnel + connector-file-cos + ${project.version} + test + + + org.apache.seatunnel + connector-assert + ${project.version} + test + + + diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/cos/CosFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/cos/CosFileIT.java new file mode 100644 index 00000000000..aaa2c1a2763 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/cos/CosFileIT.java @@ -0,0 +1,76 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.e2e.connector.file.cos; + +import org.apache.seatunnel.e2e.common.TestSuiteBase; +import org.apache.seatunnel.e2e.common.container.TestContainer; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Disabled; +import org.junit.jupiter.api.TestTemplate; +import org.testcontainers.containers.Container; + +import java.io.IOException; + +@Disabled +public class CosFileIT extends TestSuiteBase { + + @TestTemplate + public void testCosFileWriteAndRead(TestContainer container) + throws IOException, InterruptedException { + // test cos excel file + Container.ExecResult excelWriteResult = + container.executeJob("/excel/fake_to_cos_excel.conf"); + Assertions.assertEquals(0, excelWriteResult.getExitCode(), excelWriteResult.getStderr()); + Container.ExecResult excelReadResult = + container.executeJob("/excel/cos_excel_to_assert.conf"); + Assertions.assertEquals(0, excelReadResult.getExitCode(), excelReadResult.getStderr()); + + // test cos text file + Container.ExecResult textWriteResult = + container.executeJob("/text/fake_to_cos_file_text.conf"); + Assertions.assertEquals(0, textWriteResult.getExitCode()); + Container.ExecResult textReadResult = + container.executeJob("/text/cos_file_text_to_assert.conf"); + Assertions.assertEquals(0, textReadResult.getExitCode()); + + // test cos json file + Container.ExecResult jsonWriteResult = + container.executeJob("/json/fake_to_cos_file_json.conf"); + Assertions.assertEquals(0, jsonWriteResult.getExitCode()); + Container.ExecResult jsonReadResult = + container.executeJob("/json/cos_file_json_to_assert.conf"); + Assertions.assertEquals(0, jsonReadResult.getExitCode()); + + // test cos orc file + Container.ExecResult orcWriteResult = + container.executeJob("/orc/fake_to_cos_file_orc.conf"); + Assertions.assertEquals(0, orcWriteResult.getExitCode()); + Container.ExecResult orcReadResult = + container.executeJob("/orc/cos_file_orc_to_assert.conf"); + Assertions.assertEquals(0, orcReadResult.getExitCode()); + + // test cos parquet file + Container.ExecResult parquetWriteResult = + container.executeJob("/parquet/fake_to_cos_file_parquet.conf"); + Assertions.assertEquals(0, parquetWriteResult.getExitCode()); + Container.ExecResult parquetReadResult = + container.executeJob("/parquet/cos_file_parquet_to_assert.conf"); + Assertions.assertEquals(0, parquetReadResult.getExitCode()); + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/cos_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/cos_excel_to_assert.conf new file mode 100644 index 00000000000..b71709318ec --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/cos_excel_to_assert.conf @@ -0,0 +1,116 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + CosFile { + path = "/read/excel" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + result_table_name = "fake" + file_format_type = excel + delimiter = ; + skip_header_row_number = 1 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/fake_to_cos_excel.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/fake_to_cos_excel.conf new file mode 100644 index 00000000000..4c603f5633c --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/excel/fake_to_cos_excel.conf @@ -0,0 +1,82 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + FakeSource { + result_table_name = "fake" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + CosFile { + path="/sink/execl" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_name_expression = "${transactionId}_${now}" + file_format_type = "excel" + filename_time_format = "yyyy.MM.dd" + is_enable_transaction = true + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/json/cos_file_json_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/json/cos_file_json_to_assert.conf new file mode 100644 index 00000000000..d88761799b1 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/json/cos_file_json_to_assert.conf @@ -0,0 +1,114 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + CosFile { + path = "/read/json" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + file_format_type = "json" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + C_MAP = "map" + C_ARRAY = "array" + C_STRING = string + C_BOOLEAN = boolean + C_TINYINT = tinyint + C_SMALLINT = smallint + C_INT = int + C_BIGINT = bigint + C_FLOAT = float + C_DOUBLE = double + C_BYTES = bytes + C_DATE = date + C_DECIMAL = "decimal(38, 18)" + C_TIMESTAMP = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/json/fake_to_cos_file_json.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/json/fake_to_cos_file_json.conf new file mode 100644 index 00000000000..20f54863d6d --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/json/fake_to_cos_file_json.conf @@ -0,0 +1,83 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + FakeSource { + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + CosFile { + path="/sink/json" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + row_delimiter = "\n" + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_name_expression = "${transactionId}_${now}" + file_format_type = "json" + filename_time_format = "yyyy.MM.dd" + is_enable_transaction = true + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/orc/cos_file_orc_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/orc/cos_file_orc_to_assert.conf new file mode 100644 index 00000000000..1041997ed68 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/orc/cos_file_orc_to_assert.conf @@ -0,0 +1,80 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + CosFile { + path = "/read/orc" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + file_format_type = "orc" + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/orc/fake_to_cos_file_orc.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/orc/fake_to_cos_file_orc.conf new file mode 100644 index 00000000000..879993b4ea9 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/orc/fake_to_cos_file_orc.conf @@ -0,0 +1,84 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + FakeSource { + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + CosFile { + path="/sink/orc" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + row_delimiter = "\n" + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_name_expression = "${transactionId}_${now}" + file_format_type = "orc" + filename_time_format = "yyyy.MM.dd" + is_enable_transaction = true + compress_codec = "zlib" + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/parquet/cos_file_parquet_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/parquet/cos_file_parquet_to_assert.conf new file mode 100644 index 00000000000..8bf9c171ce8 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/parquet/cos_file_parquet_to_assert.conf @@ -0,0 +1,80 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + CosFile { + path = "/read/parquet" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + file_format_type = "parquet" + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/parquet/fake_to_cos_file_parquet.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/parquet/fake_to_cos_file_parquet.conf new file mode 100644 index 00000000000..bb86e5f8b2e --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/parquet/fake_to_cos_file_parquet.conf @@ -0,0 +1,84 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + FakeSource { + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + CosFile { + path="/sink/parquet" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + row_delimiter = "\n" + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_name_expression = "${transactionId}_${now}" + file_format_type = "parquet" + filename_time_format = "yyyy.MM.dd" + is_enable_transaction = true + compress_codec = "gzip" + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/text/cos_file_text_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/text/cos_file_text_to_assert.conf new file mode 100644 index 00000000000..d53a046079c --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/text/cos_file_text_to_assert.conf @@ -0,0 +1,114 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + CosFile { + path = "/read/text" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + file_format_type = "text" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/text/fake_to_cos_file_text.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/text/fake_to_cos_file_text.conf new file mode 100644 index 00000000000..f93af2e212e --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-cos-e2e/src/test/resources/text/fake_to_cos_file_text.conf @@ -0,0 +1,84 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + FakeSource { + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + result_table_name = "fake" + } +} + +sink { + CosFile { + path="/sink/text" + bucket = "cosn://seatunnel-test" + secret_id = "dummy" + secret_key = "dummy" + region = "ap-chengdu" + row_delimiter = "\n" + partition_dir_expression = "${k0}=${v0}" + is_partition_field_write_in_file = true + file_name_expression = "${transactionId}_${now}" + file_format_type = "text" + filename_time_format = "yyyy.MM.dd" + is_enable_transaction = true + compress_codec = "lzo" + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/ftp/FtpFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/ftp/FtpFileIT.java index 5fc0e486091..15a58ebf082 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/ftp/FtpFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/ftp/FtpFileIT.java @@ -21,23 +21,20 @@ import org.apache.seatunnel.e2e.common.TestSuiteBase; import org.apache.seatunnel.e2e.common.container.EngineType; import org.apache.seatunnel.e2e.common.container.TestContainer; +import org.apache.seatunnel.e2e.common.container.TestHelper; import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer; import org.apache.seatunnel.e2e.common.util.ContainerUtil; import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.TestTemplate; -import org.testcontainers.containers.Container; import org.testcontainers.containers.GenericContainer; import org.testcontainers.containers.output.Slf4jLogConsumer; import org.testcontainers.lifecycle.Startables; -import org.testcontainers.utility.MountableFile; import lombok.extern.slf4j.Slf4j; import java.io.IOException; -import java.nio.file.Path; import java.util.Collections; import java.util.stream.Stream; @@ -87,19 +84,26 @@ public void startUp() throws Exception { Startables.deepStart(Stream.of(ftpContainer)).join(); log.info("ftp container started"); - Path jsonPath = ContainerUtil.getResourcesFile("/json/e2e.json").toPath(); - Path textPath = ContainerUtil.getResourcesFile("/text/e2e.txt").toPath(); - Path excelPath = ContainerUtil.getResourcesFile("/excel/e2e.xlsx").toPath(); - - ftpContainer.copyFileToContainer( - MountableFile.forHostPath(jsonPath), - "/home/vsftpd/seatunnel/tmp/seatunnel/read/json/name=tyrantlucifer/hobby=coding/e2e.json"); - ftpContainer.copyFileToContainer( - MountableFile.forHostPath(textPath), - "/home/vsftpd/seatunnel/tmp/seatunnel/read/text/name=tyrantlucifer/hobby=coding/e2e.txt"); - ftpContainer.copyFileToContainer( - MountableFile.forHostPath(excelPath), - "/home/vsftpd/seatunnel/tmp/seatunnel/read/excel/name=tyrantlucifer/hobby=coding/e2e.xlsx"); + ContainerUtil.copyFileIntoContainers( + "/json/e2e.json", + "/home/vsftpd/seatunnel/tmp/seatunnel/read/json/name=tyrantlucifer/hobby=coding/e2e.json", + ftpContainer); + + ContainerUtil.copyFileIntoContainers( + "/text/e2e.txt", + "/home/vsftpd/seatunnel/tmp/seatunnel/read/text/name=tyrantlucifer/hobby=coding/e2e.txt", + ftpContainer); + + ContainerUtil.copyFileIntoContainers( + "/excel/e2e.xlsx", + "/home/vsftpd/seatunnel/tmp/seatunnel/read/excel/name=tyrantlucifer/hobby=coding/e2e.xlsx", + ftpContainer); + + ContainerUtil.copyFileIntoContainers( + "/excel/e2e.xlsx", + "/home/vsftpd/seatunnel/tmp/seatunnel/read/excel_filter/name=tyrantlucifer/hobby=coding/e2e_filter.xlsx", + ftpContainer); + ftpContainer.execInContainer("sh", "-c", "chmod -R 777 /home/vsftpd/seatunnel/"); ftpContainer.execInContainer("sh", "-c", "chown -R ftp:ftp /home/vsftpd/seatunnel/"); } @@ -107,51 +111,31 @@ public void startUp() throws Exception { @TestTemplate public void testFtpFileReadAndWrite(TestContainer container) throws IOException, InterruptedException { + TestHelper helper = new TestHelper(container); // test write ftp excel file - Container.ExecResult excelWriteResult = - container.executeJob("/excel/fake_source_to_ftp_excel.conf"); - Assertions.assertEquals(0, excelWriteResult.getExitCode(), excelWriteResult.getStderr()); + helper.execute("/excel/fake_source_to_ftp_excel.conf"); // test read ftp excel file - Container.ExecResult excelReadResult = - container.executeJob("/excel/ftp_excel_to_assert.conf"); - Assertions.assertEquals(0, excelReadResult.getExitCode(), excelReadResult.getStderr()); + helper.execute("/excel/ftp_excel_to_assert.conf"); // test read ftp excel file with projection - Container.ExecResult excelProjectionReadResult = - container.executeJob("/excel/ftp_excel_projection_to_assert.conf"); - Assertions.assertEquals( - 0, excelProjectionReadResult.getExitCode(), excelProjectionReadResult.getStderr()); + helper.execute("/excel/ftp_excel_projection_to_assert.conf"); + // test read ftp excel file with filter + helper.execute("/excel/ftp_filter_excel_to_assert.conf"); // test write ftp text file - Container.ExecResult textWriteResult = - container.executeJob("/text/fake_to_ftp_file_text.conf"); - Assertions.assertEquals(0, textWriteResult.getExitCode()); + helper.execute("/text/fake_to_ftp_file_text.conf"); // test read skip header - Container.ExecResult textWriteAndSkipResult = - container.executeJob("/text/ftp_file_text_skip_headers.conf"); - Assertions.assertEquals(0, textWriteAndSkipResult.getExitCode()); + helper.execute("/text/ftp_file_text_skip_headers.conf"); // test read ftp text file - Container.ExecResult textReadResult = - container.executeJob("/text/ftp_file_text_to_assert.conf"); - Assertions.assertEquals(0, textReadResult.getExitCode()); + helper.execute("/text/ftp_file_text_to_assert.conf"); // test read ftp text file with projection - Container.ExecResult textProjectionResult = - container.executeJob("/text/ftp_file_text_projection_to_assert.conf"); - Assertions.assertEquals(0, textProjectionResult.getExitCode()); + helper.execute("/text/ftp_file_text_projection_to_assert.conf"); // test write ftp json file - Container.ExecResult jsonWriteResult = - container.executeJob("/json/fake_to_ftp_file_json.conf"); - Assertions.assertEquals(0, jsonWriteResult.getExitCode()); + helper.execute("/json/fake_to_ftp_file_json.conf"); // test read ftp json file - Container.ExecResult jsonReadResult = - container.executeJob("/json/ftp_file_json_to_assert.conf"); - Assertions.assertEquals(0, jsonReadResult.getExitCode()); + helper.execute("/json/ftp_file_json_to_assert.conf"); // test write ftp parquet file - Container.ExecResult parquetWriteResult = - container.executeJob("/parquet/fake_to_ftp_file_parquet.conf"); - Assertions.assertEquals(0, parquetWriteResult.getExitCode()); + helper.execute("/parquet/fake_to_ftp_file_parquet.conf"); // test write ftp orc file - Container.ExecResult orcWriteResult = - container.executeJob("/orc/fake_to_ftp_file_orc.conf"); - Assertions.assertEquals(0, orcWriteResult.getExitCode()); + helper.execute("/orc/fake_to_ftp_file_orc.conf"); } @AfterAll diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_filter_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_filter_excel_to_assert.conf new file mode 100644 index 00000000000..6af42f6f3d6 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-ftp-e2e/src/test/resources/excel/ftp_filter_excel_to_assert.conf @@ -0,0 +1,141 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" + + # You can set spark configuration here + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + FtpFile { + host = "ftp" + port = 21 + user = seatunnel + password = pass + path = "/tmp/seatunnel/read/excel_filter" + result_table_name = "ftp" + file_format_type = excel + delimiter = ; + skip_header_row_number = 1 + file_filter_pattern = "e2e_filter.*" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } +} + + +sink { + Assert { + source_table_name = "ftp" + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = name + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = hobby + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} + diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java index f5c220deabd..aed35767263 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/local/LocalFileIT.java @@ -21,17 +21,14 @@ import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory; import org.apache.seatunnel.e2e.common.container.TestContainer; import org.apache.seatunnel.e2e.common.container.TestContainerId; +import org.apache.seatunnel.e2e.common.container.TestHelper; import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer; import org.apache.seatunnel.e2e.common.junit.TestContainerExtension; import org.apache.seatunnel.e2e.common.util.ContainerUtil; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.TestTemplate; -import org.testcontainers.containers.Container; -import org.testcontainers.utility.MountableFile; import java.io.IOException; -import java.nio.file.Path; @DisabledOnContainer( value = {TestContainerId.SPARK_2_4}, @@ -43,88 +40,70 @@ public class LocalFileIT extends TestSuiteBase { @TestContainerExtension private final ContainerExtendedFactory extendedFactory = container -> { - Path jsonPath = ContainerUtil.getResourcesFile("/json/e2e.json").toPath(); - Path orcPath = ContainerUtil.getResourcesFile("/orc/e2e.orc").toPath(); - Path parquetPath = ContainerUtil.getResourcesFile("/parquet/e2e.parquet").toPath(); - Path textPath = ContainerUtil.getResourcesFile("/text/e2e.txt").toPath(); - Path excelPath = ContainerUtil.getResourcesFile("/excel/e2e.xlsx").toPath(); - container.copyFileToContainer( - MountableFile.forHostPath(jsonPath), - "/seatunnel/read/json/name=tyrantlucifer/hobby=coding/e2e.json"); - container.copyFileToContainer( - MountableFile.forHostPath(orcPath), - "/seatunnel/read/orc/name=tyrantlucifer/hobby=coding/e2e.orc"); - container.copyFileToContainer( - MountableFile.forHostPath(parquetPath), - "/seatunnel/read/parquet/name=tyrantlucifer/hobby=coding/e2e.parquet"); - container.copyFileToContainer( - MountableFile.forHostPath(textPath), - "/seatunnel/read/text/name=tyrantlucifer/hobby=coding/e2e.txt"); - container.copyFileToContainer( - MountableFile.forHostPath(excelPath), - "/seatunnel/read/excel/name=tyrantlucifer/hobby=coding/e2e.xlsx"); + ContainerUtil.copyFileIntoContainers( + "/json/e2e.json", + "/seatunnel/read/json/name=tyrantlucifer/hobby=coding/e2e.json", + container); + + ContainerUtil.copyFileIntoContainers( + "/text/e2e.txt", + "/seatunnel/read/text/name=tyrantlucifer/hobby=coding/e2e.txt", + container); + + ContainerUtil.copyFileIntoContainers( + "/excel/e2e.xlsx", + "/seatunnel/read/excel/name=tyrantlucifer/hobby=coding/e2e.xlsx", + container); + + ContainerUtil.copyFileIntoContainers( + "/orc/e2e.orc", + "/seatunnel/read/orc/name=tyrantlucifer/hobby=coding/e2e.orc", + container); + + ContainerUtil.copyFileIntoContainers( + "/parquet/e2e.parquet", + "/seatunnel/read/parquet/name=tyrantlucifer/hobby=coding/e2e.parquet", + container); + + ContainerUtil.copyFileIntoContainers( + "/excel/e2e.xlsx", + "/seatunnel/read/excel_filter/name=tyrantlucifer/hobby=coding/e2e_filter.xlsx", + container); }; @TestTemplate public void testLocalFileReadAndWrite(TestContainer container) throws IOException, InterruptedException { - Container.ExecResult excelWriteResult = - container.executeJob("/excel/fake_to_local_excel.conf"); - Assertions.assertEquals(0, excelWriteResult.getExitCode(), excelWriteResult.getStderr()); - Container.ExecResult excelReadResult = - container.executeJob("/excel/local_excel_to_assert.conf"); - Assertions.assertEquals(0, excelReadResult.getExitCode(), excelReadResult.getStderr()); - Container.ExecResult excelProjectionReadResult = - container.executeJob("/excel/local_excel_projection_to_assert.conf"); - Assertions.assertEquals( - 0, excelProjectionReadResult.getExitCode(), excelProjectionReadResult.getStderr()); + TestHelper helper = new TestHelper(container); + + helper.execute("/excel/fake_to_local_excel.conf"); + helper.execute("/excel/local_excel_to_assert.conf"); + helper.execute("/excel/local_excel_projection_to_assert.conf"); // test write local text file - Container.ExecResult textWriteResult = - container.executeJob("/text/fake_to_local_file_text.conf"); - Assertions.assertEquals(0, textWriteResult.getExitCode()); + helper.execute("/text/fake_to_local_file_text.conf"); // test read skip header - Container.ExecResult textWriteAndSkipResult = - container.executeJob("/text/local_file_text_skip_headers.conf"); - Assertions.assertEquals(0, textWriteAndSkipResult.getExitCode()); + helper.execute("/text/local_file_text_skip_headers.conf"); // test read local text file - Container.ExecResult textReadResult = - container.executeJob("/text/local_file_text_to_assert.conf"); - Assertions.assertEquals(0, textReadResult.getExitCode()); + helper.execute("/text/local_file_text_to_assert.conf"); // test read local text file with projection - Container.ExecResult textProjectionResult = - container.executeJob("/text/local_file_text_projection_to_assert.conf"); - Assertions.assertEquals(0, textProjectionResult.getExitCode()); + helper.execute("/text/local_file_text_projection_to_assert.conf"); // test write local json file - Container.ExecResult jsonWriteResult = - container.executeJob("/json/fake_to_local_file_json.conf"); - Assertions.assertEquals(0, jsonWriteResult.getExitCode()); + helper.execute("/json/fake_to_local_file_json.conf"); // test read local json file - Container.ExecResult jsonReadResult = - container.executeJob("/json/local_file_json_to_assert.conf"); - Assertions.assertEquals(0, jsonReadResult.getExitCode()); + helper.execute("/json/local_file_json_to_assert.conf"); // test write local orc file - Container.ExecResult orcWriteResult = - container.executeJob("/orc/fake_to_local_file_orc.conf"); - Assertions.assertEquals(0, orcWriteResult.getExitCode()); + helper.execute("/orc/fake_to_local_file_orc.conf"); // test read local orc file - Container.ExecResult orcReadResult = - container.executeJob("/orc/local_file_orc_to_assert.conf"); - Assertions.assertEquals(0, orcReadResult.getExitCode()); + helper.execute("/orc/local_file_orc_to_assert.conf"); // test read local orc file with projection - Container.ExecResult orcProjectionResult = - container.executeJob("/orc/local_file_orc_projection_to_assert.conf"); - Assertions.assertEquals(0, orcProjectionResult.getExitCode()); + helper.execute("/orc/local_file_orc_projection_to_assert.conf"); // test write local parquet file - Container.ExecResult parquetWriteResult = - container.executeJob("/parquet/fake_to_local_file_parquet.conf"); - Assertions.assertEquals(0, parquetWriteResult.getExitCode()); + helper.execute("/parquet/fake_to_local_file_parquet.conf"); // test read local parquet file - Container.ExecResult parquetReadResult = - container.executeJob("/parquet/local_file_parquet_to_assert.conf"); - Assertions.assertEquals(0, parquetReadResult.getExitCode()); + helper.execute("/parquet/local_file_parquet_to_assert.conf"); // test read local parquet file with projection - Container.ExecResult parquetProjectionResult = - container.executeJob("/parquet/local_file_parquet_projection_to_assert.conf"); - Assertions.assertEquals(0, parquetProjectionResult.getExitCode()); + helper.execute("/parquet/local_file_parquet_projection_to_assert.conf"); + // test read filtered local file + helper.execute("/excel/local_filter_excel_to_assert.conf"); } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_filter_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_filter_excel_to_assert.conf new file mode 100644 index 00000000000..86039b44dbf --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-local-e2e/src/test/resources/excel/local_filter_excel_to_assert.conf @@ -0,0 +1,131 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local + job.mode = "BATCH" +} + +source { + LocalFile { + path = "/seatunnel/read/excel_filter" + result_table_name = "fake" + file_format_type = excel + delimiter = ; + skip_header_row_number = 1 + file_filter_pattern = "e2e_filter.*" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + Assert { + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = name + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = hobby + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/fstp/SftpFileIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/fstp/SftpFileIT.java index 82d1be73db1..e5fbcb5f5ef 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/fstp/SftpFileIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/java/org/apache/seatunnel/e2e/connector/file/fstp/SftpFileIT.java @@ -21,22 +21,19 @@ import org.apache.seatunnel.e2e.common.TestSuiteBase; import org.apache.seatunnel.e2e.common.container.TestContainer; import org.apache.seatunnel.e2e.common.container.TestContainerId; +import org.apache.seatunnel.e2e.common.container.TestHelper; import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer; import org.apache.seatunnel.e2e.common.util.ContainerUtil; import org.junit.jupiter.api.AfterAll; -import org.junit.jupiter.api.Assertions; import org.junit.jupiter.api.BeforeAll; import org.junit.jupiter.api.TestTemplate; -import org.testcontainers.containers.Container; import org.testcontainers.containers.GenericContainer; import org.testcontainers.lifecycle.Startables; -import org.testcontainers.utility.MountableFile; import lombok.extern.slf4j.Slf4j; import java.io.IOException; -import java.nio.file.Path; import java.util.Collections; import java.util.stream.Stream; @@ -75,61 +72,54 @@ public void startUp() throws Exception { sftpContainer.start(); Startables.deepStart(Stream.of(sftpContainer)).join(); log.info("Sftp container started"); - Path jsonPath = ContainerUtil.getResourcesFile("/json/e2e.json").toPath(); - Path textPath = ContainerUtil.getResourcesFile("/text/e2e.txt").toPath(); - Path excelPath = ContainerUtil.getResourcesFile("/excel/e2e.xlsx").toPath(); - sftpContainer.copyFileToContainer( - MountableFile.forHostPath(jsonPath), - "/home/seatunnel/tmp/seatunnel/read/json/name=tyrantlucifer/hobby=coding/e2e.json"); - sftpContainer.copyFileToContainer( - MountableFile.forHostPath(textPath), - "/home/seatunnel/tmp/seatunnel/read/text/name=tyrantlucifer/hobby=coding/e2e.txt"); - sftpContainer.copyFileToContainer( - MountableFile.forHostPath(excelPath), - "/home/seatunnel/tmp/seatunnel/read/excel/name=tyrantlucifer/hobby=coding/e2e.xlsx"); + + ContainerUtil.copyFileIntoContainers( + "/json/e2e.json", + "/home/seatunnel/tmp/seatunnel/read/json/name=tyrantlucifer/hobby=coding/e2e.json", + sftpContainer); + + ContainerUtil.copyFileIntoContainers( + "/text/e2e.txt", + "/home/seatunnel/tmp/seatunnel/read/text/name=tyrantlucifer/hobby=coding/e2e.txt", + sftpContainer); + + ContainerUtil.copyFileIntoContainers( + "/excel/e2e.xlsx", + "/home/seatunnel/tmp/seatunnel/read/excel/name=tyrantlucifer/hobby=coding/e2e.xlsx", + sftpContainer); + + ContainerUtil.copyFileIntoContainers( + "/excel/e2e.xlsx", + "/home/seatunnel/tmp/seatunnel/read/excel_filter/name=tyrantlucifer/hobby=coding/e2e_filter.xlsx", + sftpContainer); + sftpContainer.execInContainer("sh", "-c", "chown -R seatunnel /home/seatunnel/tmp/"); } @TestTemplate public void testSftpFileReadAndWrite(TestContainer container) throws IOException, InterruptedException { + TestHelper helper = new TestHelper(container); // test write sftp excel file - Container.ExecResult excelWriteResult = - container.executeJob("/excel/fakesource_to_sftp_excel.conf"); - Assertions.assertEquals(0, excelWriteResult.getExitCode(), excelWriteResult.getStderr()); + helper.execute("/excel/fakesource_to_sftp_excel.conf"); // test read sftp excel file - Container.ExecResult excelReadResult = - container.executeJob("/excel/sftp_excel_to_assert.conf"); - Assertions.assertEquals(0, excelReadResult.getExitCode(), excelReadResult.getStderr()); + helper.execute("/excel/sftp_excel_to_assert.conf"); // test read sftp excel file with projection - Container.ExecResult excelProjectionReadResult = - container.executeJob("/excel/sftp_excel_projection_to_assert.conf"); - Assertions.assertEquals( - 0, excelProjectionReadResult.getExitCode(), excelProjectionReadResult.getStderr()); + helper.execute("/excel/sftp_excel_projection_to_assert.conf"); + // test read sftp excel file with filter pattern + helper.execute("/excel/sftp_filter_excel_to_assert.conf"); // test write sftp text file - Container.ExecResult textWriteResult = - container.executeJob("/text/fake_to_sftp_file_text.conf"); - Assertions.assertEquals(0, textWriteResult.getExitCode()); + helper.execute("/text/fake_to_sftp_file_text.conf"); // test read skip header - Container.ExecResult textWriteAndSkipResult = - container.executeJob("/text/sftp_file_text_skip_headers.conf"); - Assertions.assertEquals(0, textWriteAndSkipResult.getExitCode()); + helper.execute("/text/sftp_file_text_skip_headers.conf"); // test read sftp text file - Container.ExecResult textReadResult = - container.executeJob("/text/sftp_file_text_to_assert.conf"); - Assertions.assertEquals(0, textReadResult.getExitCode()); + helper.execute("/text/sftp_file_text_to_assert.conf"); // test read sftp text file with projection - Container.ExecResult textProjectionResult = - container.executeJob("/text/sftp_file_text_projection_to_assert.conf"); - Assertions.assertEquals(0, textProjectionResult.getExitCode()); + helper.execute("/text/sftp_file_text_projection_to_assert.conf"); // test write sftp json file - Container.ExecResult jsonWriteResult = - container.executeJob("/json/fake_to_sftp_file_json.conf"); - Assertions.assertEquals(0, jsonWriteResult.getExitCode()); + helper.execute("/json/fake_to_sftp_file_json.conf"); // test read sftp json file - Container.ExecResult jsonReadResult = - container.executeJob("/json/sftp_file_json_to_assert.conf"); - Assertions.assertEquals(0, jsonReadResult.getExitCode()); + helper.execute("/json/sftp_file_json_to_assert.conf"); } @AfterAll diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_filter_excel_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_filter_excel_to_assert.conf new file mode 100644 index 00000000000..b6cd92f712a --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-file-sftp-e2e/src/test/resources/excel/sftp_filter_excel_to_assert.conf @@ -0,0 +1,132 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" + + # You can set spark configuration here + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + SftpFile { + path = "tmp/seatunnel/read/excel_filter" + result_table_name = "sftp" + file_format_type = excel + host = "sftp" + port = 22 + user = seatunnel + password = pass + delimiter = ";" + file_filter_pattern = "e2e_filter.*" + skip_header_row_number = 1 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_tinyint = tinyint + c_smallint = smallint + c_int = int + c_bigint = bigint + c_float = float + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(38, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + Assert { + source_table_name = "sftp" + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 5 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} + + + + + + + + + + + diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseITBase.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseITBase.java new file mode 100644 index 00000000000..b8202e697a1 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseITBase.java @@ -0,0 +1,147 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc; + +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; + +import org.apache.commons.lang3.tuple.Pair; + +import org.junit.jupiter.api.Assertions; +import org.testcontainers.shaded.org.apache.commons.io.IOUtils; + +import java.io.InputStream; +import java.nio.charset.StandardCharsets; +import java.sql.ResultSet; +import java.sql.Statement; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Objects; + +public abstract class JdbcOceanBaseITBase extends AbstractJdbcIT { + + private static final String OCEANBASE_DATABASE = "seatunnel"; + private static final String OCEANBASE_SOURCE = "source"; + private static final String OCEANBASE_SINK = "sink"; + + private static final String OCEANBASE_JDBC_TEMPLATE = "jdbc:oceanbase://" + HOST + ":%s"; + private static final String OCEANBASE_DRIVER_CLASS = "com.oceanbase.jdbc.Driver"; + + abstract String imageName(); + + abstract String host(); + + abstract int port(); + + abstract String username(); + + abstract String password(); + + abstract List configFile(); + + abstract String createSqlTemplate(); + + abstract String[] getFieldNames(); + + @Override + JdbcCase getJdbcCase() { + Map containerEnv = new HashMap<>(); + String jdbcUrl = String.format(OCEANBASE_JDBC_TEMPLATE, port()); + Pair> testDataSet = initTestData(); + String[] fieldNames = testDataSet.getKey(); + + String insertSql = insertTable(OCEANBASE_DATABASE, OCEANBASE_SOURCE, fieldNames); + + return JdbcCase.builder() + .dockerImage(imageName()) + .networkAliases(host()) + .containerEnv(containerEnv) + .driverClass(OCEANBASE_DRIVER_CLASS) + .host(HOST) + .port(port()) + .localPort(port()) + .jdbcTemplate(OCEANBASE_JDBC_TEMPLATE) + .jdbcUrl(jdbcUrl) + .userName(username()) + .password(password()) + .database(OCEANBASE_DATABASE) + .sourceTable(OCEANBASE_SOURCE) + .sinkTable(OCEANBASE_SINK) + .createSql(createSqlTemplate()) + .configFile(configFile()) + .insertSql(insertSql) + .testData(testDataSet) + .build(); + } + + @Override + void compareResult() { + String sourceSql = + String.format( + "select * from %s.%s order by 1", OCEANBASE_DATABASE, OCEANBASE_SOURCE); + String sinkSql = + String.format("select * from %s.%s order by 1", OCEANBASE_DATABASE, OCEANBASE_SINK); + try { + Statement sourceStatement = connection.createStatement(); + Statement sinkStatement = connection.createStatement(); + ResultSet sourceResultSet = sourceStatement.executeQuery(sourceSql); + ResultSet sinkResultSet = sinkStatement.executeQuery(sinkSql); + Assertions.assertEquals( + sourceResultSet.getMetaData().getColumnCount(), + sinkResultSet.getMetaData().getColumnCount()); + while (sourceResultSet.next()) { + if (sinkResultSet.next()) { + for (String column : getFieldNames()) { + Object source = sourceResultSet.getObject(column); + Object sink = sinkResultSet.getObject(column); + if (!Objects.deepEquals(source, sink)) { + InputStream sourceAsciiStream = sourceResultSet.getBinaryStream(column); + InputStream sinkAsciiStream = sinkResultSet.getBinaryStream(column); + String sourceValue = + IOUtils.toString(sourceAsciiStream, StandardCharsets.UTF_8); + String sinkValue = + IOUtils.toString(sinkAsciiStream, StandardCharsets.UTF_8); + Assertions.assertEquals(sourceValue, sinkValue); + } + } + } + } + sourceResultSet.last(); + sinkResultSet.last(); + } catch (Exception e) { + throw new RuntimeException("Compare result error", e); + } + } + + @Override + String driverUrl() { + return "https://repo1.maven.org/maven2/com/oceanbase/oceanbase-client/2.4.3/oceanbase-client-2.4.3.jar"; + } + + @Override + protected void createSchemaIfNeeded() { + String sql = "CREATE DATABASE IF NOT EXISTS " + OCEANBASE_DATABASE; + try { + connection.prepareStatement(sql).executeUpdate(); + } catch (Exception e) { + throw new SeaTunnelRuntimeException( + JdbcITErrorCode.CREATE_TABLE_FAILED, "Fail to execute sql " + sql, e); + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseMysqlIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseMysqlIT.java new file mode 100644 index 00000000000..548fecaee66 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseMysqlIT.java @@ -0,0 +1,256 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc; + +import org.apache.seatunnel.api.table.type.SeaTunnelRow; + +import org.apache.commons.lang3.tuple.Pair; + +import org.junit.jupiter.api.Disabled; +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.utility.DockerLoggerFactory; + +import com.google.common.collect.Lists; + +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Timestamp; +import java.time.Duration; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.List; + +@Disabled("Disabled due to insufficient hardware resources in the CI environment") +public class JdbcOceanBaseMysqlIT extends JdbcOceanBaseITBase { + + @Override + String imageName() { + return "oceanbase/oceanbase-ce:4.0.0.0"; + } + + @Override + String host() { + return "e2e_oceanbase_mysql"; + } + + @Override + int port() { + return 2881; + } + + @Override + String username() { + return "root"; + } + + @Override + String password() { + return ""; + } + + @Override + List configFile() { + return Lists.newArrayList("/jdbc_oceanbase_mysql_source_and_sink.conf"); + } + + @Override + String createSqlTemplate() { + return "CREATE TABLE IF NOT EXISTS %s\n" + + "(\n" + + " `c_bit_1` bit(1) DEFAULT NULL,\n" + + " `c_bit_8` bit(8) DEFAULT NULL,\n" + + " `c_bit_16` bit(16) DEFAULT NULL,\n" + + " `c_bit_32` bit(32) DEFAULT NULL,\n" + + " `c_bit_64` bit(64) DEFAULT NULL,\n" + + " `c_boolean` tinyint(1) DEFAULT NULL,\n" + + " `c_tinyint` tinyint(4) DEFAULT NULL,\n" + + " `c_tinyint_unsigned` tinyint(3) unsigned DEFAULT NULL,\n" + + " `c_smallint` smallint(6) DEFAULT NULL,\n" + + " `c_smallint_unsigned` smallint(5) unsigned DEFAULT NULL,\n" + + " `c_mediumint` mediumint(9) DEFAULT NULL,\n" + + " `c_mediumint_unsigned` mediumint(8) unsigned DEFAULT NULL,\n" + + " `c_int` int(11) DEFAULT NULL,\n" + + " `c_integer` int(11) DEFAULT NULL,\n" + + " `c_bigint` bigint(20) DEFAULT NULL,\n" + + " `c_bigint_unsigned` bigint(20) unsigned DEFAULT NULL,\n" + + " `c_decimal` decimal(20, 0) DEFAULT NULL,\n" + + " `c_decimal_unsigned` decimal(38, 18) DEFAULT NULL,\n" + + " `c_float` float DEFAULT NULL,\n" + + " `c_float_unsigned` float unsigned DEFAULT NULL,\n" + + " `c_double` double DEFAULT NULL,\n" + + " `c_double_unsigned` double unsigned DEFAULT NULL,\n" + + " `c_char` char(1) DEFAULT NULL,\n" + + " `c_tinytext` tinytext,\n" + + " `c_mediumtext` mediumtext,\n" + + " `c_text` text,\n" + + " `c_varchar` varchar(255) DEFAULT NULL,\n" + + " `c_json` json DEFAULT NULL,\n" + + " `c_longtext` longtext,\n" + + " `c_date` date DEFAULT NULL,\n" + + " `c_datetime` datetime DEFAULT NULL,\n" + + " `c_timestamp` timestamp NULL DEFAULT NULL,\n" + + " `c_tinyblob` tinyblob,\n" + + " `c_mediumblob` mediumblob,\n" + + " `c_blob` blob,\n" + + " `c_longblob` longblob,\n" + + " `c_varbinary` varbinary(255) DEFAULT NULL,\n" + + " `c_binary` binary(1) DEFAULT NULL,\n" + + " `c_year` year(4) DEFAULT NULL,\n" + + " `c_int_unsigned` int(10) unsigned DEFAULT NULL,\n" + + " `c_integer_unsigned` int(10) unsigned DEFAULT NULL,\n" + + " `c_bigint_30` BIGINT(40) unsigned DEFAULT NULL,\n" + + " `c_decimal_unsigned_30` DECIMAL(30) unsigned DEFAULT NULL,\n" + + " `c_decimal_30` DECIMAL(30) DEFAULT NULL\n" + + ");"; + } + + @Override + String[] getFieldNames() { + return new String[] { + "c_bit_1", + "c_bit_8", + "c_bit_16", + "c_bit_32", + "c_bit_64", + "c_boolean", + "c_tinyint", + "c_tinyint_unsigned", + "c_smallint", + "c_smallint_unsigned", + "c_mediumint", + "c_mediumint_unsigned", + "c_int", + "c_integer", + "c_year", + "c_int_unsigned", + "c_integer_unsigned", + "c_bigint", + "c_bigint_unsigned", + "c_decimal", + "c_decimal_unsigned", + "c_float", + "c_float_unsigned", + "c_double", + "c_double_unsigned", + "c_char", + "c_tinytext", + "c_mediumtext", + "c_text", + "c_varchar", + "c_json", + "c_longtext", + "c_date", + "c_datetime", + "c_timestamp", + "c_tinyblob", + "c_mediumblob", + "c_blob", + "c_longblob", + "c_varbinary", + "c_binary", + "c_bigint_30", + "c_decimal_unsigned_30", + "c_decimal_30", + }; + } + + @Override + Pair> initTestData() { + String[] fieldNames = getFieldNames(); + + List rows = new ArrayList<>(); + BigDecimal bigintValue = new BigDecimal("2844674407371055000"); + BigDecimal decimalValue = new BigDecimal("999999999999999999999999999899"); + for (int i = 0; i < 100; i++) { + byte byteArr = Integer.valueOf(i).byteValue(); + SeaTunnelRow row = + new SeaTunnelRow( + new Object[] { + i % 2 == 0 ? (byte) 1 : (byte) 0, + new byte[] {byteArr}, + new byte[] {byteArr, byteArr}, + new byte[] {byteArr, byteArr, byteArr, byteArr}, + new byte[] { + byteArr, byteArr, byteArr, byteArr, byteArr, byteArr, byteArr, + byteArr + }, + i % 2 == 0 ? Boolean.TRUE : Boolean.FALSE, + i, + i, + i, + i, + i, + i, + i, + i, + i, + Long.parseLong("1"), + Long.parseLong("1"), + Long.parseLong("1"), + BigDecimal.valueOf(i, 0), + BigDecimal.valueOf(i, 18), + BigDecimal.valueOf(i, 18), + Float.parseFloat("1.1"), + Float.parseFloat("1.1"), + Double.parseDouble("1.1"), + Double.parseDouble("1.1"), + "f", + String.format("f1_%s", i), + String.format("f1_%s", i), + String.format("f1_%s", i), + String.format("f1_%s", i), + String.format("{\"aa\":\"bb_%s\"}", i), + String.format("f1_%s", i), + Date.valueOf(LocalDate.now()), + Timestamp.valueOf(LocalDateTime.now()), + new Timestamp(System.currentTimeMillis()), + "test".getBytes(), + "test".getBytes(), + "test".getBytes(), + "test".getBytes(), + "test".getBytes(), + "f".getBytes(), + bigintValue.add(BigDecimal.valueOf(i)), + decimalValue.add(BigDecimal.valueOf(i)), + decimalValue.add(BigDecimal.valueOf(i)), + }); + rows.add(row); + } + + return Pair.of(fieldNames, rows); + } + + @Override + GenericContainer initContainer() { + GenericContainer container = + new GenericContainer<>(imageName()) + .withNetwork(NETWORK) + .withNetworkAliases(host()) + .waitingFor(Wait.forLogMessage(".*boot success!.*", 1)) + .withStartupTimeout(Duration.ofMinutes(5)) + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger(imageName()))); + + container.setPortBindings(Lists.newArrayList(String.format("%s:%s", port(), port()))); + + return container; + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseOracleIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseOracleIT.java new file mode 100644 index 00000000000..4c3cca5ddc1 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcOceanBaseOracleIT.java @@ -0,0 +1,161 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc; + +import org.apache.seatunnel.api.table.type.SeaTunnelRow; + +import org.apache.commons.lang3.tuple.Pair; + +import org.junit.jupiter.api.Disabled; +import org.testcontainers.containers.GenericContainer; + +import com.google.common.collect.Lists; + +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Timestamp; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import static org.awaitility.Awaitility.given; + +@Disabled("Oracle mode of OceanBase Enterprise Edition does not provide docker environment") +public class JdbcOceanBaseOracleIT extends JdbcOceanBaseITBase { + + @Override + String imageName() { + return null; + } + + @Override + String host() { + return "e2e_oceanbase_oracle"; + } + + @Override + int port() { + return 2883; + } + + @Override + String username() { + return "root"; + } + + @Override + String password() { + return ""; + } + + @Override + List configFile() { + return Lists.newArrayList("/jdbc_oceanbase_oracle_source_and_sink.conf"); + } + + @Override + GenericContainer initContainer() { + throw new UnsupportedOperationException(); + } + + @Override + public void startUp() { + jdbcCase = getJdbcCase(); + + given().ignoreExceptions() + .await() + .atMost(360, TimeUnit.SECONDS) + .untilAsserted(() -> this.initializeJdbcConnection(jdbcCase.getJdbcUrl())); + + createSchemaIfNeeded(); + createNeededTables(); + insertTestData(); + } + + @Override + public String quoteIdentifier(String field) { + return "\"" + field + "\""; + } + + @Override + String createSqlTemplate() { + return "create table %s\n" + + "(\n" + + " VARCHAR_10_COL varchar2(10),\n" + + " CHAR_10_COL char(10),\n" + + " CLOB_COL clob,\n" + + " NUMBER_3_SF_2_DP number(3, 2),\n" + + " INTEGER_COL integer,\n" + + " FLOAT_COL float(10),\n" + + " REAL_COL real,\n" + + " BINARY_FLOAT_COL binary_float,\n" + + " BINARY_DOUBLE_COL binary_double,\n" + + " DATE_COL date,\n" + + " TIMESTAMP_WITH_3_FRAC_SEC_COL timestamp(3),\n" + + " TIMESTAMP_WITH_LOCAL_TZ timestamp with local time zone\n" + + ")"; + } + + @Override + String[] getFieldNames() { + return new String[] { + "VARCHAR_10_COL", + "CHAR_10_COL", + "CLOB_COL", + "NUMBER_3_SF_2_DP", + "INTEGER_COL", + "FLOAT_COL", + "REAL_COL", + "BINARY_FLOAT_COL", + "BINARY_DOUBLE_COL", + "DATE_COL", + "TIMESTAMP_WITH_3_FRAC_SEC_COL", + "TIMESTAMP_WITH_LOCAL_TZ" + }; + } + + @Override + Pair> initTestData() { + String[] fieldNames = getFieldNames(); + + List rows = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + SeaTunnelRow row = + new SeaTunnelRow( + new Object[] { + String.format("f%s", i), + String.format("f%s", i), + String.format("f%s", i), + BigDecimal.valueOf(1.1), + i, + Float.parseFloat("2.2"), + Float.parseFloat("2.2"), + Float.parseFloat("22.2"), + Double.parseDouble("2.2"), + Date.valueOf(LocalDate.now()), + Timestamp.valueOf(LocalDateTime.now()), + Timestamp.valueOf(LocalDateTime.now()) + }); + rows.add(row); + } + + return Pair.of(fieldNames, rows); + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_oceanbase_mysql_source_and_sink.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_oceanbase_mysql_source_and_sink.conf new file mode 100644 index 00000000000..098d3ffae26 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_oceanbase_mysql_source_and_sink.conf @@ -0,0 +1,55 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +###### +###### This config file is a demonstration of streaming processing in seatunnel config +###### + +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" + #execution.checkpoint.interval = 10000 + #execution.checkpoint.data-uri = "hdfs://localhost:9000/checkpoint" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + Jdbc { + driver = com.oceanbase.jdbc.Driver + url = "jdbc:oceanbase://e2e_oceanbase_mysql:2881/seatunnel?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true&serverTimezone=UTC" + user = root + password = "" + query = "SELECT c_bit_1, c_bit_8, c_bit_16, c_bit_32, c_bit_64, c_boolean, c_tinyint, c_tinyint_unsigned, c_smallint, c_smallint_unsigned, c_mediumint, c_mediumint_unsigned, c_int, c_integer, c_bigint, c_bigint_unsigned, c_decimal, c_decimal_unsigned, c_float, c_float_unsigned, c_double, c_double_unsigned, c_char, c_tinytext, c_mediumtext, c_text, c_varchar, c_json, c_longtext, c_date, c_datetime, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary, c_binary, c_year, c_int_unsigned, c_integer_unsigned, c_bigint_30, c_decimal_unsigned_30, c_decimal_30 FROM source" + compatible_mode = "mysql" + } + + # If you would like to get more information about how to configure seatunnel and see full list of source plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/source/FakeSource +} + +sink { + Jdbc { + driver = com.oceanbase.jdbc.Driver + url = "jdbc:oceanbase://e2e_oceanbase_mysql:2881/seatunnel?useUnicode=true&characterEncoding=UTF-8&rewriteBatchedStatements=true&serverTimezone=UTC" + user = root + password = "" + query = "insert into sink(c_bit_1, c_bit_8, c_bit_16, c_bit_32, c_bit_64, c_boolean, c_tinyint, c_tinyint_unsigned, c_smallint, c_smallint_unsigned, c_mediumint, c_mediumint_unsigned, c_int, c_integer, c_bigint, c_bigint_unsigned, c_decimal, c_decimal_unsigned, c_float, c_float_unsigned, c_double, c_double_unsigned, c_char, c_tinytext, c_mediumtext, c_text, c_varchar, c_json, c_longtext, c_date, c_datetime, c_timestamp, c_tinyblob, c_mediumblob, c_blob, c_longblob, c_varbinary, c_binary, c_year, c_int_unsigned, c_integer_unsigned,c_bigint_30,c_decimal_unsigned_30,c_decimal_30) values (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?);" + compatible_mode = "mysql" + } + # If you would like to get more information about how to configure seatunnel and see full list of sink plugins, + # please go to https://seatunnel.apache.org/docs/connector-v2/sink +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_oceanbase_oracle_source_and_sink.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_oceanbase_oracle_source_and_sink.conf new file mode 100644 index 00000000000..bf2b1ccf067 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_oceanbase_oracle_source_and_sink.conf @@ -0,0 +1,53 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +###### +###### This config file is a demonstration of streaming processing in seatunnel config +###### + +env { + # You can set flink configuration here + execution.parallelism = 1 + job.mode = "BATCH" + #execution.checkpoint.interval = 10000 + #execution.checkpoint.data-uri = "hdfs://localhost:9000/checkpoint" +} + +source { + jdbc{ + # This is a example source plugin **only for test and demonstrate the feature source plugin** + url = "jdbc:oceanbase://e2e_oceanbase_oracle:2883/seatunnel" + driver = com.oceanbase.jdbc.Driver + user = "root" + password = "" + query = "SELECT VARCHAR_10_COL,CHAR_10_COL,CLOB_COL,NUMBER_3_SF_2_DP,INTEGER_COL,FLOAT_COL,REAL_COL,BINARY_FLOAT_COL,BINARY_DOUBLE_COL,DATE_COL,TIMESTAMP_WITH_3_FRAC_SEC_COL,TIMESTAMP_WITH_LOCAL_TZ FROM source" + compatible_mode = "oracle" + } +} + +transform { +} + +sink { + jdbc{ + url = "jdbc:oceanbase://e2e_oceanbase_oracle:2883/seatunnel" + driver = com.oceanbase.jdbc.Driver + user = "root" + password = "" + query = "INSERT INTO sink (VARCHAR_10_COL,CHAR_10_COL,CLOB_COL,NUMBER_3_SF_2_DP,INTEGER_COL,FLOAT_COL,REAL_COL,BINARY_FLOAT_COL,BINARY_DOUBLE_COL,DATE_COL,TIMESTAMP_WITH_3_FRAC_SEC_COL,TIMESTAMP_WITH_LOCAL_TZ) VALUES(?,?,?,?,?,?,?,?,?,?,?,?)" + compatible_mode = "oracle" + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/selectdb-jdbc-to-selectdb.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/selectdb-jdbc-to-selectdb.conf index 99c7c8dc8fc..9795d4132ed 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/selectdb-jdbc-to-selectdb.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/selectdb-jdbc-to-selectdb.conf @@ -36,7 +36,7 @@ transform { sink { SelectDBCloud { load-url = "selectdb_e2e:8030" - base-url = "jdbc:mysql://selectdb_e2e:9030" + jdbc-url = "selectdb_e2e:9030" username = "admin" password = "" cluster-name = "cluster" diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/pom.xml index 81ecdc29882..8628e2b80b6 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/pom.xml +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/pom.xml @@ -52,8 +52,30 @@ ${testcontainer.version} test + + org.testcontainers + oracle-xe + ${testcontainer.version} + test + + + org.testcontainers + mysql + ${testcontainer.version} + test + + + mysql + mysql-connector-java + test + + + com.oracle.database.jdbc + ojdbc8 + test + org.postgresql postgresql diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcPostgresIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcPostgresIT.java index 6a3eb231b27..f66ef615d7b 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcPostgresIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcPostgresIT.java @@ -95,7 +95,9 @@ public class JdbcPostgresIT extends TestSuiteBase implements TestResource { + " multilinestring geometry(MULTILINESTRING, 4326),\n" + " multipolygon geometry(MULTIPOLYGON, 4326),\n" + " geometrycollection geometry(GEOMETRYCOLLECTION, 4326),\n" - + " geog geography(POINT, 4326)\n" + + " geog geography(POINT, 4326),\n" + + " json_col json NOT NULL,\n" + + " jsonb_col jsonb NOT NULL\n" + ")"; private static final String PG_SINK_DDL = "CREATE TABLE IF NOT EXISTS pg_e2e_sink_table (\n" @@ -126,7 +128,9 @@ public class JdbcPostgresIT extends TestSuiteBase implements TestResource { + " multilinestring varchar(2000) NULL,\n" + " multipolygon varchar(2000) NULL,\n" + " geometrycollection varchar(2000) NULL,\n" - + " geog varchar(2000) NULL\n" + + " geog varchar(2000) NULL,\n" + + " json_col json NOT NULL \n," + + " jsonb_col jsonb NOT NULL\n" + " )"; private static final String SOURCE_SQL = "select \n" @@ -157,8 +161,10 @@ public class JdbcPostgresIT extends TestSuiteBase implements TestResource { + "multilinestring,\n" + "multipolygon,\n" + "geometrycollection,\n" - + "geog\n" - + " from pg_e2e_source_table"; + + "geog,\n" + + "json_col,\n" + + "jsonb_col\n" + + "from pg_e2e_source_table"; private static final String SINK_SQL = "select\n" + " gid,\n" @@ -188,7 +194,9 @@ public class JdbcPostgresIT extends TestSuiteBase implements TestResource { + " cast(multilinestring as geometry) as multilinestring,\n" + " cast(multipolygon as geometry) as multilinestring,\n" + " cast(geometrycollection as geometry) as geometrycollection,\n" - + " cast(geog as geography) as geog\n" + + " cast(geog as geography) as geog,\n" + + " json_col,\n" + + " jsonb_col\n" + "from\n" + " pg_e2e_sink_table"; @@ -279,7 +287,9 @@ private void initializeJdbcTable() { + " multilinestring,\n" + " multipolygon,\n" + " geometrycollection,\n" - + " geog\n" + + " geog,\n" + + " json_col,\n" + + " jsonb_col \n" + " )\n" + "VALUES\n" + " (\n" @@ -330,7 +340,9 @@ private void initializeJdbcTable() { + " 'GEOMETRYCOLLECTION(POINT(-122.3462 47.5921), LINESTRING(-122.3460 47.5924, -122.3457 47.5924))',\n" + " 4326\n" + " ),\n" - + " ST_GeographyFromText('POINT(-122.3452 47.5925)')\n" + + " ST_GeographyFromText('POINT(-122.3452 47.5925)'),\n" + + " '{\"key\":\"test\"}',\n" + + " '{\"key\":\"test\"}'\n" + " )"); } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcSinkCDCChangelogIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcSinkCDCChangelogIT.java index 2a29c1cb5fe..dd812efb12b 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcSinkCDCChangelogIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcSinkCDCChangelogIT.java @@ -126,6 +126,16 @@ public void testSinkCDCChangelog(TestContainer container) Stream.>of(Arrays.asList(1L, "A_1", 100), Arrays.asList(3L, "C", 100)) .collect(Collectors.toSet()); Assertions.assertIterableEquals(expected, actual); + try (Connection connection = + DriverManager.getConnection( + postgreSQLContainer.getJdbcUrl(), + postgreSQLContainer.getUsername(), + postgreSQLContainer.getPassword())) { + try (Statement statement = connection.createStatement()) { + statement.execute("truncate table sink"); + log.info("testSinkCDCChangelog truncate table sink"); + } + } } private void initializeJdbcTable() { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink.conf index 1c7417f8a55..7a34a4f49c0 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink.conf @@ -28,7 +28,7 @@ source{ password = "test" query ="""select gid, text_col, varchar_col, char_col, boolean_col, smallint_col, integer_col, bigint_col, decimal_col, numeric_col, real_col, double_precision_col, smallserial_col, serial_col, bigserial_col, date_col, timestamp_col, bpchar_col, age, name, point, linestring, polygon_colums, multipoint, - multilinestring, multipolygon, geometrycollection, geog from pg_e2e_source_table""" + multilinestring, multipolygon, geometrycollection, geog, json_col, jsonb_col from pg_e2e_source_table""" } } @@ -36,12 +36,12 @@ source{ sink { Jdbc { driver = org.postgresql.Driver - url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF" + url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF&stringtype=unspecified" user = test password = test generate_sink_sql = true database = test - table = "public.pg_e2e_sink_table" + table = public.pg_e2e_sink_table primary_keys = ["gid"] } } \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_parallel.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_parallel.conf index 25df382c4af..58feafe102a 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_parallel.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_parallel.conf @@ -28,7 +28,7 @@ source{ password = "test" query ="""select gid, text_col, varchar_col, char_col, boolean_col, smallint_col, integer_col, bigint_col, decimal_col, numeric_col, real_col, double_precision_col, smallserial_col, serial_col, bigserial_col, date_col, timestamp_col, bpchar_col, age, name, point, linestring, polygon_colums, multipoint, - multilinestring, multipolygon, geometrycollection, geog from pg_e2e_source_table""" + multilinestring, multipolygon, geometrycollection, geog, json_col, jsonb_col from pg_e2e_source_table""" partition_column= "gid" result_table_name = "jdbc" @@ -40,14 +40,14 @@ transform { sink { jdbc { - url = "jdbc:postgresql://postgresql:5432/test" + url = "jdbc:postgresql://postgresql:5432/test?stringtype=unspecified" driver = "org.postgresql.Driver" user = "test" password = "test" connection_check_timeout_sec = 100 query ="""INSERT INTO pg_e2e_sink_table ( gid, text_col, varchar_col, char_col, boolean_col, smallint_col, integer_col, bigint_col, decimal_col, numeric_col, real_col, double_precision_col, smallserial_col, serial_col, bigserial_col, date_col, timestamp_col, bpchar_col, age, name, point, - linestring, polygon_colums, multipoint, multilinestring, multipolygon, geometrycollection, geog ) - VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )""" + linestring, polygon_colums, multipoint, multilinestring, multipolygon, geometrycollection, geog, json_col, jsonb_col) + VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )""" } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_parallel_upper_lower.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_parallel_upper_lower.conf index 46f1b43022b..4a98ab64776 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_parallel_upper_lower.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_parallel_upper_lower.conf @@ -28,7 +28,7 @@ source{ password = "test" query ="""select gid, text_col, varchar_col, char_col, boolean_col, smallint_col, integer_col, bigint_col, decimal_col, numeric_col, real_col, double_precision_col, smallserial_col, serial_col, bigserial_col, date_col, timestamp_col, bpchar_col, age, name, point, linestring, polygon_colums, multipoint, - multilinestring, multipolygon, geometrycollection, geog from pg_e2e_source_table""" + multilinestring, multipolygon, geometrycollection, geog, json_col, jsonb_col from pg_e2e_source_table""" partition_column= "gid" result_table_name = "jdbc" @@ -43,7 +43,7 @@ transform { sink { jdbc { - url = "jdbc:postgresql://postgresql:5432/test" + url = "jdbc:postgresql://postgresql:5432/test?stringtype=unspecified" driver = "org.postgresql.Driver" user = "test" @@ -51,7 +51,7 @@ sink { connection_check_timeout_sec = 100 query ="""INSERT INTO pg_e2e_sink_table ( gid, text_col, varchar_col, char_col, boolean_col, smallint_col, integer_col, bigint_col, decimal_col, numeric_col, real_col, double_precision_col, smallserial_col, serial_col, bigserial_col, date_col, timestamp_col, bpchar_col, age, name, point, - linestring, polygon_colums, multipoint, multilinestring, multipolygon, geometrycollection, geog ) - VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )""" + linestring, polygon_colums, multipoint, multilinestring, multipolygon, geometrycollection, geog, json_col, jsonb_col ) + VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )""" } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_xa.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_xa.conf index ba32ca81bc1..d135b19376a 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_xa.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_postgres_source_and_sink_xa.conf @@ -29,7 +29,7 @@ source { password = "test" query ="""select gid, text_col, varchar_col, char_col, boolean_col, smallint_col, integer_col, bigint_col, decimal_col, numeric_col, real_col, double_precision_col, smallserial_col, serial_col, bigserial_col, date_col, timestamp_col, bpchar_col, age, name, point, linestring, polygon_colums, multipoint, - multilinestring, multipolygon, geometrycollection, geog from pg_e2e_source_table""" + multilinestring, multipolygon, geometrycollection, geog, json_col, jsonb_col from pg_e2e_source_table""" } } @@ -38,15 +38,15 @@ transform { sink { jdbc { - url = "jdbc:postgresql://postgresql:5432/test" + url = "jdbc:postgresql://postgresql:5432/test?stringtype=unspecified" driver = "org.postgresql.Driver" user = "test" password = "test" max_retries = 0 query ="""INSERT INTO pg_e2e_sink_table ( gid, text_col, varchar_col, char_col, boolean_col, smallint_col, integer_col, bigint_col, decimal_col, numeric_col, real_col, double_precision_col, smallserial_col, serial_col, bigserial_col, date_col, timestamp_col, bpchar_col, age, name, point, - linestring, polygon_colums, multipoint, multilinestring, multipolygon, geometrycollection, geog ) - VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )""" + linestring, polygon_colums, multipoint, multilinestring, multipolygon, geometrycollection, geog, json_col, jsonb_col ) + VALUES( ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ? )""" is_exactly_once = "true" diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_sink_cdc_changelog.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_sink_cdc_changelog.conf index 5a48476171e..e0742a04f4c 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_sink_cdc_changelog.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-3/src/test/resources/jdbc_sink_cdc_changelog.conf @@ -66,7 +66,7 @@ sink { password = test generate_sink_sql = true database = test - table = "public.sink" + table = public.sink primary_keys = ["pk_id"] } } \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/pom.xml new file mode 100644 index 00000000000..99bbff4fa23 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/pom.xml @@ -0,0 +1,96 @@ + + + + 4.0.0 + + org.apache.seatunnel + connector-jdbc-e2e + ${revision} + + + connector-jdbc-e2e-part-4 + SeaTunnel : E2E : Connector V2 : Jdbc : Part 4 + + + + org.apache.seatunnel + connector-jdbc-e2e-common + ${project.version} + test-jar + test + + + + + org.testcontainers + postgresql + ${testcontainer.version} + test + + + net.snowflake + snowflake-jdbc + test + + + org.testcontainers + mssqlserver + ${testcontainer.version} + test + + + org.testcontainers + oracle-xe + ${testcontainer.version} + test + + + org.testcontainers + mysql + ${testcontainer.version} + test + + + + + mysql + mysql-connector-java + test + + + com.oracle.database.jdbc + ojdbc8 + test + + + org.postgresql + postgresql + test + + + com.microsoft.sqlserver + mssql-jdbc + test + + + com.vertica.jdbc + vertica-jdbc + test + + + + diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcMySqlCreateTableIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcMySqlCreateTableIT.java new file mode 100644 index 00000000000..cdc6fe1992a --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcMySqlCreateTableIT.java @@ -0,0 +1,471 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc; + +import org.apache.seatunnel.api.table.catalog.CatalogTable; +import org.apache.seatunnel.api.table.catalog.TablePath; +import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; +import org.apache.seatunnel.common.utils.JdbcUrlUtil; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.mysql.MySqlCatalog; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.oracle.OracleCatalog; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.oracle.OracleURLParser; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.psql.PostgresCatalog; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.sqlserver.SqlServerCatalog; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.sqlserver.SqlServerURLParser; +import org.apache.seatunnel.e2e.common.TestResource; +import org.apache.seatunnel.e2e.common.TestSuiteBase; +import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory; +import org.apache.seatunnel.e2e.common.container.EngineType; +import org.apache.seatunnel.e2e.common.container.TestContainer; +import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer; +import org.apache.seatunnel.e2e.common.junit.TestContainerExtension; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.TestTemplate; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.MSSQLServerContainer; +import org.testcontainers.containers.MySQLContainer; +import org.testcontainers.containers.OracleContainer; +import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.lifecycle.Startables; +import org.testcontainers.utility.DockerImageName; +import org.testcontainers.utility.DockerLoggerFactory; + +import com.google.common.collect.Lists; +import lombok.extern.slf4j.Slf4j; + +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.stream.Stream; + +@Slf4j +@DisabledOnContainer( + value = {}, + type = {EngineType.SPARK, EngineType.FLINK}, + disabledReason = "Currently SPARK and FLINK do not support cdc") +public class JdbcMySqlCreateTableIT extends TestSuiteBase implements TestResource { + private static final String SQLSERVER_IMAGE = "mcr.microsoft.com/mssql/server:2022-latest"; + private static final String SQLSERVER_CONTAINER_HOST = "sqlserver"; + private static final int SQLSERVER_CONTAINER_PORT = 14333; + private static final String DRIVER_CLASS = "com.microsoft.sqlserver.jdbc.SQLServerDriver"; + + private static final String PG_IMAGE = "postgis/postgis"; + private static final String PG_DRIVER_JAR = + "https://repo1.maven.org/maven2/org/postgresql/postgresql/42.3.3/postgresql-42.3.3.jar"; + private static final String PG_JDBC_JAR = + "https://repo1.maven.org/maven2/net/postgis/postgis-jdbc/2.5.1/postgis-jdbc-2.5.1.jar"; + private static final String PG_GEOMETRY_JAR = + "https://repo1.maven.org/maven2/net/postgis/postgis-geometry/2.5.1/postgis-geometry-2.5.1.jar"; + + private static final String MYSQL_IMAGE = "mysql:latest"; + private static final String MYSQL_CONTAINER_HOST = "mysql-e2e"; + private static final String MYSQL_DATABASE = "auto"; + + private static final String MYSQL_USERNAME = "root"; + private static final String PASSWORD = "Abc!@#135_seatunnel"; + private static final int MYSQL_PORT = 33061; + // private static final String MYSQL_URL = "jdbc:mysql://" + HOST + ":%s/%s?useSSL=false"; + + private static final String MYSQL_DRIVER_CLASS = "com.mysql.cj.jdbc.Driver"; + + private static final String ORACLE_IMAGE = "gvenzl/oracle-xe:21-slim-faststart"; + private static final String ORACLE_NETWORK_ALIASES = "e2e_oracleDb"; + private static final String ORACLE_DRIVER_CLASS = "oracle.jdbc.OracleDriver"; + private static final int ORACLE_PORT = 15211; + // private static final String ORACLE_URL = "jdbc:oracle:thin:@" + HOST + ":%s/%s"; + private static final String USERNAME = "testUser"; + private static final String DATABASE = "TESTUSER"; + + private PostgreSQLContainer POSTGRESQL_CONTAINER; + + private MSSQLServerContainer sqlserver_container; + private MySQLContainer mysql_container; + private OracleContainer oracle_container; + + private static final String mysqlCheck = + "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema = 'auto' AND table_name = 'mysql_auto_create_mysql') AS table_exists"; + private static final String sqlserverCheck = + "IF EXISTS (\n" + + " SELECT 1\n" + + " FROM testauto.sys.tables t\n" + + " JOIN testauto.sys.schemas s ON t.schema_id = s.schema_id\n" + + " WHERE t.name = 'mysql_auto_create_sql' AND s.name = 'dbo'\n" + + ")\n" + + " SELECT 1 AS table_exists;\n" + + "ELSE\n" + + " SELECT 0 AS table_exists;"; + private static final String pgCheck = + "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'mysql_auto_create_pg') AS table_exists;\n"; + private static final String oracleCheck = + "SELECT CASE WHEN EXISTS(SELECT 1 FROM user_tables WHERE table_name = 'mysql_auto_create_oracle') THEN 1 ELSE 0 END AS table_exists FROM DUAL;\n"; + + String driverSqlServerUrl() { + return "https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/9.4.1.jre8/mssql-jdbc-9.4.1.jre8.jar"; + } + + private static final String CREATE_SQL_DATABASE = + "IF NOT EXISTS (\n" + + " SELECT name \n" + + " FROM sys.databases \n" + + " WHERE name = N'testauto'\n" + + ")\n" + + "CREATE DATABASE testauto;\n"; + + private static final String CREATE_TABLE_SQL = + "CREATE TABLE IF NOT EXISTS mysql_auto_create\n" + + "(\n " + + "`id` int(11) NOT NULL AUTO_INCREMENT,\n" + + " `f_binary` binary(64) DEFAULT NULL,\n" + + " `f_smallint` smallint(6) DEFAULT NULL,\n" + + " `f_smallint_unsigned` smallint(5) unsigned DEFAULT NULL,\n" + + " `f_mediumint` mediumint(9) DEFAULT NULL,\n" + + " `f_mediumint_unsigned` mediumint(8) unsigned DEFAULT NULL,\n" + + " `f_int` int(11) DEFAULT NULL,\n" + + " `f_int_unsigned` int(10) unsigned DEFAULT NULL,\n" + + " `f_integer` int(11) DEFAULT NULL,\n" + + " `f_integer_unsigned` int(10) unsigned DEFAULT NULL,\n" + + " `f_bigint` bigint(20) DEFAULT NULL,\n" + + " `f_bigint_unsigned` bigint(20) unsigned DEFAULT NULL,\n" + + " `f_numeric` decimal(10,0) DEFAULT NULL,\n" + + " `f_decimal` decimal(10,0) DEFAULT NULL,\n" + + " `f_float` float DEFAULT NULL,\n" + + " `f_double` double DEFAULT NULL,\n" + + " `f_double_precision` double DEFAULT NULL,\n" + + " `f_tinytext` tinytext COLLATE utf8mb4_unicode_ci,\n" + + " `f_varchar` varchar(100) COLLATE utf8mb4_unicode_ci DEFAULT NULL,\n" + + " `f_datetime` datetime DEFAULT NULL,\n" + + " `f_timestamp` timestamp NULL DEFAULT NULL,\n" + + " `f_bit1` bit(1) DEFAULT NULL,\n" + + " `f_bit64` bit(64) DEFAULT NULL,\n" + + " `f_char` char(1) COLLATE utf8mb4_unicode_ci DEFAULT NULL,\n" + + " `f_enum` enum('enum1','enum2','enum3') COLLATE utf8mb4_unicode_ci DEFAULT NULL,\n" + + " `f_real` double DEFAULT NULL,\n" + + " `f_tinyint` tinyint(4) DEFAULT NULL,\n" + + " `f_bigint8` bigint(8) DEFAULT NULL,\n" + + " `f_bigint1` bigint(1) DEFAULT NULL,\n" + + " `f_data` date DEFAULT NULL,\n" + + " PRIMARY KEY (`id`)\n" + + ");"; + + private String getInsertSql = + "INSERT INTO mysql_auto_create" + + "(id, f_binary, f_smallint, f_smallint_unsigned, f_mediumint, f_mediumint_unsigned, f_int, f_int_unsigned, f_integer, f_integer_unsigned, f_bigint, f_bigint_unsigned, f_numeric, f_decimal, f_float, f_double, f_double_precision, f_tinytext, f_varchar, f_datetime, f_timestamp, f_bit1, f_bit64, f_char, f_enum, f_real, f_tinyint, f_bigint8, f_bigint1, f_data)\n" + + "VALUES(575, 0x654458436C70336B7357000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000, 194, 549, 633, 835, 719, 253, 742, 265, 806, 736, 474, 254, 120.8, 476.42, 264.95, 'In other words, Navicat provides the ability for data in different databases and/or schemas to be kept up-to-date so that each repository contains the same information.', 'jF9X70ZqH4', '2011-10-20 23:10:08', '2017-09-10 19:33:51', 1, b'0001001101100000001010010100010111000010010110110101110011111100', 'u', 'enum2', 876.55, 25, 503, 1, '2011-03-06');\n"; + + @TestContainerExtension + private final ContainerExtendedFactory extendedSqlServerFactory = + container -> { + Container.ExecResult extraCommands = + container.execInContainer( + "bash", + "-c", + "mkdir -p /tmp/seatunnel/plugins/Jdbc/lib && cd /tmp/seatunnel/plugins/Jdbc/lib && curl -O " + + PG_DRIVER_JAR + + " && curl -O " + + PG_JDBC_JAR + + " && curl -O " + + PG_GEOMETRY_JAR + + " && curl -O " + + MYSQL_DRIVER_CLASS + + " && curl -O " + + ORACLE_DRIVER_CLASS + + " && curl -O " + + driverSqlserverUrl() + + " && curl -O " + + driverMySqlUrl() + + " && curl -O " + + driverOracleUrl()); + // Assertions.assertEquals(0, extraCommands.getExitCode()); + }; + + String driverMySqlUrl() { + return "https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.0.32/mysql-connector-j-8.0.32.jar"; + } + + String driverOracleUrl() { + return "https://repo1.maven.org/maven2/com/oracle/database/jdbc/ojdbc8/12.2.0.1/ojdbc8-12.2.0.1.jar"; + } + + String driverSqlserverUrl() { + return "https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/9.4.1.jre8/mssql-jdbc-9.4.1.jre8.jar"; + } + + void initContainer() throws ClassNotFoundException { + DockerImageName imageName = DockerImageName.parse(SQLSERVER_IMAGE); + sqlserver_container = + new MSSQLServerContainer<>(imageName) + .withNetwork(TestSuiteBase.NETWORK) + .withNetworkAliases(SQLSERVER_CONTAINER_HOST) + .withPassword(PASSWORD) + .acceptLicense() + .withLogConsumer( + new Slf4jLogConsumer( + DockerLoggerFactory.getLogger(SQLSERVER_IMAGE))); + + sqlserver_container.setPortBindings( + Lists.newArrayList(String.format("%s:%s", SQLSERVER_CONTAINER_PORT, 1433))); + + try { + Class.forName(sqlserver_container.getDriverClassName()); + } catch (ClassNotFoundException e) { + throw new SeaTunnelRuntimeException( + JdbcITErrorCode.DRIVER_NOT_FOUND, "Not found suitable driver for mssql", e); + } + + // ============= PG + POSTGRESQL_CONTAINER = + new PostgreSQLContainer<>( + DockerImageName.parse(PG_IMAGE) + .asCompatibleSubstituteFor("postgres")) + .withNetwork(TestSuiteBase.NETWORK) + .withNetworkAliases("postgresql") + .withDatabaseName("pg") + .withUsername(USERNAME) + .withPassword(PASSWORD) + .withCommand("postgres -c max_prepared_transactions=100") + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger(PG_IMAGE))); + POSTGRESQL_CONTAINER.setPortBindings( + Lists.newArrayList(String.format("%s:%s", 54323, 5432))); + // Startables.deepStart(Stream.of(POSTGRESQL_CONTAINER)).join(); + log.info("PostgreSQL container started"); + Class.forName(POSTGRESQL_CONTAINER.getDriverClassName()); + + log.info("pg data initialization succeeded. Procedure"); + DockerImageName mysqlImageName = DockerImageName.parse(MYSQL_IMAGE); + mysql_container = + new MySQLContainer<>(mysqlImageName) + .withUsername(MYSQL_USERNAME) + .withPassword(PASSWORD) + .withDatabaseName(MYSQL_DATABASE) + .withNetwork(NETWORK) + .withNetworkAliases(MYSQL_CONTAINER_HOST) + .withExposedPorts(MYSQL_PORT) + .waitingFor(Wait.forHealthcheck()) + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger(MYSQL_IMAGE))); + + mysql_container.setPortBindings( + Lists.newArrayList(String.format("%s:%s", MYSQL_PORT, 3306))); + DockerImageName oracleImageName = DockerImageName.parse(ORACLE_IMAGE); + oracle_container = + new OracleContainer(oracleImageName) + .withDatabaseName(DATABASE) + .withUsername(USERNAME) + .withPassword(PASSWORD) + .withNetwork(NETWORK) + .withNetworkAliases(ORACLE_NETWORK_ALIASES) + .withExposedPorts(ORACLE_PORT) + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger(ORACLE_IMAGE))); + oracle_container.withCommand( + "bash", + "-c", + "echo \"CREATE USER admin IDENTIFIED BY admin; GRANT DBA TO admin;\" | sqlplus / as sysdba"); + oracle_container.setPortBindings( + Lists.newArrayList(String.format("%s:%s", ORACLE_PORT, 1521))); + Startables.deepStart( + Stream.of( + POSTGRESQL_CONTAINER, + sqlserver_container, + mysql_container, + oracle_container)) + .join(); + } + + @Override + @BeforeAll + public void startUp() throws Exception { + initContainer(); + initializeSqlJdbcTable(); + initializeJdbcTable(); + } + + static JdbcUrlUtil.UrlInfo sqlParse = + SqlServerURLParser.parse("jdbc:sqlserver://localhost:14333;database=testauto"); + static JdbcUrlUtil.UrlInfo MysqlUrlInfo = + JdbcUrlUtil.getUrlInfo("jdbc:mysql://localhost:33061/auto?useSSL=false"); + static JdbcUrlUtil.UrlInfo pg = JdbcUrlUtil.getUrlInfo("jdbc:postgresql://localhost:54323/pg"); + static JdbcUrlUtil.UrlInfo oracle = + OracleURLParser.parse("jdbc:oracle:thin:@localhost:15211/TESTUSER"); + + @TestTemplate + public void testAutoCreateTable(TestContainer container) + throws IOException, InterruptedException { + TablePath tablePathMySql = TablePath.of("auto", "mysql_auto_create"); + TablePath tablePathMySql_Mysql = TablePath.of("auto", "mysql_auto_create_mysql"); + TablePath tablePathSQL = TablePath.of("testauto", "dbo", "mysql_auto_create_sql"); + TablePath tablePathPG = TablePath.of("pg", "public", "mysql_auto_create_pg"); + TablePath tablePathOracle = TablePath.of("TESTUSER", "mysql_auto_create_oracle"); + + SqlServerCatalog sqlServerCatalog = + new SqlServerCatalog("sqlserver", "sa", PASSWORD, sqlParse, "dbo"); + MySqlCatalog mySqlCatalog = new MySqlCatalog("mysql", "root", PASSWORD, MysqlUrlInfo); + PostgresCatalog postgresCatalog = + new PostgresCatalog("postgres", "testUser", PASSWORD, pg, "public"); + OracleCatalog oracleCatalog = + new OracleCatalog("oracle", "admin", "admin", oracle, "TESTUSER"); + mySqlCatalog.open(); + sqlServerCatalog.open(); + postgresCatalog.open(); + // oracleCatalog.open(); + + CatalogTable mysqlTable = mySqlCatalog.getTable(tablePathMySql); + + sqlServerCatalog.createTable(tablePathSQL, mysqlTable, true); + postgresCatalog.createTable(tablePathPG, mysqlTable, true); + // oracleCatalog.createTable(tablePathOracle, mysqlTable, true); + mySqlCatalog.createTable(tablePathMySql_Mysql, mysqlTable, true); + + Assertions.assertTrue(checkMysql(mysqlCheck)); + // Assertions.assertTrue(checkOracle(oracleCheck)); + Assertions.assertTrue(checkSqlServer(sqlserverCheck)); + Assertions.assertTrue(checkPG(pgCheck)); + + // delete table + log.info("delete table"); + mySqlCatalog.dropTable(tablePathMySql_Mysql, true); + sqlServerCatalog.dropTable(tablePathSQL, true); + postgresCatalog.dropTable(tablePathPG, true); + // oracleCatalog.dropTable(tablePathOracle, true); + mySqlCatalog.dropTable(tablePathMySql, true); + + sqlServerCatalog.close(); + mySqlCatalog.close(); + postgresCatalog.close(); + // delete table + } + + @Override + public void tearDown() throws Exception { + + sqlserver_container.close(); + mysql_container.close(); + oracle_container.close(); + POSTGRESQL_CONTAINER.close(); + } + + private Connection getJdbcSqlServerConnection() throws SQLException { + return DriverManager.getConnection( + sqlserver_container.getJdbcUrl(), + sqlserver_container.getUsername(), + sqlserver_container.getPassword()); + } + + private Connection getJdbcMySqlConnection() throws SQLException { + return DriverManager.getConnection( + mysql_container.getJdbcUrl(), + mysql_container.getUsername(), + mysql_container.getPassword()); + } + + private Connection getJdbcPgConnection() throws SQLException { + return DriverManager.getConnection( + POSTGRESQL_CONTAINER.getJdbcUrl(), + POSTGRESQL_CONTAINER.getUsername(), + POSTGRESQL_CONTAINER.getPassword()); + } + + private Connection getJdbcOracleConnection() throws SQLException { + return DriverManager.getConnection( + oracle_container.getJdbcUrl(), + oracle_container.getUsername(), + oracle_container.getPassword()); + } + + private void initializeSqlJdbcTable() { + try (Connection connection = getJdbcSqlServerConnection()) { + Statement statement = connection.createStatement(); + statement.execute(CREATE_SQL_DATABASE); + // statement.executeBatch(); + } catch (SQLException e) { + throw new RuntimeException("Initializing PostgreSql table failed!", e); + } + } + + private void initializeJdbcTable() { + try (Connection connection = getJdbcMySqlConnection()) { + Statement statement = connection.createStatement(); + statement.execute(CREATE_TABLE_SQL); + statement.execute(getInsertSql); + + // statement.executeBatch(); + } catch (SQLException e) { + throw new RuntimeException("Initializing PostgreSql table failed!", e); + } + } + + private boolean checkMysql(String sql) { + try (Connection connection = getJdbcMySqlConnection()) { + ResultSet resultSet = connection.createStatement().executeQuery(sql); + boolean tableExists = false; + if (resultSet.next()) { + tableExists = resultSet.getBoolean(1); + } + return tableExists; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private boolean checkPG(String sql) { + try (Connection connection = getJdbcPgConnection()) { + ResultSet resultSet = connection.createStatement().executeQuery(sql); + boolean tableExists = false; + if (resultSet.next()) { + tableExists = resultSet.getBoolean(1); + } + return tableExists; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private boolean checkSqlServer(String sql) { + try (Connection connection = getJdbcSqlServerConnection()) { + ResultSet resultSet = connection.createStatement().executeQuery(sql); + boolean tableExists = false; + if (resultSet.next()) { + tableExists = resultSet.getInt(1) == 1; + } + return tableExists; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private boolean checkOracle(String sql) { + try (Connection connection = getJdbcOracleConnection()) { + ResultSet resultSet = connection.createStatement().executeQuery(sql); + boolean tableExists = false; + if (resultSet.next()) { + tableExists = resultSet.getInt(1) == 1; + } + return tableExists; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcSqlServerCreateTableIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcSqlServerCreateTableIT.java new file mode 100644 index 00000000000..35a2338b260 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-4/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcSqlServerCreateTableIT.java @@ -0,0 +1,482 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc; + +import org.apache.seatunnel.api.table.catalog.CatalogTable; +import org.apache.seatunnel.api.table.catalog.TablePath; +import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; +import org.apache.seatunnel.common.utils.JdbcUrlUtil; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.mysql.MySqlCatalog; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.oracle.OracleCatalog; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.oracle.OracleURLParser; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.psql.PostgresCatalog; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.sqlserver.SqlServerCatalog; +import org.apache.seatunnel.connectors.seatunnel.jdbc.catalog.sqlserver.SqlServerURLParser; +import org.apache.seatunnel.e2e.common.TestResource; +import org.apache.seatunnel.e2e.common.TestSuiteBase; +import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory; +import org.apache.seatunnel.e2e.common.container.EngineType; +import org.apache.seatunnel.e2e.common.container.TestContainer; +import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer; +import org.apache.seatunnel.e2e.common.junit.TestContainerExtension; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.TestTemplate; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.MSSQLServerContainer; +import org.testcontainers.containers.MySQLContainer; +import org.testcontainers.containers.OracleContainer; +import org.testcontainers.containers.PostgreSQLContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.containers.wait.strategy.Wait; +import org.testcontainers.lifecycle.Startables; +import org.testcontainers.utility.DockerImageName; +import org.testcontainers.utility.DockerLoggerFactory; + +import com.google.common.collect.Lists; +import lombok.extern.slf4j.Slf4j; + +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.stream.Stream; + +@Slf4j +@DisabledOnContainer( + value = {}, + type = {EngineType.SPARK, EngineType.FLINK}, + disabledReason = "Currently SPARK and FLINK do not support cdc") +public class JdbcSqlServerCreateTableIT extends TestSuiteBase implements TestResource { + + private static final String SQLSERVER_IMAGE = "mcr.microsoft.com/mssql/server:2022-latest"; + private static final String SQLSERVER_CONTAINER_HOST = "sqlserver-e2e"; + private static final int SQLSERVER_CONTAINER_PORT = 1433; + private static final String SQLSERVER_URL = + "jdbc:sqlserver://" + AbstractJdbcIT.HOST + ":%s;encrypt=false;"; + private static final String DRIVER_CLASS = "com.microsoft.sqlserver.jdbc.SQLServerDriver"; + + private static final String CREATE_DATABASE = + "IF NOT EXISTS (\n" + + " SELECT name \n" + + " FROM sys.databases \n" + + " WHERE name = N'testauto'\n" + + ")\n" + + "CREATE DATABASE testauto;\n"; + + private static final String CREATE_TABLE_SQL = + "IF NOT EXISTS (SELECT * FROM testauto.sys.tables WHERE name = 'sqlserver_auto_create' AND schema_id = SCHEMA_ID('dbo'))\n" + + "BEGIN\n" + + "CREATE TABLE testauto.dbo.sqlserver_auto_create (\n" + + " c1 bigint NOT NULL,\n" + + " c2 bit NULL,\n" + + " c3 decimal(18) NULL,\n" + + " c4 decimal(18,2) NULL,\n" + + " c5 real NULL,\n" + + " c6 float(53) NULL,\n" + + " c7 int NULL,\n" + + " c8 money NULL,\n" + + " c9 numeric(18) NULL,\n" + + " c10 numeric(18,2) NULL,\n" + + " c11 real NULL,\n" + + " c12 smallint NULL,\n" + + " c13 smallmoney NULL,\n" + + " c14 tinyint NULL,\n" + + " c15 char(10) NULL,\n" + + " c16 varchar(50) NULL,\n" + + " c17 varchar(max) NULL,\n" + + " c18 text NULL,\n" + + " c19 nchar(10) NULL,\n" + + " c20 nvarchar(50) NULL,\n" + + " c21 nvarchar(max) NULL,\n" + + " c22 ntext NULL,\n" + + " c25 varbinary(max) NULL,\n" + + " c26 image NULL,\n" + + " c27 datetime NULL,\n" + + " c28 datetime2(7) NULL,\n" + + " c29 datetimeoffset(7) NULL,\n" + + " c30 smalldatetime NULL,\n" + + " c31 date NULL,\n" + + " PRIMARY KEY CLUSTERED (c1)\n" + + ") \n" + + "END"; + + private String username; + + private String password; + + private String getInsertSql = + "INSERT INTO testauto.dbo.sqlserver_auto_create\n" + + "(c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11, c12, c13, c14, c15, c16, c17, c18, c19, c20, c21, c22, c25, c26, c27, c28, c29, c30, c31)\n" + + "VALUES(8, 1, 714, 876.63, 368.74686, 61.59519333775628, 97, 7.1403, 497, 727.56, 303.78827, 654, 620.8399, 181, N'qEVAoi6KLU', N'1Y7QDYF6me', N'Navicat allows you to transfer data from one database and/or schema to another with detailed analytical process. Instead of wondering when your next vacation is, maybe you should set up a life you don’t need to escape from. I will greet this day with love in my heart. HTTP Tunneling is a method for connecting to a server that uses the same protocol (http://) and the same port (port 80) as a web server does. Export Wizard allows you to export data from tables, collections, views, or query results to any available formats. Always keep your eyes open. Keep watching. Because whatever you see can inspire you. After logged in the Navicat Cloud feature, the Navigation pane will be divided into Navicat Cloud and My Connections sections. Navicat Cloud could not connect and access your databases. By which it means, it could only store your connection settings, queries, model files, and virtual group; your database passwords and data (e.g. tables, views, etc) will not be stored to Navicat Cloud. Always keep your eyes open. Keep watching. Because whatever you see can inspire you. With its well-designed Graphical User Interface(GUI), Navicat lets you quickly and easily create, organize, access and share information in a secure and easy way. Anyone who has ever made anything of importance was disciplined. After logged in the Navicat Cloud feature, the Navigation pane will be divided into Navicat Cloud and My Connections sections. If you wait, all that happens is you get older. Navicat Data Modeler enables you to build high-quality conceptual, logical and physical data models for a wide variety of audiences. Navicat Monitor requires a repository to store alerts and metrics for historical analysis. There is no way to happiness. Happiness is the way. To connect to a database or schema, simply double-click it in the pane. Anyone who has never made a mistake has never tried anything new. If your Internet Service Provider (ISP) does not provide direct access to its server, Secure Tunneling Protocol (SSH) / HTTP is another solution. Navicat 15 has added support for the system-wide dark mode. You will succeed because most people are lazy. Success consists of going from failure to failure without loss of enthusiasm. SSH serves to prevent such vulnerabilities and allows you to access a remote server''s shell without compromising security. Navicat provides a wide range advanced features, such as compelling code editing capabilities, smart code-completion, SQL formatting, and more. Navicat provides powerful tools for working with queries: Query Editor for editing the query text directly, and Query Builder, Find Builder or Aggregate Builder for building queries visually. The Synchronize to Database function will give you a full picture of all database differences. If the plan doesn’t work, change the plan, but never the goal. You can select any connections, objects or projects, and then select the corresponding buttons on the Information Pane. The Main Window consists of several toolbars and panes for you to work on connections, database objects and advanced tools. Actually it is just in an idea when feel oneself can achieve and cannot achieve. The Main Window consists of several toolbars and panes for you to work on connections, database objects and advanced tools. After logged in the Navicat Cloud feature, the Navigation pane will be divided into Navicat Cloud and My Connections sections. Anyone who has never made a mistake has never tried anything new. Navicat Monitor is a safe, simple and agentless remote server monitoring tool that is packed with powerful features to make your monitoring effective as possible. The Main Window consists of several toolbars and panes for you to work on connections, database objects and advanced tools. Navicat provides a wide range advanced features, such as compelling code editing capabilities, smart code-completion, SQL formatting, and more. Champions keep playing until they get it right. If it scares you, it might be a good thing to try. It can also manage cloud databases such as Amazon Redshift, Amazon RDS, Alibaba Cloud. Features in Navicat are sophisticated enough to provide professional developers for all their specific needs, yet easy to learn for users who are new to database server. To connect to a database or schema, simply double-click it in the pane. A query is used to extract data from the database in a readable format according to the user''s request. To successfully establish a new connection to local/remote server - no matter via SSL or SSH, set the database login information in the General tab. SQL Editor allows you to create and edit SQL text, prepare and execute selected queries. Navicat is a multi-connections Database Administration tool allowing you to connect to MySQL, Oracle, PostgreSQL, SQLite, SQL Server, MariaDB and/or MongoDB databases, making database administration to multiple kinds of database so easy. Secure Sockets Layer(SSL) is a protocol for transmitting private documents via the Internet. I may not have gone where I intended to go, but I think I have ended up where I needed to be. Navicat Cloud provides a cloud service for synchronizing connections, queries, model files and virtual group information from Navicat, other Navicat family members, different machines and different platforms. To connect to a database or schema, simply double-click it in the pane. With its well-designed Graphical User Interface(GUI), Navicat lets you quickly and easily create, organize, access and share information in a secure and easy way. I may not have gone where I intended to go, but I think I have ended up where I needed to be. Anyone who has ever made anything of importance was disciplined. Actually it is just in an idea when feel oneself can achieve and cannot achieve. Instead of wondering when your next vacation is, maybe you should set up a life you don’t need to escape from. It wasn’t raining when Noah built the ark. You must be the change you wish to see in the world. SQL Editor allows you to create and edit SQL text, prepare and execute selected queries. Navicat provides a wide range advanced features, such as compelling code editing capabilities, smart code-completion, SQL formatting, and more. To start working with your server in Navicat, you should first establish a connection or several connections using the Connection window. SSH serves to prevent such vulnerabilities and allows you to access a remote server''s shell without compromising security. In the Objects tab, you can use the List List, Detail Detail and ER Diagram ER Diagram buttons to change the object view. Genius is an infinite capacity for taking pains. Typically, it is employed as an encrypted version of Telnet. Secure Sockets Layer(SSL) is a protocol for transmitting private documents via the Internet. You cannot save people, you can just love them. You cannot save people, you can just love them. Navicat provides a wide range advanced features, such as compelling code editing capabilities, smart code-completion, SQL formatting, and more. To connect to a database or schema, simply double-click it in the pane. Navicat provides a wide range advanced features, such as compelling code editing capabilities, smart code-completion, SQL formatting, and more. Navicat Monitor requires a repository to store alerts and metrics for historical analysis. How we spend our days is, of course, how we spend our lives. Instead of wondering when your next vacation is, maybe you should set up a life you don’t need to escape from. To start working with your server in Navicat, you should first establish a connection or several connections using the Connection window. Always keep your eyes open. Keep watching. Because whatever you see can inspire you. Navicat Data Modeler enables you to build high-quality conceptual, logical and physical data models for a wide variety of audiences. Navicat Cloud could not connect and access your databases. By which it means, it could only store your connection settings, queries, model files, and virtual group; your database passwords and data (e.g. tables, views, etc) will not be stored to Navicat Cloud. I may not have gone where I intended to go, but I think I have ended up where I needed to be. The reason why a great man is great is that he resolves to be a great man. Export Wizard allows you to export data from tables, collections, views, or query results to any available formats. Navicat 15 has added support for the system-wide dark mode. Actually it is just in an idea when feel oneself can achieve and cannot achieve. SSH serves to prevent such vulnerabilities and allows you to access a remote server''s shell without compromising security. Difficult circumstances serve as a textbook of life for people. Flexible settings enable you to set up a custom key for comparison and synchronization. It collects process metrics such as CPU load, RAM usage, and a variety of other resources over SSH/SNMP. It wasn’t raining when Noah built the ark. SQL Editor allows you to create and edit SQL text, prepare and execute selected queries. You can select any connections, objects or projects, and then select the corresponding buttons on the Information Pane.', N'Actually it is just in an idea when feel oneself can achieve and cannot achieve. A man is not old until regrets take the place of dreams. With its well-designed Graphical User Interface(GUI), Navicat lets you quickly and easily create, organize, access and share information in a secure and easy way.', N'j8OKNCrsFb', N'KTLmoNjIiI', N'All the Navicat Cloud objects are located under different projects. You can share the project to other Navicat Cloud accounts for collaboration. Navicat Data Modeler is a powerful and cost-effective database design tool which helps you build high-quality conceptual, logical and physical data models. After logged in the Navicat Cloud feature, the Navigation pane will be divided into Navicat Cloud and My Connections sections. Navicat Cloud provides a cloud service for synchronizing connections, queries, model files and virtual group information from Navicat, other Navicat family members, different machines and different platforms. Secure Sockets Layer(SSL) is a protocol for transmitting private documents via the Internet. To successfully establish a new connection to local/remote server - no matter via SSL, SSH or HTTP, set the database login information in the General tab. Champions keep playing until they get it right. It is used while your ISPs do not allow direct connections, but allows establishing HTTP connections. With its well-designed Graphical User Interface(GUI), Navicat lets you quickly and easily create, organize, access and share information in a secure and easy way. Navicat allows you to transfer data from one database and/or schema to another with detailed analytical process. You must be the change you wish to see in the world. Navicat provides a wide range advanced features, such as compelling code editing capabilities, smart code-completion, SQL formatting, and more. Anyone who has never made a mistake has never tried anything new. Navicat allows you to transfer data from one database and/or schema to another with detailed analytical process. I may not have gone where I intended to go, but I think I have ended up where I needed to be. Typically, it is employed as an encrypted version of Telnet. Secure SHell (SSH) is a program to log in into another computer over a network, execute commands on a remote server, and move files from one machine to another. Success consists of going from failure to failure without loss of enthusiasm. Sometimes you win, sometimes you learn. Navicat 15 has added support for the system-wide dark mode. It provides strong authentication and secure encrypted communications between two hosts, known as SSH Port Forwarding (Tunneling), over an insecure network.', N'To connect to a database or schema, simply double-click it in the pane. If you wait, all that happens is you get older. Always keep your eyes open. Keep watching. Because whatever you see can inspire you. Import Wizard allows you to import data to tables/collections from CSV, TXT, XML, DBF and more. Success consists of going from failure to failure without loss of enthusiasm. A query is used to extract data from the database in a readable format according to the user''s request. Anyone who has never made a mistake has never tried anything new. To successfully establish a new connection to local/remote server - no matter via SSL or SSH, set the database login information in the General tab. SQL Editor allows you to create and edit SQL text, prepare and execute selected queries. Navicat Monitor is a safe, simple and agentless remote server monitoring tool that is packed with powerful features to make your monitoring effective as possible. I will greet this day with love in my heart. How we spend our days is, of course, how we spend our lives. You can select any connections, objects or projects, and then select the corresponding buttons on the Information Pane. Remember that failure is an event, not a person. The Information Pane shows the detailed object information, project activities, the DDL of database objects, object dependencies, membership of users/roles and preview. Navicat authorizes you to make connection to remote servers running on different platforms (i.e. Windows, macOS, Linux and UNIX), and supports PAM and GSSAPI authentication. Secure Sockets Layer(SSL) is a protocol for transmitting private documents via the Internet. The Information Pane shows the detailed object information, project activities, the DDL of database objects, object dependencies, membership of users/roles and preview. You can select any connections, objects or projects, and then select the corresponding buttons on the Information Pane. The On Startup feature allows you to control what tabs appear when you launch Navicat. The first step is as good as half over. Always keep your eyes open. Keep watching. Because whatever you see can inspire you. Champions keep playing until they get it right. If the Show objects under schema in navigation pane option is checked at the Preferences window, all database objects are also displayed in the pane. To successfully establish a new connection to local/remote server - no matter via SSL, SSH or HTTP, set the database login information in the General tab. It provides strong authentication and secure encrypted communications between two hosts, known as SSH Port Forwarding (Tunneling), over an insecure network. Navicat is a multi-connections Database Administration tool allowing you to connect to MySQL, Oracle, PostgreSQL, SQLite, SQL Server, MariaDB and/or MongoDB databases, making database administration to multiple kinds of database so easy. It wasn’t raining when Noah built the ark. A comfort zone is a beautiful place, but nothing ever grows there. Navicat Cloud provides a cloud service for synchronizing connections, queries, model files and virtual group information from Navicat, other Navicat family members, different machines and different platforms. The past has no power over the present moment. Creativity is intelligence having fun. Navicat authorizes you to make connection to remote servers running on different platforms (i.e. Windows, macOS, Linux and UNIX), and supports PAM and GSSAPI authentication. HTTP Tunneling is a method for connecting to a server that uses the same protocol (http://) and the same port (port 80) as a web server does. Difficult circumstances serve as a textbook of life for people. A comfort zone is a beautiful place, but nothing ever grows there. I may not have gone where I intended to go, but I think I have ended up where I needed to be. It wasn’t raining when Noah built the ark. Navicat Cloud could not connect and access your databases. By which it means, it could only store your connection settings, queries, model files, and virtual group; your database passwords and data (e.g. tables, views, etc) will not be stored to Navicat Cloud. What you get by achieving your goals is not as important as what you become by achieving your goals. Difficult circumstances serve as a textbook of life for people. There is no way to happiness. Happiness is the way. Genius is an infinite capacity for taking pains. If the plan doesn’t work, change the plan, but never the goal. Genius is an infinite capacity for taking pains.', 0xFFD8FFE000104A46494600010100000100010000FFDB004300080606070605080707070909080A0C140D0C0B0B0C1912130F141D1A1F1E1D1A1C1C20242E2720222C231C1C2837292C30313434341F27393D38323C2E333432FFDB0043010909090C0B0C180D0D1832211C213232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232FFC00011080140014003012200021101031101FFC4001C0001010003000301000000000000000000000705060801020304FFC400441000010302020605070A0309010000000000010203040506110712213141B23651617172153542748191B113142223326282A1C1D152C2F016172433535493A2D2E1FFC4001A010100030101010000000000000000000000030405020601FFC400311101000201020306040602030000000000000102030411051231213233517181133441D114156191B1C152A12442F0FFDA000C03010002110311003F00A8000CB42000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000031376C4D66B1ECB857C5149FE922EB3FB3E8A66B976EE34FAED2ED0C6E54A1B64F3A6796B4AF48D3BD32D6FD09F1E97365EDA55DD715EDD214604827D2E5D9CEFA8B7D1313AA4D77FC150F9B34B57C45FA7456F54EC63D3F98B1F966A3CBFDA5FC36458C130A4D2FB55CD6D65A1513D27C3367EE6AA7EA6DB67C7161BD3DB14158914EECB286A1351CABD49C157B115487268F3E38DED5476C37AF586C4002B2300000035CC478D2DD862AA1A7AD86AA47CACD76AC2C6AA226796DCDC8754C76C96E5AC6F2FB5ACDA7686C60D0FF00BDAB0FFB4B97FC6CFF00D99FC398B2DB89D93AD124CC7C2A88E8E6444764BB953255D84B7D366A579AD5DA1D5B15EB1BCC33A0020700000000003E35752CA3A39EAA4472B218DD2391BBD511335CBDC693FDED587FDA5CBFE367FEC971E0C997B691BBAAD2D6EEC37C06A763D20DA6FF00758EDD4B4F5AC9A4472A3A56311BB133E0E5EA36C39C98AF8E796F1B496ACD676900070E40000000000000000000000FCD5F5F4D6CA09AB6AE548A085BACF72FF5BD57622088999DA0EAF171B951DAA89F575D3B21819BDCE5DFD889C57B1091626D255C2E6E7D35A95F45499AA7CA22E52BD3B57D1F67BCC1E29C515789AE4E9A473994AC5CA0833D8C4EB5EB72F153027A0D270FAE388B64ED9FE1A1874F15EDB7579739CF72B9CAAAE55CD55576AA9E0D830EE0EBAE247A3E9A248A95172754CBB189DDC5CBDDEDC8A45B345963A4622D73E7AE932DBACE58D9EC46AE7F9A96336B70E19DAD3DBE5092F9E94EC945C1D110612C3D4EC4632CB42A89FEA428F5F7BB353DA4C2D87E46EABACB6F44FBB4ED6FC10A9F9B63FF001945F8BAF939D416DB968C30FD6B55695B350C9B72589EAE6AAF6A3B3F722A133C4983AE786A4D6A8624D48E5C99531A7D155EA5EA5FEB696F06BB0E69DAB3B4FEA9699E97EC86430B6906E1637B29EB1CFACB7A6CD472E6F8D3EEAAFC176756459ADD71A4BAD0C7594533658244CD1C9C3B17A97B0E68366C198AE6C35736A48E73ADF33B29E3DF97DF4ED4FCD3675655F5BA0AE489BE38DADFCA3CD822D1CD5EABD83D63919344C9637A3E37B51CD735734545DCA87B1E7D9E122D2EF9E6DFEAEBCCA574916977CF36FF00575E652F70DF988F74FA6F1213B32787EF75187EF305C29F6EA2E52333D8F62EF6FF005C723180F496AC5A26B3D25A531131B4BA6682BA9EE541056D2BF5E09988F63BB3F73F411FD19E29F27D7791AAE4CA9AA5D9C2E72FD893ABB9DF1CBAD4B01E5755A79C1926B3D3E8CACB8E71DB600057460000C7DF7A3D72F5597914E6E3A46FBD1EB97AACBC8A7371B9C23B965ED27496DBA35E9C51F824E452E842F46BD38A3F049C8A5D0ABC57C78F4FBA2D577FD80019AAC00000000000000000000120D286237565C9B65A77AFCDE9575A6C9763E45E1F853F355EA2A379B8B2D166ACB83F25482257A22F15E09ED5C90E6F9A692A2792695EAF92472BDEE5DEAAAB9AA9ABC2F045AF3927E9FCAD6971EF3CD3F47A1BB603C17E5F9D6BEBDAE6DBA17648DDCB3BBA93B138AFB138E5A9DB6825BA5CE9A861FF00327912345EACD77FB379D1D6FA082D96F828A99BAB0C0C46353F55ED5DE5DE23AA9C34E5A7594FA8CB348DA3ACBED1451C10B2186364713111AD631A888D44DC8889B90F700F3ACE00000F8D552C15B4B25354C4D96191BAAF639334543EC044EDDB039FF18E1A7E19BD3A06E6EA49915F4EF5FE1E2D5ED4FD9789AF17AC7D644BCE16A8D4667514A8B3C59266AB926D4F6A67B3AF2221476CAFB8AAA51515454E4B92FC8C4AECBBF243D368B53F1716F69ED8EAD3C3979E9BCAAFA2DBF3ABAD32DAA77AACB47B62555DAB1AF0F62FE4A886FE4A702E11C4768C41057D452B29E9B55CC952495359CD54E0899EDCF25DB96E2AC626BA2919A671CEF13E4A59E2BCFBD4245A5DF3CDBFD5D7994AE922D2EF9E6DFEAEBCCA77C37E623DDF74DE242767DA6A59E9E385F2C6AD64ECF948D57739B9AA669ED453E25629F0CB71268B6DAD89A9F3EA763DF4EEEB5D7766DEE5F8E46F67CF187966DD26765FC99229B6E9422AA2A2A2AA2A6E542EB80F142621B324750FCEBE95119367BDE9C1FEDE3DBEC214F63A37B98F6AB5CD5C95AA992A2F5192C3F7BA8C3F7982E14FB751729199EC7B177B7FAE3911EB34D19F1ED1D63A39CD8FE257F574703F3D05753DCA820ADA57EBC13311EC7767EE7E83CBCC4C4ED2CBE80000C7DF7A3D72F5597914E6E3A46FBD1EB97AACBC8A7371B9C23B965ED27496DBA35E9C51F824E452E842F46BD38A3F049C8A5D0ABC57C78F4FBA2D577FD80019AAC00000000000000000000D0F4AD5CB4F8661A46B9116A67447275B5A99FC7548D14AD2FCDAD5B6A83F82391FEF544FE526A7A5E1D4E5D3C7EAD3D346D8E1BE68AADA95588E6AD7A66DA48736F63DDB13F2D62CA4EB4454C8CB35C2AB2DB254246BF85A8BFCE514C7E237E6D44FE9D8A7A8B6F924001490000000000A88A992ED43C22235A88888889B111381E4000000245A5DF3CDBFD5D7994AE922D2EF9E6DFEAEBCCA5EE1BF311EE9F4DE242765F700F41ED9E0773B88117DC03D07B6781DCEE3478B7831EBFD4AC6AFB91EAD2749D85BE6D51E5DA48FEA6672254B53D17F07772F1EDEF27074DD5D2C35D492D2D4C692432B558F6AF14539F3135826C397A96865CDD1FDA8645F4D8BB97BF82F6A0E1BAAE7AFC2B758FE1F74D979A3967AC365D1AE29F265C7C91572654954EFAA739764727ECED89DF9769643978B9E00C53FDA0B47C854C99DC29511B2AAEF91BC1FFA2F6F7A10713D2ED3F1ABEFF747AAC5FF00786DC0031D4D8FBEF47AE5EAB2F229CDC748DF7A3D72F5597914E6E37384772CBDA4E92DB746BD38A3F049C8A5D085E8D7A7147E093914BA1578AF8F1E9F745AAEFF00B00033558000000000000000000011DD2DBF3C4B46CCB751B573EF7BFF00634028BA5D8952F36F9783A9D5BEE77FF49D1EA7433FF1EAD4C1E1C2D7A2C6A3707AAA7A552F55F7221BB1A2E8A25D7C293B38C756F4F7B5ABFA9BD1E7F59E3DFD59F9BC4900056460000000000000000122D2EF9E6DFEAEBCCA574916977CF36FF575E652F70DF988F74FA6F1213B2FB807A0F6CF03B9DC408BEE01E83DB3C0EE771A3C5BC18F5FEA56357DC8F56C86B58DB0CB712595CD89A9F3EA7CDF4EEEB5E2DEE5F8E46CA0C3C792D8ED17AF5851ADA6B3BC397DEC746F731ED56B9AB92B5532545EA32162BCD4586EF05C29B6BA35C9CC55C91ED5DED5FEB7E4BC0DDB49D85BE6D51E5DA48FEA6672254B53D17F07772F1EDEF2707A9C5929A8C5CDF49EAD5A5A3257774BDBAE14D75B7C15D48FD78266EB357E28BDA8B9A2F71FA88DE8D714F932E3E48AB932A4AA77D539CBB2393F676C4EFCBB4B21E6F55A79C1926BF4FA3372E39C76D98FBEF47AE5EAB2F229CDC748DF7A3D72F5597914E6E35384772CB5A4E92DB746BD38A3F049C8A5D085E8D7A7147E093914BA1578AF8F1E9F745AAEFF00B000335580000000000000000000135D2F522BA82D9589BA395F12FE24454E452505F71EDBBCA583AB98D4CE485A93B3667F676AFE599023D170CBF360E5F2968E96DBD36F254B443588B1DD289576A2B256A7BD17E0D29E41B47D744B5E2FA557B91B154A2D3BD57EF6597FD91A5E4CCE278F973CCF9AB6A6BB64DFCC001415C000000003D5AF63F3D4735D92AB5725CF254E07A54D445474B2D4CEE46C51315EF72F0444CD4E709EE9572DD6A2E31CD2C13CD23A45746F5454D65CF2CD0B9A4D1CEA37EDDB64D8B0CE4DDD280845AF1E628A79A1822AE75566E463639D88FD655D889ADF6BF32EACD7F936FCA6AEBE49ADABBB3E391C6A74B7D3CC734C76BE65C538FABD8916977CF36FF575E652BA48B4BBE79B7FABAF3292F0DF988F775A6F1213B2FB807A0F6CF03B9DC408BEE01E83DB3C0EE771A3C5BC18F5FEA56357DC8F56C80030141F1ABA586BA925A5A98D248656AB1ED5E28A73E626B04D872F52D0CB9BA3FB50C8BE9B1772F7F05ED43A24D6B1B6196E24B2B9B1353E7D4F9BE9DDD6BC5BDCBF1C8BDA0D57C1C9B5BBB3FF00B74F832F25B69E9281973C018A7FB4168F90A9933B852A236555DF23783FF45EDEF421AF63A37B98F6AB5CD5C95AA992A2F5190B15E6A2C37782E14DB5D1AE4E62AE48F6AEF6AFF5BF25E06D6B34D19F1ED1D63A2EE6C7F12BB7D5D017DE8F5CBD565E4539B8E88ACB8535D7075657523F5E09A8A57357F0AE68BDA8B9A2F71CEE54E131315BC4F9A1D246D12DB746BD38A3F049C8A5D085E8D7A7147E093914BA1538AF8F1E9F745AAEFF00B0003355800000000000000000001E1CD6BDAAD72239AA992A2A668A873A624B43AC5882AE8151518C7E712AF162ED6AFBBF3453A30D174958656ED6B4B9D2C7AD5746DFA4889B5F16F54F66D5F797F876A23165E5B74958D364E5B6D3F5465AE731C8E6AAA391734545DA8A5FB06E248F11D8D92B9EDF9E4288CA9626F4770765D4B967EF4E0400C958EF95B87EE4CADA27A23D1355EC77D97B7A950D8D6E97F114DA3AC745CCD8BE257F5747835FC398C2D789226A412A4557966FA6917E92777F1276A7B723603CD5E96A5B96D1B4B32D59ACED2000E5F000D4716E3BA2C3F1494D4CE654DCB2C92245CDB1AF5BD7F4DFDDBCEF1E2BE5B72D2379755ACDA76862B4A3889B4B6E6D969E4FF115393A6CBD18D3877AAA7B917AC909F6ABABA8AFAB96AEAA574B3CAED67BDDBD54F9318E91ED631AAE7397246A266AABD47A8D3608C18E29FBB4F1638C75D9B6E8E2CEEB9E2A867735160A2FAF7AAFF17A09DF9EDF617335AC1187530ED8238E56E5593E52D42F145E0DF626CEFCCD94C0D767F8D9A663A47642867C9CF7ECE8122D2EF9E6DFEAEBCCA574916977CF36FF00575E653AE1BF311EEFBA6F1213B2FB807A0F6CF03B9DC408BEE01E83DB3C0EE771A3C5BC18F5FEA56357DC8F56C80030140000125D27616F9B547976923FA999C8952D4F45FC1DDCBC7B7BC9C1D37574B0D7524B4B531A490CAD563DABC514E7CC4D609B0E5EA5A1973747F6A1917D362EE5EFE0BDA86FF0DD573D7E15BAC7F0D0D365E68E59EB0CB612C51E4DB7DCAD156FFF0009554F2FC92AAEC8E4D45FC9DB13BF2ED35100D0AE3AD6D368FAAC456226663EADB746BD38A3F049C8A5D085E8D7A7147E093914BA185C57C78F4FBA86ABBFEC000CD560000000000000000000000004871EE047DBE496EF6A8D5D48E557CF0B536C2BC5C9F77E1DDBA787509A1E25D19D0DD1CFAAB53D94554BB56354FAA7AF727D9F66CEC36747C4A2239337EFF75CC3A9DA396E8DB5CE6391CD554722E68A8BB514DAAD7A45C456C8D235A9655C6D4C91B54DD754FC48A8E5F6A98DBAE15BDD955CB5B6F99B1B76FCB3135E3CBAF593627B72530C6ACD7167AF6ED685B98ADE3CD4C8B4C13237EBACB1BDDD6CA856A7E6D513E97E7731529ECD1C6FE0B2542BD3DC8D4266083F2FD36FBF2FFB947F87C7E4DA2EBA41C457563A3755A52C4EDECA56EA7FDB6BBF33570676CF83AF97BD57D2D0BDB03B2FAF9BE8332EB455DFECCC9E23160AFD2B0936AD23C9822B1A3FC0AFA47C77ABB44AD9D36D3D3BD36B3EFB93AFA9386FDFBB3385F47D6FB03995552A9595EDDA9239B93235FBA9D7DABB7B8DC0C7D6711E789C78BA79A9E6D473472D000192A8122D2EF9E6DFEAEBCCA574916977CF36FF00575E652F70DF988F74FA6F1213B2FB807A0F6CF03B9DC408BEE01E83DB3C0EE771A3C5BC18F5FEA56357DC8F56C80030140000035AC6D865B892CAE6C4D4F9F53E6FA7775AF16F72FC723650778F25B1DA2F5EB0FB5B4D677872FBD8E8DEE63DAAD7357256AA64A8BD47828FA4EC2DF36A8F2ED247F5333912A5A9E8BF83BB978F6F79383D560CD5CD8E2F56B63BC5EBCD0DB746BD38A3F049C8A5D085E8D7A7147E093914BA189C57C78F4FBA8EABBFEC000CD5600000000000000000000000000000C7D5D86D15EED6AAB6524CFF00E27C2D577BF2CCC803EC5A6BDB1244CC746B52600C2F2FDAB4B13C32BDBF071E19A3EC2D1BB59B6A6AAFDE9A4727B95C6CC097F119BFCE7F7977F12FE72C751582CF6E735F476CA48646EC491B126B27E2DE644022B5A6D3BCCEEE26667A8003E00000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000FFD9, 0xFFD8FFE000104A46494600010100000100010000FFDB004300080606070605080707070909080A0C140D0C0B0B0C1912130F141D1A1F1E1D1A1C1C20242E2720222C231C1C2837292C30313434341F27393D38323C2E333432FFDB0043010909090C0B0C180D0D1832211C213232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232323232FFC00011080140014003012200021101031101FFC4001C0001000301010101010000000000000000000506070804030201FFC40049100100010302020507070906040700000000010203040511063107213641615171727481B1B2121314152223911735547393A1B3C1D224528292D1F03242626325335355A2A4E2FFC400190101000301010000000000000000000000000304050201FFC400251101000201030304030100000000000000000102030411321231332141718113226151FFDA000C03010002110311003F00D24064AD000000000000000000000000000000000000000000000000000000000000000004F5472DC0148C8E94347C4C9B98F7F0353B77AD5534574556A8DE2639C7FC6F9FE56342FD1352FD9D1FD697F064FF00117E6C7FEAF622B40E20C1E23D3E733066B8A69AA68AEDDC888AE89F18899F3A551CC4C4ED29226263780078F400000011FAD6B389A0E99733F36AAA2D5131114D11BD554CF2888EF9553F2B1A17E89A97ECE8FEB775C57B46F58716C95ACED32BD8A4E374A1A46664DBC7C7C0D4EE5EB954534514DAA37999FF001AED1D71CB6797A5A9CA1ED6F5B719079751CEB5A669D919D7A9AEAB562DCDCAA28889AA623C9BA9DF958D0BF44D4BF6747F5BDAE3BDFD6B0F2D92B5E52BD8AA689D20695AF6A96F4FC5C7CDA2F5C8AA62ABB45314F546FDD54F916B796A5AB3B5A1ED6D168DE001CBA000000054758E91348D1755BFA764E3E6D77ACCC45555BA289A677889EADEA8F2BC5F958D0BF44D4BF6747F5A58C192637884739A913B4CAF63E183976F3F031B32D45516F22D537688AA3AE22A8898DFC7ADF745D9D800F40000000000000019A749DC2FF396FEBEC3B7F6E888A72A9A639C728AFD9CA7C36F232C74E5CB745EB55DAB94C576EBA669AA9AA378989E712C138C786EBE1BD6ABB34C4CE25EDEBC7AE7FBBDF4CF8C72FC27BDA3A4CDBC744A86A716D3D70FEF06F11D5C39AE517ABAA7E877B6A322988DFECF755B79639FE3E56F74574DCA29AE8AA2AA2A8DE9AA99DE263CB0E626B1D187134E4E3CE859773EF6CD3F2B1A667AEAA3BE9F67778798D5E1DE3AE0D2E5DA7A25A300CE5F0000145E9238A3EABD3BEAAC4B9B666553F6EAA67AEDDBEFF6CF5C79B7F0778E937B4561C5EF14AF54A91C7DC4DF5F6B33631AE7CAC0C599A6D4C4F557577D5FCA3C23C552167E08E18AF88B5889BD44FD031E62ABF57F7BC94479FDDBF835FF005C54FE432FF6C97FECAE1D19F0ACE359FAF732DFDEDDA76C5A663AE9A279D7EDE51E1BF95A3BF94D34D14C534C4534C46D1111B4443FAC8C992725BAA5A98E914AF4C21B8B7B23AB7AB57EE73D3A178B7B23AB7AB57EE73D2F68B8CA9EAF942D9D1BF6DF0FD0B9F04B7261BD1BF6DF0FD0B9F04B7243ACF27D25D270FB00545A00000060DC7FDB8D4FD2A3F874AB4B2F1FF6E353F4A8FE1D2AD36B170AFC43232739F974570D765B48F52B3F0425117C35D96D23D4ACFC109463DF94B56BC6001CBA00000000000000010BC53C3F6B88F45BB8756D4DFA7EDD8B93FF002D71CBD93CA5343DADA6B3BC3C988B46D2E65C8C7BB8993731EFDB9B77AD553457455CE2639C3F787977F0332CE5E35C9B77ACD715D154774C34CE93B85FE72DFD7D876FEDD1114E5534C738E515FB394F86DE4658D9C592325376564A4E3B6CE89E1CD72CF10E8B633ED6D4D557D9BB6E277F915C738FE71E13095615C0DC4F3C3DACC517EB9FA064CC517A3BA89EEAFD9DFE133E0DD62626378EB865E7C5F8EDFC6861C9F92BFD004299E1D6355C7D134ABFA8654FDDDAA778A639D73DD4C78CCB9EB54D4B2357D4F233F2AADEF5FAFE54EDCA23944478446D1EC5A3A43E279D6B56FA0E357BE0E255311B72B97394D5E68E51EDF2A98D4D2E1E8AF54F7966EA32F5DB68ED0F4E9F8191A9EA1630B168F977EF5514D31FCE7C239BA0787B44B1C3FA359C0B1B4CD31F2AED7B6DF395CF3ABFDF7442AFD1BF0ACE9983F5B6651B65E4D3F754CC75DBB7FEB3CFCDB78AF8ADAACDD73D31DA1634D8BA63AA7BC802A2D21B8B7B23AB7AB57EE73D3A178B7B23AB7AB57EE73D34745C654357CA16CE8DFB6F87E85CF825B930DE8DFB6F87E85CF825B921D6793E92E9387D802A2D000000306E3FEDC6A7E951FC3A55A5978FFB71A9FA547F0E9569B58B857E2191939CFCBA2B86BB2DA47A959F821288BE1AECB691EA567E084A31EFCA5AB5E3000E5D0000000000000000003F372DD17AD576AE5315DBAE99A6AA6A8DE26279C4B01E2EE1DB9C37ADDCC6DA6716E6F731EB9EFA37E533E58E53EC9EF74020B8B7876DF126897317AA326DFDE63D73DD5EDCA7C2794FE3DC9F4F97F1DBD7B4A0CF8BAEBE9DDCFCD8BA35E269D4B4E9D272AE6F958B4C7CD4CCF5D76FFD69EA8F34C78B20BD66E63DFB962F51345DB754D15D1573A6A89DA625E9D2B52BFA46A98F9F8D3B5DB15FCA8F18E531E698DE3DAD2CD8E3253651C59271DB7749299D21F134E8BA47D0B1AE6D9B99134C4C4F5DBB7CA6AF3F747B67B960B3AEE0DEE1E8D6FE76230FE666ECCF7C6DCE3CFBC4C6DE560BAEEB17F5DD632350BFD5372AFB34F7514C754447B1434D87AAFBDBB42E6A32F4D768F7472E1C01C2BF5F6A7F4BCAA2274FC5AA26B8AA3AAE55CE29F37299F0EAEF57748D2B235AD52C6062D3BDCBB56DBF7531DF54F84475BA0B47D2B1F44D2AC69F8B1F776A9DA6A9E75CF7D53E332B5A9CDD15E98EF2ADA7C5D73BCF687B8065B480010DC5BD91D5BD5ABF739E9D0BC5BD91D5BD5ABF739E9A3A2E32A1ABE50B6746FDB7C3F42E7C12DC986F46FDB7C3F42E7C12DC90EB3C9F49749C3EC0151680000018371FF6E353F4A8FE1D2AD2CBC7FDB8D4FD2A3F874AB4DAC5C2BF10C8C9CE7E5D15C35D96D23D4ACFC109445F0D765B48F52B3F042518F7E52D5AF180072E800000000000000000000198F49DC2FBC7D7F876FAE36A72A9A63F0AFDD13ECF165EE9BBD66DE458B966F514D76AE5334574551BC55131B4C4B02E2EE1DAF86F5CB98D1BD58D73EF31EB9EFA26794F8C729FC7BDA3A4CDD51D12A1A9C5B4F5C3C14EB39B4E87568F177FB1D57A2F4D3DFF002B6E5E6EFDBCAF00BDF471C2D1AAE7CEAB9746F898B5C7CDD33CAE5CE7F84754FB63C566F6AE3ACDA55EB59BDA2AB8F47DC2BF5169BF4DCAA36CFCAA6266263AED51CE29F3F7CFB23B972063DEF37B754B56958AC6D000E5D00021B8B7B23AB7AB57EE73D3A178B7B23AB7AB57EE73D34745C654357CA16CE8DFB6F87E85CF825B930DE8DFB6F87E85CF825B921D6793E92E9387D802A2D000000306E3FEDC6A7E951FC3A55A5978FF00B71A9FA547F0E9569B58B857E2191939CFCBA2B86BB2DA47A959F821288BE1AECB691EA567E084A31EFCA5AB5E3000E5D0000000000000000000000C9FA5CFCE5A6FEA6BF7C35864FD2E7E72D37F535FBE1634BE5841A9F1CB396CFD16764ABF5AAFDD4B186CFD16764ABF5AAFDD4AE6B3C6A9A5F22EE032DA400000087E2CA66AE12D5A23F45B93FB9CF2E91D62CFD2744CFB1FFAB8D728FC6998737347453FACC286AE3F685A3A3BAE69E39D3E37DA2A8B913FB3A9BB39F783B263178C34BB957544DF8A3FCDF67F9BA0916B63F789FE25D24FE9200A6B4000000C0F8EEBF97C6DA9CFFDCA63F0A2985752BC4D91F4BE28D52F44EF1564DC889F08AA623F74229B78E36A4431EF3BDA65D17C3B4CD1C31A4D3546D31876627FC909379F06CFD1B4FC6B1B6DF376A9A36F34443D0C5B4EF332D7AC6D1000F1E8000000000000000000000C9FA5CFCE5A6FEA6BF7C35864FD2E7E72D37F535FBE1634BE5841A9F1CB396CFD16764ABF5AAFDD4B186CFD16764ABF5AAFDD4AE6B3C6A9A5F22EE032DA40000004C44C6D3D70E6DD5B0A74DD5F330A63FF0022F556E3CD13D5FB9D24C77A52D1E70F5EB7A9514CFCD6653B55311D515D31113F8C6DFBD6F477DAF35FF557555DEB13FE28F62F578F916EFDB9DAE5BAA2BA67C9313BC3A474ECEB5A9E9D8F9D6277B57EDC574F86F1CBCF1C9CD4BD700F1AD1A255F566A354C605CAB7A2E6DBFCD553CF7FFA67F77E2B3AAC537AEF1DE1069B2452DB4F696C83F16AEDBBD6A9BB6AE5372DD51BD35513BC4C784BF6CB6880008FD7353B7A3E89979F72A88F99B7334EFDF572A63DB3B43DB76EDBB36AABB76E536EDD31BD55573B44478CB1CE3FE32A35DBB4E9DA7D754E0DAABE5575F2F9EABBBFC31FEFB92E1C5392DB7B22CB92295DFDD489999999999999E73291E1FC29D438874FC588DFE72FD1157A3BEF3FBB746B43E8AB45AAFEA77F58B94FDD635336AD4CC73B95475EDE6A67FF00943572DFA2932CDC75EABC435B018AD7000000000000000000000000193F4B9F9CB4DFD4D7EF86B0C9FA5CFCE5A6FEA6BF7C2C697CB08353E39672D9FA2CEC957EB55FBA9630D9FA2CEC957EB55FBA95CD678D534BE45DC065B4800000046EBDA2E3F106917B4FC999A62BEBA2B88DE68AA39551FEFCA921EC4CC4EF0F262263697376ADA4E668BA8DCC1CEB5345DA394F7571DD5533DF12F13A275DE1DD3B88B0FE8F9D6779A77F9BBB4F5576E7C27F972651AE746FACE99555730E9FAC31E3AE26D47DB8F3D1CE7D9BB4F16A6B78DADE92CECBA7B56778F5841E91C4BABE873FF87E6DCB76FBED4ED5513FE19EAF6C75ADF8BD2DE7D14ED97A663DE9F2DAAEAB7EFF0094CF6ED9BB8F76AB57ADD76EE533B554574CC4C4F8C4BF09AD8B1DFD6611D72DEBE912D3E7A5F9DBAB43EBF5BFFF000F0E5F4B5A9DC8DB134FC5B3E3726AB93FC99F0E234D8A3D9D4EA324FBA5756E24D635B9DB50CEBB768EEB71B5347F96368F6A29F4B18F7F2AF53671ECDCBD76AFF868B74CD554FB2175D07A32D5350AE9BBA9FF0061C6E7F26769BB579A3BBDBD7E0EED6A638F5F47115BE49F4F556B40D033788B52A70F0E9EAE772ECC7D9B74F967FD3BDBEE93A663E8DA5D8C0C5A76B5669DB79E754F7D53E333D6FE693A3E0E89854E26058A6D5B8EB99FF9AB9F2D53DF2F73373E79CB3B4766861C318E379EE00AE9C000000000000000000000000000000000000000000001F0C9C2C5CDA3E4656359BF47F76EDB8AA3F7A22F70570DDF9DEBD231E27FE889A3DD309E1D45AD1DA5CCD6B3DE158FC9DF0AFF00ED7FFD8BBFD4F558E0BE1BC7AFE551A4634CFF00DC89AE3F0AB74E8F672DE7DE5E7E3A47B43E38D898D876A2D62E3DAB16E39516A88A63F087D81C3B0000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000000007FFD9, '2006-02-27 05:15:03.000', '2019-08-14 17:36:43.000', N'2003-05-14 08:07:42 +00:00', '1900-06-19 00:00:00.000', '2005-05-29');\n"; + + private static final String PG_IMAGE = "postgis/postgis"; + private static final String PG_DRIVER_JAR = + "https://repo1.maven.org/maven2/org/postgresql/postgresql/42.3.3/postgresql-42.3.3.jar"; + private static final String PG_JDBC_JAR = + "https://repo1.maven.org/maven2/net/postgis/postgis-jdbc/2.5.1/postgis-jdbc-2.5.1.jar"; + private static final String PG_GEOMETRY_JAR = + "https://repo1.maven.org/maven2/net/postgis/postgis-geometry/2.5.1/postgis-geometry-2.5.1.jar"; + + private static final String MYSQL_IMAGE = "mysql:latest"; + private static final String MYSQL_CONTAINER_HOST = "mysql-e2e"; + private static final String MYSQL_DATABASE = "auto"; + + private static final String MYSQL_USERNAME = "root"; + private static final String MYSQL_PASSWORD = "Abc!@#135_seatunnel"; + private static final int MYSQL_PORT = 3306; + // private static final String MYSQL_URL = "jdbc:mysql://" + HOST + ":%s/%s?useSSL=false"; + + private static final String MYSQL_DRIVER_CLASS = "com.mysql.cj.jdbc.Driver"; + + private static final String ORACLE_IMAGE = "gvenzl/oracle-xe:21-slim-faststart"; + private static final String ORACLE_NETWORK_ALIASES = "e2e_oracleDb"; + private static final String ORACLE_DRIVER_CLASS = "oracle.jdbc.OracleDriver"; + private static final int ORACLE_PORT = 1521; + // private static final String ORACLE_URL = "jdbc:oracle:thin:@" + HOST + ":%s/%s"; + private static final String USERNAME = "testUser"; + private static final String PASSWORD = "Abc!@#135_seatunnel"; + private static final String DATABASE = "TESTUSER"; + private static final String SOURCE_TABLE = "E2E_TABLE_SOURCE"; + private static final String SINK_TABLE = "E2E_TABLE_SINK"; + + private PostgreSQLContainer POSTGRESQL_CONTAINER; + + private MSSQLServerContainer sqlserver_container; + private MySQLContainer mysql_container; + private OracleContainer oracle_container; + + private static final String mysqlCheck = + "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema = 'auto' AND table_name = 'sqlserver_auto_create_mysql') AS table_exists"; + private static final String sqlserverCheck = + "IF EXISTS (\n" + + " SELECT 1\n" + + " FROM testauto.sys.tables t\n" + + " JOIN testauto.sys.schemas s ON t.schema_id = s.schema_id\n" + + " WHERE t.name = 'sqlserver_auto_create_sql' AND s.name = 'dbo'\n" + + ")\n" + + " SELECT 1 AS table_exists;\n" + + "ELSE\n" + + " SELECT 0 AS table_exists;"; + private static final String pgCheck = + "SELECT EXISTS(SELECT 1 FROM information_schema.tables WHERE table_schema = 'public' AND table_name = 'sqlserver_auto_create_pg') AS table_exists;\n"; + private static final String oracleCheck = + "SELECT CASE WHEN EXISTS(SELECT 1 FROM user_tables WHERE table_name = 'sqlserver_auto_create_oracle') THEN 1 ELSE 0 END AS table_exists FROM DUAL;\n"; + + String driverMySqlUrl() { + return "https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.0.32/mysql-connector-j-8.0.32.jar"; + } + + String driverOracleUrl() { + return "https://repo1.maven.org/maven2/com/oracle/database/jdbc/ojdbc8/12.2.0.1/ojdbc8-12.2.0.1.jar"; + } + + String driverSqlserverUrl() { + return "https://repo1.maven.org/maven2/com/microsoft/sqlserver/mssql-jdbc/9.4.1.jre8/mssql-jdbc-9.4.1.jre8.jar"; + } + + static JdbcUrlUtil.UrlInfo sqlParse = + SqlServerURLParser.parse("jdbc:sqlserver://localhost:1433;database=testauto"); + static JdbcUrlUtil.UrlInfo MysqlUrlInfo = + JdbcUrlUtil.getUrlInfo("jdbc:mysql://localhost:3306/auto?useSSL=false"); + static JdbcUrlUtil.UrlInfo pg = JdbcUrlUtil.getUrlInfo("jdbc:postgresql://localhost:5432/pg"); + static JdbcUrlUtil.UrlInfo oracle = + OracleURLParser.parse("jdbc:oracle:thin:@localhost:1521/TESTUSER"); + + @TestContainerExtension + private final ContainerExtendedFactory extendedSqlServerFactory = + container -> { + Container.ExecResult extraCommands = + container.execInContainer( + "bash", + "-c", + "mkdir -p /tmp/seatunnel/plugins/Jdbc/lib && cd /tmp/seatunnel/plugins/Jdbc/lib && curl -O " + + PG_DRIVER_JAR + + " && curl -O " + + PG_JDBC_JAR + + " && curl -O " + + PG_GEOMETRY_JAR + + " && curl -O " + + MYSQL_DRIVER_CLASS + + " && curl -O " + + ORACLE_DRIVER_CLASS + + " && curl -O " + + driverSqlserverUrl() + + " && curl -O " + + driverMySqlUrl() + + " && curl -O " + + driverOracleUrl()); + // Assertions.assertEquals(0, extraCommands.getExitCode()); + }; + + void initContainer() throws ClassNotFoundException { + DockerImageName imageName = DockerImageName.parse(SQLSERVER_IMAGE); + sqlserver_container = + new MSSQLServerContainer<>(imageName) + .withNetwork(TestSuiteBase.NETWORK) + .withNetworkAliases(SQLSERVER_CONTAINER_HOST) + .withPassword(PASSWORD) + .acceptLicense() + .withLogConsumer( + new Slf4jLogConsumer( + DockerLoggerFactory.getLogger(SQLSERVER_IMAGE))); + + sqlserver_container.setPortBindings( + Lists.newArrayList( + String.format( + "%s:%s", SQLSERVER_CONTAINER_PORT, SQLSERVER_CONTAINER_PORT))); + + try { + Class.forName(sqlserver_container.getDriverClassName()); + } catch (ClassNotFoundException e) { + throw new SeaTunnelRuntimeException( + JdbcITErrorCode.DRIVER_NOT_FOUND, "Not found suitable driver for mssql", e); + } + + username = sqlserver_container.getUsername(); + password = sqlserver_container.getPassword(); + // ============= PG + POSTGRESQL_CONTAINER = + new PostgreSQLContainer<>( + DockerImageName.parse(PG_IMAGE) + .asCompatibleSubstituteFor("postgres")) + .withNetwork(TestSuiteBase.NETWORK) + .withNetworkAliases("postgre-e2e") + .withDatabaseName("pg") + .withUsername(USERNAME) + .withPassword(PASSWORD) + .withCommand("postgres -c max_prepared_transactions=100") + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger(PG_IMAGE))); + POSTGRESQL_CONTAINER.setPortBindings( + Lists.newArrayList(String.format("%s:%s", 5432, 5432))); + + log.info("PostgreSQL container started"); + Class.forName(POSTGRESQL_CONTAINER.getDriverClassName()); + + log.info("pg data initialization succeeded. Procedure"); + DockerImageName mysqlImageName = DockerImageName.parse(MYSQL_IMAGE); + mysql_container = + new MySQLContainer<>(mysqlImageName) + .withUsername(MYSQL_USERNAME) + .withPassword(MYSQL_PASSWORD) + .withDatabaseName(MYSQL_DATABASE) + .withNetwork(NETWORK) + .withNetworkAliases(MYSQL_CONTAINER_HOST) + .withExposedPorts(MYSQL_PORT) + .waitingFor(Wait.forHealthcheck()) + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger(MYSQL_IMAGE))); + + mysql_container.setPortBindings( + Lists.newArrayList(String.format("%s:%s", MYSQL_PORT, MYSQL_PORT))); + + DockerImageName oracleImageName = DockerImageName.parse(ORACLE_IMAGE); + oracle_container = + new OracleContainer(oracleImageName) + .withDatabaseName(DATABASE) + .withUsername(USERNAME) + .withPassword(PASSWORD) + .withNetwork(NETWORK) + .withNetworkAliases(ORACLE_NETWORK_ALIASES) + .withExposedPorts(ORACLE_PORT) + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger(ORACLE_IMAGE))); + oracle_container.withCommand( + "bash", + "-c", + "echo \"CREATE USER admin IDENTIFIED BY admin; GRANT DBA TO admin;\" | sqlplus / as sysdba"); + oracle_container.setPortBindings( + Lists.newArrayList(String.format("%s:%s", ORACLE_PORT, ORACLE_PORT))); + Startables.deepStart( + Stream.of( + POSTGRESQL_CONTAINER, + sqlserver_container, + mysql_container, + oracle_container)) + .join(); + + log.info(" container is up "); + } + + @Override + @BeforeAll + public void startUp() throws Exception { + initContainer(); + + initializeJdbcTable(); + } + + @TestTemplate + public void testAutoCreateTable(TestContainer container) + throws IOException, InterruptedException { + + TablePath tablePathSQL = TablePath.of("testauto", "dbo", "sqlserver_auto_create"); + TablePath tablePathSQL_Sql = TablePath.of("testauto", "dbo", "sqlserver_auto_create_sql"); + TablePath tablePathMySql = TablePath.of("auto", "sqlserver_auto_create_mysql"); + TablePath tablePathPG = TablePath.of("pg", "public", "sqlserver_auto_create_pg"); + TablePath tablePathOracle = TablePath.of("TESTUSER", "sqlserver_auto_create_oracle"); + + SqlServerCatalog sqlServerCatalog = + new SqlServerCatalog("sqlserver", "sa", password, sqlParse, "dbo"); + MySqlCatalog mySqlCatalog = new MySqlCatalog("mysql", "root", PASSWORD, MysqlUrlInfo); + PostgresCatalog postgresCatalog = + new PostgresCatalog("postgres", "testUser", PASSWORD, pg, "public"); + OracleCatalog oracleCatalog = + new OracleCatalog("oracle", "admin", "admin", oracle, "TESTUSER"); + mySqlCatalog.open(); + sqlServerCatalog.open(); + postgresCatalog.open(); + // oracleCatalog.open(); + + CatalogTable sqlServerCatalogTable = sqlServerCatalog.getTable(tablePathSQL); + + sqlServerCatalog.createTable(tablePathSQL_Sql, sqlServerCatalogTable, true); + postgresCatalog.createTable(tablePathPG, sqlServerCatalogTable, true); + // oracleCatalog.createTable(tablePathOracle, sqlServerCatalogTable, true); + mySqlCatalog.createTable(tablePathMySql, sqlServerCatalogTable, true); + + Assertions.assertTrue(checkMysql(mysqlCheck)); + // Assertions.assertTrue(checkOracle(oracleCheck)); + Assertions.assertTrue(checkSqlServer(sqlserverCheck)); + Assertions.assertTrue(checkPG(pgCheck)); + + // delete table + log.info("delete table"); + sqlServerCatalog.dropTable(tablePathSQL_Sql, true); + sqlServerCatalog.dropTable(tablePathSQL, true); + postgresCatalog.dropTable(tablePathPG, true); + // oracleCatalog.dropTable(tablePathOracle, true); + mySqlCatalog.dropTable(tablePathMySql, true); + + sqlServerCatalog.close(); + mySqlCatalog.close(); + postgresCatalog.close(); + } + + @Override + public void tearDown() throws Exception { + if (sqlserver_container != null) { + sqlserver_container.close(); + } + if (mysql_container != null) { + mysql_container.close(); + } + if (oracle_container != null) { + oracle_container.close(); + } + if (POSTGRESQL_CONTAINER != null) { + POSTGRESQL_CONTAINER.close(); + } + } + + private Connection getJdbcSqlServerConnection() throws SQLException { + return DriverManager.getConnection( + sqlserver_container.getJdbcUrl(), + sqlserver_container.getUsername(), + sqlserver_container.getPassword()); + } + + private Connection getJdbcMySqlConnection() throws SQLException { + return DriverManager.getConnection( + mysql_container.getJdbcUrl(), + mysql_container.getUsername(), + mysql_container.getPassword()); + } + + private Connection getJdbcPgConnection() throws SQLException { + return DriverManager.getConnection( + POSTGRESQL_CONTAINER.getJdbcUrl(), + POSTGRESQL_CONTAINER.getUsername(), + POSTGRESQL_CONTAINER.getPassword()); + } + + private Connection getJdbcOracleConnection() throws SQLException { + return DriverManager.getConnection( + oracle_container.getJdbcUrl(), + oracle_container.getUsername(), + oracle_container.getPassword()); + } + + private void initializeJdbcTable() { + try (Connection connection = getJdbcSqlServerConnection()) { + Statement statement = connection.createStatement(); + statement.execute(CREATE_DATABASE); + statement.execute(CREATE_TABLE_SQL); + statement.execute(getInsertSql); + // statement.executeBatch(); + } catch (SQLException e) { + throw new RuntimeException("Initializing PostgreSql table failed!", e); + } + } + + private boolean checkMysql(String sql) { + try (Connection connection = getJdbcMySqlConnection()) { + ResultSet resultSet = connection.createStatement().executeQuery(sql); + boolean tableExists = false; + if (resultSet.next()) { + tableExists = resultSet.getBoolean(1); + } + return tableExists; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private boolean checkPG(String sql) { + try (Connection connection = getJdbcPgConnection()) { + ResultSet resultSet = connection.createStatement().executeQuery(sql); + boolean tableExists = false; + if (resultSet.next()) { + tableExists = resultSet.getBoolean(1); + } + return tableExists; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private boolean checkSqlServer(String sql) { + try (Connection connection = getJdbcSqlServerConnection()) { + ResultSet resultSet = connection.createStatement().executeQuery(sql); + boolean tableExists = false; + if (resultSet.next()) { + tableExists = resultSet.getInt(1) == 1; + } + return tableExists; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } + + private boolean checkOracle(String sql) { + try (Connection connection = getJdbcOracleConnection()) { + ResultSet resultSet = connection.createStatement().executeQuery(sql); + boolean tableExists = false; + if (resultSet.next()) { + tableExists = resultSet.getInt(1) == 1; + } + return tableExists; + } catch (SQLException e) { + throw new RuntimeException(e); + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/pom.xml new file mode 100644 index 00000000000..fb4923020af --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/pom.xml @@ -0,0 +1,49 @@ + + + + 4.0.0 + + org.apache.seatunnel + connector-jdbc-e2e + ${revision} + + + connector-jdbc-e2e-part-5 + SeaTunnel : E2E : Connector V2 : Jdbc : Part 5 + + + + org.apache.seatunnel + connector-jdbc-e2e-common + ${project.version} + test-jar + test + + + + + mysql + mysql-connector-java + test + + + com.dameng + DmJdbcDriver18 + test + + + diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmIT.java similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmIT.java rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmIT.java diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmUpsetIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmUpsetIT.java new file mode 100644 index 00000000000..65339431548 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDmUpsetIT.java @@ -0,0 +1,258 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.connectors.seatunnel.jdbc; + +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.common.exception.SeaTunnelRuntimeException; + +import org.apache.commons.lang3.tuple.Pair; + +import org.testcontainers.containers.GenericContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.utility.DockerLoggerFactory; + +import com.google.common.collect.Lists; + +import java.math.BigDecimal; +import java.sql.Date; +import java.sql.Statement; +import java.sql.Timestamp; +import java.time.LocalDate; +import java.time.LocalDateTime; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +public class JdbcDmUpsetIT extends AbstractJdbcIT { + + private static final String DM_IMAGE = "laglangyue/dmdb8"; + private static final String DM_CONTAINER_HOST = "e2e_dmdb_upset"; + + private static final String DM_DATABASE = "SYSDBA"; + private static final String DM_SOURCE = "E2E_TABLE_SOURCE_UPSET"; + private static final String DM_SINK = "E2E_TABLE_SINK_UPSET"; + private static final String DM_USERNAME = "SYSDBA"; + private static final String DM_PASSWORD = "SYSDBA"; + private static final int DOCKET_PORT = 5236; + private static final int JDBC_PORT = 5336; + private static final String DM_URL = "jdbc:dm://" + HOST + ":%s"; + + private static final String DRIVER_CLASS = "dm.jdbc.driver.DmDriver"; + + private static final List CONFIG_FILE = + Lists.newArrayList("/jdbc_dm_source_and_dm_upset_sink.conf"); + private static final String CREATE_SQL = + "create table if not exists %s" + + "(\n" + + " DM_BIT BIT,\n" + + " DM_INT INT,\n" + + " DM_INTEGER INTEGER,\n" + + " DM_TINYINT TINYINT,\n" + + "\n" + + " DM_BYTE BYTE,\n" + + " DM_SMALLINT SMALLINT,\n" + + " DM_BIGINT BIGINT,\n" + + "\n" + + " DM_NUMBER NUMBER,\n" + + " DM_DECIMAL DECIMAL,\n" + + " DM_FLOAT FLOAT,\n" + + " DM_DOUBLE_PRECISION DOUBLE PRECISION,\n" + + " DM_DOUBLE DOUBLE,\n" + + "\n" + + " DM_CHAR CHAR,\n" + + " DM_VARCHAR VARCHAR,\n" + + " DM_VARCHAR2 VARCHAR2,\n" + + " DM_TEXT TEXT,\n" + + " DM_LONG LONG,\n" + + "\n" + + " DM_TIMESTAMP TIMESTAMP,\n" + + " DM_DATETIME DATETIME,\n" + + " DM_DATE DATE\n" + + ")"; + private static final String CREATE_SINKTABLE_SQL = + "create table if not exists %s" + + "(\n" + + " DM_BIT BIT,\n" + + " DM_INT INT,\n" + + " DM_INTEGER INTEGER,\n" + + " DM_TINYINT TINYINT,\n" + + "\n" + + " DM_BYTE BYTE,\n" + + " DM_SMALLINT SMALLINT,\n" + + " DM_BIGINT BIGINT,\n" + + "\n" + + " DM_NUMBER NUMBER,\n" + + " DM_DECIMAL DECIMAL,\n" + + " DM_FLOAT FLOAT,\n" + + " DM_DOUBLE_PRECISION DOUBLE PRECISION,\n" + + " DM_DOUBLE DOUBLE,\n" + + "\n" + + " DM_CHAR CHAR,\n" + + " DM_VARCHAR VARCHAR,\n" + + " DM_VARCHAR2 VARCHAR2,\n" + + " DM_TEXT TEXT,\n" + + " DM_LONG LONG,\n" + + "\n" + + " DM_TIMESTAMP TIMESTAMP,\n" + + " DM_DATETIME DATETIME,\n" + + " DM_DATE DATE,\n" + + " CONSTRAINT DMPKID PRIMARY KEY (DM_BIT) \n" + + ")"; + + @Override + JdbcCase getJdbcCase() { + Map containerEnv = new HashMap<>(); + String jdbcUrl = String.format(DM_URL, JDBC_PORT); + Pair> testDataSet = initTestData(); + String[] fieldNames = testDataSet.getKey(); + + String insertSql = insertTable(DM_DATABASE, DM_SOURCE, fieldNames); + + return JdbcCase.builder() + .dockerImage(DM_IMAGE) + .networkAliases(DM_CONTAINER_HOST) + .containerEnv(containerEnv) + .driverClass(DRIVER_CLASS) + .host(HOST) + .port(DOCKET_PORT) + .localPort(DOCKET_PORT) + .jdbcTemplate(DM_URL) + .jdbcUrl(jdbcUrl) + .userName(DM_USERNAME) + .password(DM_PASSWORD) + .database(DM_DATABASE) + .sourceTable(DM_SOURCE) + .sinkTable(DM_SINK) + .createSql(CREATE_SQL) + .configFile(CONFIG_FILE) + .insertSql(insertSql) + .testData(testDataSet) + .build(); + } + + @Override + void compareResult() {} + + @Override + protected void createNeededTables() { + try (Statement statement = connection.createStatement()) { + String createTemplate = jdbcCase.getCreateSql(); + + String createSource = + String.format( + createTemplate, + buildTableInfoWithSchema( + jdbcCase.getDatabase(), jdbcCase.getSourceTable())); + String createSink = + String.format( + CREATE_SINKTABLE_SQL, + buildTableInfoWithSchema( + jdbcCase.getDatabase(), jdbcCase.getSinkTable())); + + statement.execute(createSource); + statement.execute(createSink); + connection.commit(); + } catch (Exception exception) { + throw new SeaTunnelRuntimeException(JdbcITErrorCode.CREATE_TABLE_FAILED, exception); + } + } + + @Override + String driverUrl() { + return "https://repo1.maven.org/maven2/com/dameng/DmJdbcDriver18/8.1.1.193/DmJdbcDriver18-8.1.1.193.jar"; + } + + @Override + Pair> initTestData() { + String[] fieldNames = + new String[] { + "DM_BIT", + "DM_INT", + "DM_INTEGER", + "DM_TINYINT", + "DM_BYTE", + "DM_SMALLINT", + "DM_BIGINT", + "DM_NUMBER", + "DM_DECIMAL", + "DM_FLOAT", + "DM_DOUBLE_PRECISION", + "DM_DOUBLE", + "DM_CHAR", + "DM_VARCHAR", + "DM_VARCHAR2", + "DM_TEXT", + "DM_LONG", + "DM_TIMESTAMP", + "DM_DATETIME", + "DM_DATE" + }; + + List rows = new ArrayList<>(); + for (int i = 0; i < 100; i++) { + SeaTunnelRow row = + new SeaTunnelRow( + new Object[] { + i % 2 == 0 ? (byte) 1 : (byte) 0, + i, + i, + Short.valueOf("1"), + Byte.valueOf("1"), + i, + Long.parseLong("1"), + BigDecimal.valueOf(i, 18), + BigDecimal.valueOf(i, 18), + Float.parseFloat("1.1"), + Double.parseDouble("1.1"), + Double.parseDouble("1.1"), + 'f', + String.format("f1_%s", i), + String.format("f1_%s", i), + String.format("f1_%s", i), + String.format("{\"aa\":\"bb_%s\"}", i), + Timestamp.valueOf(LocalDateTime.now()), + new Timestamp(System.currentTimeMillis()), + Date.valueOf(LocalDate.now()) + }); + rows.add(row); + } + + return Pair.of(fieldNames, rows); + } + + @Override + protected GenericContainer initContainer() { + GenericContainer container = + new GenericContainer<>(DM_IMAGE) + .withNetwork(NETWORK) + .withNetworkAliases(DM_CONTAINER_HOST) + .withLogConsumer( + new Slf4jLogConsumer(DockerLoggerFactory.getLogger(DM_IMAGE))); + container.setPortBindings( + Lists.newArrayList(String.format("%s:%s", JDBC_PORT, DOCKET_PORT))); + + return container; + } + + @Override + public String quoteIdentifier(String field) { + return "\"" + field + "\""; + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisIT.java similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisIT.java rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisIT.java diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisdbIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisdbIT.java similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisdbIT.java rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcDorisdbIT.java diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGBase8aIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGBase8aIT.java similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGBase8aIT.java rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGBase8aIT.java diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGreenplumIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGreenplumIT.java similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGreenplumIT.java rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/java/org/apache/seatunnel/connectors/seatunnel/jdbc/JdbcGreenplumIT.java diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/doris-jdbc-to-doris.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/doris-jdbc-to-doris.conf similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/doris-jdbc-to-doris.conf rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/doris-jdbc-to-doris.conf diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_dm_source_and_dm_upset_sink.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_dm_source_and_dm_upset_sink.conf new file mode 100644 index 00000000000..81104b79077 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_dm_source_and_dm_upset_sink.conf @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + job.mode = "BATCH" +} + +source { + Jdbc { + url = "jdbc:dm://e2e_dmdb_upset:5236" + driver = "dm.jdbc.driver.DmDriver" + connection_check_timeout_sec = 1000 + user = "SYSDBA" + password = "SYSDBA" + query = "select * from SYSDBA.E2E_TABLE_SOURCE_UPSET" + } + +} + +sink { + Jdbc { + url = "jdbc:dm://e2e_dmdb_upset:5236" + driver = "dm.jdbc.driver.DmDriver" + connection_check_timeout_sec = 1000 + user = "SYSDBA" + password = "SYSDBA" + database = "SYSDBA" + primary_keys = ["DM_BIT"] + table = "E2E_TABLE_SINK_UPSET" + generate_sink_sql = true + query = "" + } +} + diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_dm_source_and_sink.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_dm_source_and_sink.conf similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_dm_source_and_sink.conf rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_dm_source_and_sink.conf diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_doris_source_and_sink.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_doris_source_and_sink.conf similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_doris_source_and_sink.conf rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_doris_source_and_sink.conf diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_gbase8a_source_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_gbase8a_source_to_assert.conf similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_gbase8a_source_to_assert.conf rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_gbase8a_source_to_assert.conf diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_greenplum_source_and_sink.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_greenplum_source_and_sink.conf similarity index 100% rename from seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-2/src/test/resources/jdbc_greenplum_source_and_sink.conf rename to seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/connector-jdbc-e2e-part-5/src/test/resources/jdbc_greenplum_source_and_sink.conf diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/pom.xml index f803a4c61e8..1dabc25490b 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/pom.xml +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-jdbc-e2e/pom.xml @@ -31,6 +31,8 @@ connector-jdbc-e2e-part-1 connector-jdbc-e2e-part-2 connector-jdbc-e2e-part-3 + connector-jdbc-e2e-part-4 + connector-jdbc-e2e-part-5 diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/pom.xml index 81cbb785698..fa2e1930cce 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/pom.xml +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/pom.xml @@ -92,6 +92,11 @@ postgresql test + + mysql + mysql-connector-java + test + org.testcontainers mysql diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/CanalToKafkaIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/CanalToKafkaIT.java index 0d8bb567ae4..9afe0ce332f 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/CanalToKafkaIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/CanalToKafkaIT.java @@ -293,6 +293,17 @@ public void testCanalFormatKafkaCdcToPgsql(TestContainer container) Arrays.asList(107, "rocks", "box of assorted rocks", "7.88"), Arrays.asList(108, "jacket", "water resistent black wind breaker", "0.1")); Assertions.assertIterableEquals(expected, actual); + + try (Connection connection = + DriverManager.getConnection( + POSTGRESQL_CONTAINER.getJdbcUrl(), + POSTGRESQL_CONTAINER.getUsername(), + POSTGRESQL_CONTAINER.getPassword())) { + try (Statement statement = connection.createStatement()) { + statement.execute("truncate table sink"); + LOG.info("testCanalFormatKafkaCdcToPgsql truncate table sink"); + } + } } private void initKafkaConsumer() { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/DebeziumToKafkaIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/DebeziumToKafkaIT.java index e76a4459963..da1ee137192 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/DebeziumToKafkaIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/DebeziumToKafkaIT.java @@ -314,6 +314,17 @@ public void testDebeziumFormatKafkaCdcToPgsql(TestContainer container) 108, "jacket", "water resistent black wind breaker", "0.1")) .collect(Collectors.toSet()); Assertions.assertIterableEquals(expected, actual); + + try (Connection connection = + DriverManager.getConnection( + POSTGRESQL_CONTAINER.getJdbcUrl(), + POSTGRESQL_CONTAINER.getUsername(), + POSTGRESQL_CONTAINER.getPassword())) { + try (Statement statement = connection.createStatement()) { + statement.execute("truncate table sink"); + LOG.info("testDebeziumFormatKafkaCdcToPgsql truncate table sink"); + } + } } public void initializeSourceTableData() throws Exception { diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/KafkaConnectToKafkaIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/KafkaConnectToKafkaIT.java new file mode 100644 index 00000000000..591049917f8 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/java/org/apache/seatunnel/e2e/connector/kafka/KafkaConnectToKafkaIT.java @@ -0,0 +1,282 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.e2e.connector.kafka; + +import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; +import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; + +import org.apache.seatunnel.connectors.seatunnel.cdc.mysql.testutils.MySqlContainer; +import org.apache.seatunnel.connectors.seatunnel.cdc.mysql.testutils.MySqlVersion; +import org.apache.seatunnel.e2e.common.TestResource; +import org.apache.seatunnel.e2e.common.TestSuiteBase; +import org.apache.seatunnel.e2e.common.container.ContainerExtendedFactory; +import org.apache.seatunnel.e2e.common.container.EngineType; +import org.apache.seatunnel.e2e.common.container.TestContainer; +import org.apache.seatunnel.e2e.common.junit.DisabledOnContainer; +import org.apache.seatunnel.e2e.common.junit.TestContainerExtension; + +import org.apache.kafka.clients.producer.KafkaProducer; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.clients.producer.ProducerRecord; +import org.apache.kafka.common.serialization.ByteArraySerializer; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.TestTemplate; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.testcontainers.containers.Container; +import org.testcontainers.containers.KafkaContainer; +import org.testcontainers.containers.output.Slf4jLogConsumer; +import org.testcontainers.lifecycle.Startables; +import org.testcontainers.utility.DockerImageName; +import org.testcontainers.utility.DockerLoggerFactory; + +import com.google.common.collect.Lists; +import lombok.SneakyThrows; +import lombok.extern.slf4j.Slf4j; + +import java.io.IOException; +import java.sql.Connection; +import java.sql.DriverManager; +import java.sql.ResultSet; +import java.sql.SQLException; +import java.sql.Statement; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; +import java.util.Properties; +import java.util.concurrent.TimeUnit; +import java.util.stream.Stream; + +import static org.awaitility.Awaitility.given; + +@Slf4j +@DisabledOnContainer( + value = {}, + type = {EngineType.SPARK}) +public class KafkaConnectToKafkaIT extends TestSuiteBase implements TestResource { + private static final Logger LOG = LoggerFactory.getLogger(KafkaConnectToKafkaIT.class); + private final ObjectMapper objectMapper = new ObjectMapper(); + // kafka + private static final String KAFKA_IMAGE_NAME = "confluentinc/cp-kafka:latest"; + + private static final String KAFKA_JDBC_TOPIC = "jdbc_source_record"; + + private static final String KAFKA_HOST = "kafka_connect_source_record"; + + private static KafkaContainer KAFKA_CONTAINER; + + private KafkaProducer kafkaProducer; + + // -----------------------------------mysql----------------------------------------- + private static MySqlContainer MYSQL_CONTAINER; + private static final String MYSQL_DATABASE = "seatunnel"; + private static final String MYSQL_HOST = "kafka_to_mysql_e2e"; + private static final int MYSQL_PORT = 3306; + private static final String MYSQL_DRIVER_JAR = + "https://repo1.maven.org/maven2/com/mysql/mysql-connector-j/8.0.32/mysql-connector-j-8.0.32.jar"; + + @TestContainerExtension + private final ContainerExtendedFactory extendedFactory = + container -> { + Container.ExecResult extraCommands = + container.execInContainer( + "bash", + "-c", + "mkdir -p /tmp/seatunnel/plugins/Jdbc/lib && cd /tmp/seatunnel/plugins/Jdbc/lib && curl -O " + + MYSQL_DRIVER_JAR); + Assertions.assertEquals(0, extraCommands.getExitCode()); + }; + + private static MySqlContainer createMySqlContainer(MySqlVersion version) { + MySqlContainer mySqlContainer = + new MySqlContainer(version) + .withConfigurationOverride("docker/server-gtids/my.cnf") + .withSetupSQL("docker/setup.sql") + .withNetwork(NETWORK) + .withNetworkAliases(MYSQL_HOST) + .withDatabaseName("seatunnel") + .withUsername("st_user") + .withPassword("seatunnel") + .withLogConsumer(new Slf4jLogConsumer(LOG)); + mySqlContainer.setPortBindings( + com.google.common.collect.Lists.newArrayList( + String.format("%s:%s", MYSQL_PORT, MYSQL_PORT))); + return mySqlContainer; + } + + private void createKafkaContainer() { + KAFKA_CONTAINER = + new KafkaContainer(DockerImageName.parse(KAFKA_IMAGE_NAME)) + .withNetwork(NETWORK) + .withNetworkAliases(KAFKA_HOST) + .withLogConsumer( + new Slf4jLogConsumer( + DockerLoggerFactory.getLogger(KAFKA_IMAGE_NAME))); + } + + @BeforeAll + @Override + public void startUp() { + + LOG.info("The first stage: Starting Kafka containers..."); + createKafkaContainer(); + Startables.deepStart(Stream.of(KAFKA_CONTAINER)).join(); + LOG.info("Kafka Containers are started"); + + given().ignoreExceptions() + .atLeast(100, TimeUnit.MILLISECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted(this::initKafkaProducer); + + LOG.info("The second stage: Starting Mysql containers..."); + MYSQL_CONTAINER = createMySqlContainer(MySqlVersion.V8_0); + Startables.deepStart(Stream.of(MYSQL_CONTAINER)).join(); + LOG.info("Mysql Containers are started"); + + given().ignoreExceptions() + .await() + .atLeast(100, TimeUnit.MILLISECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted(this::initializeDatabase); + + given().ignoreExceptions() + .await() + .atLeast(100, TimeUnit.MILLISECONDS) + .pollInterval(500, TimeUnit.MILLISECONDS) + .atMost(2, TimeUnit.MINUTES) + .untilAsserted(this::initializeJdbcTable); + + log.info("Write 3 records to topic " + KAFKA_JDBC_TOPIC); + generateConnectJdbcRecord(); + } + + @TestTemplate + public void testJdbcRecordKafkaToMysql(TestContainer container) + throws IOException, InterruptedException, SQLException { + Container.ExecResult execResult = + container.executeJob("/kafkasource_jdbc_record_to_mysql.conf"); + Assertions.assertEquals(0, execResult.getExitCode(), execResult.getStderr()); + List actual = new ArrayList<>(); + try (Connection connection = + DriverManager.getConnection( + MYSQL_CONTAINER.getJdbcUrl(), + MYSQL_CONTAINER.getUsername(), + MYSQL_CONTAINER.getPassword())) { + try (Statement statement = connection.createStatement()) { + ResultSet resultSet = + statement.executeQuery("select * from seatunnel.jdbc_sink order by id"); + while (resultSet.next()) { + List row = + Arrays.asList( + resultSet.getInt("id"), + resultSet.getString("name"), + resultSet.getString("description"), + resultSet.getString("weight")); + actual.add(row); + } + } + } + List expected = + Lists.newArrayList( + Arrays.asList(15, "test", "test", "20"), + Arrays.asList(16, "test-001", "test", "30"), + Arrays.asList(18, "sdc", "sdc", "sdc")); + Assertions.assertIterableEquals(expected, actual); + + try (Connection connection = + DriverManager.getConnection( + MYSQL_CONTAINER.getJdbcUrl(), + MYSQL_CONTAINER.getUsername(), + MYSQL_CONTAINER.getPassword())) { + try (Statement statement = connection.createStatement()) { + statement.execute("truncate table seatunnel.jdbc_sink"); + LOG.info("testJdbcRecordKafkaToMysql truncate table sink"); + } + } + } + + @SneakyThrows + public void generateConnectJdbcRecord() { + String[] jdbcSourceRecords = { + "{\"schema\":{\"type\":\"struct\",\"fields\":[{\"type\":\"int64\",\"optional\":false,\"field\":\"id\"},{\"type\":\"string\",\"optional\":true,\"field\":\"name\"},{\"type\":\"string\",\"optional\":true,\"field\":\"description\"},{\"type\":\"string\",\"optional\":true,\"field\":\"weight\"}],\"optional\":false,\"name\":\"test_database_001.seatunnel_test_cdc\"},\"payload\":{\"id\":15,\"name\":\"test\",\"description\":\"test\",\"weight\":\"20\"}}", + "{\"schema\":{\"type\":\"struct\",\"fields\":[{\"type\":\"int64\",\"optional\":false,\"field\":\"id\"},{\"type\":\"string\",\"optional\":true,\"field\":\"name\"},{\"type\":\"string\",\"optional\":true,\"field\":\"description\"},{\"type\":\"string\",\"optional\":true,\"field\":\"weight\"}],\"optional\":false,\"name\":\"test_database_001.seatunnel_test_cdc\"},\"payload\":{\"id\":16,\"name\":\"test-001\",\"description\":\"test\",\"weight\":\"30\"}}", + "{\"schema\":{\"type\":\"struct\",\"fields\":[{\"type\":\"int64\",\"optional\":false,\"field\":\"id\"},{\"type\":\"string\",\"optional\":true,\"field\":\"name\"},{\"type\":\"string\",\"optional\":true,\"field\":\"description\"},{\"type\":\"string\",\"optional\":true,\"field\":\"weight\"}],\"optional\":false,\"name\":\"test_database_001.seatunnel_test_cdc\"},\"payload\":{\"id\":18,\"name\":\"sdc\",\"description\":\"sdc\",\"weight\":\"sdc\"}}" + }; + for (String value : jdbcSourceRecords) { + JsonNode jsonNode = objectMapper.readTree(value); + byte[] bytes = objectMapper.writeValueAsBytes(jsonNode); + ProducerRecord producerRecord = + new ProducerRecord<>(KAFKA_JDBC_TOPIC, null, bytes); + kafkaProducer.send(producerRecord).get(); + } + } + + private void initKafkaProducer() { + Properties props = new Properties(); + String bootstrapServers = KAFKA_CONTAINER.getBootstrapServers(); + props.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers); + props.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); + props.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class); + kafkaProducer = new KafkaProducer<>(props); + } + + @Override + public void tearDown() { + MYSQL_CONTAINER.close(); + KAFKA_CONTAINER.close(); + } + + protected void initializeDatabase() { + try (Connection connection = + DriverManager.getConnection( + MYSQL_CONTAINER.getJdbcUrl(), + MYSQL_CONTAINER.getUsername(), + MYSQL_CONTAINER.getPassword())) { + Statement statement = connection.createStatement(); + String sql = "CREATE DATABASE IF NOT EXISTS " + MYSQL_DATABASE; + statement.execute(sql); + } catch (SQLException e) { + throw new RuntimeException("Initializing Mysql database failed!", e); + } + } + + private void initializeJdbcTable() { + try (Connection connection = + DriverManager.getConnection( + MYSQL_CONTAINER.getJdbcUrl(), + MYSQL_CONTAINER.getUsername(), + MYSQL_CONTAINER.getPassword())) { + Statement statement = connection.createStatement(); + String jdbcSink = + "CREATE TABLE IF NOT EXISTS seatunnel.jdbc_sink(\n" + + "id INT NOT NULL PRIMARY KEY,\n" + + "name varchar(255),\n" + + "description varchar(255),\n" + + "weight varchar(255)" + + ")"; + statement.execute(jdbcSink); + } catch (SQLException e) { + throw new RuntimeException("Initializing Mysql table failed!", e); + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/canalFormatIT/kafka_source_canal_cdc_to_pgsql.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/canalFormatIT/kafka_source_canal_cdc_to_pgsql.conf index 9ce69a2344c..2f7249dbdbd 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/canalFormatIT/kafka_source_canal_cdc_to_pgsql.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/canalFormatIT/kafka_source_canal_cdc_to_pgsql.conf @@ -47,13 +47,14 @@ source { sink { Jdbc { + driver = org.postgresql.Driver url = "jdbc:postgresql://postgresql:5432/test?loggerLevel=OFF" user = test password = test generate_sink_sql = true - database = public - table = sink + database = test + table = public.sink primary_keys = ["id"] } } \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/kafkasource_debezium_cdc_to_pgsql.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/kafkasource_debezium_cdc_to_pgsql.conf index a0531b2345a..2d56fb7879d 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/kafkasource_debezium_cdc_to_pgsql.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/kafkasource_debezium_cdc_to_pgsql.conf @@ -55,8 +55,8 @@ sink { user = test password = test generate_sink_sql = true - database = public - table = sink + database = test + table = public.sink primary_keys = ["id"] } } \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/kafkasource_jdbc_record_to_mysql.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/kafkasource_jdbc_record_to_mysql.conf new file mode 100644 index 00000000000..36ae276e034 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-kafka-e2e/src/test/resources/kafkasource_jdbc_record_to_mysql.conf @@ -0,0 +1,63 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +###### +###### This config file is a demonstration of streaming processing in seatunnel config +###### + +env { + execution.parallelism = 1 + job.mode = "BATCH" + + #spark config + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + Kafka { + bootstrap.servers = "kafka_connect_source_record:9092" + topic = "jdbc_source_record" + result_table_name = "kafka_table" + start_mode = earliest + schema = { + fields { + id = "int" + name = "string" + description = "string" + weight = "string" + } + }, + format = COMPATIBLE_KAFKA_CONNECT_JSON + } +} + + +sink { + Jdbc { + driver = com.mysql.cj.jdbc.Driver + url = "jdbc:mysql://kafka_to_mysql_e2e:3306/seatunnel" + user = st_user + password = seatunnel + generate_sink_sql = true + database = seatunnel + table = jdbc_sink + primary_keys = ["id"] + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/AbstractMongodbIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/AbstractMongodbIT.java index 4c85c0d097e..5dbe7cf3479 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/AbstractMongodbIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/AbstractMongodbIT.java @@ -82,6 +82,11 @@ public abstract class AbstractMongodbIT extends TestSuiteBase implements TestRes protected static final String MONGODB_CDC_RESULT_TABLE = "test_cdc_table"; + protected static final String MONGODB_TRANSACTION_SINK_TABLE = + "test_source_transaction_sink_table"; + protected static final String MONGODB_TRANSACTION_UPSERT_TABLE = + "test_source_upsert_transaction_table"; + protected GenericContainer mongodbContainer; protected MongoClient client; diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/MongodbIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/MongodbIT.java index ce25b2062b6..fb643455a6e 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/MongodbIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/java/org/apache/seatunnel/e2e/connector/v2/mongodb/MongodbIT.java @@ -140,4 +140,65 @@ public void testMongodbSourceSplit(TestContainer container) .collect(Collectors.toList())); clearDate(MONGODB_SPLIT_RESULT_TABLE); } + + @TestTemplate + public void testCompatibleParameters(TestContainer container) + throws IOException, InterruptedException { + // `upsert-key` compatible test + Container.ExecResult insertResult = + container.executeJob("/updateIT/fake_source_to_updateMode_insert_mongodb.conf"); + Assertions.assertEquals(0, insertResult.getExitCode(), insertResult.getStderr()); + + Container.ExecResult updateResult = + container.executeJob("/compatibleParametersIT/fake_source_to_update_mongodb.conf"); + Assertions.assertEquals(0, updateResult.getExitCode(), updateResult.getStderr()); + + Container.ExecResult assertResult = + container.executeJob("/updateIT/update_mongodb_to_assert.conf"); + Assertions.assertEquals(0, assertResult.getExitCode(), assertResult.getStderr()); + + clearDate(MONGODB_UPDATE_TABLE); + + // `matchQuery` compatible test + Container.ExecResult queryResult = + container.executeJob("/matchIT/mongodb_matchQuery_source_to_assert.conf"); + Assertions.assertEquals(0, queryResult.getExitCode(), queryResult.getStderr()); + + Assertions.assertIterableEquals( + TEST_MATCH_DATASET.stream() + .filter(x -> x.get("c_int").equals(2)) + .peek(e -> e.remove("_id")) + .collect(Collectors.toList()), + readMongodbData(MONGODB_MATCH_RESULT_TABLE).stream() + .peek(e -> e.remove("_id")) + .collect(Collectors.toList())); + clearDate(MONGODB_MATCH_RESULT_TABLE); + } + + @TestTemplate + public void testTransactionSinkAndUpsert(TestContainer container) + throws IOException, InterruptedException { + Container.ExecResult insertResult = + container.executeJob("/transactionIT/fake_source_to_transaction_sink_mongodb.conf"); + Assertions.assertEquals(0, insertResult.getExitCode(), insertResult.getStderr()); + + Container.ExecResult assertSinkResult = + container.executeJob( + "/transactionIT/mongodb_source_transaction_sink_to_assert.conf"); + Assertions.assertEquals(0, assertSinkResult.getExitCode(), assertSinkResult.getStderr()); + + Container.ExecResult upsertResult = + container.executeJob( + "/transactionIT/fake_source_to_transaction_upsert_mongodb.conf"); + Assertions.assertEquals(0, upsertResult.getExitCode(), upsertResult.getStderr()); + + Container.ExecResult assertUpsertResult = + container.executeJob( + "/transactionIT/mongodb_source_transaction_upsert_to_assert.conf"); + Assertions.assertEquals( + 0, assertUpsertResult.getExitCode(), assertUpsertResult.getStderr()); + + clearDate(MONGODB_TRANSACTION_SINK_TABLE); + clearDate(MONGODB_TRANSACTION_UPSERT_TABLE); + } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/compatibleParametersIT/fake_source_to_update_mongodb.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/compatibleParametersIT/fake_source_to_update_mongodb.conf new file mode 100644 index 00000000000..ef5bf5b88e1 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/compatibleParametersIT/fake_source_to_update_mongodb.conf @@ -0,0 +1,103 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + job.mode = "BATCH" + #spark config + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + FakeSource { + row.num = 5 + int.template = [2] + result_table_name = "mongodb_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + MongoDB { + uri = "mongodb://e2e_mongodb:27017/test_db?retryWrites=true" + database = "test_db" + collection = "test_update_table" + upsert-enable = true + // compatible parameters + upsert-key = ["c_int"] + source_table_name = "mongodb_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/compatibleParametersIT/mongodb_matchQuery_source_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/compatibleParametersIT/mongodb_matchQuery_source_to_assert.conf new file mode 100644 index 00000000000..5b7e73344ea --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/compatibleParametersIT/mongodb_matchQuery_source_to_assert.conf @@ -0,0 +1,93 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + job.mode = "BATCH" + #spark config + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + MongoDB { + uri = "mongodb://e2e_mongodb:27017/test_db" + database = "test_db" + collection = "test_match_op_db" + result_table_name = "mongodb_table" + // compatible parameters + matchQuery = "{c_int: 2}" + cursor.no-timeout = true + fetch.size = 1000 + max.time-min = 100 + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + } + } + } + } +} + +sink { + Console { + source_table_name = "mongodb_table" + } + MongoDB { + uri = "mongodb://e2e_mongodb:27017/test_db?retryWrites=true" + database = "test_db" + collection = "test_match_op_result_db" + source_table_name = "mongodb_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + } + } + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/fake_source_to_transaction_sink_mongodb.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/fake_source_to_transaction_sink_mongodb.conf new file mode 100644 index 00000000000..67947eb956c --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/fake_source_to_transaction_sink_mongodb.conf @@ -0,0 +1,102 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + job.mode = "BATCH" + #spark config + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + FakeSource { + row.num = 50 + int.template = [3] + split.num = 5 + split.read-interval = 100 + result_table_name = "mongodb_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + MongoDB { + uri = "mongodb://e2e_mongodb:27017" + database = "test_db" + collection = "test_source_transaction_sink_table" + transaction = true + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/fake_source_to_transaction_upsert_mongodb.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/fake_source_to_transaction_upsert_mongodb.conf new file mode 100644 index 00000000000..53a98fe28a4 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/fake_source_to_transaction_upsert_mongodb.conf @@ -0,0 +1,104 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + job.mode = "BATCH" + #spark config + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + FakeSource { + row.num = 50 + int.template = [2] + split.num = 5 + split.read-interval = 100 + result_table_name = "mongodb_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + MongoDB { + uri = "mongodb://e2e_mongodb:27017" + database = "test_db" + collection = "test_source_upsert_transaction_table" + transaction = true + upsert-enable = true + primary-key = ["c_int"] + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/mongodb_source_transaction_sink_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/mongodb_source_transaction_sink_to_assert.conf new file mode 100644 index 00000000000..f453ff5dfef --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/mongodb_source_transaction_sink_to_assert.conf @@ -0,0 +1,115 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + job.mode = "BATCH" + #spark config + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + MongoDB { + uri = "mongodb://e2e_mongodb:27017/test_db" + database = "test_db" + collection = "test_source_transaction_sink_table" + cursor.no-timeout = true + result_table_name = "mongodb_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + Console { + source_table_name = "mongodb_table" + } + Assert { + source_table_name = "mongodb_table" + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 50 + }, + { + rule_type = MIN_ROW + rule_value = 50 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/mongodb_source_transaction_upsert_to_assert.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/mongodb_source_transaction_upsert_to_assert.conf new file mode 100644 index 00000000000..0a5f8e5e1e0 --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-mongodb-e2e/src/test/resources/transactionIT/mongodb_source_transaction_upsert_to_assert.conf @@ -0,0 +1,115 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + job.mode = "BATCH" + #spark config + spark.app.name = "SeaTunnel" + spark.executor.instances = 1 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + MongoDB { + uri = "mongodb://e2e_mongodb:27017/test_db" + database = "test_db" + collection = "test_source_upsert_transaction_table" + cursor.no-timeout = true + result_table_name = "mongodb_table" + schema = { + fields { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + c_row = { + c_map = "map" + c_array = "array" + c_string = string + c_boolean = boolean + c_int = int + c_bigint = bigint + c_double = double + c_bytes = bytes + c_date = date + c_decimal = "decimal(33, 18)" + c_timestamp = timestamp + } + } + } + } +} + +sink { + Console { + source_table_name = "mongodb_table" + } + Assert { + source_table_name = "mongodb_table" + rules { + row_rules = [ + { + rule_type = MAX_ROW + rule_value = 1 + }, + { + rule_type = MIN_ROW + rule_value = 1 + } + ], + field_rules = [ + { + field_name = c_string + field_type = string + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_boolean + field_type = boolean + field_value = [ + { + rule_type = NOT_NULL + } + ] + }, + { + field_name = c_double + field_type = double + field_value = [ + { + rule_type = NOT_NULL + } + ] + } + ] + } + } +} diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/java/org/apache/seatunnel/e2e/connector/pulsar/CanalToPulsarIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/java/org/apache/seatunnel/e2e/connector/pulsar/CanalToPulsarIT.java index 716bd7dc90e..ec8fd481380 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/java/org/apache/seatunnel/e2e/connector/pulsar/CanalToPulsarIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/java/org/apache/seatunnel/e2e/connector/pulsar/CanalToPulsarIT.java @@ -337,5 +337,16 @@ void testCanalFormatMessages(TestContainer container) Arrays.asList(107, "rocks", "box of assorted rocks", "7.88"), Arrays.asList(108, "jacket", "water resistent black wind breaker", "0.1")); Assertions.assertIterableEquals(expected, actual); + + try (Connection connection = + DriverManager.getConnection( + POSTGRESQL_CONTAINER.getJdbcUrl(), + POSTGRESQL_CONTAINER.getUsername(), + POSTGRESQL_CONTAINER.getPassword())) { + try (Statement statement = connection.createStatement()) { + statement.execute("truncate table sink"); + LOG.info("testSinkCDCChangelog truncate table sink"); + } + } } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/java/org/apache/seatunnel/e2e/connector/pulsar/PulsarBatchIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/java/org/apache/seatunnel/e2e/connector/pulsar/PulsarBatchIT.java index b1ea69efac6..092f37f9bc4 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/java/org/apache/seatunnel/e2e/connector/pulsar/PulsarBatchIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/java/org/apache/seatunnel/e2e/connector/pulsar/PulsarBatchIT.java @@ -56,6 +56,7 @@ import lombok.extern.slf4j.Slf4j; import java.io.IOException; +import java.time.Duration; import java.util.ArrayList; import java.util.List; import java.util.concurrent.TimeUnit; @@ -113,6 +114,7 @@ public void startUp() throws Exception { new PulsarContainer(DockerImageName.parse(PULSAR_IMAGE_NAME)) .withNetwork(NETWORK) .withNetworkAliases(PULSAR_HOST) + .withStartupTimeout(Duration.ofMinutes(3)) .withLogConsumer( new Slf4jLogConsumer( DockerLoggerFactory.getLogger(PULSAR_IMAGE_NAME))); diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/resources/cdc_canal_pulsar_to_pg.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/resources/cdc_canal_pulsar_to_pg.conf index c287be67658..3ace667579e 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/resources/cdc_canal_pulsar_to_pg.conf +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-pulsar-e2e/src/test/resources/cdc_canal_pulsar_to_pg.conf @@ -58,8 +58,8 @@ sink { user = test password = test generate_sink_sql = true - database = public - table = sink + database = test + table = public.sink primary_keys = ["id"] } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-redis-e2e/src/test/java/org/apache/seatunnel/e2e/connector/redis/RedisIT.java b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-redis-e2e/src/test/java/org/apache/seatunnel/e2e/connector/redis/RedisIT.java index 808f6860337..bd4a9063ba1 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-redis-e2e/src/test/java/org/apache/seatunnel/e2e/connector/redis/RedisIT.java +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-redis-e2e/src/test/java/org/apache/seatunnel/e2e/connector/redis/RedisIT.java @@ -192,4 +192,15 @@ public void testRedis(TestContainer container) throws IOException, InterruptedEx jedis.del("key_list"); Assertions.assertEquals(0, jedis.llen("key_list")); } + + @TestTemplate + public void testRedisWithExpire(TestContainer container) + throws IOException, InterruptedException { + Container.ExecResult execResult = container.executeJob("/redis-to-redis-expire.conf"); + Assertions.assertEquals(0, execResult.getExitCode()); + Assertions.assertEquals(100, jedis.llen("key_list")); + // Clear data to prevent data duplication in the next TestContainer + Thread.sleep(60 * 1000); + Assertions.assertEquals(0, jedis.llen("key_list")); + } } diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-redis-e2e/src/test/resources/redis-to-redis-expire.conf b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-redis-e2e/src/test/resources/redis-to-redis-expire.conf new file mode 100644 index 00000000000..4a42bd3a46a --- /dev/null +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/connector-redis-e2e/src/test/resources/redis-to-redis-expire.conf @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +env { + execution.parallelism = 1 + job.mode = "BATCH" + shade.identifier = "base64" + + #spark config + spark.app.name = "SeaTunnel" + spark.executor.instances = 2 + spark.executor.cores = 1 + spark.executor.memory = "1g" + spark.master = local +} + +source { + Redis { + host = "redis-e2e" + port = 6379 + auth = "U2VhVHVubmVs" + keys = "key_test*" + data_type = key + } +} + +sink { + Redis { + host = "redis-e2e" + port = 6379 + auth = "U2VhVHVubmVs" + key = "key_list" + data_type = list + expire = 30 + } +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-connector-v2-e2e/pom.xml b/seatunnel-e2e/seatunnel-connector-v2-e2e/pom.xml index 65798fee100..8644b551b2f 100644 --- a/seatunnel-e2e/seatunnel-connector-v2-e2e/pom.xml +++ b/seatunnel-e2e/seatunnel-connector-v2-e2e/pom.xml @@ -36,6 +36,7 @@ connector-influxdb-e2e connector-amazondynamodb-e2e connector-file-local-e2e + connector-file-cos-e2e connector-file-sftp-e2e connector-cassandra-e2e connector-neo4j-e2e diff --git a/seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/common/container/TestHelper.java b/seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/common/container/TestHelper.java new file mode 100644 index 00000000000..a88723f8201 --- /dev/null +++ b/seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/common/container/TestHelper.java @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.e2e.common.container; + +import org.junit.jupiter.api.Assertions; +import org.testcontainers.containers.Container; + +import java.io.IOException; + +public class TestHelper { + private final TestContainer container; + + public TestHelper(TestContainer container) { + this.container = container; + } + + public void execute(String file) throws IOException, InterruptedException { + execute(0, file); + } + + public void execute(int exceptResult, String file) throws IOException, InterruptedException { + Container.ExecResult result = container.executeJob(file); + Assertions.assertEquals(exceptResult, result.getExitCode(), result.getStderr()); + } +} diff --git a/seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/common/util/ContainerUtil.java b/seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/common/util/ContainerUtil.java index 92d6100a7ce..fa5660a1700 100644 --- a/seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/common/util/ContainerUtil.java +++ b/seatunnel-e2e/seatunnel-e2e-common/src/test/java/org/apache/seatunnel/e2e/common/util/ContainerUtil.java @@ -246,4 +246,10 @@ public static List discoverTestContainers() { throw new FactoryException("Could not load service provider for containers.", e); } } + + public static void copyFileIntoContainers( + String fileName, String targetPath, GenericContainer container) { + Path path = getResourcesFile(fileName).toPath(); + container.copyFileToContainer(MountableFile.forHostPath(path), targetPath); + } } diff --git a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/pom.xml b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/pom.xml index a1315565349..20a2e612a6f 100644 --- a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/pom.xml +++ b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/pom.xml @@ -92,6 +92,18 @@ ${netty-buffer.version} test + + org.apache.seatunnel + seatunnel-transforms-v2 + ${project.version} + test + + + org.apache.seatunnel + seatunnel-api + ${project.version} + test + diff --git a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/ClusterFaultToleranceIT.java b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/ClusterFaultToleranceIT.java index a7067c7c3a5..33bf2ba7b24 100644 --- a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/ClusterFaultToleranceIT.java +++ b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/ClusterFaultToleranceIT.java @@ -50,7 +50,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; /** * Cluster fault tolerance test. Test the job recovery capability and data consistency assurance diff --git a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/ClusterFaultToleranceTwoPipelineIT.java b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/ClusterFaultToleranceTwoPipelineIT.java index e99940defec..f60a5374fb9 100644 --- a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/ClusterFaultToleranceTwoPipelineIT.java +++ b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/ClusterFaultToleranceTwoPipelineIT.java @@ -48,7 +48,7 @@ import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; /** * Cluster fault tolerance test. Test the job which have two pipelines can recovery capability and diff --git a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/JobExecutionIT.java b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/JobExecutionIT.java index 4609a10dc4c..cba498e9992 100644 --- a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/JobExecutionIT.java +++ b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/JobExecutionIT.java @@ -24,13 +24,13 @@ import org.apache.seatunnel.engine.client.job.JobExecutionEnvironment; import org.apache.seatunnel.engine.common.config.ConfigProvider; import org.apache.seatunnel.engine.common.config.JobConfig; +import org.apache.seatunnel.engine.core.job.JobResult; import org.apache.seatunnel.engine.core.job.JobStatus; import org.apache.seatunnel.engine.server.SeaTunnelServerStarter; -import org.awaitility.Awaitility; -import org.junit.jupiter.api.AfterAll; +import org.junit.jupiter.api.AfterEach; import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; import com.hazelcast.client.config.ClientConfig; @@ -38,15 +38,18 @@ import lombok.extern.slf4j.Slf4j; import java.util.concurrent.CompletableFuture; +import java.util.concurrent.ExecutionException; import java.util.concurrent.TimeUnit; +import static org.awaitility.Awaitility.await; + @Slf4j public class JobExecutionIT { private static HazelcastInstanceImpl hazelcastInstance; - @BeforeAll - public static void beforeClass() throws Exception { + @BeforeEach + public void beforeClass() { hazelcastInstance = SeaTunnelServerStarter.createHazelcastInstance( TestUtils.getClusterName("JobExecutionIT")); @@ -79,13 +82,9 @@ public void testExecuteJob() throws Exception { final ClientJobProxy clientJobProxy = jobExecutionEnv.execute(); CompletableFuture objectCompletableFuture = - CompletableFuture.supplyAsync( - () -> { - return clientJobProxy.waitForJobComplete(); - }); + CompletableFuture.supplyAsync(clientJobProxy::waitForJobComplete); - Awaitility.await() - .atMost(600000, TimeUnit.MILLISECONDS) + await().atMost(600000, TimeUnit.MILLISECONDS) .untilAsserted( () -> Assertions.assertTrue( @@ -110,17 +109,12 @@ public void cancelJobTest() throws Exception { final ClientJobProxy clientJobProxy = jobExecutionEnv.execute(); JobStatus jobStatus1 = clientJobProxy.getJobStatus(); Assertions.assertFalse(jobStatus1.isEndState()); - ClientJobProxy finalClientJobProxy = clientJobProxy; CompletableFuture objectCompletableFuture = - CompletableFuture.supplyAsync( - () -> { - return finalClientJobProxy.waitForJobComplete(); - }); + CompletableFuture.supplyAsync(clientJobProxy::waitForJobComplete); Thread.sleep(1000); clientJobProxy.cancelJob(); - Awaitility.await() - .atMost(20000, TimeUnit.MILLISECONDS) + await().atMost(20000, TimeUnit.MILLISECONDS) .untilAsserted( () -> Assertions.assertTrue( @@ -129,8 +123,53 @@ public void cancelJobTest() throws Exception { objectCompletableFuture.get()))); } - @AfterAll - static void afterClass() { + @Test + public void testGetErrorInfo() throws ExecutionException, InterruptedException { + Common.setDeployMode(DeployMode.CLIENT); + String filePath = TestUtils.getResource("batch_fakesource_to_console_error.conf"); + JobConfig jobConfig = new JobConfig(); + jobConfig.setName("fake_to_console_error"); + ClientConfig clientConfig = ConfigProvider.locateAndGetClientConfig(); + clientConfig.setClusterName(TestUtils.getClusterName("JobExecutionIT")); + SeaTunnelClient engineClient = new SeaTunnelClient(clientConfig); + JobExecutionEnvironment jobExecutionEnv = + engineClient.createExecutionContext(filePath, jobConfig); + final ClientJobProxy clientJobProxy = jobExecutionEnv.execute(); + CompletableFuture completableFuture = + CompletableFuture.supplyAsync(clientJobProxy::waitForJobComplete); + await().atMost(600000, TimeUnit.MILLISECONDS) + .untilAsserted(() -> Assertions.assertTrue(completableFuture.isDone())); + + JobResult result = clientJobProxy.getJobResultCache(); + Assertions.assertEquals(result.getStatus(), JobStatus.FAILED); + Assertions.assertTrue(result.getError().startsWith("java.lang.NumberFormatException")); + } + + @Test + public void testExpiredJobWasDeleted() throws Exception { + Common.setDeployMode(DeployMode.CLIENT); + String filePath = TestUtils.getResource("batch_fakesource_to_file.conf"); + JobConfig jobConfig = new JobConfig(); + jobConfig.setName("job_expire"); + + ClientConfig clientConfig = ConfigProvider.locateAndGetClientConfig(); + clientConfig.setClusterName(TestUtils.getClusterName("JobExecutionIT")); + SeaTunnelClient engineClient = new SeaTunnelClient(clientConfig); + JobExecutionEnvironment jobExecutionEnv = + engineClient.createExecutionContext(filePath, jobConfig); + + final ClientJobProxy clientJobProxy = jobExecutionEnv.execute(); + + Assertions.assertEquals(clientJobProxy.waitForJobComplete(), JobStatus.FINISHED); + await().atMost(65, TimeUnit.SECONDS) + .untilAsserted( + () -> + Assertions.assertThrowsExactly( + NullPointerException.class, clientJobProxy::getJobStatus)); + } + + @AfterEach + void afterClass() { if (hazelcastInstance != null) { hazelcastInstance.shutdown(); } diff --git a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/RestApiIT.java b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/RestApiIT.java index 5f4e97ac8d5..d38d1c732f1 100644 --- a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/RestApiIT.java +++ b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/java/org/apache/seatunnel/engine/e2e/RestApiIT.java @@ -22,10 +22,12 @@ import org.apache.seatunnel.engine.client.SeaTunnelClient; import org.apache.seatunnel.engine.client.job.ClientJobProxy; import org.apache.seatunnel.engine.client.job.JobExecutionEnvironment; +import org.apache.seatunnel.engine.common.Constant; import org.apache.seatunnel.engine.common.config.ConfigProvider; import org.apache.seatunnel.engine.common.config.JobConfig; import org.apache.seatunnel.engine.common.config.SeaTunnelConfig; import org.apache.seatunnel.engine.core.job.JobStatus; +import org.apache.seatunnel.engine.server.SeaTunnelServer; import org.apache.seatunnel.engine.server.SeaTunnelServerStarter; import org.apache.seatunnel.engine.server.rest.RestConstant; @@ -37,6 +39,7 @@ import com.hazelcast.client.config.ClientConfig; import com.hazelcast.instance.impl.HazelcastInstanceImpl; +import io.restassured.response.Response; import lombok.extern.slf4j.Slf4j; import java.util.concurrent.TimeUnit; @@ -131,6 +134,75 @@ public void testSystemMonitoringInformation() { .statusCode(200); } + @Test + public void testSubmitJob() { + String requestBody = + "{\n" + + " \"env\": {\n" + + " \"job.mode\": \"batch\"\n" + + " },\n" + + " \"source\": [\n" + + " {\n" + + " \"plugin_name\": \"FakeSource\",\n" + + " \"result_table_name\": \"fake\",\n" + + " \"row.num\": 100,\n" + + " \"schema\": {\n" + + " \"fields\": {\n" + + " \"name\": \"string\",\n" + + " \"age\": \"int\",\n" + + " \"card\": \"int\"\n" + + " }\n" + + " }\n" + + " }\n" + + " ],\n" + + " \"transform\": [\n" + + " ],\n" + + " \"sink\": [\n" + + " {\n" + + " \"plugin_name\": \"Console\",\n" + + " \"source_table_name\": [\"fake\"]\n" + + " }\n" + + " ]\n" + + "}"; + String parameters = "jobId=1&jobName=test&isStartWithSavePoint=false"; + // Only jobName is compared because jobId is randomly generated if isStartWithSavePoint is + // false + Response response = + given().body(requestBody) + .post( + HOST + + hazelcastInstance + .getCluster() + .getLocalMember() + .getAddress() + .getPort() + + RestConstant.SUBMIT_JOB_URL + + "?" + + parameters); + + response.then().statusCode(200).body("jobName", equalTo("test")); + String jobId = response.getBody().jsonPath().getString("jobId"); + SeaTunnelServer seaTunnelServer = + (SeaTunnelServer) + hazelcastInstance + .node + .getNodeExtension() + .createExtensionServices() + .get(Constant.SEATUNNEL_SERVICE_NAME); + JobStatus jobStatus = + seaTunnelServer.getCoordinatorService().getJobStatus(Long.parseLong(jobId)); + Assertions.assertEquals(JobStatus.RUNNING, jobStatus); + Awaitility.await() + .atMost(2, TimeUnit.MINUTES) + .untilAsserted( + () -> + Assertions.assertEquals( + JobStatus.FINISHED, + seaTunnelServer + .getCoordinatorService() + .getJobStatus(Long.parseLong(jobId)))); + } + @AfterAll static void afterClass() { if (hazelcastInstance != null) { diff --git a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/batch_fakesource_to_console_error.conf b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/batch_fakesource_to_console_error.conf new file mode 100644 index 00000000000..5fb9b3b80b6 --- /dev/null +++ b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/batch_fakesource_to_console_error.conf @@ -0,0 +1,50 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +###### +###### This config file is a demonstration of streaming processing in seatunnel config +###### + +env { + job.mode = "BATCH" +} + +source { + # This is a example source plugin **only for test and demonstrate the feature source plugin** + FakeSource { + result_table_name = "fake" + schema { + fields { + id = "int" + name = "string" + age = "int" + } + } + } +} +transform { + sql { + source_table_name = "fake" + result_table_name = "fake1" + query ="select cast(name as int) as name, id,age from fake" + } +} +sink { + console { + source_table_name = "fake1" + } + +} \ No newline at end of file diff --git a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/cluster_batch_fake_to_localfile_two_pipeline_template.conf b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/cluster_batch_fake_to_localfile_two_pipeline_template.conf index 7e01c01c984..e94f7ae672e 100644 --- a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/cluster_batch_fake_to_localfile_two_pipeline_template.conf +++ b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/cluster_batch_fake_to_localfile_two_pipeline_template.conf @@ -134,7 +134,7 @@ sink { field_delimiter = "\t" row_delimiter = "\n" file_name_expression = "${transactionId}_${now}" - file_format = "text" + file_format_type = "text" filename_time_format = "yyyy.MM.dd" is_enable_transaction = true save_mode = "error" diff --git a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/seatunnel.yaml b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/seatunnel.yaml index 16b9f55c30d..4276fc87916 100644 --- a/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/seatunnel.yaml +++ b/seatunnel-e2e/seatunnel-engine-e2e/connector-seatunnel-e2e-base/src/test/resources/seatunnel.yaml @@ -17,6 +17,7 @@ seatunnel: engine: + history-job-expire-minutes: 1 backup-count: 2 queue-type: blockingqueue print-execution-info-interval: 10 @@ -25,8 +26,6 @@ seatunnel: checkpoint: interval: 300000 timeout: 10000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: localfile max-retained: 3 diff --git a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/copy_transform.conf b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/copy_transform.conf index 25ca4ce5f9a..b937b0a8cbe 100644 --- a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/copy_transform.conf +++ b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/copy_transform.conf @@ -30,6 +30,11 @@ source { fields { id = "int" name = "string" + c_row = { + c_row = { + c_int = int + } + } } } } @@ -49,6 +54,7 @@ transform { id_1 = "id" name2 = "name" name3 = "name" + c_row_1 = "c_row" } } } diff --git a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_exclude_delete.conf b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_exclude_delete.conf index f7fc0f6e0e1..8fdf195b037 100644 --- a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_exclude_delete.conf +++ b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_exclude_delete.conf @@ -31,6 +31,11 @@ source { id = "int" name = "string" age = "int" + c_row = { + c_row = { + c_int = int + } + } } } } diff --git a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_exclude_insert.conf b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_exclude_insert.conf index cc36417788b..9fc0e577cb8 100644 --- a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_exclude_insert.conf +++ b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_exclude_insert.conf @@ -31,6 +31,11 @@ source { id = "int" name = "string" age = "int" + c_row = { + c_row = { + c_int = int + } + } } } } diff --git a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_include_insert.conf b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_include_insert.conf index d1fbf79bea2..72d1e38cd44 100644 --- a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_include_insert.conf +++ b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_row_kind_include_insert.conf @@ -31,6 +31,11 @@ source { id = "int" name = "string" age = "int" + c_row = { + c_row = { + c_int = int + } + } } } } diff --git a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_transform.conf b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_transform.conf index 56439b4414f..c869c70a77b 100644 --- a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_transform.conf +++ b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/filter_transform.conf @@ -31,6 +31,11 @@ source { id = "int" name = "string" age = "int" + c_row = { + c_row = { + c_int = int + } + } } } } @@ -40,7 +45,7 @@ transform { Filter { source_table_name = "fake" result_table_name = "fake1" - fields = ["age", "name"] + fields = ["age", "name", "c_row"] } } diff --git a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/split_transform.conf b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/split_transform.conf index 61e10f694ac..7ad9fbf8f4a 100644 --- a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/split_transform.conf +++ b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-1/src/test/resources/split_transform.conf @@ -31,6 +31,11 @@ source { id = "int" name = "string" age = "int" + c_row = { + c_row = { + c_int = int + } + } } } } diff --git a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/field_mapper_transform.conf b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/field_mapper_transform.conf index c2d1f225f2b..59d19f3ee74 100644 --- a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/field_mapper_transform.conf +++ b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/field_mapper_transform.conf @@ -34,6 +34,11 @@ source { string1 = "string" int1 = "int" c_bigint = "bigint" + c_row = { + c_row = { + c_int = int + } + } } } } @@ -48,6 +53,7 @@ transform { age = age_as int1 = int1_as name = name + c_row = c_row } } } diff --git a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/sql_transform.conf b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/sql_transform.conf index c5f7c4047e7..78e21280f0d 100644 --- a/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/sql_transform.conf +++ b/seatunnel-e2e/seatunnel-transforms-v2-e2e/seatunnel-transforms-v2-e2e-part-2/src/test/resources/sql_transform.conf @@ -36,6 +36,11 @@ source { c_map = "map" c_array = "array" c_decimal = "decimal(30, 8)" + c_row = { + c_row = { + c_int = int + } + } } } } @@ -46,7 +51,7 @@ transform { source_table_name = "fake" result_table_name = "fake1" # the query table name must same as field 'source_table_name' - query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal from fake" + query = "select id, regexp_replace(name, '.+', 'b') as name, age+1 as age, pi() as pi, c_timestamp, c_date, c_map, c_array, c_decimal, c_row from fake" } # The SQL transform support base function and criteria operation # But the complex SQL unsupported yet, include: multi source table/rows JOIN and AGGREGATE operation and the like diff --git a/seatunnel-engine/seatunnel-engine-client/src/main/java/org/apache/seatunnel/engine/client/job/ClientJobProxy.java b/seatunnel-engine/seatunnel-engine-client/src/main/java/org/apache/seatunnel/engine/client/job/ClientJobProxy.java index 641e738ad23..5bb81378cdf 100644 --- a/seatunnel-engine/seatunnel-engine-client/src/main/java/org/apache/seatunnel/engine/client/job/ClientJobProxy.java +++ b/seatunnel-engine/seatunnel-engine-client/src/main/java/org/apache/seatunnel/engine/client/job/ClientJobProxy.java @@ -18,7 +18,10 @@ package org.apache.seatunnel.engine.client.job; import org.apache.seatunnel.common.utils.ExceptionUtils; +import org.apache.seatunnel.common.utils.RetryUtils; import org.apache.seatunnel.engine.client.SeaTunnelHazelcastClient; +import org.apache.seatunnel.engine.common.Constant; +import org.apache.seatunnel.engine.common.utils.ExceptionUtil; import org.apache.seatunnel.engine.common.utils.PassiveCompletableFuture; import org.apache.seatunnel.engine.core.job.Job; import org.apache.seatunnel.engine.core.job.JobImmutableInformation; @@ -89,8 +92,19 @@ private void submitJob(JobImmutableInformation jobImmutableInformation) { @Override public JobStatus waitForJobComplete() { try { - PassiveCompletableFuture jobFuture = doWaitForJobComplete(); - jobResult = jobFuture.get(); + jobResult = + RetryUtils.retryWithException( + () -> { + PassiveCompletableFuture jobFuture = + doWaitForJobComplete(); + return jobFuture.get(); + }, + new RetryUtils.RetryMaterial( + 100000, + true, + exception -> + ExceptionUtil.isOperationNeedRetryException(exception), + Constant.OPERATION_RETRY_SLEEP)); if (jobResult == null) { LOGGER.severe( "Unable to obtain the status of the job, it may have been running during the last cluster shutdown."); diff --git a/seatunnel-engine/seatunnel-engine-client/src/main/java/org/apache/seatunnel/engine/client/job/JobExecutionEnvironment.java b/seatunnel-engine/seatunnel-engine-client/src/main/java/org/apache/seatunnel/engine/client/job/JobExecutionEnvironment.java index bf3169e4c80..3f870c61216 100644 --- a/seatunnel-engine/seatunnel-engine-client/src/main/java/org/apache/seatunnel/engine/client/job/JobExecutionEnvironment.java +++ b/seatunnel-engine/seatunnel-engine-client/src/main/java/org/apache/seatunnel/engine/client/job/JobExecutionEnvironment.java @@ -18,55 +18,19 @@ package org.apache.seatunnel.engine.client.job; import org.apache.seatunnel.api.common.JobContext; -import org.apache.seatunnel.api.env.EnvCommonOptions; -import org.apache.seatunnel.common.config.Common; -import org.apache.seatunnel.common.utils.FileUtils; import org.apache.seatunnel.engine.client.SeaTunnelHazelcastClient; import org.apache.seatunnel.engine.common.config.JobConfig; -import org.apache.seatunnel.engine.common.exception.SeaTunnelEngineException; -import org.apache.seatunnel.engine.common.utils.IdGenerator; -import org.apache.seatunnel.engine.core.dag.actions.Action; -import org.apache.seatunnel.engine.core.dag.logical.LogicalDag; -import org.apache.seatunnel.engine.core.dag.logical.LogicalDagGenerator; +import org.apache.seatunnel.engine.core.job.AbstractJobEnvironment; import org.apache.seatunnel.engine.core.job.JobImmutableInformation; import org.apache.seatunnel.engine.core.parse.MultipleTableJobConfigParser; -import org.apache.commons.lang3.tuple.ImmutablePair; - -import com.hazelcast.logging.ILogger; -import com.hazelcast.logging.Logger; - -import java.io.IOException; -import java.net.MalformedURLException; -import java.net.URL; -import java.nio.file.Files; -import java.nio.file.Path; import java.util.ArrayList; -import java.util.Collections; -import java.util.HashSet; -import java.util.List; -import java.util.Set; import java.util.concurrent.ExecutionException; -import java.util.stream.Collectors; - -public class JobExecutionEnvironment { - - private static final ILogger LOGGER = Logger.getLogger(JobExecutionEnvironment.class); - - private final boolean isStartWithSavePoint; - - private final JobConfig jobConfig; - - private final List actions = new ArrayList<>(); - - private final Set jarUrls = new HashSet<>(); - private final List commonPluginJars = new ArrayList<>(); +public class JobExecutionEnvironment extends AbstractJobEnvironment { private final String jobFilePath; - private final IdGenerator idGenerator; - private final SeaTunnelHazelcastClient seaTunnelHazelcastClient; private final JobClient jobClient; @@ -78,35 +42,12 @@ public JobExecutionEnvironment( SeaTunnelHazelcastClient seaTunnelHazelcastClient, boolean isStartWithSavePoint, Long jobId) { - this.jobConfig = jobConfig; + super(jobConfig, isStartWithSavePoint); this.jobFilePath = jobFilePath; - this.idGenerator = new IdGenerator(); this.seaTunnelHazelcastClient = seaTunnelHazelcastClient; this.jobClient = new JobClient(seaTunnelHazelcastClient); - this.isStartWithSavePoint = isStartWithSavePoint; this.jobConfig.setJobContext( new JobContext(isStartWithSavePoint ? jobId : jobClient.getNewJobId())); - this.commonPluginJars.addAll(searchPluginJars()); - this.commonPluginJars.addAll( - new ArrayList<>( - Common.getThirdPartyJars( - jobConfig - .getEnvOptions() - .getOrDefault(EnvCommonOptions.JARS.key(), "") - .toString()) - .stream() - .map(Path::toUri) - .map( - uri -> { - try { - return uri.toURL(); - } catch (MalformedURLException e) { - throw new SeaTunnelEngineException( - "the uri of jar illegal:" + uri, e); - } - }) - .collect(Collectors.toList()))); - LOGGER.info("add common jar in plugins :" + commonPluginJars); } public JobExecutionEnvironment( @@ -117,27 +58,12 @@ public JobExecutionEnvironment( } /** Search all jars in SEATUNNEL_HOME/plugins */ - private Set searchPluginJars() { - try { - if (Files.exists(Common.pluginRootDir())) { - return new HashSet<>(FileUtils.searchJarFiles(Common.pluginRootDir())); - } - } catch (IOException | SeaTunnelEngineException e) { - LOGGER.warning( - String.format("Can't search plugin jars in %s.", Common.pluginRootDir()), e); - } - return Collections.emptySet(); - } - - private MultipleTableJobConfigParser getJobConfigParser() { + @Override + protected MultipleTableJobConfigParser getJobConfigParser() { return new MultipleTableJobConfigParser( jobFilePath, idGenerator, jobConfig, commonPluginJars, isStartWithSavePoint); } - private LogicalDagGenerator getLogicalDagGenerator() { - return new LogicalDagGenerator(actions, jobConfig, idGenerator); - } - public ClientJobProxy execute() throws ExecutionException, InterruptedException { JobImmutableInformation jobImmutableInformation = new JobImmutableInformation( @@ -150,11 +76,4 @@ public ClientJobProxy execute() throws ExecutionException, InterruptedException return jobClient.createJobProxy(jobImmutableInformation); } - - private LogicalDag getLogicalDag() { - ImmutablePair, Set> immutablePair = getJobConfigParser().parse(); - actions.addAll(immutablePair.getLeft()); - jarUrls.addAll(immutablePair.getRight()); - return getLogicalDagGenerator().generate(); - } } diff --git a/seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fakesource_to_file.conf b/seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fakesource_to_file.conf index 4f5cffa4a92..181a9fc1ad7 100644 --- a/seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fakesource_to_file.conf +++ b/seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fakesource_to_file.conf @@ -52,7 +52,7 @@ sink { partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_name_expression = "${transactionId}_${now}" - file_format = "text" + file_format_type = "text" sink_columns = ["name", "age"] filename_time_format = "yyyy.MM.dd" is_enable_transaction = true diff --git a/seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fakesource_to_file_complex.conf b/seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fakesource_to_file_complex.conf index c687fc1cf24..3a44886274e 100644 --- a/seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fakesource_to_file_complex.conf +++ b/seatunnel-engine/seatunnel-engine-client/src/test/resources/batch_fakesource_to_file_complex.conf @@ -63,7 +63,7 @@ sink { partition_dir_expression = "${k0}=${v0}" is_partition_field_write_in_file = true file_name_expression = "${transactionId}_${now}" - file_format = "text" + file_format_type = "text" sink_columns = ["name", "age"] filename_time_format = "yyyy.MM.dd" is_enable_transaction = true diff --git a/seatunnel-engine/seatunnel-engine-client/src/test/resources/client_test.conf b/seatunnel-engine/seatunnel-engine-client/src/test/resources/client_test.conf index 92e159c2a24..a4404b9f918 100644 --- a/seatunnel-engine/seatunnel-engine-client/src/test/resources/client_test.conf +++ b/seatunnel-engine/seatunnel-engine-client/src/test/resources/client_test.conf @@ -63,7 +63,7 @@ sink { partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" - file_format="text" + file_format_type="text" sink_columns=["name","age"] filename_time_format="yyyy.MM.dd" is_enable_transaction=true diff --git a/seatunnel-engine/seatunnel-engine-client/src/test/resources/seatunnel.yaml b/seatunnel-engine/seatunnel-engine-client/src/test/resources/seatunnel.yaml index ea5b5ac2307..4678cfed3d5 100644 --- a/seatunnel-engine/seatunnel-engine-client/src/test/resources/seatunnel.yaml +++ b/seatunnel-engine/seatunnel-engine-client/src/test/resources/seatunnel.yaml @@ -24,8 +24,6 @@ seatunnel: checkpoint: interval: 6000 timeout: 7000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 diff --git a/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/EngineConfig.java b/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/EngineConfig.java index edc18a0b15e..e162b428bb4 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/EngineConfig.java +++ b/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/EngineConfig.java @@ -32,6 +32,7 @@ @Data @SuppressWarnings("checkstyle:MagicNumber") public class EngineConfig { + private int backupCount = ServerConfigOptions.BACKUP_COUNT.defaultValue(); private int printExecutionInfoInterval = ServerConfigOptions.PRINT_EXECUTION_INFO_INTERVAL.defaultValue(); @@ -50,6 +51,8 @@ public class EngineConfig { private CheckpointConfig checkpointConfig = ServerConfigOptions.CHECKPOINT.defaultValue(); private QueueType queueType = ServerConfigOptions.QUEUE_TYPE.defaultValue(); + private int historyJobExpireMinutes = + ServerConfigOptions.HISTORY_JOB_EXPIRE_MINUTES.defaultValue(); public void setBackupCount(int newBackupCount) { checkBackupCount(newBackupCount, 0); @@ -82,6 +85,13 @@ public void setTaskExecutionThreadShareMode(ThreadShareMode taskExecutionThreadS this.taskExecutionThreadShareMode = taskExecutionThreadShareMode; } + public void setHistoryJobExpireMinutes(int historyJobExpireMinutes) { + checkPositive( + historyJobExpireMinutes, + ServerConfigOptions.HISTORY_JOB_EXPIRE_MINUTES + " must be > 0"); + this.historyJobExpireMinutes = historyJobExpireMinutes; + } + public EngineConfig setQueueType(QueueType queueType) { checkNotNull(queueType); this.queueType = queueType; diff --git a/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelDomConfigProcessor.java b/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelDomConfigProcessor.java index a901fbb5e6a..2010d1f4155 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelDomConfigProcessor.java +++ b/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelDomConfigProcessor.java @@ -131,6 +131,11 @@ private void parseEngineConfig(Node engineNode, SeaTunnelConfig config) { engineConfig.setSlotServiceConfig(parseSlotServiceConfig(node)); } else if (ServerConfigOptions.CHECKPOINT.key().equals(name)) { engineConfig.setCheckpointConfig(parseCheckpointConfig(node)); + } else if (ServerConfigOptions.HISTORY_JOB_EXPIRE_MINUTES.key().equals(name)) { + engineConfig.setHistoryJobExpireMinutes( + getIntegerValue( + ServerConfigOptions.HISTORY_JOB_EXPIRE_MINUTES.key(), + getTextContent(node))); } else { LOGGER.warning("Unrecognized element: " + name); } @@ -156,16 +161,6 @@ private CheckpointConfig parseCheckpointConfig(Node checkpointNode) { getIntegerValue( ServerConfigOptions.SCHEMA_CHANGE_CHECKPOINT_TIMEOUT.key(), getTextContent(node))); - } else if (ServerConfigOptions.CHECKPOINT_MAX_CONCURRENT.key().equals(name)) { - checkpointConfig.setMaxConcurrentCheckpoints( - getIntegerValue( - ServerConfigOptions.CHECKPOINT_MAX_CONCURRENT.key(), - getTextContent(node))); - } else if (ServerConfigOptions.CHECKPOINT_TOLERABLE_FAILURE.key().equals(name)) { - checkpointConfig.setTolerableFailureCheckpoints( - getIntegerValue( - ServerConfigOptions.CHECKPOINT_TOLERABLE_FAILURE.key(), - getTextContent(node))); } else if (ServerConfigOptions.CHECKPOINT_STORAGE.key().equals(name)) { checkpointConfig.setStorage(parseCheckpointStorageConfig(node)); } else { diff --git a/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/server/CheckpointConfig.java b/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/server/CheckpointConfig.java index 30325369685..83bd9ba335f 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/server/CheckpointConfig.java +++ b/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/server/CheckpointConfig.java @@ -21,7 +21,7 @@ import java.io.Serializable; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; @Data @SuppressWarnings("checkstyle:MagicNumber") @@ -33,10 +33,6 @@ public class CheckpointConfig implements Serializable { private long checkpointTimeout = ServerConfigOptions.CHECKPOINT_TIMEOUT.defaultValue(); private long schemaChangeCheckpointTimeout = ServerConfigOptions.SCHEMA_CHANGE_CHECKPOINT_TIMEOUT.defaultValue(); - private int maxConcurrentCheckpoints = - ServerConfigOptions.CHECKPOINT_MAX_CONCURRENT.defaultValue(); - private int tolerableFailureCheckpoints = - ServerConfigOptions.CHECKPOINT_TOLERABLE_FAILURE.defaultValue(); private CheckpointStorageConfig storage = ServerConfigOptions.CHECKPOINT_STORAGE.defaultValue(); @@ -57,21 +53,7 @@ public void setCheckpointTimeout(long checkpointTimeout) { public void setSchemaChangeCheckpointTimeout(long checkpointTimeout) { checkArgument( checkpointTimeout >= MINIMAL_CHECKPOINT_TIME, - "The minimum checkpoint timeout is 10 mills."); + "The minimum checkpoint timeout is 10 ms."); this.schemaChangeCheckpointTimeout = checkpointTimeout; } - - public void setMaxConcurrentCheckpoints(int maxConcurrentCheckpoints) { - checkArgument( - maxConcurrentCheckpoints >= 1, - "The minimum number of concurrent checkpoints is 1."); - this.maxConcurrentCheckpoints = maxConcurrentCheckpoints; - } - - public void setTolerableFailureCheckpoints(int tolerableFailureCheckpoints) { - checkArgument( - maxConcurrentCheckpoints >= 0, - "The number of tolerance failed checkpoints must be a natural number."); - this.tolerableFailureCheckpoints = tolerableFailureCheckpoints; - } } diff --git a/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/server/ServerConfigOptions.java b/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/server/ServerConfigOptions.java index 2de8acad012..486f11878e5 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/server/ServerConfigOptions.java +++ b/seatunnel-engine/seatunnel-engine-common/src/main/java/org/apache/seatunnel/engine/common/config/server/ServerConfigOptions.java @@ -92,18 +92,6 @@ public class ServerConfigOptions { .withDescription( "The timeout (in milliseconds) for a schema change checkpoint."); - public static final Option CHECKPOINT_MAX_CONCURRENT = - Options.key("max-concurrent") - .intType() - .defaultValue(1) - .withDescription("The maximum number of concurrent checkpoints."); - - public static final Option CHECKPOINT_TOLERABLE_FAILURE = - Options.key("tolerable-failure") - .intType() - .defaultValue(0) - .withDescription("The tolerable failure number of a checkpoint."); - public static final Option CHECKPOINT_STORAGE_TYPE = Options.key("type") .stringType() @@ -145,4 +133,9 @@ public class ServerConfigOptions { .type(new TypeReference>() {}) .noDefaultValue() .withDescription("The checkpoint storage instance configuration."); + public static final Option HISTORY_JOB_EXPIRE_MINUTES = + Options.key("history-job-expire-minutes") + .intType() + .defaultValue(1440) + .withDescription("The expire time of history jobs.time unit minute"); } diff --git a/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml b/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml index e5d92281da7..cc14d81eafa 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml +++ b/seatunnel-engine/seatunnel-engine-common/src/main/resources/seatunnel.yaml @@ -25,8 +25,6 @@ seatunnel: checkpoint: interval: 300000 timeout: 10000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 diff --git a/seatunnel-engine/seatunnel-engine-common/src/test/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelConfigParserTest.java b/seatunnel-engine/seatunnel-engine-common/src/test/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelConfigParserTest.java index 1263a572325..ed6853e39b4 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/test/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelConfigParserTest.java +++ b/seatunnel-engine/seatunnel-engine-common/src/test/java/org/apache/seatunnel/engine/common/config/YamlSeaTunnelConfigParserTest.java @@ -53,12 +53,6 @@ public void testSeaTunnelConfig() { Assertions.assertEquals( 7000, config.getEngineConfig().getCheckpointConfig().getCheckpointTimeout()); - Assertions.assertEquals( - 5, config.getEngineConfig().getCheckpointConfig().getMaxConcurrentCheckpoints()); - - Assertions.assertEquals( - 2, config.getEngineConfig().getCheckpointConfig().getTolerableFailureCheckpoints()); - Assertions.assertEquals( "hdfs", config.getEngineConfig().getCheckpointConfig().getStorage().getStorage()); diff --git a/seatunnel-engine/seatunnel-engine-common/src/test/resources/seatunnel.yaml b/seatunnel-engine/seatunnel-engine-common/src/test/resources/seatunnel.yaml index 4f6ce5f4ef1..8453bdeecaa 100644 --- a/seatunnel-engine/seatunnel-engine-common/src/test/resources/seatunnel.yaml +++ b/seatunnel-engine/seatunnel-engine-common/src/test/resources/seatunnel.yaml @@ -25,8 +25,6 @@ seatunnel: checkpoint: interval: 6000 timeout: 7000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 diff --git a/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/dag/actions/ShufflePartitionStrategy.java b/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/dag/actions/ShufflePartitionStrategy.java index 4b69eba2271..45144d210f3 100644 --- a/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/dag/actions/ShufflePartitionStrategy.java +++ b/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/dag/actions/ShufflePartitionStrategy.java @@ -35,7 +35,7 @@ import java.util.stream.Collectors; import java.util.stream.Stream; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; @Slf4j @SuperBuilder diff --git a/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/job/AbstractJobEnvironment.java b/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/job/AbstractJobEnvironment.java new file mode 100644 index 00000000000..3509903c088 --- /dev/null +++ b/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/job/AbstractJobEnvironment.java @@ -0,0 +1,114 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.engine.core.job; + +import org.apache.seatunnel.api.env.EnvCommonOptions; +import org.apache.seatunnel.common.config.Common; +import org.apache.seatunnel.common.utils.FileUtils; +import org.apache.seatunnel.engine.common.config.JobConfig; +import org.apache.seatunnel.engine.common.exception.SeaTunnelEngineException; +import org.apache.seatunnel.engine.common.utils.IdGenerator; +import org.apache.seatunnel.engine.core.dag.actions.Action; +import org.apache.seatunnel.engine.core.dag.logical.LogicalDag; +import org.apache.seatunnel.engine.core.dag.logical.LogicalDagGenerator; +import org.apache.seatunnel.engine.core.parse.MultipleTableJobConfigParser; + +import org.apache.commons.lang3.tuple.ImmutablePair; + +import com.hazelcast.logging.ILogger; +import com.hazelcast.logging.Logger; + +import java.io.IOException; +import java.net.MalformedURLException; +import java.net.URL; +import java.nio.file.Files; +import java.nio.file.Path; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashSet; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; + +public abstract class AbstractJobEnvironment { + protected static ILogger LOGGER = null; + + protected final boolean isStartWithSavePoint; + + protected final List actions = new ArrayList<>(); + protected final Set jarUrls = new HashSet<>(); + + protected final JobConfig jobConfig; + + protected final IdGenerator idGenerator; + + protected final List commonPluginJars = new ArrayList<>(); + + public AbstractJobEnvironment(JobConfig jobConfig, boolean isStartWithSavePoint) { + LOGGER = Logger.getLogger(getClass().getName()); + this.jobConfig = jobConfig; + this.isStartWithSavePoint = isStartWithSavePoint; + this.idGenerator = new IdGenerator(); + this.commonPluginJars.addAll(searchPluginJars()); + this.commonPluginJars.addAll( + new ArrayList<>( + Common.getThirdPartyJars( + jobConfig + .getEnvOptions() + .getOrDefault(EnvCommonOptions.JARS.key(), "") + .toString()) + .stream() + .map(Path::toUri) + .map( + uri -> { + try { + return uri.toURL(); + } catch (MalformedURLException e) { + throw new SeaTunnelEngineException( + "the uri of jar illegal:" + uri, e); + } + }) + .collect(Collectors.toList()))); + LOGGER.info("add common jar in plugins :" + commonPluginJars); + } + + protected Set searchPluginJars() { + try { + if (Files.exists(Common.pluginRootDir())) { + return new HashSet<>(FileUtils.searchJarFiles(Common.pluginRootDir())); + } + } catch (IOException | SeaTunnelEngineException e) { + LOGGER.warning( + String.format("Can't search plugin jars in %s.", Common.pluginRootDir()), e); + } + return Collections.emptySet(); + } + + protected abstract MultipleTableJobConfigParser getJobConfigParser(); + + protected LogicalDagGenerator getLogicalDagGenerator() { + return new LogicalDagGenerator(actions, jobConfig, idGenerator); + } + + protected LogicalDag getLogicalDag() { + ImmutablePair, Set> immutablePair = getJobConfigParser().parse(); + actions.addAll(immutablePair.getLeft()); + jarUrls.addAll(immutablePair.getRight()); + return getLogicalDagGenerator().generate(); + } +} diff --git a/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/parse/MultipleTableJobConfigParser.java b/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/parse/MultipleTableJobConfigParser.java index 22ff01c7c85..db8d74a28fb 100644 --- a/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/parse/MultipleTableJobConfigParser.java +++ b/seatunnel-engine/seatunnel-engine-core/src/main/java/org/apache/seatunnel/engine/core/parse/MultipleTableJobConfigParser.java @@ -127,6 +127,22 @@ public MultipleTableJobConfigParser( new JobConfigParser(idGenerator, commonPluginJars, isStartWithSavePoint); } + public MultipleTableJobConfigParser( + Config seaTunnelJobConfig, + IdGenerator idGenerator, + JobConfig jobConfig, + List commonPluginJars, + boolean isStartWithSavePoint) { + this.idGenerator = idGenerator; + this.jobConfig = jobConfig; + this.commonPluginJars = commonPluginJars; + this.isStartWithSavePoint = isStartWithSavePoint; + this.seaTunnelJobConfig = seaTunnelJobConfig; + this.envOptions = ReadonlyConfig.fromConfig(seaTunnelJobConfig.getConfig("env")); + this.fallbackParser = + new JobConfigParser(idGenerator, commonPluginJars, isStartWithSavePoint); + } + public ImmutablePair, Set> parse() { List sourceConfigs = TypesafeConfigUtils.getConfigList( @@ -241,7 +257,12 @@ private void fillJobConfig() { || jobConfig.getName().equals(Constants.LOGO)) { jobConfig.setName(envOptions.get(EnvCommonOptions.JOB_NAME)); } - envOptions.toMap().forEach((k, v) -> jobConfig.getEnvOptions().put(k, v)); + envOptions + .toMap() + .forEach( + (k, v) -> { + jobConfig.getEnvOptions().put(k, v); + }); } private static boolean isFallback( diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/CoordinatorService.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/CoordinatorService.java index 6a92918bc4a..8d47dd70164 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/CoordinatorService.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/CoordinatorService.java @@ -39,9 +39,11 @@ import org.apache.seatunnel.engine.server.execution.ExecutionState; import org.apache.seatunnel.engine.server.execution.TaskExecutionState; import org.apache.seatunnel.engine.server.execution.TaskGroupLocation; +import org.apache.seatunnel.engine.server.execution.TaskLocation; import org.apache.seatunnel.engine.server.master.JobHistoryService; import org.apache.seatunnel.engine.server.master.JobMaster; import org.apache.seatunnel.engine.server.metrics.JobMetricsUtil; +import org.apache.seatunnel.engine.server.metrics.SeaTunnelMetricsContext; import org.apache.seatunnel.engine.server.resourcemanager.ResourceManager; import org.apache.seatunnel.engine.server.resourcemanager.ResourceManagerFactory; import org.apache.seatunnel.engine.server.resourcemanager.resource.SlotProfile; @@ -59,6 +61,7 @@ import lombok.NonNull; import java.util.ArrayList; +import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; @@ -136,6 +139,8 @@ public class CoordinatorService { */ private IMap> ownedSlotProfilesIMap; + private IMap> metricsImap; + /** If this node is a master node */ private volatile boolean isActive = false; @@ -201,6 +206,7 @@ private void initCoordinatorService() { nodeEngine.getHazelcastInstance().getMap(Constant.IMAP_STATE_TIMESTAMPS); ownedSlotProfilesIMap = nodeEngine.getHazelcastInstance().getMap(Constant.IMAP_OWNED_SLOT_PROFILES); + metricsImap = nodeEngine.getHazelcastInstance().getMap(Constant.IMAP_RUNNING_JOB_METRICS); jobHistoryService = new JobHistoryService( @@ -213,7 +219,8 @@ private void initCoordinatorService() { .getMap(Constant.IMAP_FINISHED_JOB_METRICS), nodeEngine .getHazelcastInstance() - .getMap(Constant.IMAP_FINISHED_JOB_VERTEX_INFO)); + .getMap(Constant.IMAP_FINISHED_JOB_VERTEX_INFO), + engineConfig.getHistoryJobExpireMinutes()); List> collect = runningJobInfoIMap.entrySet().stream() @@ -266,6 +273,7 @@ private void restoreJobFromMasterActiveSwitch(@NonNull Long jobId, @NonNull JobI runningJobStateTimestampsIMap, ownedSlotProfilesIMap, runningJobInfoIMap, + metricsImap, engineConfig); // If Job Status is CANCELLING , set needRestore to false @@ -441,6 +449,7 @@ public PassiveCompletableFuture submitJob(long jobId, Data jobImmutableInf runningJobStateTimestampsIMap, ownedSlotProfilesIMap, runningJobInfoIMap, + metricsImap, engineConfig); executorService.submit( () -> { diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/NodeExtension.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/NodeExtension.java index d4137955c8b..37e00cffab2 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/NodeExtension.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/NodeExtension.java @@ -21,6 +21,7 @@ import org.apache.seatunnel.engine.server.log.Log4j2HttpGetCommandProcessor; import org.apache.seatunnel.engine.server.log.Log4j2HttpPostCommandProcessor; import org.apache.seatunnel.engine.server.rest.RestHttpGetCommandProcessor; +import org.apache.seatunnel.engine.server.rest.RestHttpPostCommandProcessor; import com.hazelcast.cluster.ClusterState; import com.hazelcast.instance.impl.DefaultNodeExtension; @@ -79,6 +80,7 @@ public TextCommandService createTextCommandService() { register(HTTP_GET, new Log4j2HttpGetCommandProcessor(this)); register(HTTP_POST, new Log4j2HttpPostCommandProcessor(this)); register(HTTP_GET, new RestHttpGetCommandProcessor(this)); + register(HTTP_POST, new RestHttpPostCommandProcessor(this)); } }; } diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointCoordinator.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointCoordinator.java index c584b55b9e3..2dff42792cd 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointCoordinator.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointCoordinator.java @@ -117,7 +117,6 @@ public class CheckpointCoordinator { private final CheckpointConfig coordinatorConfig; - private int tolerableFailureCheckpoints; private transient ScheduledExecutorService scheduler; private final AtomicLong latestTriggerTimestamp = new AtomicLong(0); @@ -165,7 +164,6 @@ public CheckpointCoordinator( this.runningJobStateIMap = runningJobStateIMap; this.plan = plan; this.coordinatorConfig = checkpointConfig; - this.tolerableFailureCheckpoints = coordinatorConfig.getTolerableFailureCheckpoints(); this.pendingCheckpoints = new ConcurrentHashMap<>(); this.completedCheckpoints = new ArrayDeque<>(coordinatorConfig.getStorage().getMaxRetainedCheckpoints() + 1); @@ -392,7 +390,6 @@ protected void tryTriggerPendingCheckpoint(CheckpointType checkpointType) { if (checkpointType.notFinalCheckpoint() && checkpointType.notSchemaChangeCheckpoint()) { if (currentTimestamp - latestTriggerTimestamp.get() < coordinatorConfig.getCheckpointInterval() - || pendingCounter.get() >= coordinatorConfig.getMaxConcurrentCheckpoints() || !isAllTaskReady) { return; } @@ -531,16 +528,9 @@ private void startTriggerPendingCheckpoint( if (pendingCheckpoints.get(pendingCheckpoint.getCheckpointId()) != null && !pendingCheckpoint.isFullyAcknowledged()) { - if (tolerableFailureCheckpoints-- <= 0 - || pendingCheckpoint - .getCheckpointType() - .isSchemaChangeCheckpoint()) { - LOG.info( - "timeout checkpoint: " - + pendingCheckpoint.getInfo()); - handleCoordinatorError( - CheckpointCloseReason.CHECKPOINT_EXPIRED, null); - } + LOG.info("timeout checkpoint: " + pendingCheckpoint.getInfo()); + handleCoordinatorError( + CheckpointCloseReason.CHECKPOINT_EXPIRED, null); } }, checkpointTimeout, @@ -746,12 +736,6 @@ public synchronized void completePendingCheckpoint(CompletedCheckpoint completed notifyCompleted(completedCheckpoint); pendingCheckpoints.remove(checkpointId); pendingCounter.decrementAndGet(); - if (pendingCheckpoints.size() + 1 == coordinatorConfig.getMaxConcurrentCheckpoints()) { - // latest checkpoint completed time > checkpoint interval - if (completedCheckpoint.getCheckpointType().notFinalCheckpoint()) { - scheduleTriggerPendingCheckpoint(0L); - } - } if (isCompleted()) { cleanPendingCheckpoint(CheckpointCloseReason.CHECKPOINT_COORDINATOR_COMPLETED); if (latestCompletedCheckpoint.getCheckpointType().isSavepoint()) { diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointManager.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointManager.java index ca77b8c45e7..cd58da1dd9e 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointManager.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/CheckpointManager.java @@ -49,6 +49,7 @@ import com.hazelcast.spi.impl.operationservice.impl.InvocationFuture; import lombok.extern.slf4j.Slf4j; +import java.util.Arrays; import java.util.Map; import java.util.concurrent.CompletableFuture; import java.util.concurrent.ExecutorService; @@ -169,6 +170,9 @@ public PassiveCompletableFuture triggerSavepoint(int pipeli } public void reportedPipelineRunning(int pipelineId, boolean alreadyStarted) { + log.info( + "reported pipeline running stack: " + + Arrays.toString(Thread.currentThread().getStackTrace())); getCheckpointCoordinator(pipelineId).restoreCoordinator(alreadyStarted); } diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/TaskStatistics.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/TaskStatistics.java index 03e55339d46..3b00db5546b 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/TaskStatistics.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/TaskStatistics.java @@ -21,8 +21,8 @@ import java.util.Arrays; import java.util.List; -import static com.google.common.base.Preconditions.checkArgument; -import static com.google.common.base.Preconditions.checkNotNull; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkNotNull; public class TaskStatistics implements Serializable { /** ID of the task the statistics belong to. */ diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/operation/CheckpointErrorReportOperation.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/operation/CheckpointErrorReportOperation.java index 072f179c627..967e1572311 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/operation/CheckpointErrorReportOperation.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/checkpoint/operation/CheckpointErrorReportOperation.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package org.apache.seatunnel.engine.server.checkpoint.operation; import org.apache.seatunnel.common.utils.ExceptionUtils; diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/execution/ExecutionPlanGenerator.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/execution/ExecutionPlanGenerator.java index e590128edf2..6f2332c8526 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/execution/ExecutionPlanGenerator.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/execution/ExecutionPlanGenerator.java @@ -54,7 +54,7 @@ import java.util.Optional; import java.util.Set; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; @Slf4j public class ExecutionPlanGenerator { diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/execution/PipelineGenerator.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/execution/PipelineGenerator.java index 17bd2509a8b..f0a92a66914 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/execution/PipelineGenerator.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/execution/PipelineGenerator.java @@ -29,7 +29,7 @@ import java.util.Map; import java.util.stream.Collectors; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; public class PipelineGenerator { /** The action & vertex ID needs to be regenerated because of split pipeline. */ diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PhysicalVertex.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PhysicalVertex.java index c005944499c..8b2a3995f0d 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PhysicalVertex.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PhysicalVertex.java @@ -477,11 +477,6 @@ public void cancel() { } else if (ExecutionState.CANCELING.equals(runningJobStateIMap.get(taskGroupLocation))) { noticeTaskExecutionServiceCancel(); } - - LOGGER.info( - String.format( - "can not cancel task %s because it is in state %s ", - taskFullName, getExecutionState())); } @SuppressWarnings("checkstyle:MagicNumber") diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PipelineLocation.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PipelineLocation.java index 45609e5cef0..c7a2c3caaec 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PipelineLocation.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/PipelineLocation.java @@ -25,6 +25,7 @@ @AllArgsConstructor @Data public class PipelineLocation implements Serializable { + private static final long serialVersionUID = 2510281765212372549L; private long jobId; private int pipelineId; } diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/SubPlan.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/SubPlan.java index d61f0fad742..fefe7209f5a 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/SubPlan.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/dag/physical/SubPlan.java @@ -53,7 +53,7 @@ public class SubPlan { private static final ILogger LOGGER = Logger.getLogger(SubPlan.class); /** The max num pipeline can restore. */ - public static final int PIPELINE_MAX_RESTORE_NUM = 2; // TODO should set by config + public static final int PIPELINE_MAX_RESTORE_NUM = 3; // TODO should set by config private final List physicalVertexList; @@ -332,6 +332,9 @@ private void turnToEndState(@NonNull PipelineStatus endState) throws Exception { exception -> ExceptionUtil.isOperationNeedRetryException(exception), Constant.OPERATION_RETRY_SLEEP)); this.currPipelineStatus = endState; + LOGGER.info( + String.format( + "%s turn to end state %s.", pipelineFullName, currPipelineStatus)); } } @@ -511,11 +514,17 @@ private void resetPipelineState() throws Exception { LOGGER.severe(message); throw new IllegalStateException(message); } - + LOGGER.info( + String.format( + "Reset pipeline %s state to %s", + getPipelineFullName(), PipelineStatus.CREATED)); updateStateTimestamps(PipelineStatus.CREATED); runningJobStateIMap.set(pipelineLocation, PipelineStatus.CREATED); this.currPipelineStatus = PipelineStatus.CREATED; - ; + LOGGER.info( + String.format( + "Reset pipeline %s state to %s complete", + getPipelineFullName(), PipelineStatus.CREATED)); return null; }, new RetryUtils.RetryMaterial( diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/execution/TaskGroupLocation.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/execution/TaskGroupLocation.java index 83686745a8e..6dc7cadad6f 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/execution/TaskGroupLocation.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/execution/TaskGroupLocation.java @@ -30,6 +30,7 @@ @Data @AllArgsConstructor public class TaskGroupLocation implements Serializable { + private static final long serialVersionUID = -8321526709920799751L; private final long jobId; private final int pipelineId; diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/job/JobImmutableInformationEnv.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/job/JobImmutableInformationEnv.java new file mode 100644 index 00000000000..4dd72e31cb8 --- /dev/null +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/job/JobImmutableInformationEnv.java @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.engine.server.job; + +import org.apache.seatunnel.shade.com.typesafe.config.Config; + +import org.apache.seatunnel.api.common.JobContext; +import org.apache.seatunnel.engine.common.Constant; +import org.apache.seatunnel.engine.common.config.JobConfig; +import org.apache.seatunnel.engine.core.job.AbstractJobEnvironment; +import org.apache.seatunnel.engine.core.job.JobImmutableInformation; +import org.apache.seatunnel.engine.core.parse.MultipleTableJobConfigParser; + +import com.hazelcast.instance.impl.Node; +import com.hazelcast.spi.impl.NodeEngineImpl; + +import java.util.ArrayList; + +public class JobImmutableInformationEnv extends AbstractJobEnvironment { + private final Config seaTunnelJobConfig; + + private final NodeEngineImpl nodeEngine; + + private final Long jobId; + + public JobImmutableInformationEnv( + JobConfig jobConfig, + Config seaTunnelJobConfig, + Node node, + boolean isStartWithSavePoint, + Long jobId) { + super(jobConfig, isStartWithSavePoint); + this.seaTunnelJobConfig = seaTunnelJobConfig; + this.nodeEngine = node.getNodeEngine(); + this.jobConfig.setJobContext( + new JobContext( + isStartWithSavePoint + ? jobId + : nodeEngine + .getHazelcastInstance() + .getFlakeIdGenerator(Constant.SEATUNNEL_ID_GENERATOR_NAME) + .newId())); + this.jobId = Long.valueOf(jobConfig.getJobContext().getJobId()); + } + + public Long getJobId() { + return jobId; + } + + @Override + protected MultipleTableJobConfigParser getJobConfigParser() { + return new MultipleTableJobConfigParser( + seaTunnelJobConfig, idGenerator, jobConfig, commonPluginJars, isStartWithSavePoint); + } + + public JobImmutableInformation build() { + return new JobImmutableInformation( + Long.parseLong(jobConfig.getJobContext().getJobId()), + jobConfig.getName(), + isStartWithSavePoint, + nodeEngine.getSerializationService().toData(getLogicalDag()), + jobConfig, + new ArrayList<>(jarUrls)); + } +} diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobHistoryService.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobHistoryService.java index dda9a2d0f3f..686d2a04fe8 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobHistoryService.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobHistoryService.java @@ -76,20 +76,22 @@ public class JobHistoryService { * finishedJobStateImap key is jobId and value is jobState(json) JobStateData Indicates the * status of the job, pipeline, and task */ - // TODO need to limit the amount of storage private final IMap finishedJobStateImap; private final IMap finishedJobMetricsImap; private final ObjectMapper objectMapper; + private final int finishedJobExpireTime; + public JobHistoryService( IMap runningJobStateIMap, ILogger logger, Map runningJobMasterMap, IMap finishedJobStateImap, IMap finishedJobMetricsImap, - IMap finishedJobVertexInfoImap) { + IMap finishedJobVertexInfoImap, + int finishedJobExpireTime) { this.runningJobStateIMap = runningJobStateIMap; this.logger = logger; this.runningJobMasterMap = runningJobMasterMap; @@ -98,6 +100,7 @@ public JobHistoryService( this.finishedJobDAGInfoImap = finishedJobVertexInfoImap; this.objectMapper = new ObjectMapper(); this.objectMapper.configure(SerializationFeature.FAIL_ON_EMPTY_BEANS, false); + this.finishedJobExpireTime = finishedJobExpireTime; } // Gets the status of a running and completed job @@ -168,14 +171,15 @@ public String getJobDetailStateAsString(Long jobId) { public void storeFinishedJobState(JobMaster jobMaster) { JobState jobState = toJobStateMapper(jobMaster, false); jobState.setFinishTime(System.currentTimeMillis()); - finishedJobStateImap.put(jobState.jobId, jobState, 14, TimeUnit.DAYS); + jobState.setErrorMessage(jobMaster.getErrorMessage()); + finishedJobStateImap.put(jobState.jobId, jobState, finishedJobExpireTime, TimeUnit.MINUTES); } @SuppressWarnings("checkstyle:MagicNumber") public void storeFinishedPipelineMetrics(long jobId, JobMetrics metrics) { finishedJobMetricsImap.computeIfAbsent(jobId, key -> JobMetrics.of(new HashMap<>())); JobMetrics newMetrics = finishedJobMetricsImap.get(jobId).merge(metrics); - finishedJobMetricsImap.put(jobId, newMetrics, 14, TimeUnit.DAYS); + finishedJobMetricsImap.put(jobId, newMetrics, finishedJobExpireTime, TimeUnit.MINUTES); } private JobState toJobStateMapper(JobMaster jobMaster, boolean simple) { @@ -230,27 +234,31 @@ private JobState toJobStateMapper(JobMaster jobMaster, boolean simple) { JobStatus jobStatus = (JobStatus) runningJobStateIMap.get(jobId); String jobName = jobMaster.getJobImmutableInformation().getJobName(); long submitTime = jobMaster.getJobImmutableInformation().getCreateTime(); - return new JobState(jobId, jobName, jobStatus, submitTime, null, pipelineStateMapperMap); + return new JobState( + jobId, jobName, jobStatus, submitTime, null, pipelineStateMapperMap, null); } public void storeJobInfo(long jobId, JobDAGInfo jobInfo) { - finishedJobDAGInfoImap.put(jobId, jobInfo); + finishedJobDAGInfoImap.put(jobId, jobInfo, finishedJobExpireTime, TimeUnit.MINUTES); } @AllArgsConstructor @Data public static final class JobState implements Serializable { + private static final long serialVersionUID = -1176348098833918960L; private Long jobId; private String jobName; private JobStatus jobStatus; private long submitTime; private Long finishTime; private Map pipelineStateMapperMap; + private String errorMessage; } @AllArgsConstructor @Data public static final class PipelineStateData implements Serializable { + private static final long serialVersionUID = -7875004875757861958L; private PipelineStatus pipelineStatus; private Map executionStateMap; } diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobMaster.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobMaster.java index 04dd4ba1da5..c9a7dddd9c5 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobMaster.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/master/JobMaster.java @@ -147,6 +147,12 @@ public class JobMaster { private CheckpointConfig jobCheckpointConfig; + public String getErrorMessage() { + return errorMessage; + } + + private String errorMessage; + public JobMaster( @NonNull Data jobImmutableInformationData, @NonNull NodeEngine nodeEngine, @@ -157,6 +163,7 @@ public JobMaster( @NonNull IMap runningJobStateTimestampsIMap, @NonNull IMap ownedSlotProfilesIMap, @NonNull IMap runningJobInfoIMap, + @NonNull IMap> metricsImap, EngineConfig engineConfig) { this.jobImmutableInformationData = jobImmutableInformationData; this.nodeEngine = nodeEngine; @@ -172,8 +179,7 @@ public JobMaster( this.runningJobStateTimestampsIMap = runningJobStateTimestampsIMap; this.runningJobInfoIMap = runningJobInfoIMap; this.engineConfig = engineConfig; - this.metricsImap = - nodeEngine.getHazelcastInstance().getMap(Constant.IMAP_RUNNING_JOB_METRICS); + this.metricsImap = metricsImap; } public void init(long initializationTimestamp, boolean restart, boolean canRestoreAgain) @@ -258,10 +264,6 @@ private CheckpointConfig createJobCheckpointConfig( CheckpointConfig jobCheckpointConfig = new CheckpointConfig(); jobCheckpointConfig.setCheckpointTimeout(defaultCheckpointConfig.getCheckpointTimeout()); jobCheckpointConfig.setCheckpointInterval(defaultCheckpointConfig.getCheckpointInterval()); - jobCheckpointConfig.setMaxConcurrentCheckpoints( - defaultCheckpointConfig.getMaxConcurrentCheckpoints()); - jobCheckpointConfig.setTolerableFailureCheckpoints( - defaultCheckpointConfig.getTolerableFailureCheckpoints()); CheckpointStorageConfig jobCheckpointStorageConfig = new CheckpointStorageConfig(); jobCheckpointStorageConfig.setStorage(defaultCheckpointConfig.getStorage().getStorage()); @@ -290,6 +292,7 @@ public void initStateFuture() { if (JobStatus.FAILING.equals(v.getStatus())) { physicalPlan.updateJobState(JobStatus.FAILING, JobStatus.FAILED); } + JobMaster.this.errorMessage = v.getError(); JobResult jobResult = new JobResult(physicalPlan.getJobStatus(), v.getError()); cleanJob(); diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/resource/ResourceProfile.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/resource/ResourceProfile.java index bc0734028be..247c1940d8e 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/resource/ResourceProfile.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/resourcemanager/resource/ResourceProfile.java @@ -19,7 +19,7 @@ import java.io.Serializable; -import static com.google.common.base.Preconditions.checkArgument; +import static org.apache.seatunnel.shade.com.google.common.base.Preconditions.checkArgument; public class ResourceProfile implements Serializable { diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestConstant.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestConstant.java index 0a5d8437be3..7776d592b8f 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestConstant.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestConstant.java @@ -21,6 +21,7 @@ public class RestConstant { public static final String RUNNING_JOBS_URL = "/hazelcast/rest/maps/running-jobs"; public static final String RUNNING_JOB_URL = "/hazelcast/rest/maps/running-job"; + public static final String SUBMIT_JOB_URL = "/hazelcast/rest/maps/submit-job"; public static final String SYSTEM_MONITORING_INFORMATION = "/hazelcast/rest/maps/system-monitoring-information"; diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestHttpGetCommandProcessor.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestHttpGetCommandProcessor.java index 1540f99c986..4c1debd6f87 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestHttpGetCommandProcessor.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestHttpGetCommandProcessor.java @@ -22,6 +22,7 @@ import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; import org.apache.seatunnel.engine.common.Constant; +import org.apache.seatunnel.engine.common.loader.SeaTunnelChildFirstClassLoader; import org.apache.seatunnel.engine.core.dag.logical.LogicalDag; import org.apache.seatunnel.engine.core.job.JobImmutableInformation; import org.apache.seatunnel.engine.core.job.JobInfo; @@ -42,6 +43,7 @@ import com.hazelcast.internal.json.JsonValue; import com.hazelcast.internal.util.JsonUtil; import com.hazelcast.internal.util.StringUtil; +import com.hazelcast.jet.impl.execution.init.CustomClassLoadedObject; import com.hazelcast.map.IMap; import com.hazelcast.spi.impl.NodeEngine; @@ -227,19 +229,21 @@ private JsonObject convertToJson(JobInfo jobInfo, long jobId) { .getNodeEngine() .getSerializationService() .toObject(jobInfo.getJobImmutableInformation())); + + ClassLoader classLoader = + new SeaTunnelChildFirstClassLoader(jobImmutableInformation.getPluginJarsUrls()); LogicalDag logicalDag = - this.textCommandService - .getNode() - .getNodeEngine() - .getSerializationService() - .toObject(jobImmutableInformation.getLogicalDag()); + CustomClassLoadedObject.deserializeWithCustomClassLoader( + this.textCommandService.getNode().getNodeEngine().getSerializationService(), + classLoader, + jobImmutableInformation.getLogicalDag()); String jobMetrics = getSeaTunnelServer().getCoordinatorService().getJobMetrics(jobId).toJsonString(); JobStatus jobStatus = getSeaTunnelServer().getCoordinatorService().getJobStatus(jobId); jobInfoJson - .add("jobId", jobId) + .add("jobId", String.valueOf(jobId)) .add("jobName", logicalDag.getJobConfig().getName()) .add("jobStatus", jobStatus.toString()) .add("envOptions", JsonUtil.toJsonObject(logicalDag.getJobConfig().getEnvOptions())) diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestHttpPostCommandProcessor.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestHttpPostCommandProcessor.java new file mode 100644 index 00000000000..e0edd932032 --- /dev/null +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/rest/RestHttpPostCommandProcessor.java @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.engine.server.rest; + +import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; +import org.apache.seatunnel.shade.com.typesafe.config.Config; + +import org.apache.seatunnel.engine.common.Constant; +import org.apache.seatunnel.engine.common.config.JobConfig; +import org.apache.seatunnel.engine.common.utils.PassiveCompletableFuture; +import org.apache.seatunnel.engine.core.job.JobImmutableInformation; +import org.apache.seatunnel.engine.server.CoordinatorService; +import org.apache.seatunnel.engine.server.SeaTunnelServer; +import org.apache.seatunnel.engine.server.job.JobImmutableInformationEnv; +import org.apache.seatunnel.engine.server.log.Log4j2HttpPostCommandProcessor; +import org.apache.seatunnel.engine.server.utils.RestUtil; + +import com.hazelcast.internal.ascii.TextCommandService; +import com.hazelcast.internal.ascii.rest.HttpCommandProcessor; +import com.hazelcast.internal.ascii.rest.HttpPostCommand; +import com.hazelcast.internal.json.JsonObject; +import com.hazelcast.internal.serialization.Data; + +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +import static com.hazelcast.internal.ascii.rest.HttpStatusCode.SC_400; +import static com.hazelcast.internal.ascii.rest.HttpStatusCode.SC_500; +import static org.apache.seatunnel.engine.server.rest.RestConstant.SUBMIT_JOB_URL; + +public class RestHttpPostCommandProcessor extends HttpCommandProcessor { + private final Log4j2HttpPostCommandProcessor original; + + public RestHttpPostCommandProcessor(TextCommandService textCommandService) { + this(textCommandService, new Log4j2HttpPostCommandProcessor(textCommandService)); + } + + protected RestHttpPostCommandProcessor( + TextCommandService textCommandService, + Log4j2HttpPostCommandProcessor log4j2HttpPostCommandProcessor) { + super( + textCommandService, + textCommandService.getNode().getLogger(Log4j2HttpPostCommandProcessor.class)); + this.original = log4j2HttpPostCommandProcessor; + } + + @Override + public void handle(HttpPostCommand httpPostCommand) { + String uri = httpPostCommand.getURI(); + try { + if (uri.startsWith(SUBMIT_JOB_URL)) { + handleSubmitJob(httpPostCommand, uri); + } else { + original.handle(httpPostCommand); + } + } catch (IllegalArgumentException e) { + prepareResponse(SC_400, httpPostCommand, exceptionResponse(e)); + } catch (Throwable e) { + logger.warning("An error occurred while handling request " + httpPostCommand, e); + prepareResponse(SC_500, httpPostCommand, exceptionResponse(e)); + } + + this.textCommandService.sendResponse(httpPostCommand); + } + + private SeaTunnelServer getSeaTunnelServer() { + Map extensionServices = + this.textCommandService.getNode().getNodeExtension().createExtensionServices(); + return (SeaTunnelServer) extensionServices.get(Constant.SEATUNNEL_SERVICE_NAME); + } + + private void handleSubmitJob(HttpPostCommand httpPostCommand, String uri) + throws IllegalArgumentException { + Map requestParams = new HashMap<>(); + RestUtil.buildRequestParams(requestParams, uri); + byte[] requestBody = httpPostCommand.getData(); + if (requestBody.length == 0) { + throw new IllegalArgumentException("Request body is empty."); + } + JsonNode requestBodyJsonNode; + try { + requestBodyJsonNode = RestUtil.convertByteToJsonNode(requestBody); + } catch (IOException e) { + throw new IllegalArgumentException("Invalid JSON format in request body."); + } + Config config = RestUtil.buildConfig(requestBodyJsonNode); + JobConfig jobConfig = new JobConfig(); + jobConfig.setName(requestParams.get("jobName")); + JobImmutableInformationEnv jobImmutableInformationEnv = + new JobImmutableInformationEnv( + jobConfig, + config, + textCommandService.getNode(), + Boolean.parseBoolean(requestParams.get("isStartWithSavePoint")), + Long.parseLong(requestParams.get("jobId"))); + JobImmutableInformation jobImmutableInformation = jobImmutableInformationEnv.build(); + CoordinatorService coordinatorService = getSeaTunnelServer().getCoordinatorService(); + Data data = + textCommandService + .getNode() + .nodeEngine + .getSerializationService() + .toData(jobImmutableInformation); + PassiveCompletableFuture voidPassiveCompletableFuture = + coordinatorService.submitJob( + Long.parseLong(jobConfig.getJobContext().getJobId()), data); + voidPassiveCompletableFuture.join(); + + Long jobId = jobImmutableInformationEnv.getJobId(); + this.prepareResponse( + httpPostCommand, + new JsonObject().add("jobId", jobId).add("jobName", requestParams.get("jobName"))); + } + + @Override + public void handleRejection(HttpPostCommand httpPostCommand) { + handle(httpPostCommand); + } +} diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/task/group/queue/IntermediateBlockingQueue.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/task/group/queue/IntermediateBlockingQueue.java index 2b2ff57e92a..5b2de4c50c8 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/task/group/queue/IntermediateBlockingQueue.java +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/task/group/queue/IntermediateBlockingQueue.java @@ -57,7 +57,7 @@ public void collect(Collector> collector) throws Exception { @Override public void close() throws IOException { - queue.clear(); + getIntermediateQueue().clear(); } private void handleRecord(Record record, ConsumerWithException> consumer) diff --git a/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/utils/RestUtil.java b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/utils/RestUtil.java new file mode 100644 index 00000000000..d3761366d09 --- /dev/null +++ b/seatunnel-engine/seatunnel-engine-server/src/main/java/org/apache/seatunnel/engine/server/utils/RestUtil.java @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.engine.server.utils; + +import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode; +import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.seatunnel.shade.com.typesafe.config.Config; + +import org.apache.seatunnel.common.Constants; +import org.apache.seatunnel.common.utils.JsonUtils; +import org.apache.seatunnel.core.starter.utils.ConfigBuilder; + +import com.hazelcast.internal.util.StringUtil; + +import java.io.IOException; +import java.util.Map; + +public class RestUtil { + private RestUtil() {} + + private static final ObjectMapper objectMapper = new ObjectMapper(); + + public static JsonNode convertByteToJsonNode(byte[] byteData) throws IOException { + return objectMapper.readTree(byteData); + } + + public static void buildRequestParams(Map requestParams, String uri) { + requestParams.put("jobId", null); + requestParams.put("jobName", Constants.LOGO); + requestParams.put("isStartWithSavePoint", String.valueOf(false)); + uri = StringUtil.stripTrailingSlash(uri); + if (!uri.contains("?")) { + return; + } + int indexEnd = uri.indexOf('?'); + try { + for (String s : uri.substring(indexEnd + 1).split("&")) { + String[] param = s.split("="); + requestParams.put(param[0], param[1]); + } + } catch (IndexOutOfBoundsException e) { + throw new IllegalArgumentException("Invalid Params format in Params."); + } + } + + public static Config buildConfig(JsonNode jsonNode) { + Map objectMap = JsonUtils.toMap(jsonNode); + return ConfigBuilder.of(objectMap); + } +} diff --git a/seatunnel-engine/seatunnel-engine-server/src/test/java/org/apache/seatunnel/engine/server/checkpoint/SavePointTest.java b/seatunnel-engine/seatunnel-engine-server/src/test/java/org/apache/seatunnel/engine/server/checkpoint/SavePointTest.java index 804e5c455e5..fdf02c7513b 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/test/java/org/apache/seatunnel/engine/server/checkpoint/SavePointTest.java +++ b/seatunnel-engine/seatunnel-engine-server/src/test/java/org/apache/seatunnel/engine/server/checkpoint/SavePointTest.java @@ -45,7 +45,6 @@ public class SavePointTest extends AbstractSeaTunnelServerTest { public static long JOB_ID = 823342L; @Test - @Disabled() public void testSavePoint() throws InterruptedException { savePointAndRestore(false); } diff --git a/seatunnel-engine/seatunnel-engine-server/src/test/resources/batch_fakesource_to_file.conf b/seatunnel-engine/seatunnel-engine-server/src/test/resources/batch_fakesource_to_file.conf index 3e71d5dfb17..24339945e79 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/test/resources/batch_fakesource_to_file.conf +++ b/seatunnel-engine/seatunnel-engine-server/src/test/resources/batch_fakesource_to_file.conf @@ -52,7 +52,7 @@ sink { partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" - file_format="text" + file_format_type="text" sink_columns=["name","age"] filename_time_format="yyyy.MM.dd" is_enable_transaction=true diff --git a/seatunnel-engine/seatunnel-engine-server/src/test/resources/batch_fakesource_to_file_complex.conf b/seatunnel-engine/seatunnel-engine-server/src/test/resources/batch_fakesource_to_file_complex.conf index 4e345cf9677..e3e0e00d9b0 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/test/resources/batch_fakesource_to_file_complex.conf +++ b/seatunnel-engine/seatunnel-engine-server/src/test/resources/batch_fakesource_to_file_complex.conf @@ -63,7 +63,7 @@ sink { partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" - file_format="text" + file_format_type="text" sink_columns=["name","age"] filename_time_format="yyyy.MM.dd" is_enable_transaction=true diff --git a/seatunnel-engine/seatunnel-engine-server/src/test/resources/seatunnel.yaml b/seatunnel-engine/seatunnel-engine-server/src/test/resources/seatunnel.yaml index 8f22b0613ca..f8739cc4830 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/test/resources/seatunnel.yaml +++ b/seatunnel-engine/seatunnel-engine-server/src/test/resources/seatunnel.yaml @@ -25,8 +25,6 @@ seatunnel: checkpoint: interval: 6000 timeout: 7000 - max-concurrent: 1 - tolerable-failure: 2 storage: type: hdfs max-retained: 3 diff --git a/seatunnel-engine/seatunnel-engine-server/src/test/resources/stream_fakesource_to_file.conf b/seatunnel-engine/seatunnel-engine-server/src/test/resources/stream_fakesource_to_file.conf index 6f86a81d464..d2299ecfd2e 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/test/resources/stream_fakesource_to_file.conf +++ b/seatunnel-engine/seatunnel-engine-server/src/test/resources/stream_fakesource_to_file.conf @@ -52,7 +52,7 @@ sink { partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" - file_format="text" + file_format_type="text" sink_columns=["name","age"] filename_time_format="yyyy.MM.dd" is_enable_transaction=true diff --git a/seatunnel-engine/seatunnel-engine-server/src/test/resources/stream_fakesource_to_file_savepoint.conf b/seatunnel-engine/seatunnel-engine-server/src/test/resources/stream_fakesource_to_file_savepoint.conf index ced94b31e5f..27575029809 100644 --- a/seatunnel-engine/seatunnel-engine-server/src/test/resources/stream_fakesource_to_file_savepoint.conf +++ b/seatunnel-engine/seatunnel-engine-server/src/test/resources/stream_fakesource_to_file_savepoint.conf @@ -55,7 +55,7 @@ sink { partition_dir_expression="${k0}=${v0}" is_partition_field_write_in_file=true file_name_expression="${transactionId}_${now}" - file_format="text" + file_format_type="text" sink_columns=["name","age"] filename_time_format="yyyy.MM.dd" is_enable_transaction=true diff --git a/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/HdfsConfiguration.java b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/HdfsConfiguration.java index 8d41ae848d8..953da3027bd 100644 --- a/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/HdfsConfiguration.java +++ b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/main/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/common/HdfsConfiguration.java @@ -49,6 +49,8 @@ public class HdfsConfiguration extends AbstractConfiguration { private static final String HDFS_IMPL_KEY = "fs.hdfs.impl"; + private static final String SEATUNNEL_HADOOP_PREFIX = "seatunnel.hadoop."; + @Override public Configuration buildConfiguration(Map config) throws CheckpointStorageException { @@ -69,7 +71,15 @@ public Configuration buildConfiguration(Map config) authenticateKerberos(kerberosPrincipal, kerberosKeytabFilePath, hadoopConf); } } - // todo support other hdfs optional config keys + // support other hdfs optional config keys + config.entrySet().stream() + .filter(entry -> entry.getKey().startsWith(SEATUNNEL_HADOOP_PREFIX)) + .forEach( + entry -> { + String key = entry.getKey().replace(SEATUNNEL_HADOOP_PREFIX, ""); + String value = entry.getValue(); + hadoopConf.set(key, value); + }); return hadoopConf; } diff --git a/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/test/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/HDFSFileCheckpointTest.java b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/test/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/HDFSFileCheckpointTest.java new file mode 100644 index 00000000000..23a41a2782b --- /dev/null +++ b/seatunnel-engine/seatunnel-engine-storage/checkpoint-storage-plugins/checkpoint-storage-hdfs/src/test/java/org/apache/seatunnel/engine/checkpoint/storage/hdfs/HDFSFileCheckpointTest.java @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + */ + +package org.apache.seatunnel.engine.checkpoint.storage.hdfs; + +import org.apache.seatunnel.engine.checkpoint.storage.exception.CheckpointStorageException; + +import org.junit.jupiter.api.BeforeAll; +import org.junit.jupiter.api.Disabled; + +import java.util.HashMap; +import java.util.Map; + +@Disabled( + "HDFS is not available in CI, if you want to run this test, please set up your own HDFS environment") +public class HDFSFileCheckpointTest extends AbstractFileCheckPointTest { + + @BeforeAll + public static void setup() throws CheckpointStorageException { + Map config = new HashMap<>(); + config.put("storage.type", "hdfs"); + config.put("fs.defaultFS", "hdfs://usdp-bing"); + config.put("seatunnel.hadoop.dfs.nameservices", "usdp-bing"); + config.put("seatunnel.hadoop.dfs.ha.namenodes.usdp-bing", "nn1,nn2"); + config.put("seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn1", "usdp-bing-nn1:8020"); + config.put("seatunnel.hadoop.dfs.namenode.rpc-address.usdp-bing.nn2", "usdp-bing-nn2:8020"); + config.put( + "seatunnel.hadoop.dfs.client.failover.proxy.provider.usdp-bing", + "org.apache.hadoop.hdfs.server.namenode.ha.ConfiguredFailoverProxyProvider"); + STORAGE = new HdfsStorage(config); + initStorageData(); + } +} diff --git a/seatunnel-engine/seatunnel-engine-storage/imap-storage-plugins/imap-storage-file/pom.xml b/seatunnel-engine/seatunnel-engine-storage/imap-storage-plugins/imap-storage-file/pom.xml index c39ddda99c8..c7eb61012e6 100644 --- a/seatunnel-engine/seatunnel-engine-storage/imap-storage-plugins/imap-storage-file/pom.xml +++ b/seatunnel-engine/seatunnel-engine-storage/imap-storage-plugins/imap-storage-file/pom.xml @@ -30,6 +30,14 @@ imap-storage-file SeaTunnel : Engine : Storage : IMap Storage Plugins : File + + + 3.0.0 + 2.4.7 + 3.1.4 + 4.1.60.Final + + org.apache.seatunnel @@ -64,24 +72,38 @@ awaitility + org.apache.hadoop hadoop-aliyun + ${hadoop-aliyun.version} + provided + + + net.minidev + json-smart + + net.minidev json-smart + ${json-smart.version} + provided org.apache.hadoop hadoop-aws + ${hadoop-aws.version} + provided io.netty netty-buffer + ${netty-buffer.version} provided diff --git a/seatunnel-formats/pom.xml b/seatunnel-formats/pom.xml index 983a8629ce8..7fc09b356a0 100644 --- a/seatunnel-formats/pom.xml +++ b/seatunnel-formats/pom.xml @@ -30,6 +30,7 @@ seatunnel-format-json seatunnel-format-text seatunnel-format-compatible-debezium-json + seatunnel-format-compatible-connect-json diff --git a/seatunnel-formats/seatunnel-format-compatible-connect-json/pom.xml b/seatunnel-formats/seatunnel-format-compatible-connect-json/pom.xml new file mode 100644 index 00000000000..d3d55457428 --- /dev/null +++ b/seatunnel-formats/seatunnel-format-compatible-connect-json/pom.xml @@ -0,0 +1,62 @@ + + + + 4.0.0 + + org.apache.seatunnel + seatunnel-formats + ${revision} + + + seatunnel-format-compatible-connect-json + SeaTunnel : Formats : Compatible Kafka Connect Json + + 1.6.4.Final + + + + + org.apache.seatunnel + seatunnel-api + ${project.version} + provided + + + + org.apache.seatunnel + seatunnel-format-json + ${project.version} + provided + + + + org.apache.kafka + kafka-clients + 3.2.0 + provided + + + + org.apache.kafka + connect-json + 3.2.0 + provided + + + + + diff --git a/seatunnel-formats/seatunnel-format-compatible-connect-json/src/main/java/org/apache/seatunnel/format/compatible/kafka/connect/json/CompatibleKafkaConnectDeserializationSchema.java b/seatunnel-formats/seatunnel-format-compatible-connect-json/src/main/java/org/apache/seatunnel/format/compatible/kafka/connect/json/CompatibleKafkaConnectDeserializationSchema.java new file mode 100644 index 00000000000..8d99da3870b --- /dev/null +++ b/seatunnel-formats/seatunnel-format-compatible-connect-json/src/main/java/org/apache/seatunnel/format/compatible/kafka/connect/json/CompatibleKafkaConnectDeserializationSchema.java @@ -0,0 +1,212 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.format.compatible.kafka.connect.json; + +import org.apache.seatunnel.shade.com.fasterxml.jackson.databind.ObjectMapper; + +import org.apache.seatunnel.api.configuration.ReadonlyConfig; +import org.apache.seatunnel.api.serialization.DeserializationSchema; +import org.apache.seatunnel.api.source.Collector; +import org.apache.seatunnel.api.table.type.RowKind; +import org.apache.seatunnel.api.table.type.SeaTunnelDataType; +import org.apache.seatunnel.api.table.type.SeaTunnelRow; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.common.exception.CommonErrorCode; +import org.apache.seatunnel.common.utils.ReflectionUtils; +import org.apache.seatunnel.format.json.JsonToRowConverters; +import org.apache.seatunnel.format.json.exception.SeaTunnelJsonFormatException; + +import org.apache.kafka.clients.consumer.ConsumerRecord; +import org.apache.kafka.connect.data.Schema; +import org.apache.kafka.connect.data.SchemaAndValue; +import org.apache.kafka.connect.json.JsonConverter; +import org.apache.kafka.connect.json.JsonConverterConfig; +import org.apache.kafka.connect.sink.SinkRecord; + +import com.fasterxml.jackson.databind.JsonNode; +import com.fasterxml.jackson.databind.node.ArrayNode; +import lombok.NonNull; +import lombok.RequiredArgsConstructor; + +import java.io.IOException; +import java.io.UnsupportedEncodingException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.Collections; +import java.util.Map; + +import static com.google.common.base.Preconditions.checkNotNull; + +/** Compatible kafka connect deserialization schema */ +@RequiredArgsConstructor +public class CompatibleKafkaConnectDeserializationSchema + implements DeserializationSchema { + + private static final String INCLUDE_SCHEMA_METHOD = "convertToJsonWithEnvelope"; + private static final String EXCLUDE_SCHEMA_METHOD = "convertToJsonWithoutEnvelope"; + private static final String KAFKA_CONNECT_SINK_RECORD_PAYLOAD = "payload"; + private transient JsonConverter keyConverter; + private transient JsonConverter valueConverter; + private transient Method keyConverterMethod; + private transient Method valueConverterMethod; + private final SeaTunnelRowType seaTunnelRowType; + private final JsonToRowConverters.JsonToRowConverter runtimeConverter; + private final boolean keySchemaEnable; + private final boolean valueSchemaEnable; + /** Object mapper for parsing the JSON. */ + private final ObjectMapper objectMapper = new ObjectMapper(); + + public CompatibleKafkaConnectDeserializationSchema( + @NonNull SeaTunnelRowType seaTunnelRowType, + @NonNull ReadonlyConfig config, + boolean failOnMissingField, + boolean ignoreParseErrors) { + + Map configMap = config.toMap(); + this.seaTunnelRowType = seaTunnelRowType; + this.keySchemaEnable = + KafkaConnectJsonFormatOptions.getKeyConverterSchemaEnabled(configMap); + this.valueSchemaEnable = + KafkaConnectJsonFormatOptions.getValueConverterSchemaEnabled(configMap); + + // Runtime converter + this.runtimeConverter = + new JsonToRowConverters(failOnMissingField, ignoreParseErrors) + .createConverter(checkNotNull(seaTunnelRowType)); + } + + @Override + public SeaTunnelRow deserialize(byte[] message) throws IOException { + throw new UnsupportedEncodingException(); + } + + /** + * Deserialize kafka consumer record + * + * @param msg + * @param out + * @throws Exception + */ + public void deserialize(ConsumerRecord msg, Collector out) + throws InvocationTargetException, IllegalAccessException { + tryInitConverter(); + SinkRecord record = convertToSinkRecord(msg); + RowKind rowKind = RowKind.INSERT; + JsonNode jsonNode = + (JsonNode) + valueConverterMethod.invoke( + valueConverter, record.valueSchema(), record.value()); + JsonNode payload = jsonNode.get(KAFKA_CONNECT_SINK_RECORD_PAYLOAD); + if (payload.isArray()) { + ArrayNode arrayNode = (ArrayNode) payload; + for (int i = 0; i < arrayNode.size(); i++) { + SeaTunnelRow row = convertJsonNode(arrayNode.get(i)); + row.setRowKind(rowKind); + out.collect(row); + } + } else { + SeaTunnelRow row = convertJsonNode(payload); + row.setRowKind(rowKind); + out.collect(row); + } + } + + private SeaTunnelRow convertJsonNode(JsonNode jsonNode) { + if (jsonNode.isNull()) { + return null; + } + try { + org.apache.seatunnel.shade.com.fasterxml.jackson.databind.JsonNode jsonData = + objectMapper.readTree(jsonNode.toString()); + return (SeaTunnelRow) runtimeConverter.convert(jsonData); + } catch (Throwable t) { + throw new SeaTunnelJsonFormatException( + CommonErrorCode.JSON_OPERATION_FAILED, + String.format("Failed to deserialize JSON '%s'.", jsonNode), + t); + } + } + + private SinkRecord convertToSinkRecord(ConsumerRecord msg) { + SchemaAndValue keyAndSchema = + (msg.key() == null) + ? SchemaAndValue.NULL + : keyConverter.toConnectData(msg.topic(), msg.headers(), msg.key()); + SchemaAndValue valueAndSchema = + valueConverter.toConnectData(msg.topic(), msg.headers(), msg.value()); + return new SinkRecord( + msg.topic(), + msg.partition(), + keyAndSchema.schema(), + keyAndSchema.value(), + valueAndSchema.schema(), + valueAndSchema.value(), + msg.offset(), + msg.timestamp(), + msg.timestampType(), + null); + } + + @Override + public SeaTunnelDataType getProducedType() { + return seaTunnelRowType; + } + + private void tryInitConverter() { + if (keyConverter == null) { + synchronized (this) { + if (keyConverter == null) { + keyConverter = new JsonConverter(); + keyConverter.configure( + Collections.singletonMap( + JsonConverterConfig.SCHEMAS_ENABLE_CONFIG, keySchemaEnable), + true); + keyConverterMethod = + ReflectionUtils.getDeclaredMethod( + JsonConverter.class, + keySchemaEnable + ? INCLUDE_SCHEMA_METHOD + : EXCLUDE_SCHEMA_METHOD, + Schema.class, + Object.class) + .get(); + } + } + } + if (valueConverter == null) { + synchronized (this) { + if (valueConverter == null) { + valueConverter = new JsonConverter(); + valueConverter.configure( + Collections.singletonMap( + JsonConverterConfig.SCHEMAS_ENABLE_CONFIG, valueSchemaEnable), + false); + valueConverterMethod = + ReflectionUtils.getDeclaredMethod( + JsonConverter.class, + valueSchemaEnable + ? INCLUDE_SCHEMA_METHOD + : EXCLUDE_SCHEMA_METHOD, + Schema.class, + Object.class) + .get(); + } + } + } + } +} diff --git a/seatunnel-formats/seatunnel-format-compatible-connect-json/src/main/java/org/apache/seatunnel/format/compatible/kafka/connect/json/KafkaConnectJsonFormatOptions.java b/seatunnel-formats/seatunnel-format-compatible-connect-json/src/main/java/org/apache/seatunnel/format/compatible/kafka/connect/json/KafkaConnectJsonFormatOptions.java new file mode 100644 index 00000000000..05e16e0abb7 --- /dev/null +++ b/seatunnel-formats/seatunnel-format-compatible-connect-json/src/main/java/org/apache/seatunnel/format/compatible/kafka/connect/json/KafkaConnectJsonFormatOptions.java @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.format.compatible.kafka.connect.json; + +import org.apache.seatunnel.api.configuration.Option; +import org.apache.seatunnel.api.configuration.Options; + +import java.util.Map; + +public class KafkaConnectJsonFormatOptions { + + public static final Option KEY_CONVERTER_SCHEMA_ENABLED = + Options.key("key_converter_schema_enabled") + .booleanType() + .defaultValue(true) + .withDescription("kafka connect key converter schema enabled."); + + public static final Option VALUE_CONVERTER_SCHEMA_ENABLED = + Options.key("value_converter_schema_enabled") + .booleanType() + .defaultValue(true) + .withDescription("kafka connect value converter schema enabled."); + + public static boolean getKeyConverterSchemaEnabled(Map options) { + return Boolean.parseBoolean( + options.getOrDefault(KEY_CONVERTER_SCHEMA_ENABLED.key(), "true")); + } + + public static boolean getValueConverterSchemaEnabled(Map options) { + return Boolean.parseBoolean( + options.getOrDefault(VALUE_CONVERTER_SCHEMA_ENABLED.key(), "true")); + } +} diff --git a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/SQLEngine.java b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/SQLEngine.java index b1e734c31ef..6dfaddca00a 100644 --- a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/SQLEngine.java +++ b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/SQLEngine.java @@ -23,7 +23,11 @@ import java.util.List; public interface SQLEngine { - void init(String inputTableName, SeaTunnelRowType inputRowType, String sql); + void init( + String inputTableName, + String catalogTableName, + SeaTunnelRowType inputRowType, + String sql); SeaTunnelRowType typeMapping(List inputColumnsMapping); diff --git a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/SQLTransform.java b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/SQLTransform.java index 20a07dcee02..9b21c4b6f5c 100644 --- a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/SQLTransform.java +++ b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/SQLTransform.java @@ -115,7 +115,11 @@ protected void setConfig(Config pluginConfig) { @Override public void open() { sqlEngine = SQLEngineFactory.getSQLEngine(engineType); - sqlEngine.init(inputTableName, inputRowType, query); + sqlEngine.init( + inputTableName, + inputCatalogTable != null ? inputCatalogTable.getTableId().getTableName() : null, + inputRowType, + query); } private void tryOpen() { diff --git a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/zeta/ZetaSQLEngine.java b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/zeta/ZetaSQLEngine.java index 7a9e633ea3e..b973dca8e09 100644 --- a/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/zeta/ZetaSQLEngine.java +++ b/seatunnel-transforms-v2/src/main/java/org/apache/seatunnel/transform/sql/zeta/ZetaSQLEngine.java @@ -37,6 +37,8 @@ import net.sf.jsqlparser.statement.select.SelectExpressionItem; import net.sf.jsqlparser.statement.select.SelectItem; +import javax.annotation.Nullable; + import java.util.ArrayList; import java.util.Arrays; import java.util.List; @@ -45,6 +47,7 @@ public class ZetaSQLEngine implements SQLEngine { private String inputTableName; + @Nullable private String catalogTableName; private SeaTunnelRowType inputRowType; private String sql; @@ -59,8 +62,13 @@ public class ZetaSQLEngine implements SQLEngine { public ZetaSQLEngine() {} @Override - public void init(String inputTableName, SeaTunnelRowType inputRowType, String sql) { + public void init( + String inputTableName, + String catalogTableName, + SeaTunnelRowType inputRowType, + String sql) { this.inputTableName = inputTableName; + this.catalogTableName = catalogTableName; this.inputRowType = inputRowType; this.sql = sql; diff --git a/seatunnel-transforms-v2/src/test/java/org/apache/seatunnel/transform/sql/zeta/ZetaSQLEngineTest.java b/seatunnel-transforms-v2/src/test/java/org/apache/seatunnel/transform/sql/zeta/ZetaSQLEngineTest.java new file mode 100644 index 00000000000..94e1060af85 --- /dev/null +++ b/seatunnel-transforms-v2/src/test/java/org/apache/seatunnel/transform/sql/zeta/ZetaSQLEngineTest.java @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.seatunnel.transform.sql.zeta; + +import org.apache.seatunnel.api.table.type.BasicType; +import org.apache.seatunnel.api.table.type.SeaTunnelDataType; +import org.apache.seatunnel.api.table.type.SeaTunnelRowType; +import org.apache.seatunnel.transform.exception.TransformException; +import org.apache.seatunnel.transform.sql.SQLEngine; +import org.apache.seatunnel.transform.sql.SQLEngineFactory; + +import org.junit.jupiter.api.Assertions; +import org.junit.jupiter.api.Test; + +public class ZetaSQLEngineTest { + + @Test + public void testCatalogNameAndSourceTableNameBothSupport() { + + SQLEngine sqlEngine = SQLEngineFactory.getSQLEngine(SQLEngineFactory.EngineType.ZETA); + + SeaTunnelRowType rowType = + new SeaTunnelRowType( + new String[] {"id", "name", "age"}, + new SeaTunnelDataType[] { + BasicType.INT_TYPE, BasicType.STRING_TYPE, BasicType.INT_TYPE + }); + sqlEngine.init("test", null, rowType, "select * from test"); + sqlEngine.init("test", "nameFromCatalog", rowType, "select * from test"); + sqlEngine.init("test", "nameFromCatalog", rowType, "select * from nameFromCatalog"); + + Assertions.assertThrows( + TransformException.class, + () -> sqlEngine.init("test", "nameFromCatalog", rowType, "select * from unknown")); + Assertions.assertThrows( + TransformException.class, + () -> sqlEngine.init("test", null, rowType, "select * from unknown")); + } +} diff --git a/seatunnel-translation/seatunnel-translation-spark/seatunnel-translation-spark-common/src/main/java/org/apache/seatunnel/translation/spark/serialization/SeaTunnelRowConverter.java b/seatunnel-translation/seatunnel-translation-spark/seatunnel-translation-spark-common/src/main/java/org/apache/seatunnel/translation/spark/serialization/SeaTunnelRowConverter.java index 51d5c7308bd..15357204cd3 100644 --- a/seatunnel-translation/seatunnel-translation-spark/seatunnel-translation-spark-common/src/main/java/org/apache/seatunnel/translation/spark/serialization/SeaTunnelRowConverter.java +++ b/seatunnel-translation/seatunnel-translation-spark/seatunnel-translation-spark-common/src/main/java/org/apache/seatunnel/translation/spark/serialization/SeaTunnelRowConverter.java @@ -24,7 +24,10 @@ import org.apache.seatunnel.api.table.type.SeaTunnelRow; import org.apache.seatunnel.api.table.type.SeaTunnelRowType; import org.apache.seatunnel.translation.serialization.RowConverter; +import org.apache.seatunnel.translation.spark.utils.TypeConverterUtils; +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema; +import org.apache.spark.sql.types.StructType; import org.apache.spark.unsafe.types.UTF8String; import scala.Tuple2; @@ -51,7 +54,11 @@ public SeaTunnelRowConverter(SeaTunnelDataType dataType) { @Override public SeaTunnelRow convert(SeaTunnelRow seaTunnelRow) throws IOException { validate(seaTunnelRow); - return (SeaTunnelRow) convert(seaTunnelRow, dataType); + GenericRowWithSchema rowWithSchema = (GenericRowWithSchema) convert(seaTunnelRow, dataType); + SeaTunnelRow newRow = new SeaTunnelRow(rowWithSchema.values()); + newRow.setRowKind(seaTunnelRow.getRowKind()); + newRow.setTableId(seaTunnelRow.getTableId()); + return newRow; } private Object convert(Object field, SeaTunnelDataType dataType) { @@ -62,7 +69,7 @@ private Object convert(Object field, SeaTunnelDataType dataType) { case ROW: SeaTunnelRow seaTunnelRow = (SeaTunnelRow) field; SeaTunnelRowType rowType = (SeaTunnelRowType) dataType; - return convert(seaTunnelRow, rowType); + return convertRow(seaTunnelRow, rowType); case DATE: return Date.valueOf((LocalDate) field); case TIMESTAMP: @@ -94,16 +101,17 @@ private Object convert(Object field, SeaTunnelDataType dataType) { } } - private SeaTunnelRow convert(SeaTunnelRow seaTunnelRow, SeaTunnelRowType rowType) { + private GenericRowWithSchema convertRow(SeaTunnelRow seaTunnelRow, SeaTunnelRowType rowType) { int arity = rowType.getTotalFields(); Object[] values = new Object[arity]; + StructType schema = (StructType) TypeConverterUtils.convert(rowType); for (int i = 0; i < arity; i++) { Object fieldValue = convert(seaTunnelRow.getField(i), rowType.getFieldType(i)); if (fieldValue != null) { values[i] = fieldValue; } } - return new SeaTunnelRow(values); + return new GenericRowWithSchema(values, schema); } private scala.collection.immutable.HashMap convertMap( @@ -148,6 +156,10 @@ private Object reconvert(Object field, SeaTunnelDataType dataType) { } switch (dataType.getSqlType()) { case ROW: + if (field instanceof GenericRowWithSchema) { + return createFromGenericRow( + (GenericRowWithSchema) field, (SeaTunnelRowType) dataType); + } return reconvert((SeaTunnelRow) field, (SeaTunnelRowType) dataType); case DATE: return ((Date) field).toLocalDate(); @@ -166,6 +178,15 @@ private Object reconvert(Object field, SeaTunnelDataType dataType) { } } + private SeaTunnelRow createFromGenericRow(GenericRowWithSchema row, SeaTunnelRowType type) { + Object[] fields = row.values(); + Object[] newFields = new Object[fields.length]; + for (int idx = 0; idx < fields.length; idx++) { + newFields[idx] = reconvert(fields[idx], type.getFieldType(idx)); + } + return new SeaTunnelRow(newFields); + } + private SeaTunnelRow reconvert(SeaTunnelRow engineRow, SeaTunnelRowType rowType) { int num = engineRow.getFields().length; Object[] fields = new Object[num]; diff --git a/tools/dependencies/known-dependencies.txt b/tools/dependencies/known-dependencies.txt index 3a1e736b68b..70bbd1c0df5 100755 --- a/tools/dependencies/known-dependencies.txt +++ b/tools/dependencies/known-dependencies.txt @@ -22,8 +22,8 @@ protostuff-collectionschema-1.8.0.jar protostuff-core-1.8.0.jar protostuff-runtime-1.8.0.jar scala-library-2.11.12.jar -seatunnel-jackson-2.3.2-SNAPSHOT-optional.jar -seatunnel-guava-2.3.2-SNAPSHOT-optional.jar +seatunnel-jackson-2.3.3-SNAPSHOT-optional.jar +seatunnel-guava-2.3.3-SNAPSHOT-optional.jar slf4j-api-1.7.25.jar jsqlparser-4.5.jar animal-sniffer-annotations-1.17.jar diff --git a/tools/update_modules_check/update_modules_check.py b/tools/update_modules_check/update_modules_check.py index 324a961bc7e..b009fda25a4 100644 --- a/tools/update_modules_check/update_modules_check.py +++ b/tools/update_modules_check/update_modules_check.py @@ -172,10 +172,6 @@ def main(argv): get_cv2_modules(argv[2]) elif argv[1] == "cv2-e2e": get_cv2_e2e_modules(argv[2]) - elif argv[1] == "cv2-flink-e2e": - get_cv2_flink_e2e_modules(argv[2]) - elif argv[1] == "cv2-spark-e2e": - get_cv2_spark_e2e_modules(argv[2]) elif argv[1] == "engine": get_engine_modules(argv[2]) elif argv[1] == "engine-e2e":