apache · ddna1021 · Jul 7, 2023 · Jul 10, 2023 · Jul 11, 2023 · Jul 11, 2023
diff --git a/docs/en/connector-v2/sink/common-options.md b/docs/en/connector-v2/sink/common-options.md
@@ -2,10 +2,11 @@
 
 > Common parameters of sink connectors
 
-|       name        |  type  | required | default value |
-|-------------------|--------|----------|---------------|
-| source_table_name | string | no       | -             |
-| parallelism       | int    | no       | -             |
+|       name        |  type   | required | default value |
+|-------------------|---------|----------|---------------|
+| source_table_name | string  | no       | -             |
+| parallelism       | int     | no       | -             |
+| partition_balance | boolean | no       | false         |
 
 ### source_table_name [string]
 
@@ -19,6 +20,16 @@ When `parallelism` is not specified, the `parallelism` in env is used by default
 
 When parallelism is specified, it will override the parallelism in env.
 
+### partition_balance [boolean]
+
+When `partition_balance` is set to true, in the sink process, a repartition will be performed first to ensure that the size of each partition is roughly the same, which can avoid problems caused by data skew, but it will consume some extra time.
+
+The default value is false, support Spark and Flink engine
+
+When `partition_balance` is not specified, the `partition_balance` in env is used by default.
+
+When `partition_balance` is specified, it will override the `partition_balance` in env.
+
 ## Examples
 
 ```bash

diff --git a/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/CommonOptions.java b/seatunnel-api/src/main/java/org/apache/seatunnel/api/common/CommonOptions.java
@@ -66,4 +66,13 @@ public interface CommonOptions {
                     .withDescription(
                             "When parallelism is not specified, the parallelism in env is used by default. "
                                     + "When parallelism is specified, it will override the parallelism in env.");
+
+    Option<Boolean> PARTITION_BALANCE =
+            Options.key("partition_balance")
+                    .booleanType()
+                    .defaultValue(false)
+                    .withDescription(
+                            "When partition_balance is set to true, "
+                                    + "in the sink process, a repartition will be performed first to ensure that the size of each partition is roughly the same, "
+                                    + "which can avoid problems caused by data skew, but it will consume some extra time. The default value is false");
 }
diff --git a/...src/main/java/org/apache/seatunnel/core/starter/flink/execution/SinkExecuteProcessor.java b/...src/main/java/org/apache/seatunnel/core/starter/flink/execution/SinkExecuteProcessor.java
@@ -115,6 +115,12 @@ public List<DataStream<Row>> execute(List<DataStream<Row>> upstreamDataStreams)
                 DataSaveMode dataSaveMode = saveModeSink.getDataSaveMode();
                 saveModeSink.handleSaveMode(dataSaveMode);
             }
+            if (sinkConfig.hasPath(CommonOptions.PARTITION_BALANCE.key())) {
+                Boolean needBalance = sinkConfig.getBoolean(CommonOptions.PARTITION_BALANCE.key());
+                if (needBalance) {
+                    stream = stream.shuffle();
+                }
+            }
             DataStreamSink<Row> dataStreamSink =
                     stream.sinkTo(SinkV1Adapter.wrap(new FlinkSink<>(seaTunnelSink)))
                             .name(seaTunnelSink.getPluginName());

diff --git a/...src/main/java/org/apache/seatunnel/core/starter/spark/execution/SinkExecuteProcessor.java b/...src/main/java/org/apache/seatunnel/core/starter/spark/execution/SinkExecuteProcessor.java
@@ -110,6 +110,12 @@ public List<Dataset<Row>> execute(List<Dataset<Row>> upstreamDataStreams)
                                         CommonOptions.PARALLELISM.key(),
                                         CommonOptions.PARALLELISM.defaultValue());
             }
+            if (sinkConfig.hasPath(CommonOptions.PARTITION_BALANCE.key())) {
+                boolean needBalance = sinkConfig.getBoolean(CommonOptions.PARTITION_BALANCE.key());
+                if (needBalance) {
+                    dataset = dataset.repartition(parallelism);
+                }
+            }
             dataset.sparkSession().read().option(CommonOptions.PARALLELISM.key(), parallelism);
             // TODO modify checkpoint location
             seaTunnelSink.setTypeInfo(