apache · ahmedabu98 · Oct 15, 2024 · Sep 20, 2024 · Sep 20, 2024 · Sep 20, 2024
diff --git a/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Dataflow.json b/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Dataflow.json
@@ -1,4 +1,5 @@
 
 {
-  "comment": "Modify this file in a trivial way to cause this test suite to run"
+  "comment": "Modify this file in a trivial way to cause this test suite to run",
+  "modification": 1
 }
diff --git a/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Direct.json b/.github/trigger_files/beam_PostCommit_Python_Xlang_Gcp_Direct.json
@@ -1,4 +1,4 @@
 {
   "comment": "Modify this file in a trivial way to cause this test suite to run",
-  "modification": 1
+  "modification": 2
 }
diff --git a/...o/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java b/...o/google-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/bigquery/BigQueryIO.java
@@ -2276,6 +2276,7 @@ public static Write<RowMutation> applyRowMutations() {
         .withFormatFunction(RowMutation::getTableRow)
         .withRowMutationInformationFn(RowMutation::getMutationInformation);
   }
+
   /**
    * A {@link PTransform} that writes a {@link PCollection} containing {@link GenericRecord
    * GenericRecords} to a BigQuery table.
@@ -2384,8 +2385,10 @@ public enum Method {
     abstract WriteDisposition getWriteDisposition();
 
     abstract Set<SchemaUpdateOption> getSchemaUpdateOptions();
+
     /** Table description. Default is empty. */
     abstract @Nullable String getTableDescription();
+
     /** An option to indicate if table validation is desired. Default is true. */
     abstract boolean getValidate();
 
@@ -3472,7 +3475,10 @@ && getStorageApiTriggeringFrequency(bqOptions) != null) {
         LOG.error("The Storage API sink does not support the WRITE_TRUNCATE write disposition.");
       }
       if (getRowMutationInformationFn() != null) {
-        checkArgument(getMethod() == Method.STORAGE_API_AT_LEAST_ONCE);
+        checkArgument(
+            getMethod() == Method.STORAGE_API_AT_LEAST_ONCE,
+            "When using row updates on BigQuery, StorageWrite API should execute using"
+                + " \"at least once\" mode.");
         checkArgument(
             getCreateDisposition() == CreateDisposition.CREATE_NEVER || getPrimaryKey() != null,
             "If specifying CREATE_IF_NEEDED along with row updates, a primary key needs to be specified");

diff --git a/...he/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java b/...he/beam/sdk/io/gcp/bigquery/providers/BigQueryStorageWriteApiSchemaTransformProvider.java
@@ -20,13 +20,15 @@
 import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkArgument;
 import static org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Preconditions.checkNotNull;
 
+import com.google.api.services.bigquery.model.TableConstraints;
 import com.google.api.services.bigquery.model.TableSchema;
 import com.google.auto.service.AutoService;
 import com.google.auto.value.AutoValue;
 import java.util.Arrays;
 import java.util.Collections;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 import javax.annotation.Nullable;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
@@ -37,6 +39,7 @@
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryStorageApiInsertError;
 import org.apache.beam.sdk.io.gcp.bigquery.BigQueryUtils;
 import org.apache.beam.sdk.io.gcp.bigquery.DynamicDestinations;
+import org.apache.beam.sdk.io.gcp.bigquery.RowMutationInformation;
 import org.apache.beam.sdk.io.gcp.bigquery.TableDestination;
 import org.apache.beam.sdk.io.gcp.bigquery.WriteResult;
 import org.apache.beam.sdk.io.gcp.bigquery.providers.BigQueryStorageWriteApiSchemaTransformProvider.BigQueryStorageWriteApiSchemaTransformConfiguration;
@@ -62,6 +65,7 @@
 import org.apache.beam.sdk.values.ValueInSingleWindow;
 import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.annotations.VisibleForTesting;
 import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.base.Strings;
+import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableList;
 import org.apache.beam.vendor.guava.v32_1_2_jre.com.google.common.collect.ImmutableMap;
 import org.joda.time.Duration;
 
@@ -257,6 +261,20 @@ public static Builder builder() {
     @Nullable
     public abstract ErrorHandling getErrorHandling();
 
+    @SchemaFieldDescription(
+        "This option enables the use of BigQuery CDC functionality. The expected PCollection"
+            + " should contain Beam Rows with a schema wrapping the record to be inserted and"
+            + " adding the CDC info similar to: {cdc_info: {mutation_type:\"...\", "
+            + "change_sequence_number:\"...\"}, record: {...}}")
+    @Nullable
+    public abstract Boolean getUseCdcWrites();
+
+    @SchemaFieldDescription(
+        "In the case of using CDC writes and setting CREATE_IF_NEEDED mode for the tables"
+            + " a primary key is required.")
+    @Nullable
+    public abstract List<String> getCdcWritesPrimaryKey();
+
     /** Builder for {@link BigQueryStorageWriteApiSchemaTransformConfiguration}. */
     @AutoValue.Builder
     public abstract static class Builder {
@@ -277,6 +295,10 @@ public abstract static class Builder {
 
       public abstract Builder setErrorHandling(ErrorHandling errorHandling);
 
+      public abstract Builder setUseCdcWrites(Boolean cdcWrites);
+
+      public abstract Builder setCdcWritesPrimaryKey(List<String> pkColumns);
+
       /** Builds a {@link BigQueryStorageWriteApiSchemaTransformConfiguration} instance. */
       public abstract BigQueryStorageWriteApiSchemaTransformProvider
               .BigQueryStorageWriteApiSchemaTransformConfiguration
@@ -344,14 +366,39 @@ public void process(ProcessContext c) {}
 
     private static class RowDynamicDestinations extends DynamicDestinations<Row, String> {
       Schema schema;
+      String fixedDestination = null;
+      List<String> primaryKey = null;
 
       RowDynamicDestinations(Schema schema) {
         this.schema = schema;
       }
 
+      RowDynamicDestinations withFixedDestination(String destination) {
+        this.fixedDestination = destination;
+        return this;
+      }
+
+      RowDynamicDestinations withPrimaryKey(List<String> primaryKey) {
+        this.primaryKey = primaryKey;
+        return this;
+      }
+
       @Override
       public String getDestination(ValueInSingleWindow<Row> element) {
-        return element.getValue().getString("destination");
+        return fixedDestination != null
+            ? fixedDestination
+            : element.getValue().getString("destination");
+      }
+
+      @Override
+      public TableConstraints getTableConstraints(String destination) {
+        return Optional.ofNullable(this.primaryKey)
+            .filter(pk -> !pk.isEmpty())
+            .map(
+                pk ->
+                    new TableConstraints()
+                        .setPrimaryKey(new TableConstraints.PrimaryKey().setColumns(pk)))
+            .orElse(null);
       }
 
       @Override
@@ -468,13 +515,15 @@ BigQueryIO.Write<Row> createStorageWriteApiTransform(Schema schema) {
 
       if (configuration.getTable().equals(DYNAMIC_DESTINATIONS)) {
         checkArgument(
-            schema.getFieldNames().equals(Arrays.asList("destination", "record")),
+            schema.getFieldNames().containsAll(Arrays.asList("destination", "record")),
             "When writing to dynamic destinations, we expect Row Schema with a "
                 + "\"destination\" string field and a \"record\" Row field.");
         write =
             write
                 .to(new RowDynamicDestinations(schema.getField("record").getType().getRowSchema()))
                 .withFormatFunction(row -> BigQueryUtils.toTableRow(row.getRow("record")));
+      } else if (Optional.ofNullable(configuration.getUseCdcWrites()).orElse(false)) {
+        write = validateAndIncludeCDCInformation(write, schema);
       } else {
         write = write.to(configuration.getTable()).useBeamSchema();
       }
@@ -498,5 +547,60 @@ BigQueryIO.Write<Row> createStorageWriteApiTransform(Schema schema) {
 
       return write;
     }
+
+    BigQueryIO.Write<Row> validateAndIncludeCDCInformation(
+        BigQueryIO.Write<Row> write, Schema schema) {
+      if (!Strings.isNullOrEmpty(configuration.getCreateDisposition())) {
+        checkArgument(
+            BigQueryStorageWriteApiSchemaTransformConfiguration.CREATE_DISPOSITIONS
+                    .get(configuration.getCreateDisposition().toUpperCase())
+                    .equals(CreateDisposition.CREATE_IF_NEEDED)
+                && !Optional.ofNullable(configuration.getCdcWritesPrimaryKey())
+                    .orElse(ImmutableList.of())
+                    .isEmpty(),
+            "When using CDC writes into BigQuery, alongside with CREATE_IF_NEEDED mode,"
+                + " a primary key should be provided.");
+      }
+      if (configuration.getTable().equals(DYNAMIC_DESTINATIONS)) {
+        checkArgument(
+            schema.getFieldNames().contains("destination"),
+            "When writing to dynamic destinations, we expect Row Schema with a "
+                + "\"destination\" string field.");
+      }
+      checkArgument(
+          schema.getFieldNames().containsAll(Arrays.asList("cdc_info", "record")),
+          "When writing using CDC functionality, we expect Row Schema with a "
+              + "\"cdc_info\" Row field and a \"record\" Row field.");
+      checkArgument(
+          schema
+              .getField("cdc_info")
+              .getType()
+              .getRowSchema()
+              .equals(
+                  Schema.builder()
+                      .addStringField("mutation_type")
+                      .addStringField("change_sequence_number")
+                      .build()),
+          "When writing using CDC functionality, we expect a \"cdc_info\" field of Row type "
+              + "with fields \"mutation_type\" and \"change_sequence_number\" of type string.");
+
+      RowDynamicDestinations destinations =
+          new RowDynamicDestinations(schema.getField("record").getType().getRowSchema())
+              .withPrimaryKey(configuration.getCdcWritesPrimaryKey());
+      if (!configuration.getTable().equals(DYNAMIC_DESTINATIONS)) {
+        destinations = destinations.withFixedDestination(configuration.getTable());
+      }
+
+      return write
+          .to(destinations)
+          .withFormatFunction(row -> BigQueryUtils.toTableRow(row.getRow("record")))
+          .withPrimaryKey(configuration.getCdcWritesPrimaryKey())
+          .withRowMutationInformationFn(
+              row ->
+                  RowMutationInformation.of(
+                      RowMutationInformation.MutationType.valueOf(
+                          row.getRow("cdc_info").getString("mutation_type")),
+                      row.getRow("cdc_info").getString("change_sequence_number")));
+    }
   }
 }
diff --git a/...oogle-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/TableContainer.java b/...oogle-cloud-platform/src/main/java/org/apache/beam/sdk/io/gcp/testing/TableContainer.java
@@ -18,11 +18,13 @@
 package org.apache.beam.sdk.io.gcp.testing;
 
 import com.google.api.services.bigquery.model.Table;
+import com.google.api.services.bigquery.model.TableConstraints;
 import com.google.api.services.bigquery.model.TableRow;
 import java.util.AbstractMap;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Map;
+import java.util.Optional;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;
 import org.apache.beam.sdk.io.gcp.bigquery.TableRowJsonCoder;
@@ -51,12 +53,24 @@ class TableContainer {
     this.keyedRows = Maps.newHashMap();
     this.ids = new ArrayList<>();
     this.sizeBytes = 0L;
+    // extract primary key information from Table if present
+    List<String> pkColumns = primaryKeyColumns(table);
+    this.primaryKeyColumns = pkColumns;
+    this.primaryKeyColumnIndices = primaryColumnFieldIndices(pkColumns, table);
   }
 
-  // Only top-level columns supported.
-  void setPrimaryKeyColumns(List<String> primaryKeyColumns) {
-    this.primaryKeyColumns = primaryKeyColumns;
+  static @Nullable List<String> primaryKeyColumns(Table table) {
+    return Optional.ofNullable(table.getTableConstraints())
+        .flatMap(constraints -> Optional.ofNullable(constraints.getPrimaryKey()))
+        .map(TableConstraints.PrimaryKey::getColumns)
+        .orElse(null);
+  }
 
+  static @Nullable List<Integer> primaryColumnFieldIndices(
+      @Nullable List<String> primaryKeyColumns, Table table) {
+    if (primaryKeyColumns == null) {
+      return null;
+    }
     Map<String, Integer> indices =
         IntStream.range(0, table.getSchema().getFields().size())
             .boxed()
@@ -65,7 +79,13 @@ void setPrimaryKeyColumns(List<String> primaryKeyColumns) {
     for (String columnName : primaryKeyColumns) {
       primaryKeyColumnIndices.add(Preconditions.checkStateNotNull(indices.get(columnName)));
     }
-    this.primaryKeyColumnIndices = primaryKeyColumnIndices;
+    return primaryKeyColumnIndices;
+  }
+
+  // Only top-level columns supported.
+  void setPrimaryKeyColumns(List<String> primaryKeyColumns) {
+    this.primaryKeyColumns = primaryKeyColumns;
+    this.primaryKeyColumnIndices = primaryColumnFieldIndices(primaryKeyColumns, table);
   }
 
   @Nullable
@@ -80,7 +100,7 @@ List<Object> getPrimaryKey(TableRow tableRow) {
               .stream()
                   .map(cell -> Preconditions.checkStateNotNull(cell.get("v")))
                   .collect(Collectors.toList());
-      ;
+
       return Preconditions.checkStateNotNull(primaryKeyColumnIndices).stream()
           .map(cellValues::get)
           .collect(Collectors.toList());
@@ -91,7 +111,7 @@ List<Object> getPrimaryKey(TableRow tableRow) {
 
   long addRow(TableRow row, String id) {
     List<Object> primaryKey = getPrimaryKey(row);
-    if (primaryKey != null) {
+    if (primaryKey != null && !primaryKey.isEmpty()) {
       if (keyedRows.putIfAbsent(primaryKey, row) != null) {
         throw new RuntimeException(
             "Primary key validation error! Multiple inserts with the same primary key.");