feat: Adds variables for data governance project and bigquery project (…

…#124) * Adds variable for data governance project * Reduces data governance project name * Reduces data ingestion project name * Adds ensure_access_policy.sh script usage * Adds project roles for data governance project * Uses topic full id to grant roles * Adds dependency for iam roles assign because of perimiter * Adds dependency for wait vpc propagation * Expose resources variables on dwt-networking module * Adds dependency for explicit output * Changes build order * Create key on data governance project * Removes dependency * Adds services to vpc * Adds sleep to wait vpc destroy propagation * Forces wait propagation on destroy * Adds dependency for dlp regional, increase time sleep on destroy * Adds dependency for external resources on test * Removes dependency * Remove dlp from perimeter * Removes dependency on test * Removes dependency on test * Removes dependency * Adds data governance project to regional dlp * Fixes integration tests for kms * Uncomment build steps * Fixes role for pubsub sa * De-identification template created in data governance project * Comment build steps * Update modules/base-data-ingestion/vpc_service_control.tf * Uncomment steps on build * Adds project for bigquery * Adds bigquery project as outputs * Fixes variables for taxonomy * Moves providers.tf to test folder * Comment steps already checked * Uncomment steps * Removes unnecessary apis Co-authored-by: Daniel Andrade <dandrade@ciandt.com>
GoogleCloudPlatform · Sep 10, 2021 · e68e760 · e68e760
1 parent 1b81324
commit e68e760
Show file tree

Hide file tree

Showing 59 changed files with 365 additions and 823 deletions.
diff --git a/README.md b/README.md
@@ -48,6 +48,7 @@ module "secured_data_warehouse" {
 | bucket\_name | The name of for the bucket being provisioned. | `string` | n/a | yes |
 | cmek\_keyring\_name | The Keyring name for the KMS Customer Managed Encryption Keys being provisioned. | `string` | n/a | yes |
 | data\_governance\_project\_id | The ID of the project in which the data governance resources will be created. | `string` | n/a | yes |
+| datalake\_project\_id | The ID of the project in which the Bigquery will be created. | `string` | n/a | yes |
 | dataset\_default\_table\_expiration\_ms | TTL of tables using the dataset in MS. The default value is almost 12 months. | `number` | `31536000000` | no |
 | dataset\_description | Dataset description. | `string` | `"Ingest dataset"` | no |
 | dataset\_id | Unique ID for the dataset being provisioned. | `string` | n/a | yes |

diff --git a/build/int.cloudbuild.yaml b/build/int.cloudbuild.yaml
diff --git a/examples/batch-data-ingestion/README.md b/examples/batch-data-ingestion/README.md
@@ -13,6 +13,8 @@ This example illustrates how to run a Dataflow job that uses the `batch template
 |------|-------------|------|---------|:--------:|
 | access\_context\_manager\_policy\_id | The id of the default Access Context Manager policy. Can be obtained by running `gcloud access-context-manager policies list --organization YOUR-ORGANIZATION_ID --format="value(name)"`. | `number` | n/a | yes |
 | bucket\_force\_destroy | When deleting a bucket, this boolean option will delete all contained objects. If you try to delete a bucket that contains objects, Terraform will fail that run. | `bool` | `false` | no |
+| data\_governance\_project\_id | The ID of the project in which the data governance resources will be created. | `string` | n/a | yes |
+| datalake\_project\_id | The ID of the project in which the Bigquery will be created. | `string` | n/a | yes |
 | org\_id | GCP Organization ID. | `string` | n/a | yes |
 | perimeter\_members | The list of all members to be added on perimeter access. Prefix user: (user:email@email.com) or serviceAccount: (serviceAccount:my-service-account@email.com) is required. | `list(string)` | n/a | yes |
 | project\_id | The ID of the project in which the service account will be created. | `string` | n/a | yes |
@@ -27,7 +29,7 @@ This example illustrates how to run a Dataflow job that uses the `batch template
 | df\_job\_network | The URI of the VPC being created. |
 | df\_job\_region | The region of the newly created Dataflow job. |
 | df\_job\_subnetwork | The name of the subnetwork used for create Dataflow job. |
-| project\_id | The project's ID. |
+| project\_id | The data ingestion project's ID. |
 | scheduler\_id | Cloud Scheduler Job id created. |
 
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/examples/batch-data-ingestion/httpRequest.tmpl b/examples/batch-data-ingestion/httpRequest.tmpl
@@ -11,7 +11,7 @@
     },
     "parameters": {
         "inputFilePattern": "${inputFilePattern}",
-        "outputTable": "${project_id}:${dataset_id}.${table_name}",
+        "outputTable": "${bigquery_project_id}:${dataset_id}.${table_name}",
         "javascriptTextTransformFunctionName": "${javascriptTextTransformFunctionName}",
         "JSONPath": "${JSONPath}",
         "javascriptTextTransformGcsPath": "${javascriptTextTransformGcsPath}",

diff --git a/examples/batch-data-ingestion/main.tf b/examples/batch-data-ingestion/main.tf
@@ -33,7 +33,7 @@ locals {
       dataflow_service_account            = module.data_ingestion.dataflow_controller_service_account_email,
       subnetwork_self_link                = module.data_ingestion.subnets_self_links[0],
       inputFilePattern                    = "gs://${module.data_ingestion.data_ingest_bucket_names[0]}/cc_records.csv",
-      project_id                          = var.project_id,
+      bigquery_project_id                 = var.datalake_project_id,
       dataset_id                          = local.dataset_id,
       table_name                          = local.table_name,
       javascriptTextTransformFunctionName = "transform",
@@ -50,7 +50,8 @@ module "data_ingestion" {
   dataset_id                       = local.dataset_id
   org_id                           = var.org_id
   project_id                       = var.project_id
-  data_governance_project_id       = var.project_id
+  data_governance_project_id       = var.data_governance_project_id
+  datalake_project_id              = var.datalake_project_id
   region                           = local.region
   bucket_location                  = local.region
   dataset_location                 = local.region
@@ -79,6 +80,9 @@ module "dataflow_tmp_bucket" {
   labels = {
     "enterprise_data_ingest_bucket" = "true"
   }
+  depends_on = [
+    module.data_ingestion.access_level_name
+  ]
 }
 
 resource "null_resource" "download_sample_cc_into_gcs" {

diff --git a/examples/batch-data-ingestion/outputs.tf b/examples/batch-data-ingestion/outputs.tf
@@ -15,7 +15,7 @@
  */
 
 output "project_id" {
-  description = "The project's ID."
+  description = "The data ingestion project's ID."
   value       = var.project_id
 }
 

diff --git a/examples/batch-data-ingestion/variables.tf b/examples/batch-data-ingestion/variables.tf
@@ -19,6 +19,16 @@ variable "project_id" {
   type        = string
 }
 
+variable "data_governance_project_id" {
+  description = "The ID of the project in which the data governance resources will be created."
+  type        = string
+}
+
+variable "datalake_project_id" {
+  description = "The ID of the project in which the Bigquery will be created."
+  type        = string
+}
+
 variable "org_id" {
   description = "GCP Organization ID."
   type        = string

diff --git a/examples/dataflow-with-dlp/README.md b/examples/dataflow-with-dlp/README.md
@@ -15,6 +15,8 @@ This example illustrates how to run a Dataflow job that uses the `de_identificat
 | access\_context\_manager\_policy\_id | The id of the default Access Context Manager policy. Can be obtained by running `gcloud access-context-manager policies list --organization YOUR-ORGANIZATION_ID --format="value(name)"`. | `number` | n/a | yes |
 | bucket\_force\_destroy | When deleting a bucket, this boolean option will delete all contained objects. If you try to delete a bucket that contains objects, Terraform will fail that run. | `bool` | `false` | no |
 | crypto\_key | The full resource name of the Cloud KMS key that wraps the data crypto key used by DLP. | `string` | n/a | yes |
+| data\_governance\_project\_id | The ID of the project in which the data governance resources will be created. | `string` | n/a | yes |
+| datalake\_project\_id | The ID of the project in which the Bigquery will be created. | `string` | n/a | yes |
 | org\_id | GCP Organization ID. | `string` | n/a | yes |
 | perimeter\_members | The list of all members to be added on perimeter access. Prefix user: (user:email@email.com) or serviceAccount: (serviceAccount:my-service-account@email.com) is required. | `list(string)` | n/a | yes |
 | project\_id | The ID of the project in which the service account will be created. | `string` | n/a | yes |

diff --git a/examples/dataflow-with-dlp/main.tf b/examples/dataflow-with-dlp/main.tf
@@ -25,7 +25,8 @@ module "data_ingestion" {
   dataset_id                       = local.dataset_id
   org_id                           = var.org_id
   project_id                       = var.project_id
-  data_governance_project_id       = var.project_id
+  data_governance_project_id       = var.data_governance_project_id
+  datalake_project_id              = var.datalake_project_id
   region                           = local.region
   bucket_location                  = local.region
   dataset_location                 = local.region
@@ -56,6 +57,10 @@ module "dataflow_tmp_bucket" {
   labels = {
     "enterprise_data_ingest_bucket" = "true"
   }
+
+  depends_on = [
+    module.data_ingestion.access_level_name
+  ]
 }
 resource "random_id" "original_key" {
   byte_length = 16
@@ -76,12 +81,16 @@ resource "null_resource" "download_sample_cc_into_gcs" {
 EOF
 
   }
+
+  depends_on = [
+    module.data_ingestion.access_level_name
+  ]
 }
 
 module "de_identification_template" {
   source = "../..//modules/de_identification_template"
 
-  project_id                = var.project_id
+  project_id                = var.data_governance_project_id
   terraform_service_account = var.terraform_service_account
   crypto_key                = var.crypto_key
   wrapped_key               = var.wrapped_key
@@ -111,7 +120,11 @@ module "dataflow_job" {
     inputFilePattern       = "gs://${module.data_ingestion.data_ingest_bucket_names[0]}/cc_records.csv"
     datasetName            = local.dataset_id
     batchSize              = 1000
-    dlpProjectId           = var.project_id
-    deidentifyTemplateName = "projects/${var.project_id}/locations/${local.region}/deidentifyTemplates/${module.de_identification_template.template_id}"
+    dlpProjectId           = var.data_governance_project_id
+    deidentifyTemplateName = "projects/${var.data_governance_project_id}/locations/${local.region}/deidentifyTemplates/${module.de_identification_template.template_id}"
   }
+
+  depends_on = [
+    module.data_ingestion.access_level_name
+  ]
 }
diff --git a/examples/dataflow-with-dlp/variables.tf b/examples/dataflow-with-dlp/variables.tf
@@ -39,6 +39,16 @@ variable "project_id" {
   type        = string
 }
 
+variable "data_governance_project_id" {
+  description = "The ID of the project in which the data governance resources will be created."
+  type        = string
+}
+
+variable "datalake_project_id" {
+  description = "The ID of the project in which the Bigquery will be created."
+  type        = string
+}
+
 variable "access_context_manager_policy_id" {
   description = "The id of the default Access Context Manager policy. Can be obtained by running `gcloud access-context-manager policies list --organization YOUR-ORGANIZATION_ID --format=\"value(name)\"`."
   type        = number

diff --git a/examples/regional-dlp/README.md b/examples/regional-dlp/README.md
@@ -198,6 +198,8 @@ If your user does not have the necessary roles to run the commands above you can
 |------|-------------|------|---------|:--------:|
 | access\_context\_manager\_policy\_id | The id of the default Access Context Manager policy. Can be obtained by running `gcloud access-context-manager policies list --organization YOUR-ORGANIZATION_ID --format="value(name)"`. | `number` | n/a | yes |
 | crypto\_key | The full resource name of the Cloud KMS key that wraps the data crypto key used by DLP. | `string` | n/a | yes |
+| data\_governance\_project\_id | The ID of the project in which the data governance resources will be created. | `string` | n/a | yes |
+| datalake\_project\_id | The ID of the project in which the Bigquery will be created. | `string` | n/a | yes |
 | location | The location of Artifact registry. Run `gcloud artifacts locations list` to list available locations. | `string` | `"us-central1"` | no |
 | org\_id | GCP Organization ID. | `string` | n/a | yes |
 | perimeter\_additional\_members | The list additional members to be added on perimeter access. Prefix user: (user:email@email.com) or serviceAccount: (serviceAccount:my-service-account@email.com) is required. | `list(string)` | `[]` | no |

diff --git a/examples/regional-dlp/main.tf b/examples/regional-dlp/main.tf
@@ -74,7 +74,8 @@ module "data_ingestion" {
   dataset_id                       = "dlp_flex_ingest"
   org_id                           = var.org_id
   project_id                       = var.project_id
-  data_governance_project_id       = var.project_id
+  data_governance_project_id       = var.data_governance_project_id
+  datalake_project_id              = var.datalake_project_id
   terraform_service_account        = var.terraform_service_account
   access_context_manager_policy_id = var.access_context_manager_policy_id
   perimeter_members                = concat(["serviceAccount:${var.terraform_service_account}"], var.perimeter_additional_members)
@@ -94,7 +95,7 @@ module "data_ingestion" {
 module "de_identification_template_example" {
   source = "../..//modules/de_identification_template"
 
-  project_id                = var.project_id
+  project_id                = var.data_governance_project_id
   terraform_service_account = var.terraform_service_account
   dataflow_service_account  = module.data_ingestion.dataflow_controller_service_account_email
   crypto_key                = var.crypto_key
@@ -122,7 +123,6 @@ module "flex_dlp_template" {
     metadata_file     = "${path.module}/files/metadata.json"
     requirements_file = "${path.module}/files/requirements.txt"
   }
-
 }
 
 module "python_module_repository" {
@@ -134,7 +134,6 @@ module "python_module_repository" {
   terraform_service_account = var.terraform_service_account
   requirements_filename     = "${path.module}/files/requirements.txt"
   read_access_members       = ["serviceAccount:${module.data_ingestion.dataflow_controller_service_account_email}"]
-
 }
 
 module "dataflow_bucket" {
@@ -151,6 +150,9 @@ module "dataflow_bucket" {
     default_kms_key_name = module.data_ingestion.cmek_ingestion_crypto_key
   }
 
+  depends_on = [
+    module.data_ingestion.access_level_name
+  ]
 }
 
 resource "google_dataflow_flex_template_job" "regional_dlp" {
@@ -163,11 +165,11 @@ resource "google_dataflow_flex_template_job" "regional_dlp" {
 
   parameters = {
     input_topic                    = "projects/${var.project_id}/topics/${module.data_ingestion.data_ingest_topic_name}"
-    deidentification_template_name = "projects/${var.project_id}/locations/${var.location}/deidentifyTemplates/${module.de_identification_template_example.template_id}"
+    deidentification_template_name = "projects/${var.data_governance_project_id}/locations/${var.location}/deidentifyTemplates/${module.de_identification_template_example.template_id}"
     dlp_location                   = var.location
-    dlp_project                    = var.project_id
+    dlp_project                    = var.data_governance_project_id
     bq_schema                      = local.bq_schema
-    output_table                   = "${var.project_id}:${module.data_ingestion.data_ingest_bigquery_dataset.dataset_id}.classical_books"
+    output_table                   = "${var.datalake_project_id}:${module.data_ingestion.data_ingest_bigquery_dataset.dataset_id}.classical_books"
     service_account_email          = module.data_ingestion.dataflow_controller_service_account_email
     subnetwork                     = module.data_ingestion.subnets_self_links[0]
     dataflow_kms_key               = module.data_ingestion.cmek_ingestion_crypto_key
@@ -178,6 +180,7 @@ resource "google_dataflow_flex_template_job" "regional_dlp" {
   depends_on = [
     module.de_identification_template_example,
     module.flex_dlp_template,
-    module.python_module_repository
+    module.python_module_repository,
+    module.data_ingestion.access_level_name
   ]
 }
diff --git a/examples/regional-dlp/variables.tf b/examples/regional-dlp/variables.tf
@@ -29,6 +29,16 @@ variable "project_id" {
   type        = string
 }
 
+variable "data_governance_project_id" {
+  description = "The ID of the project in which the data governance resources will be created."
+  type        = string
+}
+
+variable "datalake_project_id" {
+  description = "The ID of the project in which the Bigquery will be created."
+  type        = string
+}
+
 variable "terraform_service_account" {
   description = "The email address of the service account that will run the Terraform config."
   type        = string

diff --git a/examples/simple_example/README.md b/examples/simple_example/README.md
@@ -9,6 +9,7 @@ This example illustrates how to use the `secured-data-warehouse` module.
 |------|-------------|------|---------|:--------:|
 | access\_context\_manager\_policy\_id | The id of the default Access Context Manager policy. Can be obtained by running `gcloud access-context-manager policies list --organization YOUR-ORGANIZATION_ID --format="value(name)"`. | `number` | n/a | yes |
 | data\_governance\_project\_id | The ID of the project in which the data governance resources will be created. | `string` | n/a | yes |
+| datalake\_project\_id | The ID of the project in which the Bigquery will be created. | `string` | n/a | yes |
 | org\_id | GCP Organization ID. | `string` | n/a | yes |
 | project\_id | The ID of the project in which the service account will be created. | `string` | n/a | yes |
 | terraform\_service\_account | The email address of the service account that will run the Terraform code. | `string` | n/a | yes |

diff --git a/examples/simple_example/main.tf b/examples/simple_example/main.tf
@@ -22,6 +22,7 @@ module "secured_data_warehouse" {
   source                           = "../.."
   org_id                           = var.org_id
   data_governance_project_id       = var.data_governance_project_id
+  datalake_project_id              = var.datalake_project_id
   project_id                       = var.project_id
   terraform_service_account        = var.terraform_service_account
   access_context_manager_policy_id = var.access_context_manager_policy_id

diff --git a/examples/simple_example/variables.tf b/examples/simple_example/variables.tf
@@ -29,6 +29,11 @@ variable "project_id" {
   type        = string
 }
 
+variable "datalake_project_id" {
+  description = "The ID of the project in which the Bigquery will be created."
+  type        = string
+}
+
 variable "terraform_service_account" {
   description = "The email address of the service account that will run the Terraform code."
   type        = string

diff --git a/main.tf b/main.tf
@@ -33,6 +33,7 @@ module "data_ingestion" {
   org_id                              = var.org_id
   project_id                          = var.project_id
   data_governance_project_id          = var.data_governance_project_id
+  datalake_project_id                 = var.datalake_project_id
   terraform_service_account           = var.terraform_service_account
   vpc_name                            = var.vpc_name
   access_context_manager_policy_id    = var.access_context_manager_policy_id

diff --git a/modules/base-data-ingestion/README.md b/modules/base-data-ingestion/README.md
@@ -195,6 +195,7 @@ If your user does not have the necessary roles to run the commands above you can
 | cmek\_keyring\_name | The Keyring name for the KMS Customer Managed Encryption Keys. | `string` | n/a | yes |
 | cmek\_location | The location for the KMS Customer Managed Encryption Keys. | `string` | n/a | yes |
 | data\_governance\_project\_id | The ID of the project in which the data governance resources will be created. | `string` | n/a | yes |
+| datalake\_project\_id | The ID of the project in which the Bigquery will be created. | `string` | n/a | yes |
 | dataset\_default\_table\_expiration\_ms | TTL of tables using the dataset in MS. The default value is almost 12 months. | `number` | `31536000000` | no |
 | dataset\_description | Dataset description. | `string` | `"Ingest dataset"` | no |
 | dataset\_id | Unique ID for the dataset being provisioned. | `string` | n/a | yes |

diff --git a/modules/base-data-ingestion/kms.tf b/modules/base-data-ingestion/kms.tf
@@ -47,12 +47,20 @@ data "google_project" "ingestion_project" {
   project_id = var.project_id
 }
 
+data "google_project" "governance_project" {
+  project_id = var.data_governance_project_id
+}
+
+data "google_project" "datalake_project" {
+  project_id = var.datalake_project_id
+}
+
 data "google_storage_project_service_account" "gcs_account" {
   project = var.project_id
 }
 
 data "google_bigquery_default_service_account" "bigquery_sa" {
-  project = var.project_id
+  project = var.datalake_project_id
 }
 
 resource "google_project_service_identity" "pubsub_sa" {

diff --git a/modules/base-data-ingestion/main.tf b/modules/base-data-ingestion/main.tf
@@ -55,7 +55,7 @@ module "bigquery_dataset" {
   source  = "terraform-google-modules/bigquery/google"
   version = "~> 5.1"
 
-  project_id                  = var.project_id
+  project_id                  = var.datalake_project_id
   dataset_id                  = var.dataset_id
   dataset_name                = var.dataset_name
   description                 = var.dataset_description

diff --git a/modules/base-data-ingestion/service_accounts.tf b/modules/base-data-ingestion/service_accounts.tf
@@ -24,7 +24,7 @@ module "dataflow_controller_service_account" {
   display_name = "Cloud Dataflow controller service account"
   project_roles = [
     "${var.project_id}=>roles/pubsub.subscriber",
-    "${var.project_id}=>roles/bigquery.admin",
+    "${var.datalake_project_id}=>roles/bigquery.admin",
     "${var.project_id}=>roles/cloudkms.admin",
     "${var.project_id}=>roles/cloudkms.cryptoKeyDecrypter",
     "${var.project_id}=>roles/dlp.admin",
@@ -62,15 +62,15 @@ resource "google_service_account" "pubsub_writer_service_account" {
 }
 
 resource "google_pubsub_topic_iam_member" "publisher" {
-  project = var.data_governance_project_id
-  topic   = module.data_ingest_topic.topic
+  project = var.project_id
+  topic   = module.data_ingest_topic.id
   role    = "roles/pubsub.publisher"
   member  = "serviceAccount:${google_service_account.pubsub_writer_service_account.email}"
 }
 
 resource "google_pubsub_topic_iam_member" "subscriber" {
-  project = var.data_governance_project_id
-  topic   = module.data_ingest_topic.topic
+  project = var.project_id
+  topic   = module.data_ingest_topic.id
   role    = "roles/pubsub.subscriber"
   member  = "serviceAccount:${google_service_account.pubsub_writer_service_account.email}"
 }
diff --git a/modules/base-data-ingestion/variables.tf b/modules/base-data-ingestion/variables.tf
@@ -40,6 +40,11 @@ variable "data_governance_project_id" {
   type        = string
 }
 
+variable "datalake_project_id" {
+  description = "The ID of the project in which the Bigquery will be created."
+  type        = string
+}
+
 variable "vpc_name" {
   description = "The name of the network."
   type        = string