From d8188f2abdcf199db5185b6551f55a6d0dcae2b2 Mon Sep 17 00:00:00 2001 From: tanya-borisova Date: Wed, 9 Aug 2023 14:51:25 +0100 Subject: [PATCH] Extract Spark configuration, secrets, init_scripts and libraries into the config (#318) --- config.infra-test.yaml | 7 ++ config.sample.yaml | 30 ++++++++- infrastructure/transform/adf.tf | 2 +- infrastructure/transform/databricks.tf | 91 +++++++++++++++++++------- infrastructure/transform/locals.tf | 2 + infrastructure/transform/secrets.tf | 9 +++ infrastructure/transform/variables.tf | 28 ++++++++ 7 files changed, 143 insertions(+), 26 deletions(-) diff --git a/config.infra-test.yaml b/config.infra-test.yaml index b8fa4c6b..ead14260 100644 --- a/config.infra-test.yaml +++ b/config.infra-test.yaml @@ -13,6 +13,13 @@ transform: - Bronze - Silver - Gold + spark_config: + spark.databricks.cluster.profile: singleNode + spark.master: local[*] + databricks_libraries: + pypi: + - package: opencensus-ext-azure==1.1.9 + - package: opencensus-ext-logging==0.1.1 serve: github_owner: UCLH-FlowEHR-TestBed diff --git a/config.sample.yaml b/config.sample.yaml index 44773665..b8259c88 100644 --- a/config.sample.yaml +++ b/config.sample.yaml @@ -24,11 +24,39 @@ transform: # Optional - url: https://github.com/MY_TRANSFORM_CODE_REPO.git - url: https://github.com/MY_OTHER_TRANSFORM_CODE_REPO.git sha: abcd01abcd01abcd01abcd01abcd01abcd01abcd - datalake: + datalake: # Optional zones: - Bronze - Silver - Gold + spark_config: # Optional + spark.databricks.cluster.profile: singleNode # Configuration for a Single Node cluster + spark.master: local[*] + databricks_secrets: + cog_services_key: my-super-secret-key # On Github, this wil lbe a token replacement + databricks_libraries: # Optional + pypi: + - package: opencensus-ext-azure==1.1.9 + - package: opencensus-ext-logging==0.1.1 + repo: "custom-mirror" + maven: + - coordinates: "com.amazon.deequ:deequ:1.0.4" + repo: "custom-mirror" + exclusions: ["org.apache.avro:avro"] + jar: + - "dbfs:/FileStore/app-0.0.1.jar" + databricks_cluster: + node_type: + min_memory_gb: 128 + min_cores: 16 + local_disk_min_size: 600 + category: "Memory Optimised" + autotermination_minutes: 120 + autoscale: + min_workers: 1 + max_workers: 3 + init_scripts: + - /workspaces/FlowEHR/transform/sample_init_script.sh serve: # Optional github_owner: A-GitHub-Org diff --git a/infrastructure/transform/adf.tf b/infrastructure/transform/adf.tf index 0f16ebeb..347602e4 100644 --- a/infrastructure/transform/adf.tf +++ b/infrastructure/transform/adf.tf @@ -53,7 +53,7 @@ resource "azurerm_data_factory_linked_service_azure_databricks" "msi_linked" { msi_work_space_resource_id = azurerm_databricks_workspace.databricks.id - existing_cluster_id = databricks_cluster.fixed_single_node.cluster_id + existing_cluster_id = databricks_cluster.cluster.cluster_id } resource "azurerm_data_factory_linked_service_key_vault" "msi_linked" { diff --git a/infrastructure/transform/databricks.tf b/infrastructure/transform/databricks.tf index 20c54341..0d5339e2 100644 --- a/infrastructure/transform/databricks.tf +++ b/infrastructure/transform/databricks.tf @@ -56,30 +56,26 @@ data "databricks_spark_version" "latest" { ] } -data "databricks_node_type" "smallest" { - # Providing no required configuration, Databricks will pick the smallest node possible - depends_on = [time_sleep.wait_for_databricks_network] -} +data "databricks_node_type" "node_type" { + min_memory_gb = var.transform.databricks_cluster.node_type.min_memory_gb + min_cores = var.transform.databricks_cluster.node_type.min_cores + local_disk_min_size = var.transform.databricks_cluster.node_type.local_disk_min_size + category = var.transform.databricks_cluster.node_type.category -# for prod - this will select something like E16ads v5 => ~$1.18ph whilst running -data "databricks_node_type" "prod" { - min_memory_gb = 128 - min_cores = 16 - local_disk_min_size = 600 - category = "Memory Optimized" + depends_on = [time_sleep.wait_for_databricks_network] } -resource "databricks_cluster" "fixed_single_node" { +resource "databricks_cluster" "cluster" { cluster_name = "Fixed Job Cluster" spark_version = data.databricks_spark_version.latest.id - node_type_id = var.accesses_real_data ? data.databricks_node_type.prod.id : data.databricks_node_type.smallest.id - autotermination_minutes = 10 + node_type_id = data.databricks_node_type.node_type.id + autotermination_minutes = var.transform.databricks_cluster.autotermination_minutes + autoscale { + min_workers = var.transform.databricks_cluster.autoscale.min_workers + max_workers = var.transform.databricks_cluster.autoscale.max_workers + } spark_conf = merge( - tomap({ - "spark.databricks.cluster.profile" = "singleNode" - "spark.master" = "local[*]" - }), # Secrets for SQL Feature store # Formatted according to syntax for referencing secrets in Spark config: # https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secrets @@ -110,18 +106,55 @@ resource "databricks_cluster" "fixed_single_node" { }), tomap({ for connection in var.data_source_connections : "spark.secret.${connection.name}-password" => "{{secrets/${databricks_secret_scope.secrets.name}/flowehr-dbks-${connection.name}-password}}" - }) + }), + # Additional secrets from the config + tomap({ for secret_name, secret_value in var.transform.databricks_secrets : + "spark.secret.${secret_name}" => "{{secrets/${databricks_secret_scope.secrets.name}/${secret_name}}}" + }), + # Any values set in the config + var.transform.spark_config ) - library { - pypi { - package = "opencensus-ext-azure==1.1.9" + dynamic "library" { + for_each = var.transform.databricks_libraries.pypi + content { + pypi { + package = library.value.package + repo = library.value.repo + } + } + } + + dynamic "library" { + for_each = var.transform.databricks_libraries.maven + content { + maven { + coordinates = library.value.coordinates + repo = library.value.repo + exclusions = library.value.exclusions + } } } - library { - pypi { - package = "opencensus-ext-logging==0.1.1" + dynamic "library" { + for_each = var.transform.databricks_libraries.jar + content { + jar = library.value + } + } + + dynamic "init_scripts" { + for_each = var.transform.databricks_cluster.init_scripts + content { + dbfs { + destination = "dbfs:/${local.init_scripts_dir}/${basename(init_scripts.value)}" + } + } + } + + cluster_log_conf { + dbfs { + destination = "dbfs:/${local.cluster_logs_dir}" } } @@ -136,6 +169,16 @@ resource "databricks_cluster" "fixed_single_node" { depends_on = [time_sleep.wait_for_databricks_network] } +resource "databricks_dbfs_file" "dbfs_init_script_upload" { + for_each = toset(var.transform.databricks_cluster.init_scripts) + # Source path on local filesystem + source = each.key + # Path on DBFS + path = "/${local.init_scripts_dir}/${basename(each.key)}" + + depends_on = [time_sleep.wait_for_databricks_network] +} + # databricks secret scope, in-built. Not able to use key vault backed scope due to limitation in databricks: # https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope-using-the-databricks-cli resource "databricks_secret_scope" "secrets" { diff --git a/infrastructure/transform/locals.tf b/infrastructure/transform/locals.tf index 3e06109c..c69d1076 100644 --- a/infrastructure/transform/locals.tf +++ b/infrastructure/transform/locals.tf @@ -20,6 +20,8 @@ locals { pipeline_file = "pipeline.json" trigger_file = "trigger.json" artifacts_dir = "artifacts" + init_scripts_dir = "init_scripts" + cluster_logs_dir = "cluster_logs" adb_linked_service_name = "ADBLinkedServiceViaMSI" dbfs_storage_account_name = "dbfs${var.naming_suffix_truncated}" datalake_enabled = try(var.transform.datalake, null) != null diff --git a/infrastructure/transform/secrets.tf b/infrastructure/transform/secrets.tf index b67c0a46..89e11a0a 100644 --- a/infrastructure/transform/secrets.tf +++ b/infrastructure/transform/secrets.tf @@ -70,3 +70,12 @@ resource "databricks_secret" "flowehr_databricks_sql_database" { string_value = azurerm_mssql_database.feature_database.name scope = databricks_secret_scope.secrets.id } + +# Additional Databricks secrets passed in from the config +resource "databricks_secret" "databricks_config_secret" { + for_each = var.transform.databricks_secrets + + key = each.key + string_value = each.value + scope = databricks_secret_scope.secrets.id +} diff --git a/infrastructure/transform/variables.tf b/infrastructure/transform/variables.tf index 73ecfb0b..dc27cd8f 100644 --- a/infrastructure/transform/variables.tf +++ b/infrastructure/transform/variables.tf @@ -111,6 +111,34 @@ variable "transform" { datalake = optional(object({ zones = set(string) })) + spark_config = optional(map(string), {}) + databricks_secrets = optional(map(string), {}) + databricks_libraries = optional(object({ + jar = optional(list(string), []), + pypi = optional(list(object({ + package = string, + repo = optional(string) + })), []), + maven = optional(list(object({ + coordinates = string, + repo = optional(string), + exclusions = optional(list(string), []) + })), []) + }), {}), + databricks_cluster = optional(object({ + node_type = optional(object({ + min_memory_gb = optional(number, 0), + min_cores = optional(number, 0), + local_disk_min_size = optional(number, 0), + category = optional(string, "") + }), {}), + autotermination_minutes = optional(number, 0), + init_scripts = optional(list(string), []) + autoscale = optional(object({ + min_workers = optional(number, 0) + max_workers = optional(number, 0) + }), {}) + }), {}) }) default = { spark_version = "3.3"