Skip to content

Commit

Permalink
Extract Spark configuration, secrets, init_scripts and libraries into…
Browse files Browse the repository at this point in the history
… the config (#318)
  • Loading branch information
tanya-borisova committed Aug 9, 2023
1 parent e51a793 commit d8188f2
Show file tree
Hide file tree
Showing 7 changed files with 143 additions and 26 deletions.
7 changes: 7 additions & 0 deletions config.infra-test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,13 @@ transform:
- Bronze
- Silver
- Gold
spark_config:
spark.databricks.cluster.profile: singleNode
spark.master: local[*]
databricks_libraries:
pypi:
- package: opencensus-ext-azure==1.1.9
- package: opencensus-ext-logging==0.1.1

serve:
github_owner: UCLH-FlowEHR-TestBed
Expand Down
30 changes: 29 additions & 1 deletion config.sample.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,39 @@ transform: # Optional
- url: https://github.com/MY_TRANSFORM_CODE_REPO.git
- url: https://github.com/MY_OTHER_TRANSFORM_CODE_REPO.git
sha: abcd01abcd01abcd01abcd01abcd01abcd01abcd
datalake:
datalake: # Optional
zones:
- Bronze
- Silver
- Gold
spark_config: # Optional
spark.databricks.cluster.profile: singleNode # Configuration for a Single Node cluster
spark.master: local[*]
databricks_secrets:
cog_services_key: my-super-secret-key # On Github, this wil lbe a token replacement
databricks_libraries: # Optional
pypi:
- package: opencensus-ext-azure==1.1.9
- package: opencensus-ext-logging==0.1.1
repo: "custom-mirror"
maven:
- coordinates: "com.amazon.deequ:deequ:1.0.4"
repo: "custom-mirror"
exclusions: ["org.apache.avro:avro"]
jar:
- "dbfs:/FileStore/app-0.0.1.jar"
databricks_cluster:
node_type:
min_memory_gb: 128
min_cores: 16
local_disk_min_size: 600
category: "Memory Optimised"
autotermination_minutes: 120
autoscale:
min_workers: 1
max_workers: 3
init_scripts:
- /workspaces/FlowEHR/transform/sample_init_script.sh

serve: # Optional
github_owner: A-GitHub-Org
Expand Down
2 changes: 1 addition & 1 deletion infrastructure/transform/adf.tf
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ resource "azurerm_data_factory_linked_service_azure_databricks" "msi_linked" {

msi_work_space_resource_id = azurerm_databricks_workspace.databricks.id

existing_cluster_id = databricks_cluster.fixed_single_node.cluster_id
existing_cluster_id = databricks_cluster.cluster.cluster_id
}

resource "azurerm_data_factory_linked_service_key_vault" "msi_linked" {
Expand Down
91 changes: 67 additions & 24 deletions infrastructure/transform/databricks.tf
Original file line number Diff line number Diff line change
Expand Up @@ -56,30 +56,26 @@ data "databricks_spark_version" "latest" {
]
}

data "databricks_node_type" "smallest" {
# Providing no required configuration, Databricks will pick the smallest node possible
depends_on = [time_sleep.wait_for_databricks_network]
}
data "databricks_node_type" "node_type" {
min_memory_gb = var.transform.databricks_cluster.node_type.min_memory_gb
min_cores = var.transform.databricks_cluster.node_type.min_cores
local_disk_min_size = var.transform.databricks_cluster.node_type.local_disk_min_size
category = var.transform.databricks_cluster.node_type.category

# for prod - this will select something like E16ads v5 => ~$1.18ph whilst running
data "databricks_node_type" "prod" {
min_memory_gb = 128
min_cores = 16
local_disk_min_size = 600
category = "Memory Optimized"
depends_on = [time_sleep.wait_for_databricks_network]
}

resource "databricks_cluster" "fixed_single_node" {
resource "databricks_cluster" "cluster" {
cluster_name = "Fixed Job Cluster"
spark_version = data.databricks_spark_version.latest.id
node_type_id = var.accesses_real_data ? data.databricks_node_type.prod.id : data.databricks_node_type.smallest.id
autotermination_minutes = 10
node_type_id = data.databricks_node_type.node_type.id
autotermination_minutes = var.transform.databricks_cluster.autotermination_minutes
autoscale {
min_workers = var.transform.databricks_cluster.autoscale.min_workers
max_workers = var.transform.databricks_cluster.autoscale.max_workers
}

spark_conf = merge(
tomap({
"spark.databricks.cluster.profile" = "singleNode"
"spark.master" = "local[*]"
}),
# Secrets for SQL Feature store
# Formatted according to syntax for referencing secrets in Spark config:
# https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secrets
Expand Down Expand Up @@ -110,18 +106,55 @@ resource "databricks_cluster" "fixed_single_node" {
}),
tomap({ for connection in var.data_source_connections :
"spark.secret.${connection.name}-password" => "{{secrets/${databricks_secret_scope.secrets.name}/flowehr-dbks-${connection.name}-password}}"
})
}),
# Additional secrets from the config
tomap({ for secret_name, secret_value in var.transform.databricks_secrets :
"spark.secret.${secret_name}" => "{{secrets/${databricks_secret_scope.secrets.name}/${secret_name}}}"
}),
# Any values set in the config
var.transform.spark_config
)

library {
pypi {
package = "opencensus-ext-azure==1.1.9"
dynamic "library" {
for_each = var.transform.databricks_libraries.pypi
content {
pypi {
package = library.value.package
repo = library.value.repo
}
}
}

dynamic "library" {
for_each = var.transform.databricks_libraries.maven
content {
maven {
coordinates = library.value.coordinates
repo = library.value.repo
exclusions = library.value.exclusions
}
}
}

library {
pypi {
package = "opencensus-ext-logging==0.1.1"
dynamic "library" {
for_each = var.transform.databricks_libraries.jar
content {
jar = library.value
}
}

dynamic "init_scripts" {
for_each = var.transform.databricks_cluster.init_scripts
content {
dbfs {
destination = "dbfs:/${local.init_scripts_dir}/${basename(init_scripts.value)}"
}
}
}

cluster_log_conf {
dbfs {
destination = "dbfs:/${local.cluster_logs_dir}"
}
}

Expand All @@ -136,6 +169,16 @@ resource "databricks_cluster" "fixed_single_node" {
depends_on = [time_sleep.wait_for_databricks_network]
}

resource "databricks_dbfs_file" "dbfs_init_script_upload" {
for_each = toset(var.transform.databricks_cluster.init_scripts)
# Source path on local filesystem
source = each.key
# Path on DBFS
path = "/${local.init_scripts_dir}/${basename(each.key)}"

depends_on = [time_sleep.wait_for_databricks_network]
}

# databricks secret scope, in-built. Not able to use key vault backed scope due to limitation in databricks:
# https://learn.microsoft.com/en-us/azure/databricks/security/secrets/secret-scopes#--create-an-azure-key-vault-backed-secret-scope-using-the-databricks-cli
resource "databricks_secret_scope" "secrets" {
Expand Down
2 changes: 2 additions & 0 deletions infrastructure/transform/locals.tf
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ locals {
pipeline_file = "pipeline.json"
trigger_file = "trigger.json"
artifacts_dir = "artifacts"
init_scripts_dir = "init_scripts"
cluster_logs_dir = "cluster_logs"
adb_linked_service_name = "ADBLinkedServiceViaMSI"
dbfs_storage_account_name = "dbfs${var.naming_suffix_truncated}"
datalake_enabled = try(var.transform.datalake, null) != null
Expand Down
9 changes: 9 additions & 0 deletions infrastructure/transform/secrets.tf
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,12 @@ resource "databricks_secret" "flowehr_databricks_sql_database" {
string_value = azurerm_mssql_database.feature_database.name
scope = databricks_secret_scope.secrets.id
}

# Additional Databricks secrets passed in from the config
resource "databricks_secret" "databricks_config_secret" {
for_each = var.transform.databricks_secrets

key = each.key
string_value = each.value
scope = databricks_secret_scope.secrets.id
}
28 changes: 28 additions & 0 deletions infrastructure/transform/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,34 @@ variable "transform" {
datalake = optional(object({
zones = set(string)
}))
spark_config = optional(map(string), {})
databricks_secrets = optional(map(string), {})
databricks_libraries = optional(object({
jar = optional(list(string), []),
pypi = optional(list(object({
package = string,
repo = optional(string)
})), []),
maven = optional(list(object({
coordinates = string,
repo = optional(string),
exclusions = optional(list(string), [])
})), [])
}), {}),
databricks_cluster = optional(object({
node_type = optional(object({
min_memory_gb = optional(number, 0),
min_cores = optional(number, 0),
local_disk_min_size = optional(number, 0),
category = optional(string, "")
}), {}),
autotermination_minutes = optional(number, 0),
init_scripts = optional(list(string), [])
autoscale = optional(object({
min_workers = optional(number, 0)
max_workers = optional(number, 0)
}), {})
}), {})
})
default = {
spark_version = "3.3"
Expand Down

0 comments on commit d8188f2

Please sign in to comment.