diff --git a/.codegen.json b/.codegen.json index da4f3dd610..1b29f979dc 100644 --- a/.codegen.json +++ b/.codegen.json @@ -8,6 +8,12 @@ ".codegen/cmds-account.go.tmpl": "cmd/account/cmd.go" }, "toolchain": { - "required": ["go"] + "required": ["go"], + "post_generate": [ + "go run ./bundle/internal/bundle/schema/main.go ./bundle/schema/docs/bundle_descriptions.json", + "echo 'bundle/internal/tf/schema/\\*.go linguist-generated=true' >> ./.gitattributes", + "echo 'go.sum linguist-generated=true' >> ./.gitattributes", + "echo 'bundle/schema/docs/bundle_descriptions.json linguist-generated=true' >> ./.gitattributes" + ] } } diff --git a/.gitattributes b/.gitattributes index ddd698a02e..f338932013 100755 --- a/.gitattributes +++ b/.gitattributes @@ -83,3 +83,6 @@ cmd/workspace/warehouses/warehouses.go linguist-generated=true cmd/workspace/workspace-bindings/workspace-bindings.go linguist-generated=true cmd/workspace/workspace-conf/workspace-conf.go linguist-generated=true cmd/workspace/workspace/workspace.go linguist-generated=true +bundle/internal/tf/schema/\*.go linguist-generated=true +go.sum linguist-generated=true +bundle/schema/docs/bundle_descriptions.json linguist-generated=true diff --git a/bundle/config/root.go b/bundle/config/root.go index 1fb5773b6b..32baa1a506 100644 --- a/bundle/config/root.go +++ b/bundle/config/root.go @@ -48,7 +48,7 @@ type Root struct { Targets map[string]*Target `json:"targets,omitempty"` // DEPRECATED. Left for backward compatibility with Targets - Environments map[string]*Target `json:"environments,omitempty"` + Environments map[string]*Target `json:"environments,omitempty" bundle:"deprecated"` // Sync section specifies options for files synchronization Sync Sync `json:"sync,omitempty"` diff --git a/bundle/internal/bundle/schema/main.go b/bundle/internal/bundle/schema/main.go new file mode 100644 index 0000000000..c9cc7cd4fb --- /dev/null +++ b/bundle/internal/bundle/schema/main.go @@ -0,0 +1,42 @@ +package main + +import ( + "encoding/json" + "fmt" + "log" + "os" + + "github.com/databricks/cli/bundle/schema" +) + +func main() { + if len(os.Args) != 2 { + fmt.Println("Usage: go run main.go ") + os.Exit(1) + } + + // Output file, to write the generated schema descriptions to. + outputFile := os.Args[1] + + // Input file, the databricks openapi spec. + inputFile := os.Getenv("DATABRICKS_OPENAPI_SPEC") + if inputFile == "" { + log.Fatal("DATABRICKS_OPENAPI_SPEC environment variable not set") + } + + // Generate the schema descriptions. + docs, err := schema.UpdateBundleDescriptions(inputFile) + if err != nil { + log.Fatal(err) + } + result, err := json.MarshalIndent(docs, "", " ") + if err != nil { + log.Fatal(err) + } + + // Write the schema descriptions to the output file. + err = os.WriteFile(outputFile, result, 0644) + if err != nil { + log.Fatal(err) + } +} diff --git a/bundle/schema/README.md b/bundle/schema/README.md index fe6b149c14..bf6b87df61 100644 --- a/bundle/schema/README.md +++ b/bundle/schema/README.md @@ -13,15 +13,6 @@ These descriptions are rendered in the inline documentation in an IDE ### SOP: Add schema descriptions for new fields in bundle config -1. You can autogenerate empty descriptions for the new fields by running -`databricks bundle schema --only-docs > ~/databricks/bundle/schema/docs/bundle_descriptions.json` -2. Manually edit bundle_descriptions.json to add your descriptions -3. Build again to embed the new `bundle_descriptions.json` into the binary (`go build`) -4. Again run `databricks bundle schema --only-docs > ~/databricks/bundle/schema/docs/bundle_descriptions.json` to copy over any applicable descriptions to `targets` -5. push to repo - - -### SOP: Update descriptions in resources from a newer openapi spec - -1. Run `databricks bundle schema --only-docs --openapi PATH_TO_SPEC > ~/databricks/bundle/schema/docs/bundle_descriptions.json` -2. push to repo +Manually edit bundle_descriptions.json to add your descriptions. Note that the +descriptions in `resources` block is generated from the OpenAPI spec, and thus +any changes there will be overwritten. diff --git a/bundle/schema/docs.go b/bundle/schema/docs.go index 4b2fd36aef..fe63e43284 100644 --- a/bundle/schema/docs.go +++ b/bundle/schema/docs.go @@ -23,39 +23,6 @@ type Docs struct { //go:embed docs/bundle_descriptions.json var bundleDocs []byte -func BundleDocs(openapiSpecPath string) (*Docs, error) { - docs, err := initializeBundleDocs() - if err != nil { - return nil, err - } - if openapiSpecPath != "" { - openapiSpec, err := os.ReadFile(openapiSpecPath) - if err != nil { - return nil, err - } - spec := &openapi.Specification{} - err = json.Unmarshal(openapiSpec, spec) - if err != nil { - return nil, err - } - openapiReader := &OpenapiReader{ - OpenapiSpec: spec, - Memo: make(map[string]*jsonschema.Schema), - } - resourcesDocs, err := openapiReader.ResourcesDocs() - if err != nil { - return nil, err - } - resourceSchema, err := New(reflect.TypeOf(config.Resources{}), resourcesDocs) - if err != nil { - return nil, err - } - docs.Properties["resources"] = schemaToDocs(resourceSchema) - } - docs.refreshTargetsDocs() - return docs, nil -} - func (docs *Docs) refreshTargetsDocs() error { targetsDocs, ok := docs.Properties["targets"] if !ok || targetsDocs.AdditionalProperties == nil || @@ -70,21 +37,53 @@ func (docs *Docs) refreshTargetsDocs() error { return nil } -func initializeBundleDocs() (*Docs, error) { - // load embedded descriptions +func LoadBundleDescriptions() (*Docs, error) { embedded := Docs{} err := json.Unmarshal(bundleDocs, &embedded) + return &embedded, err +} + +func UpdateBundleDescriptions(openapiSpecPath string) (*Docs, error) { + embedded, err := LoadBundleDescriptions() if err != nil { return nil, err } - // generate schema with the embedded descriptions - schema, err := New(reflect.TypeOf(config.Root{}), &embedded) + + // Generate schema from the embedded descriptions, and convert it back to docs. + // This creates empty descriptions for any properties that were missing in the + // embedded descriptions. + schema, err := New(reflect.TypeOf(config.Root{}), embedded) if err != nil { return nil, err } - // converting the schema back to docs. This creates empty descriptions - // for any properties that were missing in the embedded descriptions docs := schemaToDocs(schema) + + // Load the Databricks OpenAPI spec + openapiSpec, err := os.ReadFile(openapiSpecPath) + if err != nil { + return nil, err + } + spec := &openapi.Specification{} + err = json.Unmarshal(openapiSpec, spec) + if err != nil { + return nil, err + } + openapiReader := &OpenapiReader{ + OpenapiSpec: spec, + Memo: make(map[string]*jsonschema.Schema), + } + + // Generate descriptions for the "resources" field + resourcesDocs, err := openapiReader.ResourcesDocs() + if err != nil { + return nil, err + } + resourceSchema, err := New(reflect.TypeOf(config.Resources{}), resourcesDocs) + if err != nil { + return nil, err + } + docs.Properties["resources"] = schemaToDocs(resourceSchema) + docs.refreshTargetsDocs() return docs, nil } diff --git a/bundle/schema/docs/bundle_descriptions.json b/bundle/schema/docs/bundle_descriptions.json index 98f3cf8d0a..09462fb0ae 100644 --- a/bundle/schema/docs/bundle_descriptions.json +++ b/bundle/schema/docs/bundle_descriptions.json @@ -6,13 +6,25 @@ "additionalproperties": { "description": "", "properties": { - "notebook": { + "build": { + "description": "" + }, + "files": { "description": "", - "properties": { - "path": { - "description": "" + "items": { + "description": "", + "properties": { + "source": { + "description": "" + } } } + }, + "path": { + "description": "" + }, + "type": { + "description": "" } } } @@ -20,6 +32,9 @@ "bundle": { "description": "The details for this bundle.", "properties": { + "compute_id": { + "description": "" + }, "git": { "description": "", "properties": { @@ -36,1370 +51,2256 @@ } } }, - "targets": { + "experimental": { "description": "", - "additionalproperties": { + "properties": { + "python_wheel_wrapper": { + "description": "" + }, + "scripts": { + "description": "", + "additionalproperties": { + "description": "" + } + } + } + }, + "include": { + "description": "A list of glob patterns of files to load and merge into the this configuration. Defaults to no files being included.", + "items": { + "description": "" + } + }, + "permissions": { + "description": "", + "items": { "description": "", "properties": { - "artifacts": { - "description": "A description of all code artifacts in this bundle.", - "additionalproperties": { - "description": "", - "properties": { - "notebook": { - "description": "", - "properties": { - "path": { - "description": "" - } - } - } - } - } + "group_name": { + "description": "" }, - "bundle": { - "description": "The details for this bundle.", - "properties": { - "git": { - "description": "", - "properties": { - "branch": { - "description": "" - }, - "origin_url": { - "description": "" - } - } - }, - "name": { - "description": "The name of the bundle." - } - } + "level": { + "description": "" }, - "default": { + "service_principal_name": { "description": "" }, - "resources": { - "description": "Collection of Databricks resources to deploy.", + "user_name": { + "description": "" + } + } + } + }, + "resources": { + "description": "Collection of Databricks resources to deploy.", + "properties": { + "experiments": { + "description": "List of MLflow experiments", + "additionalproperties": { + "description": "", "properties": { - "experiments": { - "description": "List of MLflow experiments", - "additionalproperties": { + "artifact_location": { + "description": "Location where artifacts for the experiment are stored." + }, + "creation_time": { + "description": "Creation time" + }, + "experiment_id": { + "description": "Unique identifier for the experiment." + }, + "last_update_time": { + "description": "Last update time" + }, + "lifecycle_stage": { + "description": "Current life cycle stage of the experiment: \"active\" or \"deleted\".\nDeleted experiments are not returned by APIs." + }, + "name": { + "description": "Human readable name that identifies the experiment." + }, + "permissions": { + "description": "", + "items": { "description": "", "properties": { - "artifact_location": { - "description": "Location where artifacts for the experiment are stored." - }, - "creation_time": { - "description": "Creation time" - }, - "experiment_id": { - "description": "Unique identifier for the experiment." - }, - "last_update_time": { - "description": "Last update time" - }, - "lifecycle_stage": { - "description": "Current life cycle stage of the experiment: \"active\" or \"deleted\".\nDeleted experiments are not returned by APIs." + "group_name": { + "description": "" }, - "name": { - "description": "Human readable name that identifies the experiment." + "level": { + "description": "" }, - "permissions": { - "description": "", - "items": { - "description": "", - "properties": { - "group_name": { - "description": "" - }, - "level": { - "description": "" - }, - "service_principal_name": { - "description": "" - }, - "user_name": { - "description": "" - } - } - } + "service_principal_name": { + "description": "" }, - "tags": { - "description": "Tags: Additional metadata key-value pairs.", - "items": { - "description": "", - "properties": { - "key": { - "description": "The tag key." - }, - "value": { - "description": "The tag value." - } - } - } + "user_name": { + "description": "" } } } }, - "jobs": { - "description": "List of Databricks jobs", - "additionalproperties": { + "tags": { + "description": "Tags: Additional metadata key-value pairs.", + "items": { "description": "", "properties": { - "compute": { - "description": "A list of compute requirements that can be referenced by tasks of this job.", - "items": { - "description": "", - "properties": { - "compute_key": { - "description": "A unique name for the compute requirement. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine the compute requirements for the task execution." - }, - "spec": { - "description": "", - "properties": { - "kind": { - "description": "The kind of compute described by this compute specification." - } - } - } - } - } + "key": { + "description": "The tag key." }, - "continuous": { - "description": "An optional continuous property for this job. The continuous property will ensure that there is always one run executing. Only one of `schedule` and `continuous` can be used.", - "properties": { - "pause_status": { - "description": "Whether this trigger is paused or not." - } - } + "value": { + "description": "The tag value." + } + } + } + } + } + } + }, + "jobs": { + "description": "List of Databricks jobs", + "additionalproperties": { + "description": "", + "properties": { + "compute": { + "description": "A list of compute requirements that can be referenced by tasks of this job.", + "items": { + "description": "", + "properties": { + "compute_key": { + "description": "A unique name for the compute requirement. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine the compute requirements for the task execution." }, - "email_notifications": { - "description": "An optional set of email addresses that is notified when runs of this job begin or complete as well as when this job is deleted. The default behavior is to not send any emails.", + "spec": { + "description": "", "properties": { - "no_alert_for_skipped_runs": { - "description": "If true, do not send email to recipients specified in `on_failure` if the run is skipped." - }, - "on_failure": { - "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - }, - "on_start": { - "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - }, - "on_success": { - "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESS` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", - "items": { - "description": "" - } + "kind": { + "description": "The kind of compute described by this compute specification." } } - }, - "format": { - "description": "Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls. When using the Jobs API 2.1 this value is always set to `\"MULTI_TASK\"`." - }, - "git_source": { - "description": "An optional specification for a remote repository containing the notebooks used by this job's notebook tasks.", - "properties": { - "git_branch": { - "description": "Name of the branch to be checked out and used by this job.\nThis field cannot be specified in conjunction with git_tag or git_commit.\n\nThe maximum length is 255 characters.\n" - }, - "git_commit": { - "description": "Commit to be checked out and used by this job. This field cannot be specified in conjunction with git_branch or git_tag.\nThe maximum length is 64 characters." - }, - "git_provider": { - "description": "Unique identifier of the service used to host the Git repository. The value is case insensitive." - }, - "git_snapshot": { - "description": "", - "properties": { - "used_commit": { - "description": "Commit that was used to execute the run. If git_branch was specified, this points to the HEAD of the branch at the time of the run; if git_tag was specified, this points to the commit the tag points to." - } - } + } + } + } + }, + "continuous": { + "description": "An optional continuous property for this job. The continuous property will ensure that there is always one run executing. Only one of `schedule` and `continuous` can be used.", + "properties": { + "pause_status": { + "description": "Indicate whether this schedule is paused or not." + } + } + }, + "deployment": { + "description": "Deployment information for jobs managed by external sources.", + "properties": { + "kind": { + "description": "The kind of deployment that manages the job.\n\n* `BUNDLE`: The job is managed by Databricks Asset Bundle.\n" + }, + "metadata_file_path": { + "description": "Path of the file that contains deployment metadata." + } + } + }, + "description": { + "description": "An optional description for the job. The maximum length is 1024 characters in UTF-8 encoding." + }, + "edit_mode": { + "description": "Edit mode of the job.\n\n* `UI_LOCKED`: The job is in a locked UI state and cannot be modified.\n* `EDITABLE`: The job is in an editable state and can be modified.\n" + }, + "email_notifications": { + "description": "An optional set of email addresses that is notified when runs of this job begin or complete as well as when this job is deleted.", + "properties": { + "no_alert_for_skipped_runs": { + "description": "If true, do not send email to recipients specified in `on_failure` if the run is skipped." + }, + "on_duration_warning_threshold_exceeded": { + "description": "A list of email addresses to be notified when the duration of a run exceeds the threshold specified for the `RUN_DURATION_SECONDS` metric in the `health` field. If no rule for the `RUN_DURATION_SECONDS` metric is specified in the `health` field for the job, notifications are not sent.", + "items": { + "description": "" + } + }, + "on_failure": { + "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_start": { + "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_success": { + "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESS` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + } + } + }, + "format": { + "description": "Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls. When using the Jobs API 2.1 this value is always set to `\"MULTI_TASK\"`." + }, + "git_source": { + "description": "An optional specification for a remote Git repository containing the source code used by tasks. Version-controlled source code is supported by notebook, dbt, Python script, and SQL File tasks.\n\nIf `git_source` is set, these tasks retrieve the file from the remote repository by default. However, this behavior can be overridden by setting `source` to `WORKSPACE` on the task.\n\nNote: dbt and SQL File tasks support only version-controlled sources. If dbt or SQL File tasks are used, `git_source` must be defined on the job.", + "properties": { + "git_branch": { + "description": "Name of the branch to be checked out and used by this job. This field cannot be specified in conjunction with git_tag or git_commit." + }, + "git_commit": { + "description": "Commit to be checked out and used by this job. This field cannot be specified in conjunction with git_branch or git_tag." + }, + "git_provider": { + "description": "Unique identifier of the service used to host the Git repository. The value is case insensitive." + }, + "git_snapshot": { + "description": "", + "properties": { + "used_commit": { + "description": "Commit that was used to execute the run. If git_branch was specified, this points to the HEAD of the branch at the time of the run; if git_tag was specified, this points to the commit the tag points to." + } + } + }, + "git_tag": { + "description": "Name of the tag to be checked out and used by this job. This field cannot be specified in conjunction with git_branch or git_commit." + }, + "git_url": { + "description": "URL of the repository to be cloned by this job." + }, + "job_source": { + "description": "The source of the job specification in the remote repository when the job is source controlled.", + "properties": { + "dirty_state": { + "description": "Dirty state indicates the job is not fully synced with the job specification in the remote repository.\n\nPossible values are:\n* `NOT_SYNCED`: The job is not yet synced with the remote job specification. Import the remote job specification from UI to make the job fully synced.\n* `DISCONNECTED`: The job is temporary disconnected from the remote job specification and is allowed for live edit. Import the remote job specification again from UI to make the job fully synced.\n" + }, + "import_from_git_branch": { + "description": "Name of the branch which the job is imported from." + }, + "job_config_path": { + "description": "Path of the job YAML file that contains the job specification." + } + } + } + } + }, + "health": { + "description": "", + "properties": { + "rules": { + "description": "", + "items": { + "description": "", + "properties": { + "metric": { + "description": "" }, - "git_tag": { - "description": "Name of the tag to be checked out and used by this job.\nThis field cannot be specified in conjunction with git_branch or git_commit.\n\nThe maximum length is 255 characters.\n" + "op": { + "description": "" }, - "git_url": { - "description": "URL of the repository to be cloned by this job.\nThe maximum length is 300 characters." + "value": { + "description": "Specifies the threshold value that the health metric should obey to satisfy the health rule." } } + } + } + } + }, + "job_clusters": { + "description": "A list of job cluster specifications that can be shared and reused by tasks of this job. Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in task settings.", + "items": { + "description": "", + "properties": { + "job_cluster_key": { + "description": "A unique name for the job cluster. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine which cluster to launch for the task execution." }, - "job_clusters": { - "description": "A list of job cluster specifications that can be shared and reused by tasks of this job. Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in task settings.", - "items": { - "description": "", - "properties": { - "job_cluster_key": { - "description": "A unique name for the job cluster. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine which cluster to launch for the task execution." - }, - "new_cluster": { - "description": "If new_cluster, a description of a cluster that is created for only for this task.", - "properties": { - "autoscale": { - "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", - "properties": { - "max_workers": { - "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." - }, - "min_workers": { - "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." - } + "new_cluster": { + "description": "If new_cluster, a description of a cluster that is created for each task.", + "properties": { + "apply_policy_default_values": { + "description": "" + }, + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "autotermination_minutes": { + "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nThe list of available zones as well as the default value can be found by using the\n`List Zones` method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" } - }, - "autotermination_minutes": { - "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." - }, - "aws_attributes": { - "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "ebs_volume_count": { - "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." - }, - "ebs_volume_iops": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_size": { - "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." - }, - "ebs_volume_throughput": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_type": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "instance_profile_arn": { - "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." - }, - "spot_bid_price_percent": { - "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." - }, - "zone_id": { - "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." - } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" } - }, - "azure_attributes": { - "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "log_analytics_info": { - "description": "Defines values necessary to configure and run Azure Log Analytics agent", - "properties": { - "log_analytics_primary_key": { - "description": "\u003cneeds content added\u003e" - }, - "log_analytics_workspace_id": { - "description": "\u003cneeds content added\u003e" - } - } - }, - "spot_bid_max_price": { - "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." - } + } + }, + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." } - }, - "cluster_log_conf": { - "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + } + } + } + }, + "cluster_name": { + "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" + }, + "cluster_source": { + "description": "" + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "data_security_mode": { + "description": "" + }, + "docker_image": { + "description": "", + "properties": { + "basic_auth": { + "description": "", + "properties": { + "password": { + "description": "Password of the user" + }, + "username": { + "description": "Name of the user" + } + } + }, + "url": { + "description": "URL of the docker image." + } + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" + }, + "enable_elastic_disk": { + "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." + }, + "enable_local_disk_encryption": { + "description": "Whether to enable LUKS on cluster VMs' local disks" + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + }, + "local_ssd_count": { + "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." + } + } + }, + "init_scripts": { + "description": "The configuration for storing init scripts. Any number of destinations can be specified. The scripts are executed sequentially in the order provided. If `cluster_log_conf` is specified, init script logs are sent to `\u003cdestination\u003e/\u003ccluster-ID\u003e/init_scripts`.", + "items": { + "description": "", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", - "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" - } - } - }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", - "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." - }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." - }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." - }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." - }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." - } - } + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" } } }, - "cluster_name": { - "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" - }, - "cluster_source": { - "description": "" - }, - "custom_tags": { - "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", - "additionalproperties": { - "description": "" - } - }, - "data_security_mode": { - "description": "" - }, - "docker_image": { - "description": "", + "file": { + "description": "destination needs to be provided. e.g.\n`{ \"file\" : { \"destination\" : \"file:/my/local/file.sh\" } }`", "properties": { - "basic_auth": { - "description": "", - "properties": { - "password": { - "description": "Password of the user" - }, - "username": { - "description": "Name of the user" - } - } - }, - "url": { - "description": "URL of the docker image." + "destination": { + "description": "local file destination, e.g. `file:/my/local/file.sh`" } } }, - "driver_instance_pool_id": { - "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." - }, - "driver_node_type_id": { - "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" - }, - "enable_elastic_disk": { - "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." - }, - "enable_local_disk_encryption": { - "description": "Whether to enable LUKS on cluster VMs' local disks" - }, - "gcp_attributes": { - "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", "properties": { - "availability": { - "description": "" + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." }, - "boot_disk_size": { - "description": "boot disk size in GB" + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." }, - "google_service_account": { - "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." }, - "local_ssd_count": { - "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." } } }, - "init_scripts": { - "description": "The configuration for storing init scripts. Any number of destinations can be specified. The scripts are executed sequentially in the order provided. If `cluster_log_conf` is specified, init script logs are sent to `\u003cdestination\u003e/\u003ccluster-ID\u003e/init_scripts`.", - "items": { - "description": "", - "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", - "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" - } - } - }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", - "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." - }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." - }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." - }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." - }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." - } - } - }, - "workspace": { - "description": "destination needs to be provided. e.g.\n`{ \"workspace\" : { \"destination\" : \"/Users/user1@databricks.com/my-init.sh\" } }`", - "properties": { - "destination": { - "description": "workspace files destination, e.g. `/Users/user1@databricks.com/my-init.sh`" - } - } - } + "volumes": { + "description": "destination needs to be provided. e.g.\n`{ \"volumes\" : { \"destination\" : \"/Volumes/my-init.sh\" } }`", + "properties": { + "destination": { + "description": "Unity Catalog Volumes file destination, e.g. `/Volumes/my-init.sh`" } } }, - "instance_pool_id": { - "description": "The optional ID of the instance pool to which the cluster belongs." - }, - "node_type_id": { - "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" - }, - "num_workers": { - "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." - }, - "policy_id": { - "description": "The ID of the cluster policy used to create the cluster if applicable." - }, - "runtime_engine": { - "description": "" - }, - "single_user_name": { - "description": "Single user name if data_security_mode is `SINGLE_USER`" - }, - "spark_conf": { - "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", - "additionalproperties": { - "description": "" - } - }, - "spark_env_vars": { - "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", - "additionalproperties": { - "description": "" - } - }, - "spark_version": { - "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" - }, - "ssh_public_keys": { - "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", - "items": { - "description": "" - } - }, - "workload_type": { - "description": "", + "workspace": { + "description": "destination needs to be provided. e.g.\n`{ \"workspace\" : { \"destination\" : \"/Users/user1@databricks.com/my-init.sh\" } }`", "properties": { - "clients": { - "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", - "properties": { - "jobs": { - "description": "With jobs set, the cluster can be used for jobs" - }, - "notebooks": { - "description": "With notebooks set, this cluster can be used for notebooks" - } - } + "destination": { + "description": "workspace files destination, e.g. `/Users/user1@databricks.com/my-init.sh`" } } } } } - } - } - }, - "max_concurrent_runs": { - "description": "An optional maximum allowed number of concurrent runs of the job.\n\nSet this value if you want to be able to execute multiple runs of the same job concurrently. This is useful for example if you trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each other, or if you want to trigger multiple runs which differ by their input parameters.\n\nThis setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4 concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs. However, from then on, new runs are skipped unless there are fewer than 3 active runs.\n\nThis value cannot exceed 1000\\. Setting this value to 0 causes all new runs to be skipped. The default behavior is to allow only 1 concurrent run." - }, - "name": { - "description": "An optional name for the job." - }, - "notification_settings": { - "description": "Optional notification settings that are used when sending notifications to each of the `email_notifications` and `webhook_notifications` for this job.", - "properties": { - "no_alert_for_canceled_runs": { - "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is canceled." }, - "no_alert_for_skipped_runs": { - "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is skipped." - } - } - }, - "parameters": { - "description": "Job-level parameter definitions", - "items": { - "description": "", - "properties": { - "default": { - "description": "Default value of the parameter." - }, - "name": { - "description": "The name of the defined parameter. May only contain alphanumeric characters, `_`, `-`, and `.`" - } - } - } - }, - "permissions": { - "description": "", - "items": { - "description": "", - "properties": { - "group_name": { - "description": "" - }, - "level": { - "description": "" - }, - "service_principal_name": { + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "runtime_engine": { + "description": "" + }, + "single_user_name": { + "description": "Single user name if data_security_mode is `SINGLE_USER`" + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", + "additionalproperties": { "description": "" - }, - "user_name": { + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { "description": "" } - } - } - }, - "run_as": { - "description": "", - "properties": { - "service_principal_name": { - "description": "Application ID of an active service principal. Setting this field requires the `servicePrincipal/user` role." }, - "user_name": { - "description": "The email of an active workspace user. Non-admin users can only set this field to their own email." - } - } - }, - "schedule": { - "description": "An optional periodic schedule for this job. The default behavior is that the job only runs when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", - "properties": { - "pause_status": { - "description": "Whether this trigger is paused or not." + "spark_version": { + "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" }, - "quartz_cron_expression": { - "description": "A Cron expression using Quartz syntax that describes the schedule for a job.\nSee [Cron Trigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html)\nfor details. This field is required.\"\n" + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } }, - "timezone_id": { - "description": "A Java timezone ID. The schedule for a job is resolved with respect to this timezone.\nSee [Java TimeZone](https://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html) for details.\nThis field is required.\n" - } - } - }, - "tags": { - "description": "A map of tags associated with the job. These are forwarded to the cluster as cluster tags for jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can be added to the job.", - "additionalproperties": { - "description": "" - } - }, - "tasks": { - "description": "A list of task specifications to be executed by this job.", - "items": { - "description": "", - "properties": { - "compute_key": { - "description": "The key of the compute requirement, specified in `job.settings.compute`, to use for execution of this task." - }, - "condition_task": { - "description": "If condition_task, specifies a condition with an outcome that can be used to control the execution of other tasks. Does not require a cluster to execute and does not support retries or notifications.", - "properties": { - "left": { - "description": "The left operand of the condition task. Can be either a string value or a job state or parameter reference." - }, - "op": { - "description": "* `EQUAL_TO`, `NOT_EQUAL` operators perform string comparison of their operands. This means that `“12.0” == “12”` will evaluate to `false`.\n* `GREATER_THAN`, `GREATER_THAN_OR_EQUAL`, `LESS_THAN`, `LESS_THAN_OR_EQUAL` operators perform numeric comparison of their operands. `“12.0” \u003e= “12”` will evaluate to `true`, `“10.0” \u003e= “12”` will evaluate to `false`.\n\nThe boolean comparison to task values can be implemented with operators `EQUAL_TO`, `NOT_EQUAL`. If a task value was set to a boolean value, it will be serialized to `“true”` or `“false”` for the comparison.\n" - }, - "right": { - "description": "The right operand of the condition task. Can be either a string value or a job state or parameter reference." - } - } - }, - "dbt_task": { - "description": "If dbt_task, indicates that this must execute a dbt task. It requires both Databricks SQL and the ability to use a serverless or a pro SQL warehouse.", - "properties": { - "catalog": { - "description": "Optional name of the catalog to use. The value is the top level in the 3-level namespace of Unity Catalog (catalog / schema / relation). The catalog value can only be specified if a warehouse_id is specified. Requires dbt-databricks \u003e= 1.1.1." - }, - "commands": { - "description": "A list of dbt commands to execute. All commands must start with `dbt`. This parameter must not be empty. A maximum of up to 10 commands can be provided.", - "items": { - "description": "" - } - }, - "profiles_directory": { - "description": "Optional (relative) path to the profiles directory. Can only be specified if no warehouse_id is specified. If no warehouse_id is specified and this folder is unset, the root directory is used." - }, - "project_directory": { - "description": "Optional (relative) path to the project directory, if no value is provided, the root of the git repository is used." - }, - "schema": { - "description": "Optional schema to write to. This parameter is only used when a warehouse_id is also provided. If not provided, the `default` schema is used." - }, - "warehouse_id": { - "description": "ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the profile and connection details to dbt. It can be overridden on a per-command basis by using the `--profiles-dir` command line argument." - } - } - }, - "depends_on": { - "description": "An optional array of objects specifying the dependency graph of the task. All tasks specified in this field must complete successfully before executing this task.\nThe key is `task_key`, and the value is the name assigned to the dependent task.\n", - "items": { - "description": "", + "workload_type": { + "description": "", + "properties": { + "clients": { + "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", "properties": { - "outcome": { - "description": "Can only be specified on condition task dependencies. The outcome of the dependent task that must be met for this task to run." + "jobs": { + "description": "With jobs set, the cluster can be used for jobs" }, - "task_key": { - "description": "The name of the task this task depends on." + "notebooks": { + "description": "With notebooks set, this cluster can be used for notebooks" } } } - }, - "description": { - "description": "An optional description for this task.\nThe maximum length is 4096 bytes." - }, - "email_notifications": { - "description": "An optional set of email addresses that is notified when runs of this task begin or complete as well as when this task is deleted. The default behavior is to not send any emails.", - "properties": { - "on_failure": { - "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - }, - "on_start": { - "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", - "items": { - "description": "" - } + } + } + } + } + } + } + }, + "max_concurrent_runs": { + "description": "An optional maximum allowed number of concurrent runs of the job.\n\nSet this value if you want to be able to execute multiple runs of the same job concurrently. This is useful for example if you trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each other, or if you want to trigger multiple runs which differ by their input parameters.\n\nThis setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4 concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs. However, from then on, new runs are skipped unless there are fewer than 3 active runs.\n\nThis value cannot exceed 1000. Setting this value to `0` causes all new runs to be skipped." + }, + "name": { + "description": "An optional name for the job. The maximum length is 4096 bytes in UTF-8 encoding." + }, + "notification_settings": { + "description": "Optional notification settings that are used when sending notifications to each of the `email_notifications` and `webhook_notifications` for this job.", + "properties": { + "no_alert_for_canceled_runs": { + "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is canceled." + }, + "no_alert_for_skipped_runs": { + "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is skipped." + } + } + }, + "parameters": { + "description": "Job-level parameter definitions", + "items": { + "description": "", + "properties": { + "default": { + "description": "Default value of the parameter." + }, + "name": { + "description": "The name of the defined parameter. May only contain alphanumeric characters, `_`, `-`, and `.`" + } + } + } + }, + "permissions": { + "description": "", + "items": { + "description": "", + "properties": { + "group_name": { + "description": "" + }, + "level": { + "description": "" + }, + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } + } + } + }, + "queue": { + "description": "The queue settings of the job.", + "properties": { + "enabled": { + "description": "If true, enable queueing for the job. This is a required field." + } + } + }, + "run_as": { + "description": "", + "properties": { + "service_principal_name": { + "description": "Application ID of an active service principal. Setting this field requires the `servicePrincipal/user` role." + }, + "user_name": { + "description": "The email of an active workspace user. Non-admin users can only set this field to their own email." + } + } + }, + "schedule": { + "description": "An optional periodic schedule for this job. The default behavior is that the job only runs when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", + "properties": { + "pause_status": { + "description": "Indicate whether this schedule is paused or not." + }, + "quartz_cron_expression": { + "description": "A Cron expression using Quartz syntax that describes the schedule for a job.\nSee [Cron Trigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html)\nfor details. This field is required.\"\n" + }, + "timezone_id": { + "description": "A Java timezone ID. The schedule for a job is resolved with respect to this timezone.\nSee [Java TimeZone](https://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html) for details.\nThis field is required.\n" + } + } + }, + "tags": { + "description": "A map of tags associated with the job. These are forwarded to the cluster as cluster tags for jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can be added to the job.", + "additionalproperties": { + "description": "" + } + }, + "tasks": { + "description": "A list of task specifications to be executed by this job.", + "items": { + "description": "", + "properties": { + "compute_key": { + "description": "The key of the compute requirement, specified in `job.settings.compute`, to use for execution of this task." + }, + "condition_task": { + "description": "If condition_task, specifies a condition with an outcome that can be used to control the execution of other tasks. Does not require a cluster to execute and does not support retries or notifications.", + "properties": { + "left": { + "description": "The left operand of the condition task. Can be either a string value or a job state or parameter reference." + }, + "op": { + "description": "* `EQUAL_TO`, `NOT_EQUAL` operators perform string comparison of their operands. This means that `“12.0” == “12”` will evaluate to `false`.\n* `GREATER_THAN`, `GREATER_THAN_OR_EQUAL`, `LESS_THAN`, `LESS_THAN_OR_EQUAL` operators perform numeric comparison of their operands. `“12.0” \u003e= “12”` will evaluate to `true`, `“10.0” \u003e= “12”` will evaluate to `false`.\n\nThe boolean comparison to task values can be implemented with operators `EQUAL_TO`, `NOT_EQUAL`. If a task value was set to a boolean value, it will be serialized to `“true”` or `“false”` for the comparison.\n" + }, + "right": { + "description": "The right operand of the condition task. Can be either a string value or a job state or parameter reference." + } + } + }, + "dbt_task": { + "description": "If dbt_task, indicates that this must execute a dbt task. It requires both Databricks SQL and the ability to use a serverless or a pro SQL warehouse.", + "properties": { + "catalog": { + "description": "Optional name of the catalog to use. The value is the top level in the 3-level namespace of Unity Catalog (catalog / schema / relation). The catalog value can only be specified if a warehouse_id is specified. Requires dbt-databricks \u003e= 1.1.1." + }, + "commands": { + "description": "A list of dbt commands to execute. All commands must start with `dbt`. This parameter must not be empty. A maximum of up to 10 commands can be provided.", + "items": { + "description": "" + } + }, + "profiles_directory": { + "description": "Optional (relative) path to the profiles directory. Can only be specified if no warehouse_id is specified. If no warehouse_id is specified and this folder is unset, the root directory is used." + }, + "project_directory": { + "description": "Optional (relative) path to the project directory, if no value is provided, the root of the git repository is used." + }, + "schema": { + "description": "Optional schema to write to. This parameter is only used when a warehouse_id is also provided. If not provided, the `default` schema is used." + }, + "warehouse_id": { + "description": "ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the profile and connection details to dbt. It can be overridden on a per-command basis by using the `--profiles-dir` command line argument." + } + } + }, + "depends_on": { + "description": "An optional array of objects specifying the dependency graph of the task. All tasks specified in this field must complete before executing this task. The task will run only if the `run_if` condition is true.\nThe key is `task_key`, and the value is the name assigned to the dependent task.\n", + "items": { + "description": "", + "properties": { + "outcome": { + "description": "Can only be specified on condition task dependencies. The outcome of the dependent task that must be met for this task to run." + }, + "task_key": { + "description": "The name of the task this task depends on." + } + } + } + }, + "description": { + "description": "An optional description for this task." + }, + "email_notifications": { + "description": "An optional set of email addresses that is notified when runs of this task begin or complete as well as when this task is deleted. The default behavior is to not send any emails.", + "properties": { + "on_duration_warning_threshold_exceeded": { + "description": "A list of email addresses to be notified when the duration of a run exceeds the threshold specified for the `RUN_DURATION_SECONDS` metric in the `health` field. If no rule for the `RUN_DURATION_SECONDS` metric is specified in the `health` field for the job, notifications are not sent.", + "items": { + "description": "" + } + }, + "on_failure": { + "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_start": { + "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_success": { + "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESS` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + } + } + }, + "existing_cluster_id": { + "description": "If existing_cluster_id, the ID of an existing cluster that is used for all runs of this task. When running tasks on an existing cluster, you may need to manually restart the cluster if it stops responding. We suggest running jobs on new clusters for greater reliability." + }, + "health": { + "description": "", + "properties": { + "rules": { + "description": "", + "items": { + "description": "", + "properties": { + "metric": { + "description": "" }, - "on_success": { - "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESS` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "op": { + "description": "" + }, + "value": { + "description": "Specifies the threshold value that the health metric should obey to satisfy the health rule." + } + } + } + } + } + }, + "job_cluster_key": { + "description": "If job_cluster_key, this task is executed reusing the cluster specified in `job.settings.job_clusters`." + }, + "libraries": { + "description": "An optional list of libraries to be installed on the cluster that executes the task. The default value is an empty list.", + "items": { + "description": "", + "properties": { + "cran": { + "description": "Specification of a CRAN library to be installed as part of the library", + "properties": { + "package": { + "description": "The name of the CRAN package to install." + }, + "repo": { + "description": "The repository where the package can be found. If not specified, the default CRAN repo is used." + } + } + }, + "egg": { + "description": "URI of the egg to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"egg\": \"dbfs:/my/egg\" }` or\n`{ \"egg\": \"s3://my-bucket/egg\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + }, + "jar": { + "description": "URI of the jar to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"jar\": \"dbfs:/mnt/databricks/library.jar\" }` or\n`{ \"jar\": \"s3://my-bucket/library.jar\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + }, + "maven": { + "description": "Specification of a maven library to be installed. For example:\n`{ \"coordinates\": \"org.jsoup:jsoup:1.7.2\" }`", + "properties": { + "coordinates": { + "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." + }, + "exclusions": { + "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", "items": { "description": "" } + }, + "repo": { + "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." } } }, - "existing_cluster_id": { - "description": "If existing_cluster_id, the ID of an existing cluster that is used for all runs of this task. When running tasks on an existing cluster, you may need to manually restart the cluster if it stops responding. We suggest running jobs on new clusters for greater reliability." - }, - "job_cluster_key": { - "description": "If job_cluster_key, this task is executed reusing the cluster specified in `job.settings.job_clusters`." + "pypi": { + "description": "Specification of a PyPi library to be installed. For example:\n`{ \"package\": \"simplejson\" }`", + "properties": { + "package": { + "description": "The name of the pypi package to install. An optional exact version specification is also\nsupported. Examples: \"simplejson\" and \"simplejson==3.8.0\"." + }, + "repo": { + "description": "The repository where the package can be found. If not specified, the default pip index is\nused." + } + } }, - "libraries": { - "description": "An optional list of libraries to be installed on the cluster that executes the task. The default value is an empty list.", - "items": { - "description": "", + "whl": { + "description": "URI of the wheel to be installed.\nFor example: `{ \"whl\": \"dbfs:/my/whl\" }` or `{ \"whl\": \"s3://my-bucket/whl\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + } + } + } + }, + "max_retries": { + "description": "An optional maximum number of times to retry an unsuccessful run. A run is considered to be unsuccessful if it completes with the `FAILED` result_state or `INTERNAL_ERROR` `life_cycle_state`. The value `-1` means to retry indefinitely and the value `0` means to never retry." + }, + "min_retry_interval_millis": { + "description": "An optional minimal interval in milliseconds between the start of the failed run and the subsequent retry run. The default behavior is that unsuccessful runs are immediately retried." + }, + "new_cluster": { + "description": "If new_cluster, a description of a cluster that is created for each task.", + "properties": { + "apply_policy_default_values": { + "description": "" + }, + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "autotermination_minutes": { + "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nThe list of available zones as well as the default value can be found by using the\n`List Zones` method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", "properties": { - "cran": { - "description": "Specification of a CRAN library to be installed as part of the library", - "properties": { - "package": { - "description": "The name of the CRAN package to install." - }, - "repo": { - "description": "The repository where the package can be found. If not specified, the default CRAN repo is used." - } - } + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" }, - "egg": { - "description": "URI of the egg to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"egg\": \"dbfs:/my/egg\" }` or\n`{ \"egg\": \"s3://my-bucket/egg\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." }, - "jar": { - "description": "URI of the jar to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"jar\": \"dbfs:/mnt/databricks/library.jar\" }` or\n`{ \"jar\": \"s3://my-bucket/library.jar\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." }, - "maven": { - "description": "Specification of a maven library to be installed. For example:\n`{ \"coordinates\": \"org.jsoup:jsoup:1.7.2\" }`", - "properties": { - "coordinates": { - "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." - }, - "exclusions": { - "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", - "items": { - "description": "" - } - }, - "repo": { - "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." - } - } + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." }, - "pypi": { - "description": "Specification of a PyPi library to be installed. For example:\n`{ \"package\": \"simplejson\" }`", - "properties": { - "package": { - "description": "The name of the pypi package to install. An optional exact version specification is also\nsupported. Examples: \"simplejson\" and \"simplejson==3.8.0\"." - }, - "repo": { - "description": "The repository where the package can be found. If not specified, the default pip index is\nused." - } - } + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." }, - "whl": { - "description": "URI of the wheel to be installed.\nFor example: `{ \"whl\": \"dbfs:/my/whl\" }` or `{ \"whl\": \"s3://my-bucket/whl\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." } } } - }, - "max_retries": { - "description": "An optional maximum number of times to retry an unsuccessful run. A run is considered to be unsuccessful if it completes with the `FAILED` result_state or `INTERNAL_ERROR` `life_cycle_state`. The value -1 means to retry indefinitely and the value 0 means to never retry. The default behavior is to never retry." - }, - "min_retry_interval_millis": { - "description": "An optional minimal interval in milliseconds between the start of the failed run and the subsequent retry run. The default behavior is that unsuccessful runs are immediately retried." - }, - "new_cluster": { - "description": "If new_cluster, a description of a cluster that is created for only for this task.", + } + }, + "cluster_name": { + "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" + }, + "cluster_source": { + "description": "" + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "data_security_mode": { + "description": "" + }, + "docker_image": { + "description": "", + "properties": { + "basic_auth": { + "description": "", + "properties": { + "password": { + "description": "Password of the user" + }, + "username": { + "description": "Name of the user" + } + } + }, + "url": { + "description": "URL of the docker image." + } + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" + }, + "enable_elastic_disk": { + "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." + }, + "enable_local_disk_encryption": { + "description": "Whether to enable LUKS on cluster VMs' local disks" + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + }, + "local_ssd_count": { + "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." + } + } + }, + "init_scripts": { + "description": "The configuration for storing init scripts. Any number of destinations can be specified. The scripts are executed sequentially in the order provided. If `cluster_log_conf` is specified, init script logs are sent to `\u003cdestination\u003e/\u003ccluster-ID\u003e/init_scripts`.", + "items": { + "description": "", "properties": { - "autoscale": { - "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", "properties": { - "max_workers": { - "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." - }, - "min_workers": { - "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" } } }, - "autotermination_minutes": { - "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." - }, - "aws_attributes": { - "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "file": { + "description": "destination needs to be provided. e.g.\n`{ \"file\" : { \"destination\" : \"file:/my/local/file.sh\" } }`", "properties": { - "availability": { - "description": "" - }, - "ebs_volume_count": { - "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." - }, - "ebs_volume_iops": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_size": { - "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." - }, - "ebs_volume_throughput": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_type": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "instance_profile_arn": { - "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." - }, - "spot_bid_price_percent": { - "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." - }, - "zone_id": { - "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." + "destination": { + "description": "local file destination, e.g. `file:/my/local/file.sh`" } } }, - "azure_attributes": { - "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", "properties": { - "availability": { - "description": "" + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." }, - "log_analytics_info": { - "description": "Defines values necessary to configure and run Azure Log Analytics agent", - "properties": { - "log_analytics_primary_key": { - "description": "\u003cneeds content added\u003e" - }, - "log_analytics_workspace_id": { - "description": "\u003cneeds content added\u003e" - } - } + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." }, - "spot_bid_max_price": { - "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." - } - } - }, - "cluster_log_conf": { - "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", - "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", - "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" - } - } + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", - "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." - }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." - }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." - }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." - }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." - } - } - } - } - }, - "cluster_name": { - "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" - }, - "cluster_source": { - "description": "" - }, - "custom_tags": { - "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", - "additionalproperties": { - "description": "" - } - }, - "data_security_mode": { - "description": "" - }, - "docker_image": { - "description": "", - "properties": { - "basic_auth": { - "description": "", - "properties": { - "password": { - "description": "Password of the user" - }, - "username": { - "description": "Name of the user" - } - } + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." }, - "url": { - "description": "URL of the docker image." + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." } } }, - "driver_instance_pool_id": { - "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." - }, - "driver_node_type_id": { - "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" - }, - "enable_elastic_disk": { - "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." - }, - "enable_local_disk_encryption": { - "description": "Whether to enable LUKS on cluster VMs' local disks" - }, - "gcp_attributes": { - "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "volumes": { + "description": "destination needs to be provided. e.g.\n`{ \"volumes\" : { \"destination\" : \"/Volumes/my-init.sh\" } }`", "properties": { - "availability": { - "description": "" - }, - "boot_disk_size": { - "description": "boot disk size in GB" - }, - "google_service_account": { - "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." - }, - "local_ssd_count": { - "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." + "destination": { + "description": "Unity Catalog Volumes file destination, e.g. `/Volumes/my-init.sh`" } } }, - "init_scripts": { - "description": "The configuration for storing init scripts. Any number of destinations can be specified. The scripts are executed sequentially in the order provided. If `cluster_log_conf` is specified, init script logs are sent to `\u003cdestination\u003e/\u003ccluster-ID\u003e/init_scripts`.", - "items": { - "description": "", - "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", - "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" - } - } - }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", - "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." - }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." - }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." - }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." - }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." - } - } - }, - "workspace": { - "description": "destination needs to be provided. e.g.\n`{ \"workspace\" : { \"destination\" : \"/Users/user1@databricks.com/my-init.sh\" } }`", - "properties": { - "destination": { - "description": "workspace files destination, e.g. `/Users/user1@databricks.com/my-init.sh`" - } - } - } + "workspace": { + "description": "destination needs to be provided. e.g.\n`{ \"workspace\" : { \"destination\" : \"/Users/user1@databricks.com/my-init.sh\" } }`", + "properties": { + "destination": { + "description": "workspace files destination, e.g. `/Users/user1@databricks.com/my-init.sh`" } } - }, - "instance_pool_id": { - "description": "The optional ID of the instance pool to which the cluster belongs." - }, - "node_type_id": { - "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" - }, - "num_workers": { - "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." - }, - "policy_id": { - "description": "The ID of the cluster policy used to create the cluster if applicable." - }, - "runtime_engine": { - "description": "" - }, - "single_user_name": { - "description": "Single user name if data_security_mode is `SINGLE_USER`" - }, - "spark_conf": { - "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", - "additionalproperties": { - "description": "" - } - }, - "spark_env_vars": { - "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", - "additionalproperties": { - "description": "" - } - }, - "spark_version": { - "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" - }, - "ssh_public_keys": { - "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", - "items": { - "description": "" - } - }, - "workload_type": { - "description": "", - "properties": { - "clients": { - "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", - "properties": { - "jobs": { - "description": "With jobs set, the cluster can be used for jobs" - }, - "notebooks": { - "description": "With notebooks set, this cluster can be used for notebooks" - } - } - } + } + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "runtime_engine": { + "description": "" + }, + "single_user_name": { + "description": "Single user name if data_security_mode is `SINGLE_USER`" + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "spark_version": { + "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + }, + "workload_type": { + "description": "", + "properties": { + "clients": { + "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", + "properties": { + "jobs": { + "description": "With jobs set, the cluster can be used for jobs" + }, + "notebooks": { + "description": "With notebooks set, this cluster can be used for notebooks" } } } - }, - "notebook_task": { - "description": "If notebook_task, indicates that this task must run a notebook. This field may not be specified in conjunction with spark_jar_task.", - "properties": { - "base_parameters": { - "description": "Base parameters to be used for each run of this job. If the run is initiated by a call to\n:method:jobs/runNow with parameters specified, the two parameters maps are merged. If the same key is specified in\n`base_parameters` and in `run-now`, the value from `run-now` is used.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n\nIf the notebook takes a parameter that is not specified in the job’s `base_parameters` or the `run-now` override parameters,\nthe default value from the notebook is used.\n\nRetrieve these parameters in a notebook using [dbutils.widgets.get](https://docs.databricks.com/dev-tools/databricks-utils.html#dbutils-widgets).\n", - "additionalproperties": { - "description": "" + } + } + } + }, + "notebook_task": { + "description": "If notebook_task, indicates that this task must run a notebook. This field may not be specified in conjunction with spark_jar_task.", + "properties": { + "base_parameters": { + "description": "Base parameters to be used for each run of this job. If the run is initiated by a call to\n:method:jobs/runNow with parameters specified, the two parameters maps are merged. If the same key is specified in\n`base_parameters` and in `run-now`, the value from `run-now` is used.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n\nIf the notebook takes a parameter that is not specified in the job’s `base_parameters` or the `run-now` override parameters,\nthe default value from the notebook is used.\n\nRetrieve these parameters in a notebook using [dbutils.widgets.get](https://docs.databricks.com/dev-tools/databricks-utils.html#dbutils-widgets).\n\nThe JSON representation of this field cannot exceed 1MB.\n", + "additionalproperties": { + "description": "" + } + }, + "notebook_path": { + "description": "The path of the notebook to be run in the Databricks workspace or remote repository.\nFor notebooks stored in the Databricks workspace, the path must be absolute and begin with a slash.\nFor notebooks stored in a remote repository, the path must be relative. This field is required.\n" + }, + "source": { + "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved\nfrom the local \u003cDatabricks\u003e workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a \u003cDatabricks\u003e workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository.\n" + } + } + }, + "notification_settings": { + "description": "Optional notification settings that are used when sending notifications to each of the `email_notifications` and `webhook_notifications` for this task.", + "properties": { + "alert_on_last_attempt": { + "description": "If true, do not send notifications to recipients specified in `on_start` for the retried runs and do not send notifications to recipients specified in `on_failure` until the last retry of the run." + }, + "no_alert_for_canceled_runs": { + "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is canceled." + }, + "no_alert_for_skipped_runs": { + "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is skipped." + } + } + }, + "pipeline_task": { + "description": "If pipeline_task, indicates that this task must execute a Pipeline.", + "properties": { + "full_refresh": { + "description": "If true, a full refresh will be triggered on the delta live table." + }, + "pipeline_id": { + "description": "The full name of the pipeline task to execute." + } + } + }, + "python_wheel_task": { + "description": "If python_wheel_task, indicates that this job must execute a PythonWheel.", + "properties": { + "entry_point": { + "description": "Named entry point to use, if it does not exist in the metadata of the package it executes the function from the package directly using `$packageName.$entryPoint()`" + }, + "named_parameters": { + "description": "Command-line parameters passed to Python wheel task in the form of `[\"--name=task\", \"--data=dbfs:/path/to/data.json\"]`. Leave it empty if `parameters` is not null.", + "additionalproperties": { + "description": "" + } + }, + "package_name": { + "description": "Name of the package to execute" + }, + "parameters": { + "description": "Command-line parameters passed to Python wheel task. Leave it empty if `named_parameters` is not null.", + "items": { + "description": "" + } + } + } + }, + "retry_on_timeout": { + "description": "An optional policy to specify whether to retry a task when it times out." + }, + "run_if": { + "description": "An optional value specifying the condition determining whether the task is run once its dependencies have been completed.\n\n* `ALL_SUCCESS`: All dependencies have executed and succeeded\n* `AT_LEAST_ONE_SUCCESS`: At least one dependency has succeeded\n* `NONE_FAILED`: None of the dependencies have failed and at least one was executed\n* `ALL_DONE`: All dependencies have been completed\n* `AT_LEAST_ONE_FAILED`: At least one dependency failed\n* `ALL_FAILED`: ALl dependencies have failed\n" + }, + "run_job_task": { + "description": "If run_job_task, indicates that this task must execute another job.", + "properties": { + "job_id": { + "description": "ID of the job to trigger." + }, + "job_parameters": { + "description": "" + } + } + }, + "spark_jar_task": { + "description": "If spark_jar_task, indicates that this task must run a JAR.", + "properties": { + "jar_uri": { + "description": "Deprecated since 04/2016. Provide a `jar` through the `libraries` field instead. For an example, see :method:jobs/create.\n" + }, + "main_class_name": { + "description": "The full name of the class containing the main method to be executed. This class must be contained in a JAR provided as a library.\n\nThe code must use `SparkContext.getOrCreate` to obtain a Spark context; otherwise, runs of the job fail." + }, + "parameters": { + "description": "Parameters passed to the main method.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + } + } + }, + "spark_python_task": { + "description": "If spark_python_task, indicates that this task must run a Python file.", + "properties": { + "parameters": { + "description": "Command line parameters passed to the Python file.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + }, + "python_file": { + "description": "The Python file to be executed. Cloud file URIs (such as dbfs:/, s3:/, adls:/, gcs:/) and workspace paths are supported. For python files stored in the Databricks workspace, the path must be absolute and begin with `/`. For files stored in a remote repository, the path must be relative. This field is required." + }, + "source": { + "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved\nfrom the local \u003cDatabricks\u003e workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a \u003cDatabricks\u003e workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository.\n" + } + } + }, + "spark_submit_task": { + "description": "If `spark_submit_task`, indicates that this task must be launched by the spark submit script. This task can run only on new clusters.\n\nIn the `new_cluster` specification, `libraries` and `spark_conf` are not supported. Instead, use `--jars` and `--py-files` to add Java and Python libraries and `--conf` to set the Spark configurations. \n\n`master`, `deploy-mode`, and `executor-cores` are automatically configured by Databricks; you _cannot_ specify them in parameters.\n\nBy default, the Spark submit job uses all available memory (excluding reserved memory for Databricks services). You can set `--driver-memory`, and `--executor-memory` to a smaller value to leave some room for off-heap usage.\n\nThe `--jars`, `--py-files`, `--files` arguments support DBFS and S3 paths.\n", + "properties": { + "parameters": { + "description": "Command-line parameters passed to spark submit.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } + } + } + }, + "sql_task": { + "description": "If sql_task, indicates that this job must execute a SQL task.", + "properties": { + "alert": { + "description": "If alert, indicates that this job must refresh a SQL alert.", + "properties": { + "alert_id": { + "description": "The canonical identifier of the SQL alert." + }, + "pause_subscriptions": { + "description": "If true, the alert notifications are not sent to subscribers." + }, + "subscriptions": { + "description": "If specified, alert notifications are sent to subscribers.", + "items": { + "description": "", + "properties": { + "destination_id": { + "description": "The canonical identifier of the destination to receive email notification. This parameter is mutually exclusive with user_name. You cannot set both destination_id and user_name for subscription notifications." + }, + "user_name": { + "description": "The user name to receive the subscription email. This parameter is mutually exclusive with destination_id. You cannot set both destination_id and user_name for subscription notifications." + } } - }, - "notebook_path": { - "description": "The path of the notebook to be run in the Databricks workspace or remote repository.\nFor notebooks stored in the Databricks workspace, the path must be absolute and begin with a slash.\nFor notebooks stored in a remote repository, the path must be relative. This field is required.\n" - }, - "source": { - "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved\nfrom the local \u003cDatabricks\u003e workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a \u003cDatabricks\u003e workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository.\n" } } - }, - "notification_settings": { - "description": "Optional notification settings that are used when sending notifications to each of the `email_notifications` for this task.", - "properties": { - "alert_on_last_attempt": { - "description": "If true, do not send notifications to recipients specified in `on_start` for the retried runs and do not send notifications to recipients specified in `on_failure` until the last retry of the run." - }, - "no_alert_for_canceled_runs": { - "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is canceled." - }, - "no_alert_for_skipped_runs": { - "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is skipped." + } + }, + "dashboard": { + "description": "If dashboard, indicates that this job must refresh a SQL dashboard.", + "properties": { + "custom_subject": { + "description": "Subject of the email sent to subscribers of this task." + }, + "dashboard_id": { + "description": "The canonical identifier of the SQL dashboard." + }, + "pause_subscriptions": { + "description": "If true, the dashboard snapshot is not taken, and emails are not sent to subscribers." + }, + "subscriptions": { + "description": "If specified, dashboard snapshots are sent to subscriptions.", + "items": { + "description": "", + "properties": { + "destination_id": { + "description": "The canonical identifier of the destination to receive email notification. This parameter is mutually exclusive with user_name. You cannot set both destination_id and user_name for subscription notifications." + }, + "user_name": { + "description": "The user name to receive the subscription email. This parameter is mutually exclusive with destination_id. You cannot set both destination_id and user_name for subscription notifications." + } + } } } - }, - "pipeline_task": { - "description": "If pipeline_task, indicates that this task must execute a Pipeline.", - "properties": { - "full_refresh": { - "description": "If true, a full refresh will be triggered on the delta live table." - }, - "pipeline_id": { - "description": "The full name of the pipeline task to execute." - } + } + }, + "file": { + "description": "If file, indicates that this job runs a SQL file in a remote Git repository. Only one SQL statement is supported in a file. Multiple SQL statements separated by semicolons (;) are not permitted.", + "properties": { + "path": { + "description": "Relative path of the SQL file in the remote Git repository." } - }, - "python_wheel_task": { - "description": "If python_wheel_task, indicates that this job must execute a PythonWheel.", - "properties": { - "entry_point": { - "description": "Named entry point to use, if it does not exist in the metadata of the package it executes the function from the package directly using `$packageName.$entryPoint()`" - }, - "named_parameters": { - "description": "Command-line parameters passed to Python wheel task in the form of `[\"--name=task\", \"--data=dbfs:/path/to/data.json\"]`. Leave it empty if `parameters` is not null.", - "additionalproperties": { - "description": "" - } - }, - "package_name": { - "description": "Name of the package to execute" - }, - "parameters": { - "description": "Command-line parameters passed to Python wheel task. Leave it empty if `named_parameters` is not null.", - "items": { - "description": "" - } - } + } + }, + "parameters": { + "description": "Parameters to be used for each run of this job. The SQL alert task does not support custom parameters.", + "additionalproperties": { + "description": "" + } + }, + "query": { + "description": "If query, indicates that this job must execute a SQL query.", + "properties": { + "query_id": { + "description": "The canonical identifier of the SQL query." } - }, - "retry_on_timeout": { - "description": "An optional policy to specify whether to retry a task when it times out. The default behavior is to not retry on timeout." - }, - "run_if": { - "description": "An optional value specifying the condition determining whether the task is run once its dependencies have been completed. When omitted, defaults to `ALL_SUCCESS`.\n\n* `ALL_SUCCESS`: All dependencies have executed and succeeded\n* `AT_LEAST_ONE_SUCCESS`: At least one dependency has succeeded\n* `NONE_FAILED`: None of the dependencies have failed and at least one was executed\n* `ALL_DONE`: All dependencies completed and at least one was executed\n* `AT_LEAST_ONE_FAILED`: At least one dependency failed\n* `ALL_FAILED`: ALl dependencies have failed\n" - }, - "spark_jar_task": { - "description": "If spark_jar_task, indicates that this task must run a JAR.", + } + }, + "warehouse_id": { + "description": "The canonical identifier of the SQL warehouse. Recommended to use with serverless or pro SQL warehouses. Classic SQL warehouses are only supported for SQL alert, dashboard and query tasks and are limited to scheduled single-task jobs." + } + } + }, + "task_key": { + "description": "A unique name for the task. This field is used to refer to this task from other tasks.\nThis field is required and must be unique within its parent job.\nOn Update or Reset, this field is used to reference the tasks to be updated or reset." + }, + "timeout_seconds": { + "description": "An optional timeout applied to each run of this job task. A value of `0` means no timeout." + }, + "webhook_notifications": { + "description": "A collection of system notification IDs to notify when runs of this job begin or complete.", + "properties": { + "on_duration_warning_threshold_exceeded": { + "description": "An optional list of system notification IDs to call when the duration of a run exceeds the threshold specified for the `RUN_DURATION_SECONDS` metric in the `health` field. A maximum of 3 destinations can be specified for the `on_duration_warning_threshold_exceeded` property.", + "items": { + "description": "", "properties": { - "jar_uri": { - "description": "Deprecated since 04/2016. Provide a `jar` through the `libraries` field instead. For an example, see :method:jobs/create.\n" - }, - "main_class_name": { - "description": "The full name of the class containing the main method to be executed. This class must be contained in a JAR provided as a library.\n\nThe code must use `SparkContext.getOrCreate` to obtain a Spark context; otherwise, runs of the job fail." - }, - "parameters": { - "description": "Parameters passed to the main method.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", - "items": { - "description": "" - } + "id": { + "description": "" } } - }, - "spark_python_task": { - "description": "If spark_python_task, indicates that this task must run a Python file.", + } + }, + "on_failure": { + "description": "An optional list of system notification IDs to call when the run fails. A maximum of 3 destinations can be specified for the `on_failure` property.", + "items": { + "description": "", "properties": { - "parameters": { - "description": "Command line parameters passed to the Python file.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", - "items": { - "description": "" - } - }, - "python_file": { - "description": "The Python file to be executed. Cloud file URIs (such as dbfs:/, s3:/, adls:/, gcs:/) and workspace paths are supported. For python files stored in the Databricks workspace, the path must be absolute and begin with `/`. For files stored in a remote repository, the path must be relative. This field is required." - }, - "source": { - "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved\nfrom the local \u003cDatabricks\u003e workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a \u003cDatabricks\u003e workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository.\n" + "id": { + "description": "" } } - }, - "spark_submit_task": { - "description": "If spark_submit_task, indicates that this task must be launched by the spark submit script. This task can run only on new clusters.", + } + }, + "on_start": { + "description": "An optional list of system notification IDs to call when the run starts. A maximum of 3 destinations can be specified for the `on_start` property.", + "items": { + "description": "", "properties": { - "parameters": { - "description": "Command-line parameters passed to spark submit.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", - "items": { - "description": "" - } + "id": { + "description": "" } } - }, - "sql_task": { - "description": "If sql_task, indicates that this job must execute a SQL task.", + } + }, + "on_success": { + "description": "An optional list of system notification IDs to call when the run completes successfully. A maximum of 3 destinations can be specified for the `on_success` property.", + "items": { + "description": "", "properties": { - "alert": { - "description": "If alert, indicates that this job must refresh a SQL alert.", - "properties": { - "alert_id": { - "description": "The canonical identifier of the SQL alert." - }, - "pause_subscriptions": { - "description": "If true, the alert notifications are not sent to subscribers." - }, - "subscriptions": { - "description": "If specified, alert notifications are sent to subscribers.", - "items": { - "description": "", - "properties": { - "destination_id": { - "description": "The canonical identifier of the destination to receive email notification." - }, - "user_name": { - "description": "The user name to receive the subscription email." - } - } - } - } - } - }, - "dashboard": { - "description": "If dashboard, indicates that this job must refresh a SQL dashboard.", - "properties": { - "custom_subject": { - "description": "Subject of the email sent to subscribers of this task." - }, - "dashboard_id": { - "description": "The canonical identifier of the SQL dashboard." - }, - "pause_subscriptions": { - "description": "If true, the dashboard snapshot is not taken, and emails are not sent to subscribers." - }, - "subscriptions": { - "description": "If specified, dashboard snapshots are sent to subscriptions.", - "items": { - "description": "", - "properties": { - "destination_id": { - "description": "The canonical identifier of the destination to receive email notification." - }, - "user_name": { - "description": "The user name to receive the subscription email." - } - } - } - } - } - }, - "file": { - "description": "If file, indicates that this job runs a SQL file in a remote Git repository. Only one SQL statement is supported in a file. Multiple SQL statements separated by semicolons (;) are not permitted.", - "properties": { - "path": { - "description": "Relative path of the SQL file in the remote Git repository." - } - } - }, - "parameters": { - "description": "Parameters to be used for each run of this job. The SQL alert task does not support custom parameters.", - "additionalproperties": { - "description": "" - } - }, - "query": { - "description": "If query, indicates that this job must execute a SQL query.", - "properties": { - "query_id": { - "description": "The canonical identifier of the SQL query." - } - } - }, - "warehouse_id": { - "description": "The canonical identifier of the SQL warehouse. Only serverless and pro SQL warehouses are supported." + "id": { + "description": "" } } - }, - "task_key": { - "description": "A unique name for the task. This field is used to refer to this task from other tasks.\nThis field is required and must be unique within its parent job.\nOn Update or Reset, this field is used to reference the tasks to be updated or reset.\nThe maximum length is 100 characters." - }, - "timeout_seconds": { - "description": "An optional timeout applied to each run of this job task. The default behavior is to have no timeout." } } } + } + } + } + }, + "timeout_seconds": { + "description": "An optional timeout applied to each run of this job. A value of `0` means no timeout." + }, + "trigger": { + "description": "Trigger settings for the job. Can be used to trigger a run when new files arrive in an external location. The default behavior is that the job runs only when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", + "properties": { + "file_arrival": { + "description": "File arrival trigger settings.", + "properties": { + "min_time_between_triggers_seconds": { + "description": "If set, the trigger starts a run only after the specified amount of time passed since\nthe last time the trigger fired. The minimum allowed value is 60 seconds\n" + }, + "url": { + "description": "URL to be monitored for file arrivals. The path must point to the root or a subpath of the external location." + }, + "wait_after_last_change_seconds": { + "description": "If set, the trigger starts a run only after no file activity has occurred for the specified amount of time.\nThis makes it possible to wait for a batch of incoming files to arrive before triggering a run. The\nminimum allowed value is 60 seconds.\n" + } + } + }, + "pause_status": { + "description": "Indicate whether this schedule is paused or not." + } + } + }, + "webhook_notifications": { + "description": "A collection of system notification IDs to notify when runs of this job begin or complete.", + "properties": { + "on_duration_warning_threshold_exceeded": { + "description": "An optional list of system notification IDs to call when the duration of a run exceeds the threshold specified for the `RUN_DURATION_SECONDS` metric in the `health` field. A maximum of 3 destinations can be specified for the `on_duration_warning_threshold_exceeded` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_failure": { + "description": "An optional list of system notification IDs to call when the run fails. A maximum of 3 destinations can be specified for the `on_failure` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_start": { + "description": "An optional list of system notification IDs to call when the run starts. A maximum of 3 destinations can be specified for the `on_start` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_success": { + "description": "An optional list of system notification IDs to call when the run completes successfully. A maximum of 3 destinations can be specified for the `on_success` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + } + } + } + } + } + }, + "model_serving_endpoints": { + "description": "List of Model Serving Endpoints", + "additionalproperties": { + "description": "", + "properties": { + "config": { + "description": "The core config of the serving endpoint.", + "properties": { + "served_models": { + "description": "A list of served models for the endpoint to serve. A serving endpoint can have up to 10 served models.", + "items": { + "description": "", + "properties": { + "environment_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs used for serving this model.\nNote: this is an experimental feature and subject to change. \nExample model environment variables that refer to Databricks secrets: `{\"OPENAI_API_KEY\": \"{{secrets/my_scope/my_key}}\", \"DATABRICKS_TOKEN\": \"{{secrets/my_scope2/my_key2}}\"}`", + "additionalproperties": { + "description": "" + } + }, + "instance_profile_arn": { + "description": "ARN of the instance profile that the served model will use to access AWS resources." + }, + "model_name": { + "description": "The name of the model in Databricks Model Registry to be served or if the model resides in Unity Catalog, the full name of model, \nin the form of __catalog_name__.__schema_name__.__model_name__.\n" + }, + "model_version": { + "description": "The version of the model in Databricks Model Registry or Unity Catalog to be served." + }, + "name": { + "description": "The name of a served model. It must be unique across an endpoint. If not specified, this field will default to \u003cmodel-name\u003e-\u003cmodel-version\u003e.\nA served model name can consist of alphanumeric characters, dashes, and underscores.\n" + }, + "scale_to_zero_enabled": { + "description": "Whether the compute resources for the served model should scale down to zero." + }, + "workload_size": { + "description": "The workload size of the served model. The workload size corresponds to a range of provisioned concurrency that the compute will autoscale between.\nA single unit of provisioned concurrency can process one request at a time.\nValid workload sizes are \"Small\" (4 - 4 provisioned concurrency), \"Medium\" (8 - 16 provisioned concurrency), and \"Large\" (16 - 64 provisioned concurrency).\nIf scale-to-zero is enabled, the lower bound of the provisioned concurrency for each workload size will be 0.\n" + }, + "workload_type": { + "description": "The workload type of the served model. The workload type selects which type of compute to use in the endpoint. The default value for this parameter is\n\"CPU\". For deep learning workloads, GPU acceleration is available by selecting workload types like GPU_SMALL and others. See documentation for all\noptions.\n" + } + } + } + }, + "traffic_config": { + "description": "The traffic config defining how invocations to the serving endpoint should be routed.", + "properties": { + "routes": { + "description": "The list of routes that define traffic to each served model.", + "items": { + "description": "", + "properties": { + "served_model_name": { + "description": "The name of the served model this route configures traffic for." + }, + "traffic_percentage": { + "description": "The percentage of endpoint traffic to send to this route. It must be an integer between 0 and 100 inclusive." + } + } + } + } + } + } + } + }, + "name": { + "description": "The name of the serving endpoint. This field is required and must be unique across a Databricks workspace.\nAn endpoint name can consist of alphanumeric characters, dashes, and underscores.\n" + }, + "permissions": { + "description": "", + "items": { + "description": "", + "properties": { + "group_name": { + "description": "" + }, + "level": { + "description": "" + }, + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } + } + } + }, + "tags": { + "description": "Tags to be attached to the serving endpoint and automatically propagated to billing logs.", + "items": { + "description": "", + "properties": { + "key": { + "description": "Key field for a serving endpoint tag." + }, + "value": { + "description": "Optional value field for a serving endpoint tag." + } + } + } + } + } + } + }, + "models": { + "description": "List of MLflow models", + "additionalproperties": { + "description": "", + "properties": { + "creation_timestamp": { + "description": "Timestamp recorded when this `registered_model` was created." + }, + "description": { + "description": "Description of this `registered_model`." + }, + "last_updated_timestamp": { + "description": "Timestamp recorded when metadata for this `registered_model` was last updated." + }, + "latest_versions": { + "description": "Collection of latest model versions for each stage.\nOnly contains models with current `READY` status.", + "items": { + "description": "", + "properties": { + "creation_timestamp": { + "description": "Timestamp recorded when this `model_version` was created." + }, + "current_stage": { + "description": "Current stage for this `model_version`." + }, + "description": { + "description": "Description of this `model_version`." + }, + "last_updated_timestamp": { + "description": "Timestamp recorded when metadata for this `model_version` was last updated." + }, + "name": { + "description": "Unique name of the model" + }, + "run_id": { + "description": "MLflow run ID used when creating `model_version`, if `source` was generated by an\nexperiment run stored in MLflow tracking server." + }, + "run_link": { + "description": "Run Link: Direct link to the run that generated this version" + }, + "source": { + "description": "URI indicating the location of the source model artifacts, used when creating `model_version`" + }, + "status": { + "description": "Current status of `model_version`" + }, + "status_message": { + "description": "Details on current `status`, if it is pending or failed." + }, + "tags": { + "description": "Tags: Additional metadata key-value pairs for this `model_version`.", + "items": { + "description": "", + "properties": { + "key": { + "description": "The tag key." + }, + "value": { + "description": "The tag value." + } + } + } + }, + "user_id": { + "description": "User that created this `model_version`." + }, + "version": { + "description": "Model's version number." + } + } + } + }, + "name": { + "description": "Unique name for the model." + }, + "permissions": { + "description": "", + "items": { + "description": "", + "properties": { + "group_name": { + "description": "" + }, + "level": { + "description": "" + }, + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } + } + } + }, + "tags": { + "description": "Tags: Additional metadata key-value pairs for this `registered_model`.", + "items": { + "description": "", + "properties": { + "key": { + "description": "The tag key." + }, + "value": { + "description": "The tag value." + } + } + } + }, + "user_id": { + "description": "User that created this `registered_model`" + } + } + } + }, + "pipelines": { + "description": "List of DLT pipelines", + "additionalproperties": { + "description": "", + "properties": { + "catalog": { + "description": "A catalog in Unity Catalog to publish data from this pipeline to. If `target` is specified, tables in this pipeline are published to a `target` schema inside `catalog` (for example, `catalog`.`target`.`table`). If `target` is not specified, no data is published to Unity Catalog." + }, + "channel": { + "description": "DLT Release Channel that specifies which version to use." + }, + "clusters": { + "description": "Cluster settings for this pipeline deployment.", + "items": { + "description": "", + "properties": { + "apply_policy_default_values": { + "description": "Note: This field won't be persisted. Only API users will check this field." + }, + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nThe list of available zones as well as the default value can be found by using the\n`List Zones` method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nOnly dbfs destinations are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.\n", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver.\nNote that this field is optional; if unset, the driver node type will be set as the same value\nas `node_type_id` defined above." + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + }, + "local_ssd_count": { + "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." + } + } }, - "timeout_seconds": { - "description": "An optional timeout applied to each run of this job. The default behavior is to have no timeout." + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." }, - "trigger": { - "description": "Trigger settings for the job. Can be used to trigger a run when new files arrive in an external location. The default behavior is that the job runs only when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", + "label": { + "description": "A label for the cluster specification, either `default` to configure the default cluster, or `maintenance` to configure the maintenance cluster. This field is optional. The default value is `default`." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nSee :method:clusters/create for more details.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + } + } + } + }, + "configuration": { + "description": "String-String configuration for this pipeline execution.", + "additionalproperties": { + "description": "" + } + }, + "continuous": { + "description": "Whether the pipeline is continuous or triggered. This replaces `trigger`." + }, + "development": { + "description": "Whether the pipeline is in Development mode. Defaults to false." + }, + "edition": { + "description": "Pipeline product edition." + }, + "filters": { + "description": "Filters on which Pipeline packages to include in the deployed graph.", + "properties": { + "exclude": { + "description": "Paths to exclude.", + "items": { + "description": "" + } + }, + "include": { + "description": "Paths to include.", + "items": { + "description": "" + } + } + } + }, + "id": { + "description": "Unique identifier for this pipeline." + }, + "libraries": { + "description": "Libraries or code needed by this deployment.", + "items": { + "description": "", + "properties": { + "file": { + "description": "The path to a file that defines a pipeline and is stored in the Databricks Repos.\n", "properties": { - "file_arrival": { - "description": "File arrival trigger settings.", - "properties": { - "min_time_between_triggers_seconds": { - "description": "If set, the trigger starts a run only after the specified amount of time passed since\nthe last time the trigger fired. The minimum allowed value is 60 seconds\n" - }, - "url": { - "description": "URL to be monitored for file arrivals. The path must point to the root or a subpath of the external location." - }, - "wait_after_last_change_seconds": { - "description": "If set, the trigger starts a run only after no file activity has occurred for the specified amount of time.\nThis makes it possible to wait for a batch of incoming files to arrive before triggering a run. The\nminimum allowed value is 60 seconds.\n" - } - } - }, - "pause_status": { - "description": "Whether this trigger is paused or not." + "path": { + "description": "The absolute path of the file." } } }, - "webhook_notifications": { - "description": "A collection of system notification IDs to notify when the run begins or completes. The default behavior is to not send any system notifications.", + "jar": { + "description": "URI of the jar to be installed. Currently only DBFS is supported.\n" + }, + "maven": { + "description": "Specification of a maven library to be installed.\n", "properties": { - "on_failure": { - "description": "An optional list of system notification IDs to call when the run fails. A maximum of 3 destinations can be specified for the `on_failure` property.", - "items": { - "description": "", - "properties": { - "id": { - "description": "" - } - } - } + "coordinates": { + "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." }, - "on_start": { - "description": "An optional list of system notification IDs to call when the run starts. A maximum of 3 destinations can be specified for the `on_start` property.", + "exclusions": { + "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", "items": { - "description": "", - "properties": { - "id": { - "description": "" - } - } + "description": "" } }, - "on_success": { - "description": "An optional list of system notification IDs to call when the run completes successfully. A maximum of 3 destinations can be specified for the `on_success` property.", - "items": { - "description": "", - "properties": { - "id": { - "description": "" - } - } - } + "repo": { + "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." + } + } + }, + "notebook": { + "description": "The path to a notebook that defines a pipeline and is stored in the \u003cDatabricks\u003e workspace.\n", + "properties": { + "path": { + "description": "The absolute path of the notebook." } } } } } }, - "models": { - "description": "List of MLflow models", + "name": { + "description": "Friendly identifier for this pipeline." + }, + "notifications": { + "description": "List of notification settings for this pipeline.", + "items": { + "description": "", + "properties": { + "alerts": { + "description": "A list of alerts that trigger the sending of notifications to the configured\ndestinations. The supported alerts are:\n\n* `on-update-success`: A pipeline update completes successfully.\n* `on-update-failure`: Each time a pipeline update fails.\n* `on-update-fatal-failure`: A pipeline update fails with a non-retryable (fatal) error.\n* `on-flow-failure`: A single data flow fails.\n", + "items": { + "description": "" + } + }, + "email_recipients": { + "description": "A list of email addresses notified when a configured alert is triggered.\n", + "items": { + "description": "" + } + } + } + } + }, + "permissions": { + "description": "", + "items": { + "description": "", + "properties": { + "group_name": { + "description": "" + }, + "level": { + "description": "" + }, + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } + } + } + }, + "photon": { + "description": "Whether Photon is enabled for this pipeline." + }, + "serverless": { + "description": "Whether serverless compute is enabled for this pipeline." + }, + "storage": { + "description": "DBFS root directory for storing checkpoints and tables." + }, + "target": { + "description": "Target schema (database) to add tables in this pipeline to. If not specified, no data is published to the Hive metastore or Unity Catalog. To publish to Unity Catalog, also specify `catalog`." + }, + "trigger": { + "description": "Which pipeline trigger to use. Deprecated: Use `continuous` instead.", + "properties": { + "cron": { + "description": "", + "properties": { + "quartz_cron_schedule": { + "description": "" + }, + "timezone_id": { + "description": "" + } + } + }, + "manual": { + "description": "" + } + } + } + } + } + }, + "registered_models": { + "description": "List of Registered Models", + "additionalproperties": { + "description": "", + "properties": { + "catalog_name": { + "description": "The name of the catalog where the schema and the registered model reside" + }, + "comment": { + "description": "The comment attached to the registered model" + }, + "grants": { + "description": "", + "items": { + "description": "", + "properties": { + "principal": { + "description": "" + }, + "privileges": { + "description": "", + "items": { + "description": "" + } + } + } + } + }, + "name": { + "description": "The name of the registered model" + }, + "schema_name": { + "description": "The name of the schema where the registered model resides" + }, + "storage_location": { + "description": "The storage location on the cloud under which model version data files are stored" + } + } + } + } + } + }, + "run_as": { + "description": "", + "properties": { + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } + } + }, + "sync": { + "description": "", + "properties": { + "exclude": { + "description": "", + "items": { + "description": "" + } + }, + "include": { + "description": "", + "items": { + "description": "" + } + } + } + }, + "targets": { + "description": "", + "additionalproperties": { + "description": "", + "properties": { + "artifacts": { + "description": "A description of all code artifacts in this bundle.", + "additionalproperties": { + "description": "", + "properties": { + "build": { + "description": "" + }, + "files": { + "description": "", + "items": { + "description": "", + "properties": { + "source": { + "description": "" + } + } + } + }, + "path": { + "description": "" + }, + "type": { + "description": "" + } + } + } + }, + "bundle": { + "description": "The details for this bundle.", + "properties": { + "compute_id": { + "description": "" + }, + "git": { + "description": "", + "properties": { + "branch": { + "description": "" + }, + "origin_url": { + "description": "" + } + } + }, + "name": { + "description": "The name of the bundle." + } + } + }, + "compute_id": { + "description": "" + }, + "default": { + "description": "" + }, + "git": { + "description": "", + "properties": { + "branch": { + "description": "" + }, + "origin_url": { + "description": "" + } + } + }, + "mode": { + "description": "" + }, + "permissions": { + "description": "", + "items": { + "description": "", + "properties": { + "group_name": { + "description": "" + }, + "level": { + "description": "" + }, + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } + } + } + }, + "resources": { + "description": "Collection of Databricks resources to deploy.", + "properties": { + "experiments": { + "description": "List of MLflow experiments", "additionalproperties": { "description": "", "properties": { - "creation_timestamp": { - "description": "Timestamp recorded when this `registered_model` was created." + "artifact_location": { + "description": "Location where artifacts for the experiment are stored." }, - "description": { - "description": "Description of this `registered_model`." + "creation_time": { + "description": "Creation time" }, - "last_updated_timestamp": { - "description": "Timestamp recorded when metadata for this `registered_model` was last updated." + "experiment_id": { + "description": "Unique identifier for the experiment." }, - "latest_versions": { - "description": "Collection of latest model versions for each stage.\nOnly contains models with current `READY` status.", - "items": { - "description": "", - "properties": { - "creation_timestamp": { - "description": "Timestamp recorded when this `model_version` was created." - }, - "current_stage": { - "description": "Current stage for this `model_version`." - }, - "description": { - "description": "Description of this `model_version`." - }, - "last_updated_timestamp": { - "description": "Timestamp recorded when metadata for this `model_version` was last updated." - }, - "name": { - "description": "Unique name of the model" - }, - "run_id": { - "description": "MLflow run ID used when creating `model_version`, if `source` was generated by an\nexperiment run stored in MLflow tracking server." - }, - "run_link": { - "description": "Run Link: Direct link to the run that generated this version" - }, - "source": { - "description": "URI indicating the location of the source model artifacts, used when creating `model_version`" - }, - "status": { - "description": "Current status of `model_version`" - }, - "status_message": { - "description": "Details on current `status`, if it is pending or failed." - }, - "tags": { - "description": "Tags: Additional metadata key-value pairs for this `model_version`.", - "items": { - "description": "", - "properties": { - "key": { - "description": "The tag key." - }, - "value": { - "description": "The tag value." - } - } - } - }, - "user_id": { - "description": "User that created this `model_version`." - }, - "version": { - "description": "Model's version number." - } - } - } + "last_update_time": { + "description": "Last update time" + }, + "lifecycle_stage": { + "description": "Current life cycle stage of the experiment: \"active\" or \"deleted\".\nDeleted experiments are not returned by APIs." }, "name": { - "description": "Unique name for the model." + "description": "Human readable name that identifies the experiment." }, "permissions": { "description": "", @@ -1422,7 +2323,7 @@ } }, "tags": { - "description": "Tags: Additional metadata key-value pairs for this `registered_model`.", + "description": "Tags: Additional metadata key-value pairs.", "items": { "description": "", "properties": { @@ -1434,372 +2335,505 @@ } } } - }, - "user_id": { - "description": "User that created this `registered_model`" } } } }, - "model_serving_endpoints": { - "description": "List of Model Serving Endpoints", + "jobs": { + "description": "List of Databricks jobs", "additionalproperties": { "description": "", "properties": { - "name": { - "description": "The name of the model serving endpoint. This field is required and must be unique across a workspace. An endpoint name can consist of alphanumeric characters, dashes, and underscores. NOTE: Changing this name will delete the existing endpoint and create a new endpoint with the update name." - }, - "permissions": { - "description": "", + "compute": { + "description": "A list of compute requirements that can be referenced by tasks of this job.", "items": { "description": "", "properties": { - "group_name": { - "description": "" + "compute_key": { + "description": "A unique name for the compute requirement. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine the compute requirements for the task execution." }, - "level": { + "spec": { + "description": "", + "properties": { + "kind": { + "description": "The kind of compute described by this compute specification." + } + } + } + } + } + }, + "continuous": { + "description": "An optional continuous property for this job. The continuous property will ensure that there is always one run executing. Only one of `schedule` and `continuous` can be used.", + "properties": { + "pause_status": { + "description": "Indicate whether this schedule is paused or not." + } + } + }, + "deployment": { + "description": "Deployment information for jobs managed by external sources.", + "properties": { + "kind": { + "description": "The kind of deployment that manages the job.\n\n* `BUNDLE`: The job is managed by Databricks Asset Bundle.\n" + }, + "metadata_file_path": { + "description": "Path of the file that contains deployment metadata." + } + } + }, + "description": { + "description": "An optional description for the job. The maximum length is 1024 characters in UTF-8 encoding." + }, + "edit_mode": { + "description": "Edit mode of the job.\n\n* `UI_LOCKED`: The job is in a locked UI state and cannot be modified.\n* `EDITABLE`: The job is in an editable state and can be modified.\n" + }, + "email_notifications": { + "description": "An optional set of email addresses that is notified when runs of this job begin or complete as well as when this job is deleted.", + "properties": { + "no_alert_for_skipped_runs": { + "description": "If true, do not send email to recipients specified in `on_failure` if the run is skipped." + }, + "on_duration_warning_threshold_exceeded": { + "description": "A list of email addresses to be notified when the duration of a run exceeds the threshold specified for the `RUN_DURATION_SECONDS` metric in the `health` field. If no rule for the `RUN_DURATION_SECONDS` metric is specified in the `health` field for the job, notifications are not sent.", + "items": { "description": "" - }, - "service_principal_name": { + } + }, + "on_failure": { + "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", + "items": { "description": "" - }, - "user_name": { + } + }, + "on_start": { + "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_success": { + "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESS` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { "description": "" } } } }, - "config": { - "description": "The model serving endpoint configuration.", + "format": { + "description": "Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls. When using the Jobs API 2.1 this value is always set to `\"MULTI_TASK\"`." + }, + "git_source": { + "description": "An optional specification for a remote Git repository containing the source code used by tasks. Version-controlled source code is supported by notebook, dbt, Python script, and SQL File tasks.\n\nIf `git_source` is set, these tasks retrieve the file from the remote repository by default. However, this behavior can be overridden by setting `source` to `WORKSPACE` on the task.\n\nNote: dbt and SQL File tasks support only version-controlled sources. If dbt or SQL File tasks are used, `git_source` must be defined on the job.", + "properties": { + "git_branch": { + "description": "Name of the branch to be checked out and used by this job. This field cannot be specified in conjunction with git_tag or git_commit." + }, + "git_commit": { + "description": "Commit to be checked out and used by this job. This field cannot be specified in conjunction with git_branch or git_tag." + }, + "git_provider": { + "description": "Unique identifier of the service used to host the Git repository. The value is case insensitive." + }, + "git_snapshot": { + "description": "", + "properties": { + "used_commit": { + "description": "Commit that was used to execute the run. If git_branch was specified, this points to the HEAD of the branch at the time of the run; if git_tag was specified, this points to the commit the tag points to." + } + } + }, + "git_tag": { + "description": "Name of the tag to be checked out and used by this job. This field cannot be specified in conjunction with git_branch or git_commit." + }, + "git_url": { + "description": "URL of the repository to be cloned by this job." + }, + "job_source": { + "description": "The source of the job specification in the remote repository when the job is source controlled.", + "properties": { + "dirty_state": { + "description": "Dirty state indicates the job is not fully synced with the job specification in the remote repository.\n\nPossible values are:\n* `NOT_SYNCED`: The job is not yet synced with the remote job specification. Import the remote job specification from UI to make the job fully synced.\n* `DISCONNECTED`: The job is temporary disconnected from the remote job specification and is allowed for live edit. Import the remote job specification again from UI to make the job fully synced.\n" + }, + "import_from_git_branch": { + "description": "Name of the branch which the job is imported from." + }, + "job_config_path": { + "description": "Path of the job YAML file that contains the job specification." + } + } + } + } + }, + "health": { + "description": "", "properties": { - "properties": { - "served_models": { - "description": "Each block represents a served model for the endpoint to serve. A model serving endpoint can have up to 10 served models.", - "items": { - "description": "", - "properties" : { - "name": { - "description": "The name of a served model. It must be unique across an endpoint. If not specified, this field will default to modelname-modelversion. A served model name can consist of alphanumeric characters, dashes, and underscores." - }, - "model_name": { - "description": "The name of the model in Databricks Model Registry to be served." - }, - "model_version": { - "description": "The version of the model in Databricks Model Registry to be served." - }, - "workload_size": { - "description": "The workload size of the served model. The workload size corresponds to a range of provisioned concurrency that the compute will autoscale between. A single unit of provisioned concurrency can process one request at a time. Valid workload sizes are \"Small\" (4 - 4 provisioned concurrency), \"Medium\" (8 - 16 provisioned concurrency), and \"Large\" (16 - 64 provisioned concurrency)." - }, - "scale_to_zero_enabled": { - "description": "Whether the compute resources for the served model should scale down to zero. If scale-to-zero is enabled, the lower bound of the provisioned concurrency for each workload size will be 0." - } + "rules": { + "description": "", + "items": { + "description": "", + "properties": { + "metric": { + "description": "" + }, + "op": { + "description": "" + }, + "value": { + "description": "Specifies the threshold value that the health metric should obey to satisfy the health rule." } } + } + } + } + }, + "job_clusters": { + "description": "A list of job cluster specifications that can be shared and reused by tasks of this job. Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in task settings.", + "items": { + "description": "", + "properties": { + "job_cluster_key": { + "description": "A unique name for the job cluster. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine which cluster to launch for the task execution." }, - "traffic_config": { - "description": "A single block represents the traffic split configuration amongst the served models.", + "new_cluster": { + "description": "If new_cluster, a description of a cluster that is created for each task.", "properties": { - "routes": { - "description": "Each block represents a route that defines traffic to each served model. Each served_models block needs to have a corresponding routes block.", + "apply_policy_default_values": { + "description": "" + }, + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "autotermination_minutes": { + "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nThe list of available zones as well as the default value can be found by using the\n`List Zones` method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "cluster_name": { + "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" + }, + "cluster_source": { + "description": "" + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "data_security_mode": { + "description": "" + }, + "docker_image": { + "description": "", + "properties": { + "basic_auth": { + "description": "", + "properties": { + "password": { + "description": "Password of the user" + }, + "username": { + "description": "Name of the user" + } + } + }, + "url": { + "description": "URL of the docker image." + } + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" + }, + "enable_elastic_disk": { + "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." + }, + "enable_local_disk_encryption": { + "description": "Whether to enable LUKS on cluster VMs' local disks" + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + }, + "local_ssd_count": { + "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." + } + } + }, + "init_scripts": { + "description": "The configuration for storing init scripts. Any number of destinations can be specified. The scripts are executed sequentially in the order provided. If `cluster_log_conf` is specified, init script logs are sent to `\u003cdestination\u003e/\u003ccluster-ID\u003e/init_scripts`.", "items": { "description": "", "properties": { - "served_model_name": { - "description": "The name of the served model this route configures traffic for. This needs to match the name of a served_models block." + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "file": { + "description": "destination needs to be provided. e.g.\n`{ \"file\" : { \"destination\" : \"file:/my/local/file.sh\" } }`", + "properties": { + "destination": { + "description": "local file destination, e.g. `file:/my/local/file.sh`" + } + } + }, + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + }, + "volumes": { + "description": "destination needs to be provided. e.g.\n`{ \"volumes\" : { \"destination\" : \"/Volumes/my-init.sh\" } }`", + "properties": { + "destination": { + "description": "Unity Catalog Volumes file destination, e.g. `/Volumes/my-init.sh`" + } + } }, - "traffic_percentage": { - "description": "The percentage of endpoint traffic to send to this route. It must be an integer between 0 and 100 inclusive." + "workspace": { + "description": "destination needs to be provided. e.g.\n`{ \"workspace\" : { \"destination\" : \"/Users/user1@databricks.com/my-init.sh\" } }`", + "properties": { + "destination": { + "description": "workspace files destination, e.g. `/Users/user1@databricks.com/my-init.sh`" + } + } } } } - } - } - } - } - } - } - } - } - }, - "pipelines": { - "description": "List of DLT pipelines", - "additionalproperties": { - "description": "", - "properties": { - "catalog": { - "description": "A catalog in Unity Catalog to publish data from this pipeline to. If `target` is specified, tables in this pipeline are published to a `target` schema inside `catalog` (for example, `catalog`.`target`.`table`). If `target` is not specified, no data is published to Unity Catalog." - }, - "channel": { - "description": "DLT Release Channel that specifies which version to use." - }, - "clusters": { - "description": "Cluster settings for this pipeline deployment.", - "items": { - "description": "", - "properties": { - "apply_policy_default_values": { - "description": "Note: This field won't be persisted. Only API users will check this field." - }, - "autoscale": { - "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", - "properties": { - "max_workers": { - "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." - }, - "min_workers": { - "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." - } - } - }, - "aws_attributes": { - "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" }, - "ebs_volume_count": { - "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." }, - "ebs_volume_iops": { - "description": "\u003cneeds content added\u003e" + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" }, - "ebs_volume_size": { - "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." }, - "ebs_volume_throughput": { - "description": "\u003cneeds content added\u003e" + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." }, - "ebs_volume_type": { + "runtime_engine": { "description": "" }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "instance_profile_arn": { - "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." - }, - "spot_bid_price_percent": { - "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." - }, - "zone_id": { - "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." - } - } - }, - "azure_attributes": { - "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" + "single_user_name": { + "description": "Single user name if data_security_mode is `SINGLE_USER`" }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", + "additionalproperties": { + "description": "" + } }, - "log_analytics_info": { - "description": "Defines values necessary to configure and run Azure Log Analytics agent", - "properties": { - "log_analytics_primary_key": { - "description": "\u003cneeds content added\u003e" - }, - "log_analytics_workspace_id": { - "description": "\u003cneeds content added\u003e" - } + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" } }, - "spot_bid_max_price": { - "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." - } - } - }, - "cluster_log_conf": { - "description": "The configuration for delivering spark logs to a long-term storage destination.\nOnly dbfs destinations are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.\n", - "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", - "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" - } + "spark_version": { + "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" } }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "workload_type": { + "description": "", "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." - }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." - }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." - }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." - }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + "clients": { + "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", + "properties": { + "jobs": { + "description": "With jobs set, the cluster can be used for jobs" + }, + "notebooks": { + "description": "With notebooks set, this cluster can be used for notebooks" + } + } } } } } - }, - "custom_tags": { - "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", - "additionalproperties": { - "description": "" - } - }, - "driver_instance_pool_id": { - "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." - }, - "driver_node_type_id": { - "description": "The node type of the Spark driver.\nNote that this field is optional; if unset, the driver node type will be set as the same value\nas `node_type_id` defined above." - }, - "gcp_attributes": { - "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "boot_disk_size": { - "description": "boot disk size in GB" - }, - "google_service_account": { - "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." - }, - "local_ssd_count": { - "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." - } - } - }, - "instance_pool_id": { - "description": "The optional ID of the instance pool to which the cluster belongs." - }, - "label": { - "description": "A label for the cluster specification, either `default` to configure the default cluster, or `maintenance` to configure the maintenance cluster. This field is optional. The default value is `default`." - }, - "node_type_id": { - "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" - }, - "num_workers": { - "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." - }, - "policy_id": { - "description": "The ID of the cluster policy used to create the cluster if applicable." - }, - "spark_conf": { - "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nSee :method:clusters/create for more details.\n", - "additionalproperties": { - "description": "" - } - }, - "spark_env_vars": { - "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", - "additionalproperties": { - "description": "" - } - }, - "ssh_public_keys": { - "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", - "items": { - "description": "" - } - } - } - } - }, - "configuration": { - "description": "String-String configuration for this pipeline execution.", - "additionalproperties": { - "description": "" + } + } } }, - "continuous": { - "description": "Whether the pipeline is continuous or triggered. This replaces `trigger`." - }, - "development": { - "description": "Whether the pipeline is in Development mode. Defaults to false." + "max_concurrent_runs": { + "description": "An optional maximum allowed number of concurrent runs of the job.\n\nSet this value if you want to be able to execute multiple runs of the same job concurrently. This is useful for example if you trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each other, or if you want to trigger multiple runs which differ by their input parameters.\n\nThis setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4 concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs. However, from then on, new runs are skipped unless there are fewer than 3 active runs.\n\nThis value cannot exceed 1000. Setting this value to `0` causes all new runs to be skipped." }, - "edition": { - "description": "Pipeline product edition." + "name": { + "description": "An optional name for the job. The maximum length is 4096 bytes in UTF-8 encoding." }, - "filters": { - "description": "Filters on which Pipeline packages to include in the deployed graph.", + "notification_settings": { + "description": "Optional notification settings that are used when sending notifications to each of the `email_notifications` and `webhook_notifications` for this job.", "properties": { - "exclude": { - "description": "Paths to exclude.", - "items": { - "description": "" - } + "no_alert_for_canceled_runs": { + "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is canceled." }, - "include": { - "description": "Paths to include.", - "items": { - "description": "" - } + "no_alert_for_skipped_runs": { + "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is skipped." } } }, - "id": { - "description": "Unique identifier for this pipeline." - }, - "libraries": { - "description": "Libraries or code needed by this deployment.", + "parameters": { + "description": "Job-level parameter definitions", "items": { "description": "", "properties": { - "file": { - "description": "The path to a file that defines a pipeline and is stored in the Databricks Repos.\n", - "properties": { - "path": { - "description": "The absolute path of the file." - } - } - }, - "jar": { - "description": "URI of the jar to be installed. Currently only DBFS is supported.\n" - }, - "maven": { - "description": "Specification of a maven library to be installed.\n", - "properties": { - "coordinates": { - "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." - }, - "exclusions": { - "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", - "items": { - "description": "" - } - }, - "repo": { - "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." - } - } - }, - "notebook": { - "description": "The path to a notebook that defines a pipeline and is stored in the \u003cDatabricks\u003e workspace.\n", - "properties": { - "path": { - "description": "The absolute path of the notebook." - } - } + "default": { + "description": "Default value of the parameter." }, - "whl": { - "description": "URI of the wheel to be installed.\n" + "name": { + "description": "The name of the defined parameter. May only contain alphanumeric characters, `_`, `-`, and `.`" } } } }, - "name": { - "description": "Friendly identifier for this pipeline." - }, "permissions": { "description": "", "items": { @@ -1820,1075 +2854,795 @@ } } }, - "photon": { - "description": "Whether Photon is enabled for this pipeline." - }, - "serverless": { - "description": "Whether serverless compute is enabled for this pipeline." - }, - "storage": { - "description": "DBFS root directory for storing checkpoints and tables." - }, - "target": { - "description": "Target schema (database) to add tables in this pipeline to. If not specified, no data is published to the Hive metastore or Unity Catalog. To publish to Unity Catalog, also specify `catalog`." - }, - "trigger": { - "description": "Which pipeline trigger to use. Deprecated: Use `continuous` instead.", + "queue": { + "description": "The queue settings of the job.", "properties": { - "cron": { - "description": "", - "properties": { - "quartz_cron_schedule": { - "description": "" - }, - "timezone_id": { - "description": "" - } - } - }, - "manual": { - "description": "" + "enabled": { + "description": "If true, enable queueing for the job. This is a required field." } } - } - } - } - } - } - }, - "variables": { - "description": "", - "additionalproperties": { - "description": "" - } - }, - "workspace": { - "description": "Configures which workspace to connect to and locations for files, state, and similar locations within the workspace file tree.", - "properties": { - "artifact_path": { - "description": "The remote path to synchronize build artifacts to. This defaults to `${workspace.root}/artifacts`" - }, - "auth_type": { - "description": "When multiple auth attributes are available in the environment, use the auth type specified by this argument" - }, - "azure_client_id": { - "description": "Azure Client ID." - }, - "azure_environment": { - "description": "Azure environment, one of (Public, UsGov, China, Germany)." - }, - "azure_login_app_id": { - "description": "Azure Login Application ID." - }, - "azure_tenant_id": { - "description": "Azure Tenant ID." - }, - "azure_use_msi": { - "description": "Whether to use Managed Service Identity (MSI) to authenticate to Azure Databricks." - }, - "azure_workspace_resource_id": { - "description": "Azure Resource Manager ID for Azure Databricks workspace." - }, - "client_id": { - "description": "OAath client ID for the Databricks workspace." - }, - "file_path": { - "description": "The remote path to synchronize local files artifacts to. This defaults to `${workspace.root}/files`" - }, - "google_service_account": { - }, - "host": { - "description": "Host url of the workspace." - }, - "metadata_service_url": { - "description": "The URL of the metadata service to use for authentication." - }, - "profile": { - "description": "Connection profile to use. By default profiles are specified in ~/.databrickscfg." - }, - "root_path": { - "description": "The base location for synchronizing files, artifacts and state. Defaults to `/Users/jane@doe.com/.bundle/${bundle.name}/${bundle.target}`" - }, - "state_path": { - "description": "The remote path to synchronize bundle state to. This defaults to `${workspace.root}/state`" - } - } - } - } - } - }, - "include": { - "description": "A list of glob patterns of files to load and merge into the this configuration. Defaults to no files being included.", - "items": { - "description": "" - } - }, - "resources": { - "description": "Collection of Databricks resources to deploy.", - "properties": { - "experiments": { - "description": "List of MLflow experiments", - "additionalproperties": { - "description": "", - "properties": { - "artifact_location": { - "description": "Location where artifacts for the experiment are stored." - }, - "creation_time": { - "description": "Creation time" - }, - "experiment_id": { - "description": "Unique identifier for the experiment." - }, - "last_update_time": { - "description": "Last update time" - }, - "lifecycle_stage": { - "description": "Current life cycle stage of the experiment: \"active\" or \"deleted\".\nDeleted experiments are not returned by APIs." - }, - "name": { - "description": "Human readable name that identifies the experiment." - }, - "permissions": { - "description": "", - "items": { - "description": "", - "properties": { - "group_name": { - "description": "" - }, - "level": { - "description": "" - }, - "service_principal_name": { - "description": "" - }, - "user_name": { - "description": "" - } - } - } - }, - "tags": { - "description": "Tags: Additional metadata key-value pairs.", - "items": { - "description": "", - "properties": { - "key": { - "description": "The tag key." - }, - "value": { - "description": "The tag value." - } - } - } - } - } - } - }, - "jobs": { - "description": "List of Databricks jobs", - "additionalproperties": { - "description": "", - "properties": { - "compute": { - "description": "A list of compute requirements that can be referenced by tasks of this job.", - "items": { - "description": "", - "properties": { - "compute_key": { - "description": "A unique name for the compute requirement. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine the compute requirements for the task execution." }, - "spec": { + "run_as": { "description": "", "properties": { - "kind": { - "description": "The kind of compute described by this compute specification." + "service_principal_name": { + "description": "Application ID of an active service principal. Setting this field requires the `servicePrincipal/user` role." + }, + "user_name": { + "description": "The email of an active workspace user. Non-admin users can only set this field to their own email." } } - } - } - } - }, - "continuous": { - "description": "An optional continuous property for this job. The continuous property will ensure that there is always one run executing. Only one of `schedule` and `continuous` can be used.", - "properties": { - "pause_status": { - "description": "Whether this trigger is paused or not." - } - } - }, - "email_notifications": { - "description": "An optional set of email addresses that is notified when runs of this job begin or complete as well as when this job is deleted. The default behavior is to not send any emails.", - "properties": { - "no_alert_for_skipped_runs": { - "description": "If true, do not send email to recipients specified in `on_failure` if the run is skipped." - }, - "on_failure": { - "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - }, - "on_start": { - "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - }, - "on_success": { - "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESS` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - } - } - }, - "format": { - "description": "Used to tell what is the format of the job. This field is ignored in Create/Update/Reset calls. When using the Jobs API 2.1 this value is always set to `\"MULTI_TASK\"`." - }, - "git_source": { - "description": "An optional specification for a remote repository containing the notebooks used by this job's notebook tasks.", - "properties": { - "git_branch": { - "description": "Name of the branch to be checked out and used by this job.\nThis field cannot be specified in conjunction with git_tag or git_commit.\n\nThe maximum length is 255 characters.\n" - }, - "git_commit": { - "description": "Commit to be checked out and used by this job. This field cannot be specified in conjunction with git_branch or git_tag.\nThe maximum length is 64 characters." - }, - "git_provider": { - "description": "Unique identifier of the service used to host the Git repository. The value is case insensitive." - }, - "git_snapshot": { - "description": "", - "properties": { - "used_commit": { - "description": "Commit that was used to execute the run. If git_branch was specified, this points to the HEAD of the branch at the time of the run; if git_tag was specified, this points to the commit the tag points to." - } - } - }, - "git_tag": { - "description": "Name of the tag to be checked out and used by this job.\nThis field cannot be specified in conjunction with git_branch or git_commit.\n\nThe maximum length is 255 characters.\n" - }, - "git_url": { - "description": "URL of the repository to be cloned by this job.\nThe maximum length is 300 characters." - } - } - }, - "job_clusters": { - "description": "A list of job cluster specifications that can be shared and reused by tasks of this job. Libraries cannot be declared in a shared job cluster. You must declare dependent libraries in task settings.", - "items": { - "description": "", - "properties": { - "job_cluster_key": { - "description": "A unique name for the job cluster. This field is required and must be unique within the job.\n`JobTaskSettings` may refer to this field to determine which cluster to launch for the task execution." }, - "new_cluster": { - "description": "If new_cluster, a description of a cluster that is created for only for this task.", + "schedule": { + "description": "An optional periodic schedule for this job. The default behavior is that the job only runs when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", "properties": { - "autoscale": { - "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", - "properties": { - "max_workers": { - "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." - }, - "min_workers": { - "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." - } - } + "pause_status": { + "description": "Indicate whether this schedule is paused or not." }, - "autotermination_minutes": { - "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + "quartz_cron_expression": { + "description": "A Cron expression using Quartz syntax that describes the schedule for a job.\nSee [Cron Trigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html)\nfor details. This field is required.\"\n" }, - "aws_attributes": { - "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "ebs_volume_count": { - "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." - }, - "ebs_volume_iops": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_size": { - "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." - }, - "ebs_volume_throughput": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_type": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "instance_profile_arn": { - "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." - }, - "spot_bid_price_percent": { - "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." - }, - "zone_id": { - "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." + "timezone_id": { + "description": "A Java timezone ID. The schedule for a job is resolved with respect to this timezone.\nSee [Java TimeZone](https://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html) for details.\nThis field is required.\n" + } + } + }, + "tags": { + "description": "A map of tags associated with the job. These are forwarded to the cluster as cluster tags for jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can be added to the job.", + "additionalproperties": { + "description": "" + } + }, + "tasks": { + "description": "A list of task specifications to be executed by this job.", + "items": { + "description": "", + "properties": { + "compute_key": { + "description": "The key of the compute requirement, specified in `job.settings.compute`, to use for execution of this task." + }, + "condition_task": { + "description": "If condition_task, specifies a condition with an outcome that can be used to control the execution of other tasks. Does not require a cluster to execute and does not support retries or notifications.", + "properties": { + "left": { + "description": "The left operand of the condition task. Can be either a string value or a job state or parameter reference." + }, + "op": { + "description": "* `EQUAL_TO`, `NOT_EQUAL` operators perform string comparison of their operands. This means that `“12.0” == “12”` will evaluate to `false`.\n* `GREATER_THAN`, `GREATER_THAN_OR_EQUAL`, `LESS_THAN`, `LESS_THAN_OR_EQUAL` operators perform numeric comparison of their operands. `“12.0” \u003e= “12”` will evaluate to `true`, `“10.0” \u003e= “12”` will evaluate to `false`.\n\nThe boolean comparison to task values can be implemented with operators `EQUAL_TO`, `NOT_EQUAL`. If a task value was set to a boolean value, it will be serialized to `“true”` or `“false”` for the comparison.\n" + }, + "right": { + "description": "The right operand of the condition task. Can be either a string value or a job state or parameter reference." + } } - } - }, - "azure_attributes": { - "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "log_analytics_info": { - "description": "Defines values necessary to configure and run Azure Log Analytics agent", + }, + "dbt_task": { + "description": "If dbt_task, indicates that this must execute a dbt task. It requires both Databricks SQL and the ability to use a serverless or a pro SQL warehouse.", + "properties": { + "catalog": { + "description": "Optional name of the catalog to use. The value is the top level in the 3-level namespace of Unity Catalog (catalog / schema / relation). The catalog value can only be specified if a warehouse_id is specified. Requires dbt-databricks \u003e= 1.1.1." + }, + "commands": { + "description": "A list of dbt commands to execute. All commands must start with `dbt`. This parameter must not be empty. A maximum of up to 10 commands can be provided.", + "items": { + "description": "" + } + }, + "profiles_directory": { + "description": "Optional (relative) path to the profiles directory. Can only be specified if no warehouse_id is specified. If no warehouse_id is specified and this folder is unset, the root directory is used." + }, + "project_directory": { + "description": "Optional (relative) path to the project directory, if no value is provided, the root of the git repository is used." + }, + "schema": { + "description": "Optional schema to write to. This parameter is only used when a warehouse_id is also provided. If not provided, the `default` schema is used." + }, + "warehouse_id": { + "description": "ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the profile and connection details to dbt. It can be overridden on a per-command basis by using the `--profiles-dir` command line argument." + } + } + }, + "depends_on": { + "description": "An optional array of objects specifying the dependency graph of the task. All tasks specified in this field must complete before executing this task. The task will run only if the `run_if` condition is true.\nThe key is `task_key`, and the value is the name assigned to the dependent task.\n", + "items": { + "description": "", "properties": { - "log_analytics_primary_key": { - "description": "\u003cneeds content added\u003e" + "outcome": { + "description": "Can only be specified on condition task dependencies. The outcome of the dependent task that must be met for this task to run." }, - "log_analytics_workspace_id": { - "description": "\u003cneeds content added\u003e" + "task_key": { + "description": "The name of the task this task depends on." } } - }, - "spot_bid_max_price": { - "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." } - } - }, - "cluster_log_conf": { - "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", - "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", - "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" + }, + "description": { + "description": "An optional description for this task." + }, + "email_notifications": { + "description": "An optional set of email addresses that is notified when runs of this task begin or complete as well as when this task is deleted. The default behavior is to not send any emails.", + "properties": { + "on_duration_warning_threshold_exceeded": { + "description": "A list of email addresses to be notified when the duration of a run exceeds the threshold specified for the `RUN_DURATION_SECONDS` metric in the `health` field. If no rule for the `RUN_DURATION_SECONDS` metric is specified in the `health` field for the job, notifications are not sent.", + "items": { + "description": "" + } + }, + "on_failure": { + "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_start": { + "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + }, + "on_success": { + "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESS` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", + "items": { + "description": "" + } + } + } + }, + "existing_cluster_id": { + "description": "If existing_cluster_id, the ID of an existing cluster that is used for all runs of this task. When running tasks on an existing cluster, you may need to manually restart the cluster if it stops responding. We suggest running jobs on new clusters for greater reliability." + }, + "health": { + "description": "", + "properties": { + "rules": { + "description": "", + "items": { + "description": "", + "properties": { + "metric": { + "description": "" + }, + "op": { + "description": "" + }, + "value": { + "description": "Specifies the threshold value that the health metric should obey to satisfy the health rule." + } + } } } - }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + } + }, + "job_cluster_key": { + "description": "If job_cluster_key, this task is executed reusing the cluster specified in `job.settings.job_clusters`." + }, + "libraries": { + "description": "An optional list of libraries to be installed on the cluster that executes the task. The default value is an empty list.", + "items": { + "description": "", "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." + "cran": { + "description": "Specification of a CRAN library to be installed as part of the library", + "properties": { + "package": { + "description": "The name of the CRAN package to install." + }, + "repo": { + "description": "The repository where the package can be found. If not specified, the default CRAN repo is used." + } + } }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + "egg": { + "description": "URI of the egg to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"egg\": \"dbfs:/my/egg\" }` or\n`{ \"egg\": \"s3://my-bucket/egg\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + "jar": { + "description": "URI of the jar to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"jar\": \"dbfs:/mnt/databricks/library.jar\" }` or\n`{ \"jar\": \"s3://my-bucket/library.jar\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + "maven": { + "description": "Specification of a maven library to be installed. For example:\n`{ \"coordinates\": \"org.jsoup:jsoup:1.7.2\" }`", + "properties": { + "coordinates": { + "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." + }, + "exclusions": { + "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", + "items": { + "description": "" + } + }, + "repo": { + "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." + } + } }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." - } - } - } - } - }, - "cluster_name": { - "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" - }, - "cluster_source": { - "description": "" - }, - "custom_tags": { - "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", - "additionalproperties": { - "description": "" - } - }, - "data_security_mode": { - "description": "" - }, - "docker_image": { - "description": "", - "properties": { - "basic_auth": { - "description": "", - "properties": { - "password": { - "description": "Password of the user" + "pypi": { + "description": "Specification of a PyPi library to be installed. For example:\n`{ \"package\": \"simplejson\" }`", + "properties": { + "package": { + "description": "The name of the pypi package to install. An optional exact version specification is also\nsupported. Examples: \"simplejson\" and \"simplejson==3.8.0\"." + }, + "repo": { + "description": "The repository where the package can be found. If not specified, the default pip index is\nused." + } + } }, - "username": { - "description": "Name of the user" + "whl": { + "description": "URI of the wheel to be installed.\nFor example: `{ \"whl\": \"dbfs:/my/whl\" }` or `{ \"whl\": \"s3://my-bucket/whl\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." } } - }, - "url": { - "description": "URL of the docker image." - } - } - }, - "driver_instance_pool_id": { - "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." - }, - "driver_node_type_id": { - "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" - }, - "enable_elastic_disk": { - "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." - }, - "enable_local_disk_encryption": { - "description": "Whether to enable LUKS on cluster VMs' local disks" - }, - "gcp_attributes": { - "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "boot_disk_size": { - "description": "boot disk size in GB" - }, - "google_service_account": { - "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." - }, - "local_ssd_count": { - "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." } - } - }, - "init_scripts": { - "description": "The configuration for storing init scripts. Any number of destinations can be specified. The scripts are executed sequentially in the order provided. If `cluster_log_conf` is specified, init script logs are sent to `\u003cdestination\u003e/\u003ccluster-ID\u003e/init_scripts`.", - "items": { - "description": "", + }, + "max_retries": { + "description": "An optional maximum number of times to retry an unsuccessful run. A run is considered to be unsuccessful if it completes with the `FAILED` result_state or `INTERNAL_ERROR` `life_cycle_state`. The value `-1` means to retry indefinitely and the value `0` means to never retry." + }, + "min_retry_interval_millis": { + "description": "An optional minimal interval in milliseconds between the start of the failed run and the subsequent retry run. The default behavior is that unsuccessful runs are immediately retried." + }, + "new_cluster": { + "description": "If new_cluster, a description of a cluster that is created for each task.", "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "apply_policy_default_values": { + "description": "" + }, + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "autotermination_minutes": { + "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nThe list of available zones as well as the default value can be found by using the\n`List Zones` method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } } } }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "cluster_name": { + "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" + }, + "cluster_source": { + "description": "" + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "data_security_mode": { + "description": "" + }, + "docker_image": { + "description": "", "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." + "basic_auth": { + "description": "", + "properties": { + "password": { + "description": "Password of the user" + }, + "username": { + "description": "Name of the user" + } + } }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + "url": { + "description": "URL of the docker image." + } + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" + }, + "enable_elastic_disk": { + "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." + }, + "enable_local_disk_encryption": { + "description": "Whether to enable LUKS on cluster VMs' local disks" + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + "boot_disk_size": { + "description": "boot disk size in GB" }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + "local_ssd_count": { + "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." } } }, - "workspace": { - "description": "destination needs to be provided. e.g.\n`{ \"workspace\" : { \"destination\" : \"/Users/user1@databricks.com/my-init.sh\" } }`", + "init_scripts": { + "description": "The configuration for storing init scripts. Any number of destinations can be specified. The scripts are executed sequentially in the order provided. If `cluster_log_conf` is specified, init script logs are sent to `\u003cdestination\u003e/\u003ccluster-ID\u003e/init_scripts`.", + "items": { + "description": "", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "file": { + "description": "destination needs to be provided. e.g.\n`{ \"file\" : { \"destination\" : \"file:/my/local/file.sh\" } }`", + "properties": { + "destination": { + "description": "local file destination, e.g. `file:/my/local/file.sh`" + } + } + }, + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + }, + "volumes": { + "description": "destination needs to be provided. e.g.\n`{ \"volumes\" : { \"destination\" : \"/Volumes/my-init.sh\" } }`", + "properties": { + "destination": { + "description": "Unity Catalog Volumes file destination, e.g. `/Volumes/my-init.sh`" + } + } + }, + "workspace": { + "description": "destination needs to be provided. e.g.\n`{ \"workspace\" : { \"destination\" : \"/Users/user1@databricks.com/my-init.sh\" } }`", + "properties": { + "destination": { + "description": "workspace files destination, e.g. `/Users/user1@databricks.com/my-init.sh`" + } + } + } + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "runtime_engine": { + "description": "" + }, + "single_user_name": { + "description": "Single user name if data_security_mode is `SINGLE_USER`" + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "spark_version": { + "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + }, + "workload_type": { + "description": "", "properties": { - "destination": { - "description": "workspace files destination, e.g. `/Users/user1@databricks.com/my-init.sh`" + "clients": { + "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", + "properties": { + "jobs": { + "description": "With jobs set, the cluster can be used for jobs" + }, + "notebooks": { + "description": "With notebooks set, this cluster can be used for notebooks" + } + } } } } } - } - }, - "instance_pool_id": { - "description": "The optional ID of the instance pool to which the cluster belongs." - }, - "node_type_id": { - "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" - }, - "num_workers": { - "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." - }, - "policy_id": { - "description": "The ID of the cluster policy used to create the cluster if applicable." - }, - "runtime_engine": { - "description": "" - }, - "single_user_name": { - "description": "Single user name if data_security_mode is `SINGLE_USER`" - }, - "spark_conf": { - "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", - "additionalproperties": { - "description": "" - } - }, - "spark_env_vars": { - "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", - "additionalproperties": { - "description": "" - } - }, - "spark_version": { - "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" - }, - "ssh_public_keys": { - "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", - "items": { - "description": "" - } - }, - "workload_type": { - "description": "", - "properties": { - "clients": { - "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", - "properties": { - "jobs": { - "description": "With jobs set, the cluster can be used for jobs" - }, - "notebooks": { - "description": "With notebooks set, this cluster can be used for notebooks" + }, + "notebook_task": { + "description": "If notebook_task, indicates that this task must run a notebook. This field may not be specified in conjunction with spark_jar_task.", + "properties": { + "base_parameters": { + "description": "Base parameters to be used for each run of this job. If the run is initiated by a call to\n:method:jobs/runNow with parameters specified, the two parameters maps are merged. If the same key is specified in\n`base_parameters` and in `run-now`, the value from `run-now` is used.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n\nIf the notebook takes a parameter that is not specified in the job’s `base_parameters` or the `run-now` override parameters,\nthe default value from the notebook is used.\n\nRetrieve these parameters in a notebook using [dbutils.widgets.get](https://docs.databricks.com/dev-tools/databricks-utils.html#dbutils-widgets).\n\nThe JSON representation of this field cannot exceed 1MB.\n", + "additionalproperties": { + "description": "" } + }, + "notebook_path": { + "description": "The path of the notebook to be run in the Databricks workspace or remote repository.\nFor notebooks stored in the Databricks workspace, the path must be absolute and begin with a slash.\nFor notebooks stored in a remote repository, the path must be relative. This field is required.\n" + }, + "source": { + "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved\nfrom the local \u003cDatabricks\u003e workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a \u003cDatabricks\u003e workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository.\n" } } - } - } - } - } - } - } - }, - "max_concurrent_runs": { - "description": "An optional maximum allowed number of concurrent runs of the job.\n\nSet this value if you want to be able to execute multiple runs of the same job concurrently. This is useful for example if you trigger your job on a frequent schedule and want to allow consecutive runs to overlap with each other, or if you want to trigger multiple runs which differ by their input parameters.\n\nThis setting affects only new runs. For example, suppose the job’s concurrency is 4 and there are 4 concurrent active runs. Then setting the concurrency to 3 won’t kill any of the active runs. However, from then on, new runs are skipped unless there are fewer than 3 active runs.\n\nThis value cannot exceed 1000\\. Setting this value to 0 causes all new runs to be skipped. The default behavior is to allow only 1 concurrent run." - }, - "name": { - "description": "An optional name for the job." - }, - "notification_settings": { - "description": "Optional notification settings that are used when sending notifications to each of the `email_notifications` and `webhook_notifications` for this job.", - "properties": { - "no_alert_for_canceled_runs": { - "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is canceled." - }, - "no_alert_for_skipped_runs": { - "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is skipped." - } - } - }, - "parameters": { - "description": "Job-level parameter definitions", - "items": { - "description": "", - "properties": { - "default": { - "description": "Default value of the parameter." - }, - "name": { - "description": "The name of the defined parameter. May only contain alphanumeric characters, `_`, `-`, and `.`" - } - } - } - }, - "permissions": { - "description": "", - "items": { - "description": "", - "properties": { - "group_name": { - "description": "" - }, - "level": { - "description": "" - }, - "service_principal_name": { - "description": "" - }, - "user_name": { - "description": "" - } - } - } - }, - "run_as": { - "description": "", - "properties": { - "service_principal_name": { - "description": "Application ID of an active service principal. Setting this field requires the `servicePrincipal/user` role." - }, - "user_name": { - "description": "The email of an active workspace user. Non-admin users can only set this field to their own email." - } - } - }, - "schedule": { - "description": "An optional periodic schedule for this job. The default behavior is that the job only runs when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", - "properties": { - "pause_status": { - "description": "Whether this trigger is paused or not." - }, - "quartz_cron_expression": { - "description": "A Cron expression using Quartz syntax that describes the schedule for a job.\nSee [Cron Trigger](http://www.quartz-scheduler.org/documentation/quartz-2.3.0/tutorials/crontrigger.html)\nfor details. This field is required.\"\n" - }, - "timezone_id": { - "description": "A Java timezone ID. The schedule for a job is resolved with respect to this timezone.\nSee [Java TimeZone](https://docs.oracle.com/javase/7/docs/api/java/util/TimeZone.html) for details.\nThis field is required.\n" - } - } - }, - "tags": { - "description": "A map of tags associated with the job. These are forwarded to the cluster as cluster tags for jobs clusters, and are subject to the same limitations as cluster tags. A maximum of 25 tags can be added to the job.", - "additionalproperties": { - "description": "" - } - }, - "tasks": { - "description": "A list of task specifications to be executed by this job.", - "items": { - "description": "", - "properties": { - "compute_key": { - "description": "The key of the compute requirement, specified in `job.settings.compute`, to use for execution of this task." - }, - "condition_task": { - "description": "If condition_task, specifies a condition with an outcome that can be used to control the execution of other tasks. Does not require a cluster to execute and does not support retries or notifications.", - "properties": { - "left": { - "description": "The left operand of the condition task. Can be either a string value or a job state or parameter reference." - }, - "op": { - "description": "* `EQUAL_TO`, `NOT_EQUAL` operators perform string comparison of their operands. This means that `“12.0” == “12”` will evaluate to `false`.\n* `GREATER_THAN`, `GREATER_THAN_OR_EQUAL`, `LESS_THAN`, `LESS_THAN_OR_EQUAL` operators perform numeric comparison of their operands. `“12.0” \u003e= “12”` will evaluate to `true`, `“10.0” \u003e= “12”` will evaluate to `false`.\n\nThe boolean comparison to task values can be implemented with operators `EQUAL_TO`, `NOT_EQUAL`. If a task value was set to a boolean value, it will be serialized to `“true”` or `“false”` for the comparison.\n" - }, - "right": { - "description": "The right operand of the condition task. Can be either a string value or a job state or parameter reference." - } - } - }, - "dbt_task": { - "description": "If dbt_task, indicates that this must execute a dbt task. It requires both Databricks SQL and the ability to use a serverless or a pro SQL warehouse.", - "properties": { - "catalog": { - "description": "Optional name of the catalog to use. The value is the top level in the 3-level namespace of Unity Catalog (catalog / schema / relation). The catalog value can only be specified if a warehouse_id is specified. Requires dbt-databricks \u003e= 1.1.1." - }, - "commands": { - "description": "A list of dbt commands to execute. All commands must start with `dbt`. This parameter must not be empty. A maximum of up to 10 commands can be provided.", - "items": { - "description": "" - } - }, - "profiles_directory": { - "description": "Optional (relative) path to the profiles directory. Can only be specified if no warehouse_id is specified. If no warehouse_id is specified and this folder is unset, the root directory is used." - }, - "project_directory": { - "description": "Optional (relative) path to the project directory, if no value is provided, the root of the git repository is used." - }, - "schema": { - "description": "Optional schema to write to. This parameter is only used when a warehouse_id is also provided. If not provided, the `default` schema is used." - }, - "warehouse_id": { - "description": "ID of the SQL warehouse to connect to. If provided, we automatically generate and provide the profile and connection details to dbt. It can be overridden on a per-command basis by using the `--profiles-dir` command line argument." - } - } - }, - "depends_on": { - "description": "An optional array of objects specifying the dependency graph of the task. All tasks specified in this field must complete successfully before executing this task.\nThe key is `task_key`, and the value is the name assigned to the dependent task.\n", - "items": { - "description": "", - "properties": { - "outcome": { - "description": "Can only be specified on condition task dependencies. The outcome of the dependent task that must be met for this task to run." }, - "task_key": { - "description": "The name of the task this task depends on." - } - } - } - }, - "description": { - "description": "An optional description for this task.\nThe maximum length is 4096 bytes." - }, - "email_notifications": { - "description": "An optional set of email addresses that is notified when runs of this task begin or complete as well as when this task is deleted. The default behavior is to not send any emails.", - "properties": { - "on_failure": { - "description": "A list of email addresses to be notified when a run unsuccessfully completes. A run is considered to have completed unsuccessfully if it ends with an `INTERNAL_ERROR` `life_cycle_state` or a `FAILED`, or `TIMED_OUT` result_state. If this is not specified on job creation, reset, or update the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - }, - "on_start": { - "description": "A list of email addresses to be notified when a run begins. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - }, - "on_success": { - "description": "A list of email addresses to be notified when a run successfully completes. A run is considered to have completed successfully if it ends with a `TERMINATED` `life_cycle_state` and a `SUCCESS` result_state. If not specified on job creation, reset, or update, the list is empty, and notifications are not sent.", - "items": { - "description": "" - } - } - } - }, - "existing_cluster_id": { - "description": "If existing_cluster_id, the ID of an existing cluster that is used for all runs of this task. When running tasks on an existing cluster, you may need to manually restart the cluster if it stops responding. We suggest running jobs on new clusters for greater reliability." - }, - "job_cluster_key": { - "description": "If job_cluster_key, this task is executed reusing the cluster specified in `job.settings.job_clusters`." - }, - "libraries": { - "description": "An optional list of libraries to be installed on the cluster that executes the task. The default value is an empty list.", - "items": { - "description": "", - "properties": { - "cran": { - "description": "Specification of a CRAN library to be installed as part of the library", + "notification_settings": { + "description": "Optional notification settings that are used when sending notifications to each of the `email_notifications` and `webhook_notifications` for this task.", "properties": { - "package": { - "description": "The name of the CRAN package to install." + "alert_on_last_attempt": { + "description": "If true, do not send notifications to recipients specified in `on_start` for the retried runs and do not send notifications to recipients specified in `on_failure` until the last retry of the run." }, - "repo": { - "description": "The repository where the package can be found. If not specified, the default CRAN repo is used." + "no_alert_for_canceled_runs": { + "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is canceled." + }, + "no_alert_for_skipped_runs": { + "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is skipped." } } }, - "egg": { - "description": "URI of the egg to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"egg\": \"dbfs:/my/egg\" }` or\n`{ \"egg\": \"s3://my-bucket/egg\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + "pipeline_task": { + "description": "If pipeline_task, indicates that this task must execute a Pipeline.", + "properties": { + "full_refresh": { + "description": "If true, a full refresh will be triggered on the delta live table." + }, + "pipeline_id": { + "description": "The full name of the pipeline task to execute." + } + } }, - "jar": { - "description": "URI of the jar to be installed. Currently only DBFS and S3 URIs are supported.\nFor example: `{ \"jar\": \"dbfs:/mnt/databricks/library.jar\" }` or\n`{ \"jar\": \"s3://my-bucket/library.jar\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." + "python_wheel_task": { + "description": "If python_wheel_task, indicates that this job must execute a PythonWheel.", + "properties": { + "entry_point": { + "description": "Named entry point to use, if it does not exist in the metadata of the package it executes the function from the package directly using `$packageName.$entryPoint()`" + }, + "named_parameters": { + "description": "Command-line parameters passed to Python wheel task in the form of `[\"--name=task\", \"--data=dbfs:/path/to/data.json\"]`. Leave it empty if `parameters` is not null.", + "additionalproperties": { + "description": "" + } + }, + "package_name": { + "description": "Name of the package to execute" + }, + "parameters": { + "description": "Command-line parameters passed to Python wheel task. Leave it empty if `named_parameters` is not null.", + "items": { + "description": "" + } + } + } }, - "maven": { - "description": "Specification of a maven library to be installed. For example:\n`{ \"coordinates\": \"org.jsoup:jsoup:1.7.2\" }`", + "retry_on_timeout": { + "description": "An optional policy to specify whether to retry a task when it times out." + }, + "run_if": { + "description": "An optional value specifying the condition determining whether the task is run once its dependencies have been completed.\n\n* `ALL_SUCCESS`: All dependencies have executed and succeeded\n* `AT_LEAST_ONE_SUCCESS`: At least one dependency has succeeded\n* `NONE_FAILED`: None of the dependencies have failed and at least one was executed\n* `ALL_DONE`: All dependencies have been completed\n* `AT_LEAST_ONE_FAILED`: At least one dependency failed\n* `ALL_FAILED`: ALl dependencies have failed\n" + }, + "run_job_task": { + "description": "If run_job_task, indicates that this task must execute another job.", "properties": { - "coordinates": { - "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." + "job_id": { + "description": "ID of the job to trigger." }, - "exclusions": { - "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", + "job_parameters": { + "description": "" + } + } + }, + "spark_jar_task": { + "description": "If spark_jar_task, indicates that this task must run a JAR.", + "properties": { + "jar_uri": { + "description": "Deprecated since 04/2016. Provide a `jar` through the `libraries` field instead. For an example, see :method:jobs/create.\n" + }, + "main_class_name": { + "description": "The full name of the class containing the main method to be executed. This class must be contained in a JAR provided as a library.\n\nThe code must use `SparkContext.getOrCreate` to obtain a Spark context; otherwise, runs of the job fail." + }, + "parameters": { + "description": "Parameters passed to the main method.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", "items": { "description": "" } - }, - "repo": { - "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." } } }, - "pypi": { - "description": "Specification of a PyPi library to be installed. For example:\n`{ \"package\": \"simplejson\" }`", + "spark_python_task": { + "description": "If spark_python_task, indicates that this task must run a Python file.", "properties": { - "package": { - "description": "The name of the pypi package to install. An optional exact version specification is also\nsupported. Examples: \"simplejson\" and \"simplejson==3.8.0\"." + "parameters": { + "description": "Command line parameters passed to the Python file.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" + } }, - "repo": { - "description": "The repository where the package can be found. If not specified, the default pip index is\nused." + "python_file": { + "description": "The Python file to be executed. Cloud file URIs (such as dbfs:/, s3:/, adls:/, gcs:/) and workspace paths are supported. For python files stored in the Databricks workspace, the path must be absolute and begin with `/`. For files stored in a remote repository, the path must be relative. This field is required." + }, + "source": { + "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved\nfrom the local \u003cDatabricks\u003e workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a \u003cDatabricks\u003e workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository.\n" } } }, - "whl": { - "description": "URI of the wheel to be installed.\nFor example: `{ \"whl\": \"dbfs:/my/whl\" }` or `{ \"whl\": \"s3://my-bucket/whl\" }`.\nIf S3 is used, please make sure the cluster has read access on the library. You may need to\nlaunch the cluster with an IAM role to access the S3 URI." - } - } - } - }, - "max_retries": { - "description": "An optional maximum number of times to retry an unsuccessful run. A run is considered to be unsuccessful if it completes with the `FAILED` result_state or `INTERNAL_ERROR` `life_cycle_state`. The value -1 means to retry indefinitely and the value 0 means to never retry. The default behavior is to never retry." - }, - "min_retry_interval_millis": { - "description": "An optional minimal interval in milliseconds between the start of the failed run and the subsequent retry run. The default behavior is that unsuccessful runs are immediately retried." - }, - "new_cluster": { - "description": "If new_cluster, a description of a cluster that is created for only for this task.", - "properties": { - "autoscale": { - "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", - "properties": { - "max_workers": { - "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." - }, - "min_workers": { - "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." - } - } - }, - "autotermination_minutes": { - "description": "Automatically terminates the cluster after it is inactive for this time in minutes. If not set,\nthis cluster will not be automatically terminated. If specified, the threshold must be between\n10 and 10000 minutes.\nUsers can also set this value to 0 to explicitly disable automatic termination." - }, - "aws_attributes": { - "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "ebs_volume_count": { - "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." - }, - "ebs_volume_iops": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_size": { - "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." - }, - "ebs_volume_throughput": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_type": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "instance_profile_arn": { - "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." - }, - "spot_bid_price_percent": { - "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." - }, - "zone_id": { - "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." - } - } - }, - "azure_attributes": { - "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "log_analytics_info": { - "description": "Defines values necessary to configure and run Azure Log Analytics agent", - "properties": { - "log_analytics_primary_key": { - "description": "\u003cneeds content added\u003e" - }, - "log_analytics_workspace_id": { - "description": "\u003cneeds content added\u003e" - } - } - }, - "spot_bid_max_price": { - "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." - } - } - }, - "cluster_log_conf": { - "description": "The configuration for delivering spark logs to a long-term storage destination.\nTwo kinds of destinations (dbfs and s3) are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.", - "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", - "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" - } - } - }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", - "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." - }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." - }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." - }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." - }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." - } - } - } - } - }, - "cluster_name": { - "description": "Cluster name requested by the user. This doesn't have to be unique.\nIf not specified at creation, the cluster name will be an empty string.\n" - }, - "cluster_source": { - "description": "" - }, - "custom_tags": { - "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", - "additionalproperties": { - "description": "" - } - }, - "data_security_mode": { - "description": "" - }, - "docker_image": { - "description": "", - "properties": { - "basic_auth": { - "description": "", - "properties": { - "password": { - "description": "Password of the user" - }, - "username": { - "description": "Name of the user" + "spark_submit_task": { + "description": "If `spark_submit_task`, indicates that this task must be launched by the spark submit script. This task can run only on new clusters.\n\nIn the `new_cluster` specification, `libraries` and `spark_conf` are not supported. Instead, use `--jars` and `--py-files` to add Java and Python libraries and `--conf` to set the Spark configurations. \n\n`master`, `deploy-mode`, and `executor-cores` are automatically configured by Databricks; you _cannot_ specify them in parameters.\n\nBy default, the Spark submit job uses all available memory (excluding reserved memory for Databricks services). You can set `--driver-memory`, and `--executor-memory` to a smaller value to leave some room for off-heap usage.\n\nThe `--jars`, `--py-files`, `--files` arguments support DBFS and S3 paths.\n", + "properties": { + "parameters": { + "description": "Command-line parameters passed to spark submit.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "items": { + "description": "" } } - }, - "url": { - "description": "URL of the docker image." - } - } - }, - "driver_instance_pool_id": { - "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." - }, - "driver_node_type_id": { - "description": "The node type of the Spark driver. Note that this field is optional;\nif unset, the driver node type will be set as the same value\nas `node_type_id` defined above.\n" - }, - "enable_elastic_disk": { - "description": "Autoscaling Local Storage: when enabled, this cluster will dynamically acquire additional disk\nspace when its Spark workers are running low on disk space. This feature requires specific AWS\npermissions to function correctly - refer to the User Guide for more details." - }, - "enable_local_disk_encryption": { - "description": "Whether to enable LUKS on cluster VMs' local disks" - }, - "gcp_attributes": { - "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "boot_disk_size": { - "description": "boot disk size in GB" - }, - "google_service_account": { - "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." - }, - "local_ssd_count": { - "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." } - } - }, - "init_scripts": { - "description": "The configuration for storing init scripts. Any number of destinations can be specified. The scripts are executed sequentially in the order provided. If `cluster_log_conf` is specified, init script logs are sent to `\u003cdestination\u003e/\u003ccluster-ID\u003e/init_scripts`.", - "items": { - "description": "", + }, + "sql_task": { + "description": "If sql_task, indicates that this job must execute a SQL task.", "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "alert": { + "description": "If alert, indicates that this job must refresh a SQL alert.", "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" + "alert_id": { + "description": "The canonical identifier of the SQL alert." + }, + "pause_subscriptions": { + "description": "If true, the alert notifications are not sent to subscribers." + }, + "subscriptions": { + "description": "If specified, alert notifications are sent to subscribers.", + "items": { + "description": "", + "properties": { + "destination_id": { + "description": "The canonical identifier of the destination to receive email notification. This parameter is mutually exclusive with user_name. You cannot set both destination_id and user_name for subscription notifications." + }, + "user_name": { + "description": "The user name to receive the subscription email. This parameter is mutually exclusive with destination_id. You cannot set both destination_id and user_name for subscription notifications." + } + } + } + } + } + }, + "dashboard": { + "description": "If dashboard, indicates that this job must refresh a SQL dashboard.", + "properties": { + "custom_subject": { + "description": "Subject of the email sent to subscribers of this task." + }, + "dashboard_id": { + "description": "The canonical identifier of the SQL dashboard." + }, + "pause_subscriptions": { + "description": "If true, the dashboard snapshot is not taken, and emails are not sent to subscribers." + }, + "subscriptions": { + "description": "If specified, dashboard snapshots are sent to subscriptions.", + "items": { + "description": "", + "properties": { + "destination_id": { + "description": "The canonical identifier of the destination to receive email notification. This parameter is mutually exclusive with user_name. You cannot set both destination_id and user_name for subscription notifications." + }, + "user_name": { + "description": "The user name to receive the subscription email. This parameter is mutually exclusive with destination_id. You cannot set both destination_id and user_name for subscription notifications." + } + } + } } } }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "file": { + "description": "If file, indicates that this job runs a SQL file in a remote Git repository. Only one SQL statement is supported in a file. Multiple SQL statements separated by semicolons (;) are not permitted.", "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." - }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." - }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." - }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." - }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + "path": { + "description": "Relative path of the SQL file in the remote Git repository." } } }, - "workspace": { - "description": "destination needs to be provided. e.g.\n`{ \"workspace\" : { \"destination\" : \"/Users/user1@databricks.com/my-init.sh\" } }`", + "parameters": { + "description": "Parameters to be used for each run of this job. The SQL alert task does not support custom parameters.", + "additionalproperties": { + "description": "" + } + }, + "query": { + "description": "If query, indicates that this job must execute a SQL query.", "properties": { - "destination": { - "description": "workspace files destination, e.g. `/Users/user1@databricks.com/my-init.sh`" + "query_id": { + "description": "The canonical identifier of the SQL query." } } + }, + "warehouse_id": { + "description": "The canonical identifier of the SQL warehouse. Recommended to use with serverless or pro SQL warehouses. Classic SQL warehouses are only supported for SQL alert, dashboard and query tasks and are limited to scheduled single-task jobs." } } - } - }, - "instance_pool_id": { - "description": "The optional ID of the instance pool to which the cluster belongs." - }, - "node_type_id": { - "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" - }, - "num_workers": { - "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." - }, - "policy_id": { - "description": "The ID of the cluster policy used to create the cluster if applicable." - }, - "runtime_engine": { - "description": "" - }, - "single_user_name": { - "description": "Single user name if data_security_mode is `SINGLE_USER`" - }, - "spark_conf": { - "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nUsers can also pass in a string of extra JVM options to the driver and the executors via\n`spark.driver.extraJavaOptions` and `spark.executor.extraJavaOptions` respectively.\n", - "additionalproperties": { - "description": "" - } - }, - "spark_env_vars": { - "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", - "additionalproperties": { - "description": "" - } - }, - "spark_version": { - "description": "The Spark version of the cluster, e.g. `3.3.x-scala2.11`.\nA list of available Spark versions can be retrieved by using\nthe :method:clusters/sparkVersions API call.\n" - }, - "ssh_public_keys": { - "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", - "items": { - "description": "" - } - }, - "workload_type": { - "description": "", - "properties": { - "clients": { - "description": " defined what type of clients can use the cluster. E.g. Notebooks, Jobs", - "properties": { - "jobs": { - "description": "With jobs set, the cluster can be used for jobs" - }, - "notebooks": { - "description": "With notebooks set, this cluster can be used for notebooks" + }, + "task_key": { + "description": "A unique name for the task. This field is used to refer to this task from other tasks.\nThis field is required and must be unique within its parent job.\nOn Update or Reset, this field is used to reference the tasks to be updated or reset." + }, + "timeout_seconds": { + "description": "An optional timeout applied to each run of this job task. A value of `0` means no timeout." + }, + "webhook_notifications": { + "description": "A collection of system notification IDs to notify when runs of this job begin or complete.", + "properties": { + "on_duration_warning_threshold_exceeded": { + "description": "An optional list of system notification IDs to call when the duration of a run exceeds the threshold specified for the `RUN_DURATION_SECONDS` metric in the `health` field. A maximum of 3 destinations can be specified for the `on_duration_warning_threshold_exceeded` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_failure": { + "description": "An optional list of system notification IDs to call when the run fails. A maximum of 3 destinations can be specified for the `on_failure` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_start": { + "description": "An optional list of system notification IDs to call when the run starts. A maximum of 3 destinations can be specified for the `on_start` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } + } + }, + "on_success": { + "description": "An optional list of system notification IDs to call when the run completes successfully. A maximum of 3 destinations can be specified for the `on_success` property.", + "items": { + "description": "", + "properties": { + "id": { + "description": "" + } + } } } } @@ -2896,331 +3650,284 @@ } } }, - "notebook_task": { - "description": "If notebook_task, indicates that this task must run a notebook. This field may not be specified in conjunction with spark_jar_task.", - "properties": { - "base_parameters": { - "description": "Base parameters to be used for each run of this job. If the run is initiated by a call to\n:method:jobs/runNow with parameters specified, the two parameters maps are merged. If the same key is specified in\n`base_parameters` and in `run-now`, the value from `run-now` is used.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n\nIf the notebook takes a parameter that is not specified in the job’s `base_parameters` or the `run-now` override parameters,\nthe default value from the notebook is used.\n\nRetrieve these parameters in a notebook using [dbutils.widgets.get](https://docs.databricks.com/dev-tools/databricks-utils.html#dbutils-widgets).\n", - "additionalproperties": { - "description": "" - } - }, - "notebook_path": { - "description": "The path of the notebook to be run in the Databricks workspace or remote repository.\nFor notebooks stored in the Databricks workspace, the path must be absolute and begin with a slash.\nFor notebooks stored in a remote repository, the path must be relative. This field is required.\n" - }, - "source": { - "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved\nfrom the local \u003cDatabricks\u003e workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a \u003cDatabricks\u003e workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository.\n" - } - } - }, - "notification_settings": { - "description": "Optional notification settings that are used when sending notifications to each of the `email_notifications` for this task.", - "properties": { - "alert_on_last_attempt": { - "description": "If true, do not send notifications to recipients specified in `on_start` for the retried runs and do not send notifications to recipients specified in `on_failure` until the last retry of the run." - }, - "no_alert_for_canceled_runs": { - "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is canceled." - }, - "no_alert_for_skipped_runs": { - "description": "If true, do not send notifications to recipients specified in `on_failure` if the run is skipped." - } - } + "timeout_seconds": { + "description": "An optional timeout applied to each run of this job. A value of `0` means no timeout." }, - "pipeline_task": { - "description": "If pipeline_task, indicates that this task must execute a Pipeline.", + "trigger": { + "description": "Trigger settings for the job. Can be used to trigger a run when new files arrive in an external location. The default behavior is that the job runs only when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", "properties": { - "full_refresh": { - "description": "If true, a full refresh will be triggered on the delta live table." + "file_arrival": { + "description": "File arrival trigger settings.", + "properties": { + "min_time_between_triggers_seconds": { + "description": "If set, the trigger starts a run only after the specified amount of time passed since\nthe last time the trigger fired. The minimum allowed value is 60 seconds\n" + }, + "url": { + "description": "URL to be monitored for file arrivals. The path must point to the root or a subpath of the external location." + }, + "wait_after_last_change_seconds": { + "description": "If set, the trigger starts a run only after no file activity has occurred for the specified amount of time.\nThis makes it possible to wait for a batch of incoming files to arrive before triggering a run. The\nminimum allowed value is 60 seconds.\n" + } + } }, - "pipeline_id": { - "description": "The full name of the pipeline task to execute." + "pause_status": { + "description": "Indicate whether this schedule is paused or not." } } }, - "python_wheel_task": { - "description": "If python_wheel_task, indicates that this job must execute a PythonWheel.", + "webhook_notifications": { + "description": "A collection of system notification IDs to notify when runs of this job begin or complete.", "properties": { - "entry_point": { - "description": "Named entry point to use, if it does not exist in the metadata of the package it executes the function from the package directly using `$packageName.$entryPoint()`" - }, - "named_parameters": { - "description": "Command-line parameters passed to Python wheel task in the form of `[\"--name=task\", \"--data=dbfs:/path/to/data.json\"]`. Leave it empty if `parameters` is not null.", - "additionalproperties": { - "description": "" - } - }, - "package_name": { - "description": "Name of the package to execute" - }, - "parameters": { - "description": "Command-line parameters passed to Python wheel task. Leave it empty if `named_parameters` is not null.", + "on_duration_warning_threshold_exceeded": { + "description": "An optional list of system notification IDs to call when the duration of a run exceeds the threshold specified for the `RUN_DURATION_SECONDS` metric in the `health` field. A maximum of 3 destinations can be specified for the `on_duration_warning_threshold_exceeded` property.", "items": { - "description": "" + "description": "", + "properties": { + "id": { + "description": "" + } + } } - } - } - }, - "retry_on_timeout": { - "description": "An optional policy to specify whether to retry a task when it times out. The default behavior is to not retry on timeout." - }, - "run_if": { - "description": "An optional value specifying the condition determining whether the task is run once its dependencies have been completed. When omitted, defaults to `ALL_SUCCESS`.\n\n* `ALL_SUCCESS`: All dependencies have executed and succeeded\n* `AT_LEAST_ONE_SUCCESS`: At least one dependency has succeeded\n* `NONE_FAILED`: None of the dependencies have failed and at least one was executed\n* `ALL_DONE`: All dependencies completed and at least one was executed\n* `AT_LEAST_ONE_FAILED`: At least one dependency failed\n* `ALL_FAILED`: ALl dependencies have failed\n" - }, - "spark_jar_task": { - "description": "If spark_jar_task, indicates that this task must run a JAR.", - "properties": { - "jar_uri": { - "description": "Deprecated since 04/2016. Provide a `jar` through the `libraries` field instead. For an example, see :method:jobs/create.\n" - }, - "main_class_name": { - "description": "The full name of the class containing the main method to be executed. This class must be contained in a JAR provided as a library.\n\nThe code must use `SparkContext.getOrCreate` to obtain a Spark context; otherwise, runs of the job fail." }, - "parameters": { - "description": "Parameters passed to the main method.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "on_failure": { + "description": "An optional list of system notification IDs to call when the run fails. A maximum of 3 destinations can be specified for the `on_failure` property.", "items": { - "description": "" + "description": "", + "properties": { + "id": { + "description": "" + } + } } - } - } - }, - "spark_python_task": { - "description": "If spark_python_task, indicates that this task must run a Python file.", - "properties": { - "parameters": { - "description": "Command line parameters passed to the Python file.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + }, + "on_start": { + "description": "An optional list of system notification IDs to call when the run starts. A maximum of 3 destinations can be specified for the `on_start` property.", "items": { - "description": "" + "description": "", + "properties": { + "id": { + "description": "" + } + } } }, - "python_file": { - "description": "The Python file to be executed. Cloud file URIs (such as dbfs:/, s3:/, adls:/, gcs:/) and workspace paths are supported. For python files stored in the Databricks workspace, the path must be absolute and begin with `/`. For files stored in a remote repository, the path must be relative. This field is required." - }, - "source": { - "description": "Optional location type of the Python file. When set to `WORKSPACE` or not specified, the file will be retrieved\nfrom the local \u003cDatabricks\u003e workspace or cloud location (if the `python_file` has a URI format). When set to `GIT`,\nthe Python file will be retrieved from a Git repository defined in `git_source`.\n\n* `WORKSPACE`: The Python file is located in a \u003cDatabricks\u003e workspace or at a cloud filesystem URI.\n* `GIT`: The Python file is located in a remote Git repository.\n" - } - } - }, - "spark_submit_task": { - "description": "If spark_submit_task, indicates that this task must be launched by the spark submit script. This task can run only on new clusters.", - "properties": { - "parameters": { - "description": "Command-line parameters passed to spark submit.\n\nUse [Task parameter variables](https://docs.databricks.com/jobs.html#parameter-variables) to set parameters containing information about job runs.\n", + "on_success": { + "description": "An optional list of system notification IDs to call when the run completes successfully. A maximum of 3 destinations can be specified for the `on_success` property.", "items": { - "description": "" + "description": "", + "properties": { + "id": { + "description": "" + } + } } } } - }, - "sql_task": { - "description": "If sql_task, indicates that this job must execute a SQL task.", + } + } + } + }, + "model_serving_endpoints": { + "description": "List of Model Serving Endpoints", + "additionalproperties": { + "description": "", + "properties": { + "config": { + "description": "The core config of the serving endpoint.", "properties": { - "alert": { - "description": "If alert, indicates that this job must refresh a SQL alert.", - "properties": { - "alert_id": { - "description": "The canonical identifier of the SQL alert." - }, - "pause_subscriptions": { - "description": "If true, the alert notifications are not sent to subscribers." - }, - "subscriptions": { - "description": "If specified, alert notifications are sent to subscribers.", - "items": { - "description": "", - "properties": { - "destination_id": { - "description": "The canonical identifier of the destination to receive email notification." - }, - "user_name": { - "description": "The user name to receive the subscription email." - } + "served_models": { + "description": "A list of served models for the endpoint to serve. A serving endpoint can have up to 10 served models.", + "items": { + "description": "", + "properties": { + "environment_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs used for serving this model.\nNote: this is an experimental feature and subject to change. \nExample model environment variables that refer to Databricks secrets: `{\"OPENAI_API_KEY\": \"{{secrets/my_scope/my_key}}\", \"DATABRICKS_TOKEN\": \"{{secrets/my_scope2/my_key2}}\"}`", + "additionalproperties": { + "description": "" } + }, + "instance_profile_arn": { + "description": "ARN of the instance profile that the served model will use to access AWS resources." + }, + "model_name": { + "description": "The name of the model in Databricks Model Registry to be served or if the model resides in Unity Catalog, the full name of model, \nin the form of __catalog_name__.__schema_name__.__model_name__.\n" + }, + "model_version": { + "description": "The version of the model in Databricks Model Registry or Unity Catalog to be served." + }, + "name": { + "description": "The name of a served model. It must be unique across an endpoint. If not specified, this field will default to \u003cmodel-name\u003e-\u003cmodel-version\u003e.\nA served model name can consist of alphanumeric characters, dashes, and underscores.\n" + }, + "scale_to_zero_enabled": { + "description": "Whether the compute resources for the served model should scale down to zero." + }, + "workload_size": { + "description": "The workload size of the served model. The workload size corresponds to a range of provisioned concurrency that the compute will autoscale between.\nA single unit of provisioned concurrency can process one request at a time.\nValid workload sizes are \"Small\" (4 - 4 provisioned concurrency), \"Medium\" (8 - 16 provisioned concurrency), and \"Large\" (16 - 64 provisioned concurrency).\nIf scale-to-zero is enabled, the lower bound of the provisioned concurrency for each workload size will be 0.\n" + }, + "workload_type": { + "description": "The workload type of the served model. The workload type selects which type of compute to use in the endpoint. The default value for this parameter is\n\"CPU\". For deep learning workloads, GPU acceleration is available by selecting workload types like GPU_SMALL and others. See documentation for all\noptions.\n" } } } }, - "dashboard": { - "description": "If dashboard, indicates that this job must refresh a SQL dashboard.", + "traffic_config": { + "description": "The traffic config defining how invocations to the serving endpoint should be routed.", "properties": { - "custom_subject": { - "description": "Subject of the email sent to subscribers of this task." - }, - "dashboard_id": { - "description": "The canonical identifier of the SQL dashboard." - }, - "pause_subscriptions": { - "description": "If true, the dashboard snapshot is not taken, and emails are not sent to subscribers." - }, - "subscriptions": { - "description": "If specified, dashboard snapshots are sent to subscriptions.", + "routes": { + "description": "The list of routes that define traffic to each served model.", "items": { "description": "", "properties": { - "destination_id": { - "description": "The canonical identifier of the destination to receive email notification." + "served_model_name": { + "description": "The name of the served model this route configures traffic for." }, - "user_name": { - "description": "The user name to receive the subscription email." + "traffic_percentage": { + "description": "The percentage of endpoint traffic to send to this route. It must be an integer between 0 and 100 inclusive." } } } } } - }, - "file": { - "description": "If file, indicates that this job runs a SQL file in a remote Git repository. Only one SQL statement is supported in a file. Multiple SQL statements separated by semicolons (;) are not permitted.", - "properties": { - "path": { - "description": "Relative path of the SQL file in the remote Git repository." - } - } - }, - "parameters": { - "description": "Parameters to be used for each run of this job. The SQL alert task does not support custom parameters.", - "additionalproperties": { - "description": "" - } - }, - "query": { - "description": "If query, indicates that this job must execute a SQL query.", - "properties": { - "query_id": { - "description": "The canonical identifier of the SQL query." - } - } - }, - "warehouse_id": { - "description": "The canonical identifier of the SQL warehouse. Only serverless and pro SQL warehouses are supported." } } }, - "task_key": { - "description": "A unique name for the task. This field is used to refer to this task from other tasks.\nThis field is required and must be unique within its parent job.\nOn Update or Reset, this field is used to reference the tasks to be updated or reset.\nThe maximum length is 100 characters." + "name": { + "description": "The name of the serving endpoint. This field is required and must be unique across a Databricks workspace.\nAn endpoint name can consist of alphanumeric characters, dashes, and underscores.\n" }, - "timeout_seconds": { - "description": "An optional timeout applied to each run of this job task. The default behavior is to have no timeout." - } - } - } - }, - "timeout_seconds": { - "description": "An optional timeout applied to each run of this job. The default behavior is to have no timeout." - }, - "trigger": { - "description": "Trigger settings for the job. Can be used to trigger a run when new files arrive in an external location. The default behavior is that the job runs only when triggered by clicking “Run Now” in the Jobs UI or sending an API request to `runNow`.", - "properties": { - "file_arrival": { - "description": "File arrival trigger settings.", - "properties": { - "min_time_between_triggers_seconds": { - "description": "If set, the trigger starts a run only after the specified amount of time passed since\nthe last time the trigger fired. The minimum allowed value is 60 seconds\n" - }, - "url": { - "description": "URL to be monitored for file arrivals. The path must point to the root or a subpath of the external location." - }, - "wait_after_last_change_seconds": { - "description": "If set, the trigger starts a run only after no file activity has occurred for the specified amount of time.\nThis makes it possible to wait for a batch of incoming files to arrive before triggering a run. The\nminimum allowed value is 60 seconds.\n" - } - } - }, - "pause_status": { - "description": "Whether this trigger is paused or not." - } - } - }, - "webhook_notifications": { - "description": "A collection of system notification IDs to notify when the run begins or completes. The default behavior is to not send any system notifications.", - "properties": { - "on_failure": { - "description": "An optional list of system notification IDs to call when the run fails. A maximum of 3 destinations can be specified for the `on_failure` property.", - "items": { - "description": "", - "properties": { - "id": { - "description": "" - } - } - } - }, - "on_start": { - "description": "An optional list of system notification IDs to call when the run starts. A maximum of 3 destinations can be specified for the `on_start` property.", - "items": { + "permissions": { "description": "", - "properties": { - "id": { - "description": "" + "items": { + "description": "", + "properties": { + "group_name": { + "description": "" + }, + "level": { + "description": "" + }, + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } } } - } - }, - "on_success": { - "description": "An optional list of system notification IDs to call when the run completes successfully. A maximum of 3 destinations can be specified for the `on_success` property.", - "items": { - "description": "", - "properties": { - "id": { - "description": "" + }, + "tags": { + "description": "Tags to be attached to the serving endpoint and automatically propagated to billing logs.", + "items": { + "description": "", + "properties": { + "key": { + "description": "Key field for a serving endpoint tag." + }, + "value": { + "description": "Optional value field for a serving endpoint tag." + } } } } } } - } - } - } - }, - "models": { - "description": "List of MLflow models", - "additionalproperties": { - "description": "", - "properties": { - "creation_timestamp": { - "description": "Timestamp recorded when this `registered_model` was created." - }, - "description": { - "description": "Description of this `registered_model`." - }, - "last_updated_timestamp": { - "description": "Timestamp recorded when metadata for this `registered_model` was last updated." }, - "latest_versions": { - "description": "Collection of latest model versions for each stage.\nOnly contains models with current `READY` status.", - "items": { + "models": { + "description": "List of MLflow models", + "additionalproperties": { "description": "", "properties": { "creation_timestamp": { - "description": "Timestamp recorded when this `model_version` was created." - }, - "current_stage": { - "description": "Current stage for this `model_version`." + "description": "Timestamp recorded when this `registered_model` was created." }, "description": { - "description": "Description of this `model_version`." + "description": "Description of this `registered_model`." }, "last_updated_timestamp": { - "description": "Timestamp recorded when metadata for this `model_version` was last updated." + "description": "Timestamp recorded when metadata for this `registered_model` was last updated." + }, + "latest_versions": { + "description": "Collection of latest model versions for each stage.\nOnly contains models with current `READY` status.", + "items": { + "description": "", + "properties": { + "creation_timestamp": { + "description": "Timestamp recorded when this `model_version` was created." + }, + "current_stage": { + "description": "Current stage for this `model_version`." + }, + "description": { + "description": "Description of this `model_version`." + }, + "last_updated_timestamp": { + "description": "Timestamp recorded when metadata for this `model_version` was last updated." + }, + "name": { + "description": "Unique name of the model" + }, + "run_id": { + "description": "MLflow run ID used when creating `model_version`, if `source` was generated by an\nexperiment run stored in MLflow tracking server." + }, + "run_link": { + "description": "Run Link: Direct link to the run that generated this version" + }, + "source": { + "description": "URI indicating the location of the source model artifacts, used when creating `model_version`" + }, + "status": { + "description": "Current status of `model_version`" + }, + "status_message": { + "description": "Details on current `status`, if it is pending or failed." + }, + "tags": { + "description": "Tags: Additional metadata key-value pairs for this `model_version`.", + "items": { + "description": "", + "properties": { + "key": { + "description": "The tag key." + }, + "value": { + "description": "The tag value." + } + } + } + }, + "user_id": { + "description": "User that created this `model_version`." + }, + "version": { + "description": "Model's version number." + } + } + } }, "name": { - "description": "Unique name of the model" - }, - "run_id": { - "description": "MLflow run ID used when creating `model_version`, if `source` was generated by an\nexperiment run stored in MLflow tracking server." - }, - "run_link": { - "description": "Run Link: Direct link to the run that generated this version" - }, - "source": { - "description": "URI indicating the location of the source model artifacts, used when creating `model_version`" - }, - "status": { - "description": "Current status of `model_version`" + "description": "Unique name for the model." }, - "status_message": { - "description": "Details on current `status`, if it is pending or failed." + "permissions": { + "description": "", + "items": { + "description": "", + "properties": { + "group_name": { + "description": "" + }, + "level": { + "description": "" + }, + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } + } + } }, "tags": { - "description": "Tags: Additional metadata key-value pairs for this `model_version`.", + "description": "Tags: Additional metadata key-value pairs for this `registered_model`.", "items": { "description": "", "properties": { @@ -3234,386 +3941,484 @@ } }, "user_id": { - "description": "User that created this `model_version`." - }, - "version": { - "description": "Model's version number." + "description": "User that created this `registered_model`" } } } }, - "name": { - "description": "Unique name for the model." - }, - "permissions": { - "description": "", - "items": { + "pipelines": { + "description": "List of DLT pipelines", + "additionalproperties": { "description": "", "properties": { - "group_name": { - "description": "" + "catalog": { + "description": "A catalog in Unity Catalog to publish data from this pipeline to. If `target` is specified, tables in this pipeline are published to a `target` schema inside `catalog` (for example, `catalog`.`target`.`table`). If `target` is not specified, no data is published to Unity Catalog." + }, + "channel": { + "description": "DLT Release Channel that specifies which version to use." + }, + "clusters": { + "description": "Cluster settings for this pipeline deployment.", + "items": { + "description": "", + "properties": { + "apply_policy_default_values": { + "description": "Note: This field won't be persisted. Only API users will check this field." + }, + "autoscale": { + "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "properties": { + "max_workers": { + "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + }, + "min_workers": { + "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + } + } + }, + "aws_attributes": { + "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "ebs_volume_count": { + "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." + }, + "ebs_volume_iops": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_size": { + "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." + }, + "ebs_volume_throughput": { + "description": "\u003cneeds content added\u003e" + }, + "ebs_volume_type": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "instance_profile_arn": { + "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." + }, + "spot_bid_price_percent": { + "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." + }, + "zone_id": { + "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nThe list of available zones as well as the default value can be found by using the\n`List Zones` method." + } + } + }, + "azure_attributes": { + "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "first_on_demand": { + "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." + }, + "log_analytics_info": { + "description": "Defines values necessary to configure and run Azure Log Analytics agent", + "properties": { + "log_analytics_primary_key": { + "description": "\u003cneeds content added\u003e" + }, + "log_analytics_workspace_id": { + "description": "\u003cneeds content added\u003e" + } + } + }, + "spot_bid_max_price": { + "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." + } + } + }, + "cluster_log_conf": { + "description": "The configuration for delivering spark logs to a long-term storage destination.\nOnly dbfs destinations are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.\n", + "properties": { + "dbfs": { + "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", + "properties": { + "destination": { + "description": "dbfs destination, e.g. `dbfs:/my/path`" + } + } + }, + "s3": { + "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", + "properties": { + "canned_acl": { + "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." + }, + "destination": { + "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." + }, + "enable_encryption": { + "description": "(Optional) Flag to enable server side encryption, `false` by default." + }, + "encryption_type": { + "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." + }, + "endpoint": { + "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." + }, + "kms_key": { + "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." + }, + "region": { + "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + } + } + } + } + }, + "custom_tags": { + "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", + "additionalproperties": { + "description": "" + } + }, + "driver_instance_pool_id": { + "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." + }, + "driver_node_type_id": { + "description": "The node type of the Spark driver.\nNote that this field is optional; if unset, the driver node type will be set as the same value\nas `node_type_id` defined above." + }, + "gcp_attributes": { + "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", + "properties": { + "availability": { + "description": "" + }, + "boot_disk_size": { + "description": "boot disk size in GB" + }, + "google_service_account": { + "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." + }, + "local_ssd_count": { + "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." + } + } + }, + "instance_pool_id": { + "description": "The optional ID of the instance pool to which the cluster belongs." + }, + "label": { + "description": "A label for the cluster specification, either `default` to configure the default cluster, or `maintenance` to configure the maintenance cluster. This field is optional. The default value is `default`." + }, + "node_type_id": { + "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" + }, + "num_workers": { + "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + }, + "policy_id": { + "description": "The ID of the cluster policy used to create the cluster if applicable." + }, + "spark_conf": { + "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nSee :method:clusters/create for more details.\n", + "additionalproperties": { + "description": "" + } + }, + "spark_env_vars": { + "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", + "additionalproperties": { + "description": "" + } + }, + "ssh_public_keys": { + "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", + "items": { + "description": "" + } + } + } + } }, - "level": { - "description": "" + "configuration": { + "description": "String-String configuration for this pipeline execution.", + "additionalproperties": { + "description": "" + } }, - "service_principal_name": { - "description": "" + "continuous": { + "description": "Whether the pipeline is continuous or triggered. This replaces `trigger`." }, - "user_name": { - "description": "" - } - } - } - }, - "tags": { - "description": "Tags: Additional metadata key-value pairs for this `registered_model`.", - "items": { - "description": "", - "properties": { - "key": { - "description": "The tag key." + "development": { + "description": "Whether the pipeline is in Development mode. Defaults to false." }, - "value": { - "description": "The tag value." - } - } - } - }, - "user_id": { - "description": "User that created this `registered_model`" - } - } - } - }, - "pipelines": { - "description": "List of DLT pipelines", - "additionalproperties": { - "description": "", - "properties": { - "catalog": { - "description": "A catalog in Unity Catalog to publish data from this pipeline to. If `target` is specified, tables in this pipeline are published to a `target` schema inside `catalog` (for example, `catalog`.`target`.`table`). If `target` is not specified, no data is published to Unity Catalog." - }, - "channel": { - "description": "DLT Release Channel that specifies which version to use." - }, - "clusters": { - "description": "Cluster settings for this pipeline deployment.", - "items": { - "description": "", - "properties": { - "apply_policy_default_values": { - "description": "Note: This field won't be persisted. Only API users will check this field." + "edition": { + "description": "Pipeline product edition." }, - "autoscale": { - "description": "Parameters needed in order to automatically scale clusters up and down based on load.\nNote: autoscaling works best with DB runtime versions 3.0 or later.", + "filters": { + "description": "Filters on which Pipeline packages to include in the deployed graph.", "properties": { - "max_workers": { - "description": "The maximum number of workers to which the cluster can scale up when overloaded.\nNote that `max_workers` must be strictly greater than `min_workers`." + "exclude": { + "description": "Paths to exclude.", + "items": { + "description": "" + } }, - "min_workers": { - "description": "The minimum number of workers to which the cluster can scale down when underutilized.\nIt is also the initial number of workers the cluster will have after creation." + "include": { + "description": "Paths to include.", + "items": { + "description": "" + } } } }, - "aws_attributes": { - "description": "Attributes related to clusters running on Amazon Web Services.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "ebs_volume_count": { - "description": "The number of volumes launched for each instance. Users can choose up to 10 volumes.\nThis feature is only enabled for supported node types. Legacy node types cannot specify\ncustom EBS volumes.\nFor node types with no instance store, at least one EBS volume needs to be specified;\notherwise, cluster creation will fail.\n\nThese EBS volumes will be mounted at `/ebs0`, `/ebs1`, and etc.\nInstance store volumes will be mounted at `/local_disk0`, `/local_disk1`, and etc.\n\nIf EBS volumes are attached, Databricks will configure Spark to use only the EBS volumes for\nscratch storage because heterogenously sized scratch devices can lead to inefficient disk\nutilization. If no EBS volumes are attached, Databricks will configure Spark to use instance\nstore volumes.\n\nPlease note that if EBS volumes are specified, then the Spark configuration `spark.local.dir`\nwill be overridden." - }, - "ebs_volume_iops": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_size": { - "description": "The size of each EBS volume (in GiB) launched for each instance. For general purpose\nSSD, this value must be within the range 100 - 4096. For throughput optimized HDD,\nthis value must be within the range 500 - 4096." - }, - "ebs_volume_throughput": { - "description": "\u003cneeds content added\u003e" - }, - "ebs_volume_type": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nIf this value is greater than 0, the cluster driver node in particular will be placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "instance_profile_arn": { - "description": "Nodes for this cluster will only be placed on AWS instances with this instance profile. If\nommitted, nodes will be placed on instances without an IAM instance profile. The instance\nprofile must have previously been added to the Databricks environment by an account\nadministrator.\n\nThis feature may only be available to certain customer plans.\n\nIf this field is ommitted, we will pull in the default from the conf if it exists." - }, - "spot_bid_price_percent": { - "description": "The bid price for AWS spot instances, as a percentage of the corresponding instance type's\non-demand price.\nFor example, if this field is set to 50, and the cluster needs a new `r3.xlarge` spot\ninstance, then the bid price is half of the price of\non-demand `r3.xlarge` instances. Similarly, if this field is set to 200, the bid price is twice\nthe price of on-demand `r3.xlarge` instances. If not specified, the default value is 100.\nWhen spot instances are requested for this cluster, only spot instances whose bid price\npercentage matches this field will be considered.\nNote that, for safety, we enforce this field to be no more than 10000.\n\nThe default value and documentation here should be kept consistent with\nCommonConf.defaultSpotBidPricePercent and CommonConf.maxSpotBidPricePercent." - }, - "zone_id": { - "description": "Identifier for the availability zone/datacenter in which the cluster resides.\nThis string will be of a form like \"us-west-2a\". The provided availability\nzone must be in the same region as the Databricks deployment. For example, \"us-west-2a\"\nis not a valid zone id if the Databricks deployment resides in the \"us-east-1\" region.\nThis is an optional field at cluster creation, and if not specified, a default zone will be used.\nIf the zone specified is \"auto\", will try to place cluster in a zone with high availability,\nand will retry placement in a different AZ if there is not enough capacity.\nSee [[AutoAZHelper.scala]] for more details.\nThe list of available zones as well as the default value can be found by using the\n`List Zones`_ method." - } - } + "id": { + "description": "Unique identifier for this pipeline." }, - "azure_attributes": { - "description": "Attributes related to clusters running on Microsoft Azure.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "first_on_demand": { - "description": "The first `first_on_demand` nodes of the cluster will be placed on on-demand instances.\nThis value should be greater than 0, to make sure the cluster driver node is placed on an\non-demand instance. If this value is greater than or equal to the current cluster size, all\nnodes will be placed on on-demand instances. If this value is less than the current cluster\nsize, `first_on_demand` nodes will be placed on on-demand instances and the remainder will\nbe placed on `availability` instances. Note that this value does not affect\ncluster size and cannot currently be mutated over the lifetime of a cluster." - }, - "log_analytics_info": { - "description": "Defines values necessary to configure and run Azure Log Analytics agent", - "properties": { - "log_analytics_primary_key": { - "description": "\u003cneeds content added\u003e" - }, - "log_analytics_workspace_id": { - "description": "\u003cneeds content added\u003e" + "libraries": { + "description": "Libraries or code needed by this deployment.", + "items": { + "description": "", + "properties": { + "file": { + "description": "The path to a file that defines a pipeline and is stored in the Databricks Repos.\n", + "properties": { + "path": { + "description": "The absolute path of the file." + } + } + }, + "jar": { + "description": "URI of the jar to be installed. Currently only DBFS is supported.\n" + }, + "maven": { + "description": "Specification of a maven library to be installed.\n", + "properties": { + "coordinates": { + "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." + }, + "exclusions": { + "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", + "items": { + "description": "" + } + }, + "repo": { + "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." + } + } + }, + "notebook": { + "description": "The path to a notebook that defines a pipeline and is stored in the \u003cDatabricks\u003e workspace.\n", + "properties": { + "path": { + "description": "The absolute path of the notebook." + } } } - }, - "spot_bid_max_price": { - "description": "The max bid price to be used for Azure spot instances.\nThe Max price for the bid cannot be higher than the on-demand price of the instance.\nIf not specified, the default value is -1, which specifies that the instance cannot be evicted\non the basis of price, and only on the basis of availability. Further, the value should \u003e 0 or -1." } } }, - "cluster_log_conf": { - "description": "The configuration for delivering spark logs to a long-term storage destination.\nOnly dbfs destinations are supported. Only one destination can be specified\nfor one cluster. If the conf is given, the logs will be delivered to the destination every\n`5 mins`. The destination of driver logs is `$destination/$clusterId/driver`, while\nthe destination of executor logs is `$destination/$clusterId/executor`.\n", - "properties": { - "dbfs": { - "description": "destination needs to be provided. e.g.\n`{ \"dbfs\" : { \"destination\" : \"dbfs:/home/cluster_log\" } }`", - "properties": { - "destination": { - "description": "dbfs destination, e.g. `dbfs:/my/path`" + "name": { + "description": "Friendly identifier for this pipeline." + }, + "notifications": { + "description": "List of notification settings for this pipeline.", + "items": { + "description": "", + "properties": { + "alerts": { + "description": "A list of alerts that trigger the sending of notifications to the configured\ndestinations. The supported alerts are:\n\n* `on-update-success`: A pipeline update completes successfully.\n* `on-update-failure`: Each time a pipeline update fails.\n* `on-update-fatal-failure`: A pipeline update fails with a non-retryable (fatal) error.\n* `on-flow-failure`: A single data flow fails.\n", + "items": { + "description": "" } - } - }, - "s3": { - "description": "destination and either the region or endpoint need to be provided. e.g.\n`{ \"s3\": { \"destination\" : \"s3://cluster_log_bucket/prefix\", \"region\" : \"us-west-2\" } }`\nCluster iam role is used to access s3, please make sure the cluster iam role in\n`instance_profile_arn` has permission to write data to the s3 destination.", - "properties": { - "canned_acl": { - "description": "(Optional) Set canned access control list for the logs, e.g. `bucket-owner-full-control`.\nIf `canned_cal` is set, please make sure the cluster iam role has `s3:PutObjectAcl` permission on\nthe destination bucket and prefix. The full list of possible canned acl can be found at\nhttp://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html#canned-acl.\nPlease also note that by default only the object owner gets full controls. If you are using cross account\nrole for writing data, you may want to set `bucket-owner-full-control` to make bucket owner able to\nread the logs." - }, - "destination": { - "description": "S3 destination, e.g. `s3://my-bucket/some-prefix` Note that logs will be delivered using\ncluster iam role, please make sure you set cluster iam role and the role has write access to the\ndestination. Please also note that you cannot use AWS keys to deliver logs." - }, - "enable_encryption": { - "description": "(Optional) Flag to enable server side encryption, `false` by default." - }, - "encryption_type": { - "description": "(Optional) The encryption type, it could be `sse-s3` or `sse-kms`. It will be used only when\nencryption is enabled and the default type is `sse-s3`." - }, - "endpoint": { - "description": "S3 endpoint, e.g. `https://s3-us-west-2.amazonaws.com`. Either region or endpoint needs to be set.\nIf both are set, endpoint will be used." - }, - "kms_key": { - "description": "(Optional) Kms key which will be used if encryption is enabled and encryption type is set to `sse-kms`." - }, - "region": { - "description": "S3 region, e.g. `us-west-2`. Either region or endpoint needs to be set. If both are set,\nendpoint will be used." + }, + "email_recipients": { + "description": "A list of email addresses notified when a configured alert is triggered.\n", + "items": { + "description": "" } } } } }, - "custom_tags": { - "description": "Additional tags for cluster resources. Databricks will tag all cluster resources (e.g., AWS\ninstances and EBS volumes) with these tags in addition to `default_tags`. Notes:\n\n- Currently, Databricks allows at most 45 custom tags\n\n- Clusters can only reuse cloud resources if the resources' tags are a subset of the cluster tags", - "additionalproperties": { - "description": "" - } - }, - "driver_instance_pool_id": { - "description": "The optional ID of the instance pool for the driver of the cluster belongs.\nThe pool cluster uses the instance pool with id (instance_pool_id) if the driver pool is not\nassigned." - }, - "driver_node_type_id": { - "description": "The node type of the Spark driver.\nNote that this field is optional; if unset, the driver node type will be set as the same value\nas `node_type_id` defined above." - }, - "gcp_attributes": { - "description": "Attributes related to clusters running on Google Cloud Platform.\nIf not specified at cluster creation, a set of default values will be used.", - "properties": { - "availability": { - "description": "" - }, - "boot_disk_size": { - "description": "boot disk size in GB" - }, - "google_service_account": { - "description": "If provided, the cluster will impersonate the google service account when accessing\ngcloud services (like GCS). The google service account\nmust have previously been added to the Databricks environment by an account\nadministrator." - }, - "local_ssd_count": { - "description": "If provided, each node (workers and driver) in the cluster will have this number of local SSDs attached. Each local SSD is 375GB in size. Refer to [GCP documentation](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) for the supported number of local SSDs for each instance type." + "permissions": { + "description": "", + "items": { + "description": "", + "properties": { + "group_name": { + "description": "" + }, + "level": { + "description": "" + }, + "service_principal_name": { + "description": "" + }, + "user_name": { + "description": "" + } } } }, - "instance_pool_id": { - "description": "The optional ID of the instance pool to which the cluster belongs." - }, - "label": { - "description": "A label for the cluster specification, either `default` to configure the default cluster, or `maintenance` to configure the maintenance cluster. This field is optional. The default value is `default`." - }, - "node_type_id": { - "description": "This field encodes, through a single value, the resources available to each of\nthe Spark nodes in this cluster. For example, the Spark nodes can be provisioned\nand optimized for memory or compute intensive workloads. A list of available node\ntypes can be retrieved by using the :method:clusters/listNodeTypes API call.\n" - }, - "num_workers": { - "description": "Number of worker nodes that this cluster should have. A cluster has one Spark Driver\nand `num_workers` Executors for a total of `num_workers` + 1 Spark nodes.\n\nNote: When reading the properties of a cluster, this field reflects the desired number\nof workers rather than the actual current number of workers. For instance, if a cluster\nis resized from 5 to 10 workers, this field will immediately be updated to reflect\nthe target size of 10 workers, whereas the workers listed in `spark_info` will gradually\nincrease from 5 to 10 as the new nodes are provisioned." + "photon": { + "description": "Whether Photon is enabled for this pipeline." }, - "policy_id": { - "description": "The ID of the cluster policy used to create the cluster if applicable." + "serverless": { + "description": "Whether serverless compute is enabled for this pipeline." }, - "spark_conf": { - "description": "An object containing a set of optional, user-specified Spark configuration key-value pairs.\nSee :method:clusters/create for more details.\n", - "additionalproperties": { - "description": "" - } + "storage": { + "description": "DBFS root directory for storing checkpoints and tables." }, - "spark_env_vars": { - "description": "An object containing a set of optional, user-specified environment variable key-value pairs.\nPlease note that key-value pair of the form (X,Y) will be exported as is (i.e.,\n`export X='Y'`) while launching the driver and workers.\n\nIn order to specify an additional set of `SPARK_DAEMON_JAVA_OPTS`, we recommend appending\nthem to `$SPARK_DAEMON_JAVA_OPTS` as shown in the example below. This ensures that all\ndefault databricks managed environmental variables are included as well.\n\nExample Spark environment variables:\n`{\"SPARK_WORKER_MEMORY\": \"28000m\", \"SPARK_LOCAL_DIRS\": \"/local_disk0\"}` or\n`{\"SPARK_DAEMON_JAVA_OPTS\": \"$SPARK_DAEMON_JAVA_OPTS -Dspark.shuffle.service.enabled=true\"}`", - "additionalproperties": { - "description": "" - } + "target": { + "description": "Target schema (database) to add tables in this pipeline to. If not specified, no data is published to the Hive metastore or Unity Catalog. To publish to Unity Catalog, also specify `catalog`." }, - "ssh_public_keys": { - "description": "SSH public key contents that will be added to each Spark node in this cluster. The\ncorresponding private keys can be used to login with the user name `ubuntu` on port `2200`.\nUp to 10 keys can be specified.", - "items": { - "description": "" + "trigger": { + "description": "Which pipeline trigger to use. Deprecated: Use `continuous` instead.", + "properties": { + "cron": { + "description": "", + "properties": { + "quartz_cron_schedule": { + "description": "" + }, + "timezone_id": { + "description": "" + } + } + }, + "manual": { + "description": "" + } } } } } }, - "configuration": { - "description": "String-String configuration for this pipeline execution.", + "registered_models": { + "description": "List of Registered Models", "additionalproperties": { - "description": "" - } - }, - "continuous": { - "description": "Whether the pipeline is continuous or triggered. This replaces `trigger`." - }, - "development": { - "description": "Whether the pipeline is in Development mode. Defaults to false." - }, - "edition": { - "description": "Pipeline product edition." - }, - "filters": { - "description": "Filters on which Pipeline packages to include in the deployed graph.", - "properties": { - "exclude": { - "description": "Paths to exclude.", - "items": { - "description": "" - } - }, - "include": { - "description": "Paths to include.", - "items": { - "description": "" - } - } - } - }, - "id": { - "description": "Unique identifier for this pipeline." - }, - "libraries": { - "description": "Libraries or code needed by this deployment.", - "items": { "description": "", "properties": { - "file": { - "description": "The path to a file that defines a pipeline and is stored in the Databricks Repos.\n", - "properties": { - "path": { - "description": "The absolute path of the file." - } - } + "catalog_name": { + "description": "The name of the catalog where the schema and the registered model reside" }, - "jar": { - "description": "URI of the jar to be installed. Currently only DBFS is supported.\n" + "comment": { + "description": "The comment attached to the registered model" }, - "maven": { - "description": "Specification of a maven library to be installed.\n", - "properties": { - "coordinates": { - "description": "Gradle-style maven coordinates. For example: \"org.jsoup:jsoup:1.7.2\"." - }, - "exclusions": { - "description": "List of dependences to exclude. For example: `[\"slf4j:slf4j\", \"*:hadoop-client\"]`.\n\nMaven dependency exclusions:\nhttps://maven.apache.org/guides/introduction/introduction-to-optional-and-excludes-dependencies.html.", - "items": { + "grants": { + "description": "", + "items": { + "description": "", + "properties": { + "principal": { "description": "" + }, + "privileges": { + "description": "", + "items": { + "description": "" + } } - }, - "repo": { - "description": "Maven repo to install the Maven package from. If omitted, both Maven Central Repository\nand Spark Packages are searched." } } }, - "notebook": { - "description": "The path to a notebook that defines a pipeline and is stored in the \u003cDatabricks\u003e workspace.\n", - "properties": { - "path": { - "description": "The absolute path of the notebook." - } - } + "name": { + "description": "The name of the registered model" + }, + "schema_name": { + "description": "The name of the schema where the registered model resides" }, - "whl": { - "description": "URI of the wheel to be installed.\n" + "storage_location": { + "description": "The storage location on the cloud under which model version data files are stored" } } } + } + } + }, + "run_as": { + "description": "", + "properties": { + "service_principal_name": { + "description": "" }, - "name": { - "description": "Friendly identifier for this pipeline." + "user_name": { + "description": "" + } + } + }, + "sync": { + "description": "", + "properties": { + "exclude": { + "description": "", + "items": { + "description": "" + } }, - "permissions": { + "include": { "description": "", "items": { - "description": "", - "properties": { - "group_name": { - "description": "" - }, - "level": { - "description": "" - }, - "service_principal_name": { - "description": "" - }, - "user_name": { - "description": "" - } - } + "description": "" } + } + } + }, + "variables": { + "description": "", + "additionalproperties": { + "description": "" + } + }, + "workspace": { + "description": "Configures which workspace to connect to and locations for files, state, and similar locations within the workspace file tree.", + "properties": { + "artifact_path": { + "description": "The remote path to synchronize build artifacts to. This defaults to `${workspace.root}/artifacts`" }, - "photon": { - "description": "Whether Photon is enabled for this pipeline." + "auth_type": { + "description": "" }, - "serverless": { - "description": "Whether serverless compute is enabled for this pipeline." + "azure_client_id": { + "description": "" }, - "storage": { - "description": "DBFS root directory for storing checkpoints and tables." + "azure_environment": { + "description": "Azure environment, one of (Public, UsGov, China, Germany)." }, - "target": { - "description": "Target schema (database) to add tables in this pipeline to. If not specified, no data is published to the Hive metastore or Unity Catalog. To publish to Unity Catalog, also specify `catalog`." + "azure_login_app_id": { + "description": "Azure Login Application ID." }, - "trigger": { - "description": "Which pipeline trigger to use. Deprecated: Use `continuous` instead.", - "properties": { - "cron": { - "description": "", - "properties": { - "quartz_cron_schedule": { - "description": "" - }, - "timezone_id": { - "description": "" - } - } - }, - "manual": { - "description": "" - } - } + "azure_tenant_id": { + "description": "" + }, + "azure_use_msi": { + "description": "" + }, + "azure_workspace_resource_id": { + "description": "Azure Resource Manager ID for Azure Databricks workspace." + }, + "client_id": { + "description": "" + }, + "file_path": { + "description": "The remote path to synchronize local files artifacts to. This defaults to `${workspace.root}/files`" + }, + "google_service_account": { + "description": "" + }, + "host": { + "description": "Host url of the workspace." + }, + "profile": { + "description": "Connection profile to use. By default profiles are specified in ~/.databrickscfg." + }, + "root_path": { + "description": "The base location for synchronizing files, artifacts and state. Defaults to `/Users/jane@doe.com/.bundle/${bundle.name}/${bundle.target}`" + }, + "state_path": { + "description": "The remote path to synchronize bundle state to. This defaults to `${workspace.root}/state`" } } } @@ -3640,6 +4445,9 @@ "artifact_path": { "description": "The remote path to synchronize build artifacts to. This defaults to `${workspace.root}/artifacts`" }, + "auth_type": { + "description": "" + }, "azure_client_id": { "description": "" }, @@ -3658,6 +4466,9 @@ "azure_workspace_resource_id": { "description": "Azure Resource Manager ID for Azure Databricks workspace." }, + "client_id": { + "description": "" + }, "file_path": { "description": "The remote path to synchronize local files artifacts to. This defaults to `${workspace.root}/files`" }, @@ -3679,4 +4490,4 @@ } } } -} +} \ No newline at end of file diff --git a/bundle/schema/schema.go b/bundle/schema/schema.go index 00dd271928..8b5c36d12c 100644 --- a/bundle/schema/schema.go +++ b/bundle/schema/schema.go @@ -17,6 +17,10 @@ const readonlyTag = "readonly" // Fields can be tagged as "internal" to remove them from the generated schema. const internalTag = "internal" +// Annotation for bundle fields that have been deprecated. +// Fields tagged as "deprecated" are removed/omitted from the generated schema. +const deprecatedTag = "deprecated" + // This function translates golang types into json schema. Here is the mapping // between json schema types and golang types // @@ -205,7 +209,9 @@ func toSchema(golangType reflect.Type, docs *Docs, tracker *tracker) (*jsonschem required := []string{} for _, child := range children { bundleTag := child.Tag.Get("bundle") - if bundleTag == readonlyTag || bundleTag == internalTag { + // Fields marked as "readonly", "internal" or "deprecated" are skipped + // while generating the schema + if bundleTag == readonlyTag || bundleTag == internalTag || bundleTag == deprecatedTag { continue } diff --git a/cmd/bundle/schema.go b/cmd/bundle/schema.go index ec817037cb..f516695c7a 100644 --- a/cmd/bundle/schema.go +++ b/cmd/bundle/schema.go @@ -2,7 +2,6 @@ package bundle import ( "encoding/json" - "os" "reflect" "github.com/databricks/cli/bundle/config" @@ -16,47 +15,24 @@ func newSchemaCommand() *cobra.Command { Short: "Generate JSON Schema for bundle configuration", } - var openapi string - var outputFile string - var onlyDocs bool - cmd.Flags().StringVar(&openapi, "openapi", "", "path to a databricks openapi spec") - cmd.Flags().BoolVar(&onlyDocs, "only-docs", false, "only generate descriptions for the schema") - cmd.Flags().StringVar(&outputFile, "output-file", "", "File path to write the schema to. If not specified, the schema will be written to stdout.") - cmd.RunE = func(cmd *cobra.Command, args []string) error { - // If no openapi spec is provided, try to use the environment variable. - // This environment variable is set during CLI code generation. - if openapi == "" { - openapi = os.Getenv("DATABRICKS_OPENAPI_SPEC") - } - docs, err := schema.BundleDocs(openapi) + // Load embedded schema descriptions. + docs, err := schema.LoadBundleDescriptions() if err != nil { return err } + + // Generate the JSON schema from the bundle configuration struct in Go. schema, err := schema.New(reflect.TypeOf(config.Root{}), docs) if err != nil { return err } + + // Print the JSON schema to stdout. result, err := json.MarshalIndent(schema, "", " ") if err != nil { return err } - if onlyDocs { - result, err = json.MarshalIndent(docs, "", " ") - if err != nil { - return err - } - } - - // If outputFile is provided, write to that file. - if outputFile != "" { - f, err := os.OpenFile(outputFile, os.O_CREATE|os.O_TRUNC|os.O_WRONLY, 0644) - if err != nil { - return err - } - defer f.Close() - cmd.SetOut(f) - } cmd.OutOrStdout().Write(result) return nil }