Merge branch 'main' into python-texttospeech-migration

GoogleCloudPlatform · Nov 16, 2022 · 3168389 · 3168389
2 parents f831b88 + 4827801
commit 3168389
Show file tree

Hide file tree

Showing 23 changed files with 1,669 additions and 1 deletion.
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -75,5 +75,6 @@
 /talent/**/*                           @GoogleCloudPlatform/python-samples-reviewers
 /vision/**/*                           @GoogleCloudPlatform/python-samples-reviewers
 /workflows/**/*                        @GoogleCloudPlatform/python-samples-reviewers
-/datacatalog/**/*                       @GoogleCloudPlatform/python-samples-reviewers
+/datacatalog/**/*                      @GoogleCloudPlatform/python-samples-reviewers
 /kms/**/**                             @GoogleCloudPlatform/dee-infra @GoogleCloudPlatform/python-samples-reviewers
+/dataproc/**/**                        @GoogleCloudPlatform/cloud-dpes @GoogleCloudPlatform/python-samples-reviewers
diff --git a/.github/blunderbuss.yml b/.github/blunderbuss.yml
@@ -176,6 +176,10 @@ assign_prs_by:
   - 'api: cloudtasks'
   to:
   - GoogleCloudPlatform/infra-db-dpes
+- labels:
+  - 'api: dataproc'
+  to:
+  - GoogleCloudPlatform/cloud-dpes
 
 assign_issues:
   - GoogleCloudPlatform/python-samples-owners

diff --git a/dataproc/snippets/README.md b/dataproc/snippets/README.md
@@ -0,0 +1,84 @@
+# Cloud Dataproc API Examples
+
+[![Open in Cloud Shell][shell_img]][shell_link]
+
+[shell_img]: http://gstatic.com/cloudssh/images/open-btn.png
+[shell_link]: https://console.cloud.google.com/cloudshell/open?git_repo=https://github.com/GoogleCloudPlatform/python-docs-samples&page=editor&open_in_editor=dataproc/README.md
+
+Sample command-line programs for interacting with the Cloud Dataproc API.
+
+See [the tutorial on the using the Dataproc API with the Python client
+library](https://cloud.google.com/dataproc/docs/tutorials/python-library-example)
+for information on a walkthrough you can run to try out the Cloud Dataproc API sample code.
+
+Note that while this sample demonstrates interacting with Dataproc via the API, the functionality demonstrated here could also be accomplished using the Cloud Console or the gcloud CLI.
+
+`list_clusters.py` is a simple command-line program to demonstrate connecting to the Cloud Dataproc API and listing the clusters in a region.
+
+`submit_job_to_cluster.py` demonstrates how to create a cluster, submit the
+`pyspark_sort.py` job, download the output from Google Cloud Storage, and output the result.
+
+`single_job_workflow.py` uses the Cloud Dataproc InstantiateInlineWorkflowTemplate API to create an ephemeral cluster, run a job, then delete the cluster with one API request.
+
+`pyspark_sort.py_gcs` is the same as `pyspark_sort.py` but demonstrates
+ reading from a GCS bucket.
+
+## Prerequisites to run locally:
+
+* [pip](https://pypi.python.org/pypi/pip)
+
+Go to the [Google Cloud Console](https://console.cloud.google.com).
+
+Under API Manager, search for the Google Cloud Dataproc API and enable it.
+
+## Set Up Your Local Dev Environment
+
+To install, run the following commands. If you want to use  [virtualenv](https://virtualenv.readthedocs.org/en/latest/)
+(recommended), run the commands within a virtualenv.
+
+    * pip install -r requirements.txt
+
+## Authentication
+
+Please see the [Google cloud authentication guide](https://cloud.google.com/docs/authentication/).
+The recommended approach to running these samples is a Service Account with a JSON key.
+
+## Environment Variables
+
+Set the following environment variables:
+
+    GOOGLE_CLOUD_PROJECT=your-project-id
+    REGION=us-central1 # or your region
+    CLUSTER_NAME=waprin-spark7
+    ZONE=us-central1-b
+
+## Running the samples
+
+To run list_clusters.py:
+
+    python list_clusters.py $GOOGLE_CLOUD_PROJECT --region=$REGION
+
+`submit_job_to_cluster.py` can create the Dataproc cluster or use an existing cluster. To create a cluster before running the code, you can use the [Cloud Console](console.cloud.google.com) or run:
+
+    gcloud dataproc clusters create your-cluster-name
+
+To run submit_job_to_cluster.py, first create a GCS bucket (used by Cloud Dataproc to stage files) from the Cloud Console or with gsutil:
+
+    gsutil mb gs://<your-staging-bucket-name>
+
+Next, set the following environment variables:
+
+    BUCKET=your-staging-bucket
+    CLUSTER=your-cluster-name
+
+Then, if you want to use an existing cluster, run:
+
+    python submit_job_to_cluster.py --project_id=$GOOGLE_CLOUD_PROJECT --zone=us-central1-b --cluster_name=$CLUSTER --gcs_bucket=$BUCKET
+
+Alternatively, to create a new cluster, which will be deleted at the end of the job, run:
+
+    python submit_job_to_cluster.py --project_id=$GOOGLE_CLOUD_PROJECT --zone=us-central1-b --cluster_name=$CLUSTER --gcs_bucket=$BUCKET --create_new_cluster
+
+The script will setup a cluster, upload the PySpark file, submit the job, print the result, then, if it created the cluster, delete the cluster.
+
+Optionally, you can add the `--pyspark_file` argument to change from the default `pyspark_sort.py` included in this script to a new script.
diff --git a/dataproc/snippets/create_cluster.py b/dataproc/snippets/create_cluster.py
@@ -0,0 +1,73 @@
+#!/usr/bin/env python
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This sample walks a user through creating a Cloud Dataproc cluster using
+# the Python client library.
+#
+# This script can be run on its own:
+#   python create_cluster.py ${PROJECT_ID} ${REGION} ${CLUSTER_NAME}
+
+
+import sys
+
+# [START dataproc_create_cluster]
+from google.cloud import dataproc_v1 as dataproc
+
+
+def create_cluster(project_id, region, cluster_name):
+    """This sample walks a user through creating a Cloud Dataproc cluster
+    using the Python client library.
+
+    Args:
+        project_id (string): Project to use for creating resources.
+        region (string): Region where the resources should live.
+        cluster_name (string): Name to use for creating a cluster.
+    """
+
+    # Create a client with the endpoint set to the desired cluster region.
+    cluster_client = dataproc.ClusterControllerClient(
+        client_options={"api_endpoint": f"{region}-dataproc.googleapis.com:443"}
+    )
+
+    # Create the cluster config.
+    cluster = {
+        "project_id": project_id,
+        "cluster_name": cluster_name,
+        "config": {
+            "master_config": {"num_instances": 1, "machine_type_uri": "n1-standard-2"},
+            "worker_config": {"num_instances": 2, "machine_type_uri": "n1-standard-2"},
+        },
+    }
+
+    # Create the cluster.
+    operation = cluster_client.create_cluster(
+        request={"project_id": project_id, "region": region, "cluster": cluster}
+    )
+    result = operation.result()
+
+    # Output a success message.
+    print(f"Cluster created successfully: {result.cluster_name}")
+    # [END dataproc_create_cluster]
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 4:
+        sys.exit("python create_cluster.py project_id region cluster_name")
+
+    project_id = sys.argv[1]
+    region = sys.argv[2]
+    cluster_name = sys.argv[3]
+    create_cluster(project_id, region, cluster_name)
diff --git a/dataproc/snippets/create_cluster_test.py b/dataproc/snippets/create_cluster_test.py
@@ -0,0 +1,57 @@
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import uuid
+
+from google.api_core.exceptions import NotFound
+from google.cloud import dataproc_v1 as dataproc
+import pytest
+
+import create_cluster
+
+
+PROJECT_ID = os.environ["GOOGLE_CLOUD_PROJECT"]
+REGION = "us-central1"
+CLUSTER_NAME = "py-cc-test-{}".format(str(uuid.uuid4()))
+
+
+@pytest.fixture(autouse=True)
+def teardown():
+    yield
+
+    cluster_client = dataproc.ClusterControllerClient(
+        client_options={"api_endpoint": f"{REGION}-dataproc.googleapis.com:443"}
+    )
+    # Client library function
+    try:
+        operation = cluster_client.delete_cluster(
+            request={
+                "project_id": PROJECT_ID,
+                "region": REGION,
+                "cluster_name": CLUSTER_NAME,
+            }
+        )
+        # Wait for cluster to delete
+        operation.result()
+    except NotFound:
+        print("Cluster already deleted")
+
+
+def test_cluster_create(capsys):
+    # Wrapper function for client library function
+    create_cluster.create_cluster(PROJECT_ID, REGION, CLUSTER_NAME)
+
+    out, _ = capsys.readouterr()
+    assert CLUSTER_NAME in out
diff --git a/dataproc/snippets/dataproc_e2e_donttest.py b/dataproc/snippets/dataproc_e2e_donttest.py
@@ -0,0 +1,35 @@
+#!/usr/bin/env python
+
+# Copyright 2019 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Integration tests for Dataproc samples.
+
+Creates a Dataproc cluster, uploads a pyspark file to Google Cloud Storage,
+submits a job to Dataproc that runs the pyspark file, then downloads
+the output logs from Cloud Storage and verifies the expected output."""
+
+import os
+
+import submit_job_to_cluster
+
+PROJECT = os.environ["GOOGLE_CLOUD_PROJECT"]
+BUCKET = os.environ["CLOUD_STORAGE_BUCKET"]
+CLUSTER_NAME = "testcluster3"
+ZONE = "us-central1-b"
+
+
+def test_e2e():
+    output = submit_job_to_cluster.main(PROJECT, ZONE, CLUSTER_NAME, BUCKET)
+    assert b"['Hello,', 'dog', 'elephant', 'panther', 'world!']" in output
diff --git a/dataproc/snippets/instantiate_inline_workflow_template.py b/dataproc/snippets/instantiate_inline_workflow_template.py
@@ -0,0 +1,97 @@
+# Copyright 2020 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# This sample walks a user through instantiating an inline
+# workflow for Cloud Dataproc using the Python client library.
+#
+# This script can be run on its own:
+#   python instantiate_inline_workflow_template.py ${PROJECT_ID} ${REGION}
+
+
+import sys
+
+# [START dataproc_instantiate_inline_workflow_template]
+from google.cloud import dataproc_v1 as dataproc
+
+
+def instantiate_inline_workflow_template(project_id, region):
+    """This sample walks a user through submitting a workflow
+    for a Cloud Dataproc using the Python client library.
+
+    Args:
+        project_id (string): Project to use for running the workflow.
+        region (string): Region where the workflow resources should live.
+    """
+
+    # Create a client with the endpoint set to the desired region.
+    workflow_template_client = dataproc.WorkflowTemplateServiceClient(
+        client_options={"api_endpoint": f"{region}-dataproc.googleapis.com:443"}
+    )
+
+    parent = "projects/{}/regions/{}".format(project_id, region)
+
+    template = {
+        "jobs": [
+            {
+                "hadoop_job": {
+                    "main_jar_file_uri": "file:///usr/lib/hadoop-mapreduce/"
+                    "hadoop-mapreduce-examples.jar",
+                    "args": ["teragen", "1000", "hdfs:///gen/"],
+                },
+                "step_id": "teragen",
+            },
+            {
+                "hadoop_job": {
+                    "main_jar_file_uri": "file:///usr/lib/hadoop-mapreduce/"
+                    "hadoop-mapreduce-examples.jar",
+                    "args": ["terasort", "hdfs:///gen/", "hdfs:///sort/"],
+                },
+                "step_id": "terasort",
+                "prerequisite_step_ids": ["teragen"],
+            },
+        ],
+        "placement": {
+            "managed_cluster": {
+                "cluster_name": "my-managed-cluster",
+                "config": {
+                    "gce_cluster_config": {
+                        # Leave 'zone_uri' empty for 'Auto Zone Placement'
+                        # 'zone_uri': ''
+                        "zone_uri": "us-central1-a"
+                    }
+                },
+            }
+        },
+    }
+
+    # Submit the request to instantiate the workflow from an inline template.
+    operation = workflow_template_client.instantiate_inline_workflow_template(
+        request={"parent": parent, "template": template}
+    )
+    operation.result()
+
+    # Output a success message.
+    print("Workflow ran successfully.")
+    # [END dataproc_instantiate_inline_workflow_template]
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 3:
+        sys.exit(
+            "python instantiate_inline_workflow_template.py " + "project_id region"
+        )
+
+    project_id = sys.argv[1]
+    region = sys.argv[2]
+    instantiate_inline_workflow_template(project_id, region)