diff --git a/dataproc/dataproc_e2e_test.py b/dataproc/dataproc_e2e_test.py index d7e9c522074e..0a45d080122f 100644 --- a/dataproc/dataproc_e2e_test.py +++ b/dataproc/dataproc_e2e_test.py @@ -18,17 +18,14 @@ import os -from gcp_devrel.testing.flaky import flaky - import submit_job_to_cluster PROJECT = os.environ['GCLOUD_PROJECT'] BUCKET = os.environ['CLOUD_STORAGE_BUCKET'] -CLUSTER_NAME = 'testcluster2' +CLUSTER_NAME = 'testcluster3' ZONE = 'us-central1-b' -@flaky def test_e2e(): output = submit_job_to_cluster.main( PROJECT, ZONE, CLUSTER_NAME, BUCKET) diff --git a/dataproc/submit_job_to_cluster.py b/dataproc/submit_job_to_cluster.py index 3ffde240ce6e..1815078202f3 100644 --- a/dataproc/submit_job_to_cluster.py +++ b/dataproc/submit_job_to_cluster.py @@ -25,12 +25,12 @@ def get_default_pyspark_file(): """Gets the PySpark file from this directory""" current_dir = os.path.dirname(os.path.abspath(__file__)) - f = open(os.path.join(current_dir, DEFAULT_FILENAME), 'r') + f = open(os.path.join(current_dir, DEFAULT_FILENAME), 'rb') return f, DEFAULT_FILENAME def get_pyspark_file(filename): - f = open(filename, 'r') + f = open(filename, 'rb') return f, os.path.basename(filename) @@ -76,6 +76,14 @@ def create_cluster(dataproc, project, zone, region, cluster_name): 'config': { 'gceClusterConfig': { 'zoneUri': zone_uri + }, + 'masterConfig': { + 'numInstances': 1, + 'machineTypeUri': 'n1-standard-1' + }, + 'workerConfig': { + 'numInstances': 2, + 'machineTypeUri': 'n1-standard-1' } } }