diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py index 06e71daf2..f908f7d5b 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py @@ -165,6 +165,9 @@ def get_matching_executor_instance(self, cores_per_executor): def generate_cluster_configuration(self, render_args: dict): executor_names = ','.join([f'"test-node-e{i}"' for i in range(render_args['NUM_EXECUTOR_NODES'])]) render_args['EXECUTOR_NAMES'] = f'[{executor_names}]' + image_version = self.configs.get_value('clusterInference', 'defaultImage') + render_args['IMAGE'] = f'"{image_version}"' + render_args['ZONE'] = f'"{self.cli.get_zone()}"' return super().generate_cluster_configuration(render_args) diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py index c3cacd489..2c82f82bc 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/emr.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/emr.py @@ -117,6 +117,12 @@ def create_local_submission_job(self, job_prop, ctxt) -> Any: def _get_prediction_model_name(self) -> str: return CspEnv.pretty_print(CspEnv.get_default()) + def generate_cluster_configuration(self, render_args: dict): + image_version = self.configs.get_value('clusterInference', 'defaultImage') + render_args['IMAGE'] = f'"{image_version}"' + render_args['ZONE'] = f'"{self.cli.get_zone()}"' + return super().generate_cluster_configuration(render_args) + @dataclass class EMRCMDDriver(CMDDriverBase): @@ -199,6 +205,17 @@ def _build_platform_describe_node_instance(self, node: ClusterNode) -> list: '--instance-types', f'{node.instance_type}'] return cmd_params + def get_zone(self) -> str: + describe_cmd = ['aws ec2 describe-availability-zones', + '--region', f'{self.get_region()}'] + selected_zone = '' + try: + zones_list = json.loads(self.run_sys_cmd(describe_cmd)) + selected_zone = zones_list['AvailabilityZones'][0]['ZoneName'] + except Exception: # pylint: disable=broad-except + self.logger.warning('Unable to extract zone from region %s', self.get_region()) + return selected_zone + def _build_platform_list_cluster(self, cluster, query_args: dict = None) -> list: diff --git a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py index 0e0e2ecfa..d88e0c356 100644 --- a/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py +++ b/user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py @@ -299,6 +299,9 @@ def get_env_var(self, key: str): def get_region(self) -> str: return self.env_vars.get('region') + def get_zone(self) -> str: + return self.env_vars.get('zone') + def get_cmd_run_configs(self) -> dict: return self.env_vars.get('cmdRunnerProperties') diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json index b569fb220..e5c299bff 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json @@ -267,7 +267,8 @@ "executor": { "n1-standard": {"vCPUs": [1, 2, 4, 8, 16, 32, 64, 96]} } - } + }, + "defaultImage": "2.1.41-debian11" }, "clusterSpecs": { "minWorkerNodes": 2, diff --git a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json index e703c4723..72e3387ee 100644 --- a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json @@ -391,7 +391,8 @@ {"name": "m5d.12xlarge", "vCPUs": 48}, {"name": "m5d.16xlarge", "vCPUs": 64} ] - } + }, + "defaultImage": "emr-6.10.0" }, "clusterSpecs": { "minWorkerNodes": 2 diff --git a/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/dataproc.ms b/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/dataproc.ms index 77093242a..d4dc50215 100644 --- a/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/dataproc.ms +++ b/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/dataproc.ms @@ -3,7 +3,7 @@ "clusterUuid": "1234-5678-1234567", "config": { "gceClusterConfig": { - "zoneUri": "us-central1-a" + "zoneUri": {{{ ZONE }}} }, "masterConfig": { "instanceNames": [ @@ -16,6 +16,9 @@ "instanceNames": {{{ EXECUTOR_NAMES }}}, "machineTypeUri": {{{ EXECUTOR_INSTANCE }}}, "numInstances": {{ NUM_EXECUTOR_NODES }} + }, + "softwareConfig": { + "imageVersion": {{{ IMAGE }}} } }, "status": { diff --git a/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/emr.ms b/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/emr.ms index 817691ddd..7a598c098 100644 --- a/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/emr.ms +++ b/user_tools/src/spark_rapids_pytools/resources/templates/cluster_template/emr.ms @@ -6,7 +6,7 @@ "State": "TERMINATED" }, "Ec2InstanceAttributes": { - "Ec2AvailabilityZone": "us-west-2a" + "Ec2AvailabilityZone": {{{ ZONE }}} }, "InstanceGroups": [ { @@ -25,6 +25,7 @@ "InstanceType": {{{ DRIVER_INSTANCE }}}, "RequestedInstanceCount": {{ NUM_DRIVER_NODES }} } - ] + ], + "ReleaseLabel": {{{ IMAGE }}} } } diff --git a/user_tools/src/spark_rapids_pytools/resources/templates/dataproc-create_gpu_cluster_script.ms b/user_tools/src/spark_rapids_pytools/resources/templates/dataproc-create_gpu_cluster_script.ms index 114195dfb..114764fb6 100644 --- a/user_tools/src/spark_rapids_pytools/resources/templates/dataproc-create_gpu_cluster_script.ms +++ b/user_tools/src/spark_rapids_pytools/resources/templates/dataproc-create_gpu_cluster_script.ms @@ -5,7 +5,7 @@ export CLUSTER_NAME="{{ CLUSTER_NAME }}" gcloud dataproc clusters create $CLUSTER_NAME \ --image-version={{ IMAGE }} \ --region {{ REGION }} \ - --zone {{ ZONE }}-a \ + --zone {{ ZONE }} \ --master-machine-type {{ MASTER_MACHINE }} \ --num-workers {{ WORKERS_COUNT }} \ --worker-machine-type {{ WORKERS_MACHINE }} \