Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix invalid values in cluster creation script #935

Merged
merged 2 commits into from
Apr 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions user_tools/src/spark_rapids_pytools/cloud_api/dataproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,6 +165,9 @@ def get_matching_executor_instance(self, cores_per_executor):
def generate_cluster_configuration(self, render_args: dict):
executor_names = ','.join([f'"test-node-e{i}"' for i in range(render_args['NUM_EXECUTOR_NODES'])])
render_args['EXECUTOR_NAMES'] = f'[{executor_names}]'
image_version = self.configs.get_value('clusterInference', 'defaultImage')
render_args['IMAGE'] = f'"{image_version}"'
render_args['ZONE'] = f'"{self.cli.get_zone()}"'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The zone should be picked up at the beginning of the execution.
I remember it was part of the getting the platform information during initialization.

My worry in this part that we are going to rely on the SDK CLI being installed which get us more restricted.

Copy link
Collaborator Author

@parthosa parthosa Apr 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, we set env_vars dict during platform setup from the SDK config file. However, zone is not available in AWS' config file. Only region is available.

Now that makes me think, if we need zone in the cluster creation script for EMR? Why dont we use region instead.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I guess it depends on what are the accepted keys under Ec2InstanceAttributes
If region is not one of the attributes, then we cannot use region there.
I don't remember exactly why did I have the availabilityZone part of the script. MAybe I tried to create a cluster and it required me to set the availability zone..I cannot really remember. It has been long time since I wrote that.

return super().generate_cluster_configuration(render_args)


Expand Down
17 changes: 17 additions & 0 deletions user_tools/src/spark_rapids_pytools/cloud_api/emr.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,12 @@ def create_local_submission_job(self, job_prop, ctxt) -> Any:
def _get_prediction_model_name(self) -> str:
return CspEnv.pretty_print(CspEnv.get_default())

def generate_cluster_configuration(self, render_args: dict):
image_version = self.configs.get_value('clusterInference', 'defaultImage')
render_args['IMAGE'] = f'"{image_version}"'
render_args['ZONE'] = f'"{self.cli.get_zone()}"'
return super().generate_cluster_configuration(render_args)


@dataclass
class EMRCMDDriver(CMDDriverBase):
Expand Down Expand Up @@ -199,6 +205,17 @@ def _build_platform_describe_node_instance(self, node: ClusterNode) -> list:
'--instance-types', f'{node.instance_type}']
return cmd_params

def get_zone(self) -> str:
describe_cmd = ['aws ec2 describe-availability-zones',
'--region', f'{self.get_region()}']
selected_zone = ''
try:
zones_list = json.loads(self.run_sys_cmd(describe_cmd))
selected_zone = zones_list['AvailabilityZones'][0]['ZoneName']
except Exception: # pylint: disable=broad-except
self.logger.warning('Unable to extract zone from region %s', self.get_region())
return selected_zone

def _build_platform_list_cluster(self,
cluster,
query_args: dict = None) -> list:
Expand Down
3 changes: 3 additions & 0 deletions user_tools/src/spark_rapids_pytools/cloud_api/sp_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,6 +299,9 @@ def get_env_var(self, key: str):
def get_region(self) -> str:
return self.env_vars.get('region')

def get_zone(self) -> str:
return self.env_vars.get('zone')

def get_cmd_run_configs(self) -> dict:
return self.env_vars.get('cmdRunnerProperties')

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -267,7 +267,8 @@
"executor": {
"n1-standard": {"vCPUs": [1, 2, 4, 8, 16, 32, 64, 96]}
}
}
},
"defaultImage": "2.1.41-debian11"
},
"clusterSpecs": {
"minWorkerNodes": 2,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -391,7 +391,8 @@
{"name": "m5d.12xlarge", "vCPUs": 48},
{"name": "m5d.16xlarge", "vCPUs": 64}
]
}
},
"defaultImage": "emr-6.10.0"
},
"clusterSpecs": {
"minWorkerNodes": 2
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
"clusterUuid": "1234-5678-1234567",
"config": {
"gceClusterConfig": {
"zoneUri": "us-central1-a"
"zoneUri": {{{ ZONE }}}
},
"masterConfig": {
"instanceNames": [
Expand All @@ -16,6 +16,9 @@
"instanceNames": {{{ EXECUTOR_NAMES }}},
"machineTypeUri": {{{ EXECUTOR_INSTANCE }}},
"numInstances": {{ NUM_EXECUTOR_NODES }}
},
"softwareConfig": {
"imageVersion": {{{ IMAGE }}}
}
},
"status": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
"State": "TERMINATED"
},
"Ec2InstanceAttributes": {
"Ec2AvailabilityZone": "us-west-2a"
"Ec2AvailabilityZone": {{{ ZONE }}}
},
"InstanceGroups": [
{
Expand All @@ -25,6 +25,7 @@
"InstanceType": {{{ DRIVER_INSTANCE }}},
"RequestedInstanceCount": {{ NUM_DRIVER_NODES }}
}
]
],
"ReleaseLabel": {{{ IMAGE }}}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ export CLUSTER_NAME="{{ CLUSTER_NAME }}"
gcloud dataproc clusters create $CLUSTER_NAME \
--image-version={{ IMAGE }} \
--region {{ REGION }} \
--zone {{ ZONE }}-a \
--zone {{ ZONE }} \
--master-machine-type {{ MASTER_MACHINE }} \
--num-workers {{ WORKERS_COUNT }} \
--worker-machine-type {{ WORKERS_MACHINE }} \
Expand Down