From 30eb99772a52e057a5aeed03ebddd0c6e7c83465 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Tue, 3 Sep 2024 13:28:04 -0500 Subject: [PATCH 1/5] Allow spark dependency to be configured dynamically Signed-off-by: Ahmed Hussein Fixes #1316 Allow user-tools to pick the SPARK dependencies based on a runtime env_var. The value format follows the same format of `buildver` in the scala pom file. Currently 333 and 350 (default) are supported. If user specifies an invalid value, there will be a warning message, then the process fails running the java cmd. **Changes** - Add dependency key to the platform config-file - A platform can define its own default dependency versions using `activeBuildVer` key - Add a default `RUNTIME_BUILDVER` in the `__init__.py` to allow upgrades of spark release during official releases - Read an env_var `RAPIDS_USER_TOOLS_RUNTIME_BUILDVER` to pick the correct dependency. - Currently, only `333` and `350` are supported. Default is `350` --- .../src/spark_rapids_pytools/__init__.py | 5 +- user_tools/src/spark_rapids_pytools/build.py | 18 ++- .../rapids/rapids_tool.py | 22 +++- .../resources/databricks_aws-configs.json | 107 +++++++++--------- .../resources/databricks_azure-configs.json | 76 +++++++------ .../resources/dataproc-configs.json | 75 ++++++------ .../resources/dataproc_gke-configs.json | 75 ++++++------ .../resources/dev/prepackage_mgr.py | 6 +- .../resources/emr-configs.json | 107 +++++++++--------- .../resources/onprem-configs.json | 46 ++++---- 10 files changed, 293 insertions(+), 244 deletions(-) diff --git a/user_tools/src/spark_rapids_pytools/__init__.py b/user_tools/src/spark_rapids_pytools/__init__.py index d7ec40e64..62844909b 100644 --- a/user_tools/src/spark_rapids_pytools/__init__.py +++ b/user_tools/src/spark_rapids_pytools/__init__.py @@ -14,7 +14,10 @@ """init file of the spark_rapids_pytools package.""" -from spark_rapids_pytools.build import get_version +from spark_rapids_pytools.build import get_version, get_runtime_buildver VERSION = '24.08.1' +# defines the default runtime build version for the user tools environment +RUNTIME_BUILDVER = '350' __version__ = get_version(VERSION) +__runtime_buildver__ = get_runtime_buildver() diff --git a/user_tools/src/spark_rapids_pytools/build.py b/user_tools/src/spark_rapids_pytools/build.py index 5306a32f0..f8c70b3b3 100644 --- a/user_tools/src/spark_rapids_pytools/build.py +++ b/user_tools/src/spark_rapids_pytools/build.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -18,7 +18,7 @@ import os -def get_version(main=None): +def get_version(main: str = None) -> str: if main is None: # pylint: disable=import-outside-toplevel from spark_rapids_pytools import VERSION as main @@ -27,3 +27,17 @@ def get_version(main=None): if nightly == '1': suffix = '.dev' + datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S') return main + suffix + + +def get_runtime_buildver(buildver_arg: str = None) -> str: + """ + Get the runtime SPARK build_version for the user tools environment. + :param buildver_arg: optional argument to specify the build version + :return: returns the input argument if it is set. + Otherwise, it returns ${RAPIDS_USER_TOOLS_RUNTIME_BUILDVER:-RUNTIME_BUILDVER}. + """ + if buildver_arg is None: + # pylint: disable=import-outside-toplevel + from spark_rapids_pytools import RUNTIME_BUILDVER + return os.environ.get('RAPIDS_USER_TOOLS_RUNTIME_BUILDVER', RUNTIME_BUILDVER) + return buildver_arg diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py index 3270bed0f..372cd6670 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py +++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py @@ -25,14 +25,15 @@ from concurrent.futures import ThreadPoolExecutor from dataclasses import dataclass, field from logging import Logger -from typing import Any, Callable, Dict, List +from typing import Any, Callable, Dict, List, Optional import yaml import spark_rapids_pytools +from spark_rapids_pytools import get_runtime_buildver from spark_rapids_pytools.cloud_api.sp_types import get_platform, \ ClusterBase, DeployMode, NodeHWInfo -from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer +from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer, AbstractPropertiesContainer from spark_rapids_pytools.common.sys_storage import FSUtil, FileVerifier from spark_rapids_pytools.common.utilities import ToolLogging, Utils, ToolsSpinner from spark_rapids_pytools.rapids.rapids_job import RapidsJobPropContainer @@ -389,6 +390,17 @@ def _calculate_spark_settings(self, worker_info: NodeHWInfo) -> dict: } return res + @classmethod + def get_rapids_tools_dependencies(cls, deploy_mode: str, json_props: AbstractPropertiesContainer) -> Optional[list]: + """ + Get the tools dependencies from the platform configuration. + """ + # allow defining default buildver per platform + buildver_from_conf = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, 'activeBuildVer') + active_buildver = get_runtime_buildver(buildver_from_conf) + depend_arr = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, active_buildver) + return depend_arr + @dataclass class RapidsJarTool(RapidsTool): @@ -581,9 +593,7 @@ def cache_all_dependencies(dep_arr: List[dict]): # TODO: Verify the downloaded file by checking their MD5 deploy_mode = DeployMode.tostring(self.ctxt.get_deploy_mode()) - depend_arr = self.ctxt.platform.configs.get_value_silent('dependencies', - 'deployMode', - deploy_mode) + depend_arr = self.get_rapids_tools_dependencies(deploy_mode, self.ctxt.platform.configs) if depend_arr: dep_list = cache_all_dependencies(depend_arr) if any(dep_item is None for dep_item in dep_list): @@ -592,6 +602,8 @@ def cache_all_dependencies(dep_arr: List[dict]): Utils.gen_joined_str(join_elem='; ', items=dep_list)) self.ctxt.add_rapids_args('javaDependencies', dep_list) + else: + self.logger.warning('Dependencies were not found for the current deployment mode') def _process_rapids_args(self): # add a dictionary to hold the rapids arguments diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json index be8640013..20afcaa93 100644 --- a/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/databricks_aws-configs.json @@ -1,58 +1,61 @@ { "dependencies": { "deployMode": { - "LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", - "size": 400395283 - }, - { - "name": "Hadoop AWS", - "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", - "type": "jar", - "md5": "59907e790ce713441955015d79f670bc", - "sha1": "a65839fbf1869f81a1632e09f415e586922e4f80", - "size": 962685 - }, - { - "name": "AWS Java SDK Bundled", - "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", - "type": "jar", - "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", - "sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea", - "size": 280645251 - } - ], - "SPARK333-LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", - "size": 299426263 - }, - { - "name": "Hadoop AWS", - "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", - "type": "jar", - "md5": "59907e790ce713441955015d79f670bc", - "sha1": "a65839fbf1869f81a1632e09f415e586922e4f80", - "size": 962685 - }, - { - "name": "AWS Java SDK Bundled", - "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", - "type": "jar", - "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", - "sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea", - "size": 280645251 - } - ] + "LOCAL": { + "//activeBuildVer": "Define this key in order to set the default buildVer for that platform", + "350": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", + "size": 400395283 + }, + { + "name": "Hadoop AWS", + "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", + "type": "jar", + "md5": "59907e790ce713441955015d79f670bc", + "sha1": "a65839fbf1869f81a1632e09f415e586922e4f80", + "size": 962685 + }, + { + "name": "AWS Java SDK Bundled", + "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", + "type": "jar", + "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", + "sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea", + "size": 280645251 + } + ], + "333": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 + }, + { + "name": "Hadoop AWS", + "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", + "type": "jar", + "md5": "59907e790ce713441955015d79f670bc", + "sha1": "a65839fbf1869f81a1632e09f415e586922e4f80", + "size": 962685 + }, + { + "name": "AWS Java SDK Bundled", + "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", + "type": "jar", + "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", + "sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea", + "size": 280645251 + } + ] + } } }, "environment": { diff --git a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json index 882dfc1b3..07040a0be 100644 --- a/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/databricks_azure-configs.json @@ -1,42 +1,45 @@ { "dependencies": { "deployMode": { - "LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", - "size": 400395283 - }, - { - "name": "Hadoop Azure", - "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar", - "type": "jar", - "md5": "1ec4cbd59548412010fe1515070eef73", - "sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e", - "size": 574116 - } - ], - "SPARK333-LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", - "size": 299426263 - }, - { - "name": "Hadoop Azure", - "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar", - "type": "jar", - "md5": "1ec4cbd59548412010fe1515070eef73", - "sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e", - "size": 574116 - } - ] + "LOCAL": { + "//activeBuildVer": "Define this key in order to set the default buildVer for that platform", + "350": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", + "size": 400395283 + }, + { + "name": "Hadoop Azure", + "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar", + "type": "jar", + "md5": "1ec4cbd59548412010fe1515070eef73", + "sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e", + "size": 574116 + } + ], + "333": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 + }, + { + "name": "Hadoop Azure", + "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar", + "type": "jar", + "md5": "1ec4cbd59548412010fe1515070eef73", + "sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e", + "size": 574116 + } + ] + } } }, "environment": { @@ -370,4 +373,3 @@ "minWorkerNodes": 2 } } - \ No newline at end of file diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json index 8f26c8c14..d25daa2db 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json @@ -1,42 +1,45 @@ { "dependencies": { "deployMode": { - "LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", - "size": 400395283 - }, - { - "name": "GCS Connector Hadoop3", - "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar", - "type": "jar", - "md5": "2ee6ad7215304cf5da8e731afb36ad72", - "sha1": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170", - "size": 39359477 - } - ], - "SPARK333-LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", - "size": 299426263 - }, - { - "name": "GCS Connector Hadoop3", - "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar", - "type": "jar", - "md5": "41aea3add826dfbf3384a2c638148709", - "sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66", - "size": 38413466 - } - ] + "LOCAL": { + "//activeBuildVer": "Define this key in order to set the default buildVer for that platform", + "350": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", + "size": 400395283 + }, + { + "name": "GCS Connector Hadoop3", + "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar", + "type": "jar", + "md5": "2ee6ad7215304cf5da8e731afb36ad72", + "sha1": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170", + "size": 39359477 + } + ], + "333": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 + }, + { + "name": "GCS Connector Hadoop3", + "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar", + "type": "jar", + "md5": "41aea3add826dfbf3384a2c638148709", + "sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66", + "size": 38413466 + } + ] + } } }, "environment": { diff --git a/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json b/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json index 895afbdb7..dec1d05db 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/dataproc_gke-configs.json @@ -1,42 +1,45 @@ { "dependencies": { "deployMode": { - "LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", - "size": 400395283 - }, - { - "name": "GCS Connector Hadoop3", - "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar", - "type": "jar", - "md5": "2ee6ad7215304cf5da8e731afb36ad72", - "sha1": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170", - "size": 39359477 - } - ], - "SPARK333-LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", - "size": 299426263 - }, - { - "name": "GCS Connector Hadoop3", - "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar", - "type": "jar", - "md5": "41aea3add826dfbf3384a2c638148709", - "sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66", - "size": 38413466 - } - ] + "LOCAL": { + "//activeBuildVer": "Define this key in order to set the default buildVer for that platform", + "350": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", + "size": 400395283 + }, + { + "name": "GCS Connector Hadoop3", + "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar", + "type": "jar", + "md5": "2ee6ad7215304cf5da8e731afb36ad72", + "sha1": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170", + "size": 39359477 + } + ], + "333": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 + }, + { + "name": "GCS Connector Hadoop3", + "uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar", + "type": "jar", + "md5": "41aea3add826dfbf3384a2c638148709", + "sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66", + "size": 38413466 + } + ] + } } }, "environment": { diff --git a/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py b/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py index 59028689c..4c9971689 100644 --- a/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py +++ b/user_tools/src/spark_rapids_pytools/resources/dev/prepackage_mgr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. +# Copyright (c) 2023-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,6 +27,7 @@ from spark_rapids_pytools.common.prop_manager import JSONPropertiesContainer from spark_rapids_pytools.common.sys_storage import FSUtil +from spark_rapids_pytools.rapids.rapids_tool import RapidsTool from spark_rapids_tools import CspEnv from spark_rapids_tools.utils import Utilities @@ -114,7 +115,8 @@ def _fetch_resources(self) -> dict: config_file = FSUtil.build_full_path(self.resource_dir, f'{platform}{self._configs_suffix}') # pylint: disable=no-member platform_conf = JSONPropertiesContainer(config_file) - for dependency in platform_conf.get_value('dependencies', 'deployMode', 'LOCAL'): + dependency_list = RapidsTool.get_rapids_tools_dependencies('LOCAL', platform_conf) + for dependency in dependency_list: uri = dependency.get('uri') name = FSUtil.get_resource_name(uri) if uri: diff --git a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json index f059533b0..e273c224c 100644 --- a/user_tools/src/spark_rapids_pytools/resources/emr-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/emr-configs.json @@ -1,58 +1,61 @@ { "dependencies": { "deployMode": { - "LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", - "size": 400395283 - }, - { - "name": "Hadoop AWS", - "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", - "type": "jar", - "md5": "59907e790ce713441955015d79f670bc", - "sha1": "a65839fbf1869f81a1632e09f415e586922e4f80", - "size": 962685 - }, - { - "name": "AWS Java SDK Bundled", - "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", - "type": "jar", - "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", - "sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea", - "size": 280645251 - } - ], - "SPARK333-LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", - "size": 299426263 - }, - { - "name": "Hadoop AWS", - "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", - "type": "jar", - "md5": "59907e790ce713441955015d79f670bc", - "sha1": "a65839fbf1869f81a1632e09f415e586922e4f80", - "size": 962685 - }, - { - "name": "AWS Java SDK Bundled", - "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", - "type": "jar", - "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", - "sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea", - "size": 280645251 - } - ] + "LOCAL": { + "//activeBuildVer": "Define this key in order to set the default buildVer for that platform", + "350": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", + "size": 400395283 + }, + { + "name": "Hadoop AWS", + "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", + "type": "jar", + "md5": "59907e790ce713441955015d79f670bc", + "sha1": "a65839fbf1869f81a1632e09f415e586922e4f80", + "size": 962685 + }, + { + "name": "AWS Java SDK Bundled", + "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", + "type": "jar", + "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", + "sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea", + "size": 280645251 + } + ], + "333": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 + }, + { + "name": "Hadoop AWS", + "uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar", + "type": "jar", + "md5": "59907e790ce713441955015d79f670bc", + "sha1": "a65839fbf1869f81a1632e09f415e586922e4f80", + "size": 962685 + }, + { + "name": "AWS Java SDK Bundled", + "uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar", + "type": "jar", + "md5": "8a22f2d30b7e8eee9ea44f04fb13b35a", + "sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea", + "size": 280645251 + } + ] + } } }, "environment": { diff --git a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json index 756ddc3fb..bacc86e64 100644 --- a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json @@ -1,26 +1,30 @@ { "dependencies": { "deployMode": { - "LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", - "size": 400395283 - } - ], - "SPARK333-LOCAL": [ - { - "name": "Apache Spark", - "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", - "type": "archive", - "relativePath": "jars/*", - "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", - "size": 299426263 - } - ] + "LOCAL": { + "//activeBuildVer": "Define this key in order to set the default buildVer for that platform", + "activeBuildVer": "331", + "350": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319", + "size": 400395283 + } + ], + "333": [ + { + "name": "Apache Spark", + "uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz", + "type": "archive", + "relativePath": "jars/*", + "sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9", + "size": 299426263 + } + ] + } } }, "csp_pricing": { @@ -165,4 +169,4 @@ } } } -} \ No newline at end of file +} From 19ad4e93c18228d68e37974ade10d1618d53087a Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Tue, 3 Sep 2024 16:32:13 -0500 Subject: [PATCH 2/5] remove value used to test Signed-off-by: Ahmed Hussein --- .../src/spark_rapids_pytools/resources/onprem-configs.json | 1 - 1 file changed, 1 deletion(-) diff --git a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json index bacc86e64..b731c1629 100644 --- a/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json +++ b/user_tools/src/spark_rapids_pytools/resources/onprem-configs.json @@ -3,7 +3,6 @@ "deployMode": { "LOCAL": { "//activeBuildVer": "Define this key in order to set the default buildVer for that platform", - "activeBuildVer": "331", "350": [ { "name": "Apache Spark", From a4c5dee8c5acf62dc176bb9638177543af426765 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Tue, 3 Sep 2024 17:03:41 -0500 Subject: [PATCH 3/5] Change behavior to give precedence to the rnv_var Signed-off-by: Ahmed Hussein --- user_tools/src/spark_rapids_pytools/build.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/user_tools/src/spark_rapids_pytools/build.py b/user_tools/src/spark_rapids_pytools/build.py index f8c70b3b3..194ba7cba 100644 --- a/user_tools/src/spark_rapids_pytools/build.py +++ b/user_tools/src/spark_rapids_pytools/build.py @@ -32,12 +32,16 @@ def get_version(main: str = None) -> str: def get_runtime_buildver(buildver_arg: str = None) -> str: """ Get the runtime SPARK build_version for the user tools environment. + Note that the env_var always have precedence over the input argument and the default values :param buildver_arg: optional argument to specify the build version - :return: returns the input argument if it is set. - Otherwise, it returns ${RAPIDS_USER_TOOLS_RUNTIME_BUILDVER:-RUNTIME_BUILDVER}. + :return: the first value set in the following order: + 1- env_var RAPIDS_USER_TOOLS_RUNTIME_BUILDVER + 2- the input buildver_arg + 3- default value RUNTIME_BUILDVER """ if buildver_arg is None: # pylint: disable=import-outside-toplevel from spark_rapids_pytools import RUNTIME_BUILDVER - return os.environ.get('RAPIDS_USER_TOOLS_RUNTIME_BUILDVER', RUNTIME_BUILDVER) - return buildver_arg + buildver_arg = RUNTIME_BUILDVER + # the env_var should have precedence because this is the way user can override the default configs + return os.environ.get('RAPIDS_USER_TOOLS_RUNTIME_BUILDVER', buildver_arg) From d25b2213606efc4ea3fec14cedb027681bc720f3 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Wed, 4 Sep 2024 10:55:45 -0500 Subject: [PATCH 4/5] rename env_var and raise error for invalid dep version Signed-off-by: Ahmed Hussein --- user_tools/src/spark_rapids_pytools/build.py | 4 ++-- user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/user_tools/src/spark_rapids_pytools/build.py b/user_tools/src/spark_rapids_pytools/build.py index 194ba7cba..a0b546151 100644 --- a/user_tools/src/spark_rapids_pytools/build.py +++ b/user_tools/src/spark_rapids_pytools/build.py @@ -35,7 +35,7 @@ def get_runtime_buildver(buildver_arg: str = None) -> str: Note that the env_var always have precedence over the input argument and the default values :param buildver_arg: optional argument to specify the build version :return: the first value set in the following order: - 1- env_var RAPIDS_USER_TOOLS_RUNTIME_BUILDVER + 1- env_var RAPIDS_USER_TOOLS_SPARK_DEP_VERSION 2- the input buildver_arg 3- default value RUNTIME_BUILDVER """ @@ -44,4 +44,4 @@ def get_runtime_buildver(buildver_arg: str = None) -> str: from spark_rapids_pytools import RUNTIME_BUILDVER buildver_arg = RUNTIME_BUILDVER # the env_var should have precedence because this is the way user can override the default configs - return os.environ.get('RAPIDS_USER_TOOLS_RUNTIME_BUILDVER', buildver_arg) + return os.environ.get('RAPIDS_USER_TOOLS_SPARK_DEP_VERSION', buildver_arg) diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py index 372cd6670..20f374d73 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py +++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py @@ -399,6 +399,8 @@ def get_rapids_tools_dependencies(cls, deploy_mode: str, json_props: AbstractPro buildver_from_conf = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, 'activeBuildVer') active_buildver = get_runtime_buildver(buildver_from_conf) depend_arr = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, active_buildver) + if depend_arr is None: + raise ValueError(f'Invalid SPARK dependency version [{active_buildver}]') return depend_arr @@ -602,8 +604,6 @@ def cache_all_dependencies(dep_arr: List[dict]): Utils.gen_joined_str(join_elem='; ', items=dep_list)) self.ctxt.add_rapids_args('javaDependencies', dep_list) - else: - self.logger.warning('Dependencies were not found for the current deployment mode') def _process_rapids_args(self): # add a dictionary to hold the rapids arguments From 79ee229e59f6e23f98c4aeb8a4a5f02d0b33eb31 Mon Sep 17 00:00:00 2001 From: Ahmed Hussein Date: Fri, 6 Sep 2024 09:29:48 -0500 Subject: [PATCH 5/5] Rename buildver to spark_dep Signed-off-by: Ahmed Hussein --- user_tools/src/spark_rapids_pytools/__init__.py | 6 +++--- user_tools/src/spark_rapids_pytools/build.py | 14 +++++++------- .../src/spark_rapids_pytools/rapids/rapids_tool.py | 4 ++-- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/user_tools/src/spark_rapids_pytools/__init__.py b/user_tools/src/spark_rapids_pytools/__init__.py index c26ad82e9..a5b3c2314 100644 --- a/user_tools/src/spark_rapids_pytools/__init__.py +++ b/user_tools/src/spark_rapids_pytools/__init__.py @@ -14,10 +14,10 @@ """init file of the spark_rapids_pytools package.""" -from spark_rapids_pytools.build import get_version, get_runtime_buildver +from spark_rapids_pytools.build import get_version, get_spark_dep_version VERSION = '24.08.2' # defines the default runtime build version for the user tools environment -RUNTIME_BUILDVER = '350' +SPARK_DEP_VERSION = '350' __version__ = get_version(VERSION) -__runtime_buildver__ = get_runtime_buildver() +__spark_dep_version__ = get_spark_dep_version() diff --git a/user_tools/src/spark_rapids_pytools/build.py b/user_tools/src/spark_rapids_pytools/build.py index a0b546151..9f26b30fc 100644 --- a/user_tools/src/spark_rapids_pytools/build.py +++ b/user_tools/src/spark_rapids_pytools/build.py @@ -29,19 +29,19 @@ def get_version(main: str = None) -> str: return main + suffix -def get_runtime_buildver(buildver_arg: str = None) -> str: +def get_spark_dep_version(spark_dep_arg: str = None) -> str: """ Get the runtime SPARK build_version for the user tools environment. Note that the env_var always have precedence over the input argument and the default values - :param buildver_arg: optional argument to specify the build version + :param spark_dep_arg: optional argument to specify the build version :return: the first value set in the following order: 1- env_var RAPIDS_USER_TOOLS_SPARK_DEP_VERSION 2- the input buildver_arg - 3- default value RUNTIME_BUILDVER + 3- default value SPARK_DEV_VERSION """ - if buildver_arg is None: + if spark_dep_arg is None: # pylint: disable=import-outside-toplevel - from spark_rapids_pytools import RUNTIME_BUILDVER - buildver_arg = RUNTIME_BUILDVER + from spark_rapids_pytools import SPARK_DEP_VERSION + spark_dep_arg = SPARK_DEP_VERSION # the env_var should have precedence because this is the way user can override the default configs - return os.environ.get('RAPIDS_USER_TOOLS_SPARK_DEP_VERSION', buildver_arg) + return os.environ.get('RAPIDS_USER_TOOLS_SPARK_DEP_VERSION', spark_dep_arg) diff --git a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py index 20f374d73..e2d4c226e 100644 --- a/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py +++ b/user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py @@ -30,7 +30,7 @@ import yaml import spark_rapids_pytools -from spark_rapids_pytools import get_runtime_buildver +from spark_rapids_pytools import get_spark_dep_version from spark_rapids_pytools.cloud_api.sp_types import get_platform, \ ClusterBase, DeployMode, NodeHWInfo from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer, AbstractPropertiesContainer @@ -397,7 +397,7 @@ def get_rapids_tools_dependencies(cls, deploy_mode: str, json_props: AbstractPro """ # allow defining default buildver per platform buildver_from_conf = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, 'activeBuildVer') - active_buildver = get_runtime_buildver(buildver_from_conf) + active_buildver = get_spark_dep_version(buildver_from_conf) depend_arr = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, active_buildver) if depend_arr is None: raise ValueError(f'Invalid SPARK dependency version [{active_buildver}]')