Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow spark dependency to be configured dynamically #1326

Merged
merged 6 commits into from
Sep 6, 2024
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion user_tools/src/spark_rapids_pytools/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,10 @@

"""init file of the spark_rapids_pytools package."""

from spark_rapids_pytools.build import get_version
from spark_rapids_pytools.build import get_version, get_runtime_buildver

VERSION = '24.08.2'
# defines the default runtime build version for the user tools environment
RUNTIME_BUILDVER = '350'
tgravescs marked this conversation as resolved.
Show resolved Hide resolved
__version__ = get_version(VERSION)
__runtime_buildver__ = get_runtime_buildver()
22 changes: 20 additions & 2 deletions user_tools/src/spark_rapids_pytools/build.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2022-2023, NVIDIA CORPORATION.
# Copyright (c) 2022-2024, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand All @@ -18,7 +18,7 @@
import os


def get_version(main=None):
def get_version(main: str = None) -> str:
if main is None:
# pylint: disable=import-outside-toplevel
from spark_rapids_pytools import VERSION as main
Expand All @@ -27,3 +27,21 @@ def get_version(main=None):
if nightly == '1':
suffix = '.dev' + datetime.datetime.utcnow().strftime('%Y%m%d%H%M%S')
return main + suffix


def get_runtime_buildver(buildver_arg: str = None) -> str:
"""
Get the runtime SPARK build_version for the user tools environment.
Note that the env_var always have precedence over the input argument and the default values
:param buildver_arg: optional argument to specify the build version
:return: the first value set in the following order:
1- env_var RAPIDS_USER_TOOLS_SPARK_DEP_VERSION
2- the input buildver_arg
3- default value RUNTIME_BUILDVER
"""
if buildver_arg is None:
# pylint: disable=import-outside-toplevel
from spark_rapids_pytools import RUNTIME_BUILDVER
buildver_arg = RUNTIME_BUILDVER
# the env_var should have precedence because this is the way user can override the default configs
return os.environ.get('RAPIDS_USER_TOOLS_SPARK_DEP_VERSION', buildver_arg)
22 changes: 17 additions & 5 deletions user_tools/src/spark_rapids_pytools/rapids/rapids_tool.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,15 @@
from concurrent.futures import ThreadPoolExecutor
from dataclasses import dataclass, field
from logging import Logger
from typing import Any, Callable, Dict, List
from typing import Any, Callable, Dict, List, Optional

import yaml

import spark_rapids_pytools
from spark_rapids_pytools import get_runtime_buildver
from spark_rapids_pytools.cloud_api.sp_types import get_platform, \
ClusterBase, DeployMode, NodeHWInfo
from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer
from spark_rapids_pytools.common.prop_manager import YAMLPropertiesContainer, AbstractPropertiesContainer
from spark_rapids_pytools.common.sys_storage import FSUtil, FileVerifier
from spark_rapids_pytools.common.utilities import ToolLogging, Utils, ToolsSpinner
from spark_rapids_pytools.rapids.rapids_job import RapidsJobPropContainer
Expand Down Expand Up @@ -389,6 +390,19 @@ def _calculate_spark_settings(self, worker_info: NodeHWInfo) -> dict:
}
return res

@classmethod
def get_rapids_tools_dependencies(cls, deploy_mode: str, json_props: AbstractPropertiesContainer) -> Optional[list]:
"""
Get the tools dependencies from the platform configuration.
"""
# allow defining default buildver per platform
buildver_from_conf = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, 'activeBuildVer')
active_buildver = get_runtime_buildver(buildver_from_conf)
depend_arr = json_props.get_value_silent('dependencies', 'deployMode', deploy_mode, active_buildver)
if depend_arr is None:
raise ValueError(f'Invalid SPARK dependency version [{active_buildver}]')
return depend_arr


@dataclass
class RapidsJarTool(RapidsTool):
Expand Down Expand Up @@ -581,9 +595,7 @@ def cache_all_dependencies(dep_arr: List[dict]):

# TODO: Verify the downloaded file by checking their MD5
deploy_mode = DeployMode.tostring(self.ctxt.get_deploy_mode())
depend_arr = self.ctxt.platform.configs.get_value_silent('dependencies',
'deployMode',
deploy_mode)
depend_arr = self.get_rapids_tools_dependencies(deploy_mode, self.ctxt.platform.configs)
if depend_arr:
dep_list = cache_all_dependencies(depend_arr)
if any(dep_item is None for dep_item in dep_list):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,58 +1,61 @@
{
"dependencies": {
"deployMode": {
"LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
}
],
"SPARK333-LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
}
]
"LOCAL": {
"//activeBuildVer": "Define this key in order to set the default buildVer for that platform",
tgravescs marked this conversation as resolved.
Show resolved Hide resolved
"350": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
}
],
"333": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "Hadoop AWS",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-aws/3.3.4/hadoop-aws-3.3.4.jar",
"type": "jar",
"md5": "59907e790ce713441955015d79f670bc",
"sha1": "a65839fbf1869f81a1632e09f415e586922e4f80",
"size": 962685
},
{
"name": "AWS Java SDK Bundled",
"uri": "https://repo1.maven.org/maven2/com/amazonaws/aws-java-sdk-bundle/1.12.262/aws-java-sdk-bundle-1.12.262.jar",
"type": "jar",
"md5": "8a22f2d30b7e8eee9ea44f04fb13b35a",
"sha1": "02deec3a0ad83d13d032b1812421b23d7a961eea",
"size": 280645251
}
]
}
}
},
"environment": {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,42 +1,45 @@
{
"dependencies": {
"deployMode": {
"LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
}
],
"SPARK333-LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
}
]
"LOCAL": {
"//activeBuildVer": "Define this key in order to set the default buildVer for that platform",
"350": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
}
],
"333": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "Hadoop Azure",
"uri": "https://repo1.maven.org/maven2/org/apache/hadoop/hadoop-azure/3.3.4/hadoop-azure-3.3.4.jar",
"type": "jar",
"md5": "1ec4cbd59548412010fe1515070eef73",
"sha1": "a23f621bca9b2100554150f6b0b521f94b8b419e",
"size": 574116
}
]
}
}
},
"environment": {
Expand Down Expand Up @@ -370,4 +373,3 @@
"minWorkerNodes": 2
}
}

75 changes: 39 additions & 36 deletions user_tools/src/spark_rapids_pytools/resources/dataproc-configs.json
Original file line number Diff line number Diff line change
@@ -1,42 +1,45 @@
{
"dependencies": {
"deployMode": {
"LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "GCS Connector Hadoop3",
"uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar",
"type": "jar",
"md5": "2ee6ad7215304cf5da8e731afb36ad72",
"sha1": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170",
"size": 39359477
}
],
"SPARK333-LOCAL": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "GCS Connector Hadoop3",
"uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar",
"type": "jar",
"md5": "41aea3add826dfbf3384a2c638148709",
"sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66",
"size": 38413466
}
]
"LOCAL": {
"//activeBuildVer": "Define this key in order to set the default buildVer for that platform",
"350": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "8883c67e0a138069e597f3e7d4edbbd5c3a565d50b28644aad02856a1ec1da7cb92b8f80454ca427118f69459ea326eaa073cf7b1a860c3b796f4b07c2101319",
"size": 400395283
},
{
"name": "GCS Connector Hadoop3",
"uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.19/gcs-connector-hadoop3-2.2.19-shaded.jar",
"type": "jar",
"md5": "2ee6ad7215304cf5da8e731afb36ad72",
"sha1": "3bea6d5e62663a2a5c03d8ca44dff4921aeb3170",
"size": 39359477
}
],
"333": [
{
"name": "Apache Spark",
"uri": "https://archive.apache.org/dist/spark/spark-3.3.3/spark-3.3.3-bin-hadoop3.tgz",
"type": "archive",
"relativePath": "jars/*",
"sha512": "ebf79c7861f3120d5ed9465fdd8d5302a734ff30713a0454b714bbded7ab9f218b3108dc46a5de4cc2102c86e7be53908f84d2c7a19e59bc75880766eeefeef9",
"size": 299426263
},
{
"name": "GCS Connector Hadoop3",
"uri": "https://repo1.maven.org/maven2/com/google/cloud/bigdataoss/gcs-connector/hadoop3-2.2.17/gcs-connector-hadoop3-2.2.17-shaded.jar",
"type": "jar",
"md5": "41aea3add826dfbf3384a2c638148709",
"sha1": "06438f562692ff8fae5e8555eba2b9f95cb74f66",
"size": 38413466
}
]
}
}
},
"environment": {
Expand Down
Loading