From db3d344b928f585e2044c636fd5dfdad516adefc Mon Sep 17 00:00:00 2001 From: Alan Malta Rodrigues Date: Fri, 10 Sep 2021 11:24:10 +0200 Subject: [PATCH] Support GPU parameters at StdBase; validation functions implemented fix Lexicon logic for GPUParams and its internals add RequiresGPU vs GPUParams validation; fix Py2 compatibility --- src/python/WMCore/Lexicon.py | 102 ++++++++++++++++++- src/python/WMCore/WMSpec/StdSpecs/ReReco.py | 10 ++ src/python/WMCore/WMSpec/StdSpecs/StdBase.py | 8 +- 3 files changed, 116 insertions(+), 4 deletions(-) diff --git a/src/python/WMCore/Lexicon.py b/src/python/WMCore/Lexicon.py index 0a5d95d2752..2c11aea52f6 100644 --- a/src/python/WMCore/Lexicon.py +++ b/src/python/WMCore/Lexicon.py @@ -10,11 +10,12 @@ from future import standard_library standard_library.install_aliases() -from builtins import str -from future.utils import viewvalues +from builtins import str, bytes +from future.utils import viewvalues, viewkeys import io import re +import json from urllib.parse import urlparse, urlunparse @@ -710,6 +711,103 @@ def activity(candidate): raise AssertionError("Invalid dashboard activity: %s should 'test'" % candidate) +def gpuParameters(candidate): + """ + Validate the spec "GPUParams" argument, which is a JSON encoded object, thus: + * an encoded empty string (like '""') + * an encoded dictionary with the following parameters: + * mandatory: GPUMemoryMB (int), CUDARuntime (str), CUDACapabilities (list of str) + * optional: GPUName (str), CUDADriverVersion (str), CUDARuntimeVersion (str) + :param candidate: a JSON encoded data to be validated + :return: True if validation succeeded, False or exception otherwise + """ + mandatoryArgs = set(["GPUMemoryMB", "CUDARuntime", "CUDACapabilities"]) + optionalArgs = set(["GPUName", "CUDADriverVersion", "CUDARuntimeVersion"]) + try: + data = json.loads(candidate) + except Exception: + raise AssertionError("GPUParams is not a valid JSON object") + # once python2 code is deprecated, this is the way to raise only the last exception + # raise AssertionError("GPUParams is not a valid JSON object") from None + if data == "": + return True + if not isinstance(data, dict): + raise AssertionError("GPUParams is not a dictionary encoded as JSON object") + + paramSet = set(viewkeys(data)) + # is every mandatory argument also in the provided args? + if not mandatoryArgs <= paramSet: + msg = "GPUParams does not contain all the mandatory arguments. " + msg +="Mandatory args: {}, while args provided are: {}".format(mandatoryArgs, paramSet) + raise AssertionError(msg) + # are there unknown arguments in the data provided? + unknownArgs = paramSet - mandatoryArgs - optionalArgs + if unknownArgs: + msg = "GPUParams contains arguments that are not supported. Args provided: {}, ".format(paramSet) + msg +="while mandatory args are: {} and optional args are: {}".format(mandatoryArgs, optionalArgs) + raise AssertionError(msg) + return _gpuInternalParameters(data) + + +CUDA_VERSION_REGEX = {"re": r"^\d+\.\d+(\.\d+)?$", "maxLength": 100} +def _gpuInternalParameters(candidate): + """ + NOTE: this function is supposed to be called only from gpuParameters, which already + does the high level validation. + List of **required** parameters is: + * `GPUMemoryMB`: integer with the amount of memory, in Megabytes (MB). Validate as `> 0`. E.g.: 8000 + * `CUDACapabilities`: a list of short strings (<= 100 chars). Validation should ensure at least one item + in the list and matching this regex: `r"^\d+.\d$"`. E.g.: ["7.5", "8.0"] + * `CUDARuntime`: a short string (<=100 chars) with the runtime version. + Validated against this regex: `r"^\d+.\d+$"`. E.g.: "11.2" + List of **optional** parameters is: + * `GPUName`: a string with the GPU name. Validate against `<= 100 chars`. E.g. "Tesla T4", "Quadro RTX 6000"; + * `CUDADriverVersion`: a string with the CUDA driver version. + Validated against this regex: `r"^\d+.\d+\d+$"`E.g. "460.32.03" + * `CUDARuntimeVersion`: a string with the CUDA runtime version. + Validated against this regex: `r"^\d+.\d+\d+$"`E.g. "11.2.152" + + This function validates all the internal key/value pairs provided for the GPUParams + argument, mostly against their own regular expressions. + :param candidate: the JSON object already decoded (thus, str or dict) + :return: True if validation succeeded, False or exception otherwise + """ + # Generic regular expression for CUDA runtime/driver version + # It matches either something like "11.2", or "11.2.231" + # GPUMemoryMB validation + if not isinstance(candidate["GPUMemoryMB"], int) or not candidate["GPUMemoryMB"] > 0: + raise AssertionError("Mandatory GPUParams.GPUMemoryMB must be an integer and greater than 0") + # CUDACapabilities validation + if not isinstance(candidate["CUDACapabilities"], (list, set)) or not candidate["CUDACapabilities"]: + raise AssertionError("Mandatory GPUParams.CUDACapabilities must be a non-empty list") + for cudaCapabItem in candidate["CUDACapabilities"]: + if not isinstance(cudaCapabItem, (str, bytes)): + raise AssertionError("Mandatory GPUParams.CUDACapabilities must be a list of strings") + check(CUDA_VERSION_REGEX["re"], cudaCapabItem, CUDA_VERSION_REGEX["maxLength"]) + # CUDARuntime validation + if not isinstance(candidate["CUDARuntime"], (str, bytes)) or\ + not check(CUDA_VERSION_REGEX["re"], candidate["CUDARuntime"], CUDA_VERSION_REGEX["maxLength"]): + raise AssertionError("Mandatory GPUParams.CUDARuntime must be a string and shorter than 100 chars") + + ### And now, validate the optional arguments + # GPUName validation + if "GPUName" in candidate: + if not isinstance(candidate["GPUName"], (str, bytes)): + raise AssertionError("Optional GPUParams.GPUName must be a string") + check(r".*", candidate["GPUName"], 100) + # CUDADriverVersion validation + if "CUDADriverVersion" in candidate: + if not isinstance(candidate["CUDADriverVersion"], (str, bytes)): + raise AssertionError("Optional GPUParams.CUDADriverVersion must be a string") + check(CUDA_VERSION_REGEX["re"], candidate["CUDADriverVersion"], CUDA_VERSION_REGEX["maxLength"]) + # CUDARuntimeVersion validation + if "CUDARuntimeVersion" in candidate: + if not isinstance(candidate["CUDARuntimeVersion"], (str, bytes)): + raise AssertionError("Optional GPUParams.CUDARuntimeVersion must be a string") + check(CUDA_VERSION_REGEX["re"], candidate["CUDARuntimeVersion"], CUDA_VERSION_REGEX["maxLength"]) + return True + + def getStringsBetween(start, end, source): """ get the string between start string and end string for given source string diff --git a/src/python/WMCore/WMSpec/StdSpecs/ReReco.py b/src/python/WMCore/WMSpec/StdSpecs/ReReco.py index 131bf0c2d65..f7f3e077f68 100644 --- a/src/python/WMCore/WMSpec/StdSpecs/ReReco.py +++ b/src/python/WMCore/WMSpec/StdSpecs/ReReco.py @@ -5,6 +5,9 @@ Standard ReReco workflow. """ from __future__ import division + +import json + from future.utils import viewitems from Utils.Utilities import makeList @@ -273,3 +276,10 @@ def validateSchema(self, schema): if diffSet: self.raiseValidationException( msg="A transient output module was specified but no skim was defined for it") + + # Validate GPU-related spec parameters + if schema["RequiresGPU"] in ("optional", "required"): + if not json.loads(schema["GPUParams"]): + msg = "Request is set with RequiresGPU={}, ".format(schema["RequiresGPU"]) + msg += "but GPUParams schema is not provided or correct." + self.raiseValidationException(msg) diff --git a/src/python/WMCore/WMSpec/StdSpecs/StdBase.py b/src/python/WMCore/WMSpec/StdSpecs/StdBase.py index d7b4776b186..241b1896312 100644 --- a/src/python/WMCore/WMSpec/StdSpecs/StdBase.py +++ b/src/python/WMCore/WMSpec/StdSpecs/StdBase.py @@ -9,10 +9,10 @@ from builtins import range, object import logging - +import json from Utils.Utilities import makeList, makeNonEmptyList, strToBool, safeStr from WMCore.Cache.WMConfigCache import ConfigCache, ConfigCacheException -from WMCore.Lexicon import couchurl, procstring, activity, procversion, primdataset +from WMCore.Lexicon import couchurl, procstring, activity, procversion, primdataset, gpuParameters from WMCore.Lexicon import lfnBase, identifier, acqname, cmsname, dataset, block, campaign from WMCore.ReqMgr.DataStructs.RequestStatus import REQUEST_START_STATE from WMCore.ReqMgr.Tools.cms import releases, architectures @@ -1034,6 +1034,10 @@ def getWorkloadCreateArgs(): "RobustMerge": {"default": True, "type": strToBool}, "Comments": {"default": ""}, "SubRequestType": {"default": ""}, # used only(?) for RelVals + "RequiresGPU": {"default": "forbidden", + "validate": lambda x: x in ("forbidden", "optional", "required")}, + "GPUParams": {"default": json.dumps(""), "validate": gpuParameters}, + # FIXME (Alan on 27/Mar/017): maybe used by T0 during creation??? "MinMergeSize": {"default": 2 * 1024 * 1024 * 1024, "type": int,