From 86acb34bd6704592ee1ff82ceb81ff3ec93184c9 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Fri, 13 Jan 2023 22:46:14 +0100 Subject: [PATCH 1/9] Make drs configurable per rootpath --- esmvalcore/config/_config_validators.py | 48 +++++---- esmvalcore/local.py | 138 +++++++++++++----------- tests/integration/conftest.py | 3 +- tests/integration/test_local.py | 4 +- 4 files changed, 109 insertions(+), 84 deletions(-) diff --git a/esmvalcore/config/_config_validators.py b/esmvalcore/config/_config_validators.py index f202c80a9d..24e4ab004b 100644 --- a/esmvalcore/config/_config_validators.py +++ b/esmvalcore/config/_config_validators.py @@ -188,7 +188,17 @@ def validate_rootpath(value): "Correcting capitalization, project 'obs4mips' should be " "written as 'obs4MIPs' in 'rootpath' in config-user.yml") key = 'obs4MIPs' - new_mapping[key] = validate_pathlist(paths) + if isinstance(paths, Path): + paths = str(paths) + if isinstance(paths, (str, list)): + new_mapping[key] = validate_pathlist(paths) + else: + validate_dict(paths) + new_mapping[key] = { + validate_path(path): validate_string(drs) + for path, drs in paths.items() + } + return new_mapping @@ -276,38 +286,36 @@ def deprecate(func, variable, version: Optional[str] = None): _validators = { # From user config - 'log_level': validate_string, - 'exit_on_warning': validate_bool, - 'output_dir': validate_path, - 'download_dir': validate_path, + 'always_search_esgf': validate_bool, 'auxiliary_data_dir': validate_path, - 'extra_facets_dir': validate_pathtuple, 'compress_netcdf': validate_bool, - 'save_intermediary_cubes': validate_bool, - 'remove_preproc_dir': validate_bool, - 'max_parallel_tasks': validate_int_or_none, 'config_developer_file': validate_config_developer, + 'drs': validate_drs, + 'download_dir': validate_path, + 'exit_on_warning': validate_bool, + 'extra_facets_dir': validate_pathtuple, + 'log_level': validate_string, + 'max_parallel_tasks': validate_int_or_none, + 'offline': validate_bool, + 'output_dir': validate_path, + 'output_file_type': validate_string, 'profile_diagnostic': validate_bool, + 'remove_preproc_dir': validate_bool, + 'rootpath': validate_rootpath, 'run_diagnostic': validate_bool, - 'output_file_type': validate_string, - "offline": validate_bool, - 'always_search_esgf': validate_bool, + 'save_intermediary_cubes': validate_bool, # From CLI - "resume_from": validate_pathlist, - "skip_nonexistent": validate_bool, - "diagnostics": validate_diagnostics, "check_level": validate_check_level, - 'max_years': validate_int_positive_or_none, + "diagnostics": validate_diagnostics, 'max_datasets': validate_int_positive_or_none, + 'max_years': validate_int_positive_or_none, + "resume_from": validate_pathlist, + "skip_nonexistent": validate_bool, # From recipe 'write_ncl_interface': validate_bool, - # oldstyle - 'rootpath': validate_rootpath, - 'drs': validate_drs, - # config location 'config_file': validate_path, } diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 5d78045024..48322701f9 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -5,6 +5,7 @@ import logging import os import re +from dataclasses import dataclass from glob import glob from pathlib import Path from typing import Any, Union @@ -380,74 +381,85 @@ def _apply_caps(original, lower, upper): return original -def _select_drs(input_type, project): +def _select_drs(input_type: str, project: str, structure: str) -> list[str]: """Select the directory structure of input path.""" cfg = get_project_config(project) - input_path = cfg[input_type] - if isinstance(input_path, str): - return input_path + input_path_patterns = cfg[input_type] + if isinstance(input_path_patterns, str): + return [input_path_patterns] - structure = CFG['drs'].get(project, 'default') - if structure in input_path: - return input_path[structure] + if structure in input_path_patterns: + value = input_path_patterns[structure] + if isinstance(value, str): + value = [value] + return value raise KeyError( 'drs {} for {} project not specified in config-developer file'.format( structure, project)) +@dataclass(order=True, frozen=True) +class DataSource: + """Class for storing a data source and finding the associated files.""" + rootpath: Path + dirname_template: str + filename_template: str + + def get_glob_patterns(self, **facets) -> list[Path]: + """Compose the globs that will be used to look for files.""" + dirname_globs = _replace_tags(self.dirname_template, facets) + filename_globs = _replace_tags(self.filename_template, facets) + return sorted(self.rootpath / d / f for d in dirname_globs + for f in filename_globs) + + def find_files(self, **facets) -> list[LocalFile]: + """Find files.""" + globs = self.get_glob_patterns(**facets) + logger.debug("Looking for files matching %s", globs) + + files = [] + for glob_ in globs: + for filename in glob(str(glob_)): + file = LocalFile(filename) + file.facets.update(_path2facets(file, self.dirname_template)) + files.append(file) + files.sort() # sorting makes it easier to see what was found + + if 'timerange' in facets: + files = _select_files(files, facets['timerange']) + return files + + _ROOTPATH_WARNED = set() -def _get_rootpath(project): - """Select the rootpath.""" - rootpath = CFG['rootpath'] +def _get_data_sources(project: str) -> list[DataSource]: + """Get a list of data sources.""" + rootpaths = CFG['rootpath'] for key in (project, 'default'): - if key in rootpath: - nonexistent = tuple(p for p in rootpath[key] - if not os.path.exists(p)) + if key in rootpaths: + paths = rootpaths[key] + nonexistent = tuple(p for p in paths if not os.path.exists(p)) if nonexistent and (key, nonexistent) not in _ROOTPATH_WARNED: logger.warning( "'%s' rootpaths '%s' set in config-user.yml do not exist", key, ', '.join(str(p) for p in nonexistent)) _ROOTPATH_WARNED.add((key, nonexistent)) - return rootpath[key] - raise KeyError('default rootpath must be specified in config-user file') - - -def _get_globs(variable): - """Compose the globs that will be used to look for files.""" - project = variable['project'] - - rootpaths = _get_rootpath(project) - - dirname_template = _select_drs('input_dir', project) - dirname_globs = _replace_tags(dirname_template, variable) - - filename_template = _select_drs('input_file', project) - filename_globs = _replace_tags(filename_template, variable) - - globs = sorted(r / d / f for r in rootpaths for d in dirname_globs - for f in filename_globs) - return globs + if isinstance(paths, list): + structure = CFG['drs'].get(project, 'default') + paths = {p: structure for p in paths} + sources: list[DataSource] = [] + for path, structure in paths.items(): + dir_templates = _select_drs('input_dir', project, structure) + file_templates = _select_drs('input_file', project, structure) + sources.extend( + DataSource(path, d, f) + for d in dir_templates for f in file_templates + ) + return sources - -def _get_input_filelist(variable): - """Return the full path to input files.""" - variable = dict(variable) - if 'original_short_name' in variable: - variable['short_name'] = variable['original_short_name'] - - globs = _get_globs(variable) - logger.debug("Looking for files matching %s", globs) - - files = list(Path(file) for glob_ in globs for file in glob(str(glob_))) - files.sort() # sorting makes it easier to see what was found - - if 'timerange' in variable: - files = _select_files(files, variable['timerange']) - - return files, globs + raise KeyError('default rootpath must be specified in config-user file') def _get_output_file(variable: dict[str, Any], preproc_dir: Path) -> Path: @@ -616,19 +628,18 @@ def find_files( list[LocalFile] The files that were found. """ # pylint: disable=line-too-long - filenames, globs = _get_input_filelist(facets) - drs = _select_drs('input_dir', facets['project']) - if isinstance(drs, list): - # Not sure how to handle a list of DRSs - drs = '' + facets = dict(facets) + if 'original_short_name' in facets: + facets['short_name'] = facets['original_short_name'] + files = [] filter_latest = False - for filename in filenames: - file = LocalFile(filename) - file.facets.update(_path2facets(file, drs)) - if file.facets.get('version') == 'latest': - filter_latest = True - files.append(file) + data_sources = _get_data_sources(facets['project']) # type: ignore + for data_source in data_sources: + for file in data_source.find_files(**facets): + if file.facets.get('version') == 'latest': + filter_latest = True + files.append(file) if filter_latest: files = _filter_versions_called_latest(files) @@ -636,8 +647,13 @@ def find_files( if 'version' not in facets: files = _select_latest_version(files) + files.sort() # sorting makes it easier to see what was found + if debug: - return files, globs + globs = [] + for data_source in data_sources: + globs.extend(data_source.get_glob_patterns(**facets)) + return files, sorted(globs) return files diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index d40e0dbea0..53ff0cc7dc 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -18,7 +18,7 @@ def session(tmp_path, monkeypatch): # The patched_datafinder fixture does not return the correct input # directory structure, so make sure it is set to flat for every project - monkeypatch.setitem(CFG, 'drs', {}) + monkeypatch.setitem(CFG, 'rootpath', {'default': {tmp_path: 'default'}}) for project in _config.CFG: monkeypatch.setitem(_config.CFG[project]['input_dir'], 'default', '/') return session @@ -89,6 +89,7 @@ def tracking_ids(i=0): tracking_id = tracking_ids() def glob(filename): + filename = Path(filename).name # Fail for specified fx variables if 'fx_' in filename: return [] diff --git a/tests/integration/test_local.py b/tests/integration/test_local.py index 02982177a9..84e7c8e964 100644 --- a/tests/integration/test_local.py +++ b/tests/integration/test_local.py @@ -89,8 +89,8 @@ def test_find_files(monkeypatch, root, cfg): ref_globs = [ Path(root, d, f) for d in cfg['dirs'] for f in cfg['file_patterns'] ] - assert sorted([Path(f) for f in input_filelist]) == sorted(ref_files) - assert sorted([Path(g) for g in globs]) == sorted(ref_globs) + assert [Path(f) for f in input_filelist] == sorted(ref_files) + assert [Path(g) for g in globs] == sorted(ref_globs) def test_find_files_with_facets(monkeypatch, root): From f86446bd12ad2c0fcfae279ab3460a49cc04bd01 Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Mon, 6 Feb 2023 20:53:23 +0100 Subject: [PATCH 2/9] Add tests --- esmvalcore/local.py | 7 +-- tests/unit/config/test_config_validator.py | 46 +++++++++++++++++ tests/unit/local/test_get_data_sources.py | 58 ++++++++++++++++++++++ 3 files changed, 108 insertions(+), 3 deletions(-) create mode 100644 tests/unit/local/test_get_data_sources.py diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 48322701f9..b54f53e8ef 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -459,7 +459,9 @@ def _get_data_sources(project: str) -> list[DataSource]: ) return sources - raise KeyError('default rootpath must be specified in config-user file') + raise KeyError( + f"No '{project}' or 'default' path specified under 'rootpath' in " + "the user configuration.") def _get_output_file(variable: dict[str, Any], preproc_dir: Path) -> Path: @@ -502,8 +504,7 @@ def _get_multiproduct_filename(attributes: dict, preproc_dir: Path) -> Path: filename_segments = list(dict.fromkeys(filename_segments)) # Add period and extension - filename_segments.append( - f"{attributes['timerange'].replace('/', '-')}.nc") + filename_segments.append(f"{attributes['timerange'].replace('/', '-')}.nc") outfile = Path( preproc_dir, diff --git a/tests/unit/config/test_config_validator.py b/tests/unit/config/test_config_validator.py index 47da116416..cf7096fef9 100644 --- a/tests/unit/config/test_config_validator.py +++ b/tests/unit/config/test_config_validator.py @@ -17,6 +17,7 @@ validate_path, validate_path_or_none, validate_positive, + validate_rootpath, validate_string, validate_string_or_none, ) @@ -107,6 +108,7 @@ def generate_validator_testcases(valid): ('a/b/c', Path.cwd() / 'a' / 'b' / 'c'), ('/a/b/c/', Path('/', 'a', 'b', 'c')), ('~/', Path.home()), + (Path.home(), Path.home()), ), 'fail': ( (None, ValueError), @@ -120,6 +122,49 @@ def generate_validator_testcases(valid): 'success': ((None, None), ), 'fail': (), }, + { + 'validator': + validate_rootpath, + 'success': ( + # Test a single path + ({ + 'default': '/a' + }, { + 'default': [Path('/a')] + }), + ({ + 'default': Path('/a') + }, { + 'default': [Path('/a')] + }), + # Test a list of paths + ({ + 'CMIP6': ['/a', '/b'] + }, { + 'CMIP6': [Path('/a'), Path('/b')] + }), + ({ + 'CMIP6': [Path('/a'), Path('/b')] + }, { + 'CMIP6': [Path('/a'), Path('/b')] + }), + # Test a dict of paths + ( + { + 'CMIP6': { + '/a': 'DKRZ', + '/b': 'ESGF', + }, + }, + { + 'CMIP6': { + Path('/a'): 'DKRZ', + Path('/b'): 'ESGF', + }, + }, + )), + 'fail': (), + }, { 'validator': validate_positive, 'success': ( @@ -187,6 +232,7 @@ def test_validator_invalid(validator, arg, exception_type): @pytest.mark.parametrize('version', (current_version, '0.0.1', '9.9.9')) def test_deprecate(version): + def test_func(): pass diff --git a/tests/unit/local/test_get_data_sources.py b/tests/unit/local/test_get_data_sources.py new file mode 100644 index 0000000000..6229a22149 --- /dev/null +++ b/tests/unit/local/test_get_data_sources.py @@ -0,0 +1,58 @@ +from pathlib import Path + +import pytest + +from esmvalcore.config import CFG +from esmvalcore.local import DataSource, _get_data_sources + + +@pytest.mark.parametrize('rootpath_drs', [ + ( + { + 'CMIP6': { + '/climate_data': 'ESGF' + } + }, + {}, + ), + ( + { + 'CMIP6': ['/climate_data'] + }, + { + 'CMIP6': 'ESGF' + }, + ), + ( + { + 'default': ['/climate_data'] + }, + { + 'CMIP6': 'ESGF' + }, + ), +]) +def test_get_data_sources(monkeypatch, rootpath_drs): + rootpath, drs = rootpath_drs + monkeypatch.setitem(CFG, 'rootpath', rootpath) + monkeypatch.setitem(CFG, 'drs', drs) + sources = _get_data_sources('CMIP6') + source = sources[0] + assert isinstance(source, DataSource) + assert source.rootpath == Path('/climate_data') + assert '{project}' in source.dirname_template + assert '{short_name}' in source.filename_template + + +def test_get_data_sources_nodefault(monkeypatch): + monkeypatch.setitem( + CFG, + 'rootpath', + { + 'CMIP5': { + '/climate_data': 'default' + }, + }, + ) + with pytest.raises(KeyError): + _get_data_sources('CMIP6') From e5cba85366384661b2ad21cbc0ddca3a5cba6d4a Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Mon, 6 Feb 2023 21:52:36 +0100 Subject: [PATCH 3/9] Add documentation --- doc/quickstart/configure.rst | 12 ++--- doc/quickstart/find_data.rst | 92 +++++++++++++++++++++--------------- esmvalcore/config-user.yml | 55 +++++++++++++-------- 3 files changed, 96 insertions(+), 63 deletions(-) diff --git a/doc/quickstart/configure.rst b/doc/quickstart/configure.rst index 6b3c8e64e9..f6aef71fc9 100644 --- a/doc/quickstart/configure.rst +++ b/doc/quickstart/configure.rst @@ -26,7 +26,7 @@ User configuration file The ``config-user.yml`` configuration file contains all the global level -information needed by ESMValTool. It can be reused as many times the user needs +information needed by ESMValCore. It can be reused as many times the user needs to before changing any of the options stored in it. This file is essentially the gateway between the user and the machine-specific instructions to ``esmvaltool``. By default, esmvaltool looks for it in the home directory, @@ -73,7 +73,7 @@ omitted in the file. # Rootpaths to the data from different projects # This default setting will work if files have been downloaded by the - # ESMValTool via ``offline=False``. Lists are also possible. For + # ESMValCore via ``offline=False``. Lists are also possible. For # site-specific entries, see the default ``config-user.yml`` file that can be # installed with the command ``esmvaltool config get_config_user``. For each # project, this can be either a single path or a list of paths. Comment out @@ -83,7 +83,7 @@ omitted in the file. # Directory structure for input data --- [default]/ESGF/BADC/DKRZ/ETHZ/etc. # This default setting will work if files have been downloaded by the - # ESMValTool via ``offline=False``. See ``config-developer.yml`` for + # ESMValCore via ``offline=False``. See ``config-developer.yml`` for # definitions. Comment out/replace as per needed. drs: CMIP3: ESGF @@ -179,7 +179,7 @@ and memory usage. A detailed explanation of the data finding-related sections of the ``config-user.yml`` (``rootpath`` and ``drs``) is presented in the :ref:`data-retrieval` section. This section relates directly to the data -finding capabilities of ESMValTool and are very important to be understood by +finding capabilities of ESMValCore and are very important to be understood by the user. .. note:: @@ -715,7 +715,7 @@ addition of more details per project, dataset, mip table, and variable name. More precisely, one can provide this information in an extra yaml file, named `{project}-something.yml`, where `{project}` corresponds to the project as used -by ESMValTool in :ref:`Datasets` and "something" is arbitrary. +by ESMValCore in :ref:`Datasets` and "something" is arbitrary. Format of the extra facets files -------------------------------- @@ -768,7 +768,7 @@ variable of any CMIP5 dataset that does not have a ``product`` key yet: Location of the extra facets files ---------------------------------- Extra facets files can be placed in several different places. When we use them -to support a particular use-case within the ESMValTool project, they will be +to support a particular use-case within the ESMValCore project, they will be provided in the sub-folder `extra_facets` inside the package :mod:`esmvalcore.config`. If they are used from the user side, they can be either placed in `~/.esmvaltool/extra_facets` or in any other directory of the users diff --git a/doc/quickstart/find_data.rst b/doc/quickstart/find_data.rst index 3d2a09a55a..2a8a8dcaa5 100644 --- a/doc/quickstart/find_data.rst +++ b/doc/quickstart/find_data.rst @@ -7,7 +7,7 @@ Input data Overview ======== Data discovery and retrieval is the first step in any evaluation process; -ESMValTool uses a `semi-automated` data finding mechanism with inputs from both +ESMValCore uses a `semi-automated` data finding mechanism with inputs from both the user configuration file and the recipe file: this means that the user will have to provide the tool with a set of parameters related to the data needed and once these parameters have been provided, the tool will automatically find @@ -31,7 +31,7 @@ standard for naming files and structured paths; the `DRS `_ ensures that files and paths to them are named according to a standardized convention. Examples of this convention, also used by -ESMValTool for file discovery and data retrieval, include: +ESMValCore for file discovery and data retrieval, include: * CMIP6 file: ``{variable_short_name}_{mip}_{dataset_name}_{experiment}_{ensemble}_{grid}_{start-date}-{end-date}.nc`` * CMIP5 file: ``{variable_short_name}_{mip}_{dataset_name}_{experiment}_{ensemble}_{start-date}-{end-date}.nc`` @@ -44,7 +44,7 @@ ESGF data nodes, these paths differ slightly, for example: {variable_short_name}/{grid}``; * CMIP6 path for ETHZ: ``ROOT-ETHZ/{experiment}/{mip}/{variable_short_name}/{dataset_name}/{ensemble}/{grid}`` -From the ESMValTool user perspective the number of data input parameters is +From the ESMValCore user perspective the number of data input parameters is optimized to allow for ease of use. We detail this procedure in the next section. @@ -153,7 +153,7 @@ dedicated projects instead of the project ``native6``. CESM ^^^^ -ESMValTool is able to read native `CESM `__ model +ESMValCore is able to read native `CESM `__ model output. .. warning:: @@ -238,7 +238,7 @@ Key Description Default value if not EMAC ^^^^ -ESMValTool is able to read native `EMAC +ESMValCore is able to read native `EMAC `_ model output. @@ -260,7 +260,7 @@ Thus, example dataset entries could look like this: - {project: EMAC, dataset: EMAC, exp: historical, mip: Amon, short_name: ta, raw_name: tm1_p39_cav, start_year: 2000, end_year: 2014} Please note the duplication of the name ``EMAC`` in ``project`` and -``dataset``, which is necessary to comply with ESMValTool's data finding and +``dataset``, which is necessary to comply with ESMValCore's data finding and CMORizing functionalities. A variable-specific default for the facet ``channel`` is given in the extra facets (see next paragraph) for many variables, but this can be overwritten in @@ -271,7 +271,7 @@ facets`. By default, the file :download:`emac-mappings.yml ` is used for that purpose. -For some variables, extra facets are necessary; otherwise ESMValTool cannot +For some variables, extra facets are necessary; otherwise ESMValCore cannot read them properly. Supported keys for extra facets are: @@ -308,7 +308,7 @@ Key Description Default value if not ICON ^^^^ -ESMValTool is able to read native `ICON +ESMValCore is able to read native `ICON `_ model output. The default naming conventions for input directories and files for ICON are @@ -331,7 +331,7 @@ Thus, example dataset entries could look like this: end_year: 2014} Please note the duplication of the name ``ICON`` in ``project`` and -``dataset``, which is necessary to comply with ESMValTool's data finding and +``dataset``, which is necessary to comply with ESMValCore's data finding and CMORizing functionalities. A variable-specific default for the facet ``var_type`` is given in the extra facets (see next paragraph) for many variables, but this can be overwritten in @@ -342,7 +342,7 @@ facets`. By default, the file :download:`icon-mappings.yml ` is used for that purpose. -For some variables, extra facets are necessary; otherwise ESMValTool cannot +For some variables, extra facets are necessary; otherwise ESMValCore cannot read them properly. Supported keys for extra facets are: @@ -456,7 +456,7 @@ files must also undergo some data selection. Data retrieval ============== -Data retrieval in ESMValTool has two main aspects from the user's point of +Data retrieval in ESMValCore has two main aspects from the user's point of view: * data can be found by the tool, subject to availability on disk or `ESGF `_; @@ -464,7 +464,7 @@ view: The first point is self-explanatory: if the user runs the tool on a machine that has access to a data repository or multiple data repositories, then -ESMValTool will look for and find the available data requested by the user. +ESMValCore will look for and find the available data requested by the user. If the files are not found locally, the tool can search the ESGF_ and download the missing files, provided that they are available. @@ -482,7 +482,7 @@ the :ref:`user configuration file`. Setting the correct root paths ------------------------------ -The first step towards providing ESMValTool the correct set of parameters for +The first step towards providing ESMValCore the correct set of parameters for data retrieval is setting the root paths to the data. This is done in the user configuration file ``config-user.yml``. The two sections where the user will set the paths are ``rootpath`` and ``drs``. ``rootpath`` contains pointers to @@ -492,24 +492,11 @@ first discuss the ``drs`` parameter: as we've seen in the previous section, the DRS as a standard is used for both file naming conventions and for directory structures. -Synda ------ - -If the `synda install `_ command is used to download data, -it maintains the directory structure as on ESGF. To find data downloaded by -synda, use the ``SYNDA`` ``drs`` parameter. - -.. code-block:: yaml - - drs: - CMIP6: SYNDA - CMIP5: SYNDA - .. _config-user-drs: Explaining ``config-user/drs: CMIP5:`` or ``config-user/drs: CMIP6:`` --------------------------------------------------------------------- -Whereas ESMValTool will **always** use the CMOR standard for file naming (please +Whereas ESMValCore will by default use the CMOR standard for file naming (please refer above), by setting the ``drs`` parameter the user tells the tool what type of root paths they need the data from, e.g.: @@ -539,10 +526,17 @@ is another way to retrieve data from a ``ROOT`` directory that has no DRS-like structure; ``default`` indicates that the data lies in a directory that contains all the files without any structure. +The names of the directories trees that can be used under `drs` are defined in +:ref:`config-developer`. + .. note:: - When using ``CMIP6: default`` or ``CMIP5: default`` it is important to - remember that all the needed files must be in the same top-level directory - set by ``default`` (see below how to set ``default``). + When using ``CMIP6: default`` or ``CMIP5: default``, all the needed files + must be in the same top-level directory specified under ``rootpath``. + However, it is not recommended to use this, as it makes it impossible for + the tool to read the facets from the directory tree. + Moreover, this way of organizing data makes it impossible to store multiple + versions of the same file because the files typically have the same name + for different versions. .. _config-user-rootpath: @@ -552,27 +546,37 @@ Explaining ``config-user/rootpath:`` ``rootpath`` identifies the root directory for different data types (``ROOT`` as we used it above): * ``CMIP`` e.g. ``CMIP5`` or ``CMIP6``: this is the `root` path(s) to where the - CMIP files are stored; it can be a single path or a list of paths; it can + CMIP files are stored; it can be a single path, a list of paths, or a mapping + with paths as keys and `drs` names as values; it can point to an ESGF node or it can point to a user private repository. Example - for a CMIP5 root path pointing to the ESGF node on CEDA-Jasmin (formerly + for a CMIP5 root path pointing to the ESGF node mounted on CEDA-Jasmin (formerly known as BADC): .. code-block:: yaml - CMIP5: /badc/cmip5/data/cmip5/output1 + rootpath: + CMIP5: /badc/cmip5/data/cmip5/output1 Example for a CMIP6 root path pointing to the ESGF node on CEDA-Jasmin: .. code-block:: yaml - CMIP6: /badc/cmip6/data/CMIP6/CMIP + rootpath: + CMIP6: /badc/cmip6/data/CMIP6 Example for a mix of CMIP6 root path pointing to the ESGF node on CEDA-Jasmin and a user-specific data repository for extra data: .. code-block:: yaml - CMIP6: [/badc/cmip6/data/CMIP6/CMIP, /home/users/johndoe/cmip_data] + rootpath: + CMIP6: + /badc/cmip6/data/CMIP6: BADC + ~/climate_data: ESGF + + Note that this notation combines the ``rootpath`` and ``drs`` settings, so it + is not necessary to specify the directory structure in under ``drs`` in this + case. * ``OBS``: this is the `root` path(s) to where the observational datasets are stored; again, this could be a single path or a list of paths, just like for @@ -581,7 +585,8 @@ Explaining ``config-user/rootpath:`` .. code-block:: yaml - OBS: /gws/nopw/j04/esmeval/obsdata-v2 + rootpath: + OBS: /gws/nopw/j04/esmeval/obsdata-v2 * ``default``: this is the `root` path(s) where the tool will look for data from projects that do not have their own rootpath set. @@ -589,9 +594,22 @@ Explaining ``config-user/rootpath:`` * ``RAWOBS``: this is the `root` path(s) to where the raw observational data files are stored; this is used by ``esmvaltool data format``. +Synda +----- + +If the `synda install `_ command is used to download data, +it maintains the directory structure as on ESGF. To find data downloaded by +synda, use the ``SYNDA`` ``drs`` parameter. + +.. code-block:: yaml + + drs: + CMIP6: SYNDA + CMIP5: SYNDA + Dataset definitions in ``recipe`` --------------------------------- -Once the correct paths have been established, ESMValTool collects the +Once the correct paths have been established, ESMValCore collects the information on the specific datasets that are needed for the analysis. This information, together with the CMOR convention for naming files (see CMOR-DRS_) will allow the tool to search and find the right files. The specific diff --git a/esmvalcore/config-user.yml b/esmvalcore/config-user.yml index 175ac733e8..7f7d2a82bf 100644 --- a/esmvalcore/config-user.yml +++ b/esmvalcore/config-user.yml @@ -107,13 +107,24 @@ drs: CORDEX: ESGF obs4MIPs: ESGF -# Example rootpaths and directory structure that showcases the different -# projects and also the use of lists +# Example rootpaths and directory structure names for different projects. +# For each project, the entry can be a single path, a list of paths, or a +# mapping from paths to directory structure names. +# For single paths and list of paths, the directory structure names can be +# defined under 'drs'. +# If no path is defined for a project, the tool will look in the 'default' path. +# If no directory structure name is given, the name 'default' will be used. +# Directory structures corresponding to the names are defined in the file +# config-developer.yml. # For site-specific entries, see below. #rootpath: -# CMIP3: [~/cmip3_inputpath1, ~/cmip3_inputpath2] -# CMIP5: [~/cmip5_inputpath1, ~/cmip5_inputpath2] -# CMIP6: [~/cmip6_inputpath1, ~/cmip6_inputpath2] +# CMIP6: +# /path/to/data: DKRZ +# ~/path/to/more/data: ESGF +# CMIP5: +# - ~/cmip5_inputpath1 +# - ~/cmip5_inputpath2 +# CMIP3: ~/cmip6_inputpath # OBS: ~/obs_inputpath # OBS6: ~/obs6_inputpath # obs4MIPs: ~/obs4mips_inputpath @@ -122,11 +133,10 @@ drs: # RAWOBS: ~/rawobs_inputpath # default: ~/default_inputpath #drs: -# CMIP3: default -# CMIP5: default -# CMIP6: default -# CORDEX: default -# obs4MIPs: default +# CMIP3: ESGF +# CMIP5: ESGF +# CORDEX: ESGF +# obs4MIPs: ESGF # Directory tree created by automatically downloading from ESGF # Uncomment the lines below to locate data that has been automatically @@ -173,22 +183,27 @@ drs: # Uncomment the lines below to locate data on Levante at DKRZ. #auxiliary_data_dir: /work/bd0854/DATA/ESMValTool2/AUX #rootpath: -# CMIP6: /work/bd0854/DATA/ESMValTool2/CMIP6_DKRZ -# CMIP5: /work/bd0854/DATA/ESMValTool2/CMIP5_DKRZ -# CMIP3: /work/bd0854/DATA/ESMValTool2/CMIP3 -# CORDEX: /work/ik1017/C3SCORDEX/data/c3s-cordex/output +# CMIP6: +# /work/bd0854/DATA/ESMValTool2/CMIP6_DKRZ: DKRZ +# /work/bd0854/DATA/ESMValTool2/download: ESGF +# CMIP5: +# /work/bd0854/DATA/ESMValTool2/CMIP5_DKRZ: DKRZ +# /work/bd0854/DATA/ESMValTool2/download: ESGF +# CMIP3: +# /work/bd0854/DATA/ESMValTool2/CMIP3: DKRZ +# /work/bd0854/DATA/ESMValTool2/download: ESGF +# CORDEX: +# /work/ik1017/C3SCORDEX/data/c3s-cordex/output: BADC +# /work/bd0854/DATA/ESMValTool2/download: ESGF # OBS: /work/bd0854/DATA/ESMValTool2/OBS # OBS6: /work/bd0854/DATA/ESMValTool2/OBS -# obs4MIPs: /work/bd0854/DATA/ESMValTool2/OBS +# obs4MIPs: +# /work/bd0854/DATA/ESMValTool2/OBS: default +# /work/bd0854/DATA/ESMValTool2/download: ESGF # ana4mips: /work/bd0854/DATA/ESMValTool2/OBS # native6: /work/bd0854/DATA/ESMValTool2/RAWOBS # RAWOBS: /work/bd0854/DATA/ESMValTool2/RAWOBS #drs: -# CMIP6: DKRZ -# CMIP5: DKRZ -# CMIP3: DKRZ -# CORDEX: BADC -# obs4MIPs: default # ana4mips: default # OBS: default # OBS6: default From 92c5fd3d4ff4942aa490e84680e5c025886b3a9a Mon Sep 17 00:00:00 2001 From: Bouwe Andela Date: Mon, 6 Feb 2023 21:59:07 +0100 Subject: [PATCH 4/9] Fix linter issue --- esmvalcore/local.py | 1 + 1 file changed, 1 insertion(+) diff --git a/esmvalcore/local.py b/esmvalcore/local.py index b54f53e8ef..4d016494ef 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -402,6 +402,7 @@ def _select_drs(input_type: str, project: str, structure: str) -> list[str]: @dataclass(order=True, frozen=True) class DataSource: """Class for storing a data source and finding the associated files.""" + rootpath: Path dirname_template: str filename_template: str From 37ddce277ff6aa57696814bee8a5f3db19f4ec62 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 24 May 2024 16:45:22 +0200 Subject: [PATCH 5/9] Fix test_dataset --- tests/unit/test_dataset.py | 32 ++++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/tests/unit/test_dataset.py b/tests/unit/test_dataset.py index 951f3f17f8..25129dbf5d 100644 --- a/tests/unit/test_dataset.py +++ b/tests/unit/test_dataset.py @@ -1189,28 +1189,26 @@ def test_from_recipe_with_glob(tmp_path, session, mocker): tas: project: CMIP5 mip: Amon + exp: rcp85 + ensemble: r1i1p1 additional_datasets: - - {dataset: '*'} + - {dataset: '*', institute: '*'} """) recipe = tmp_path / 'recipe_test.yml' recipe.write_text(recipe_txt, encoding='utf-8') session['drs']['CMIP5'] = 'ESGF' - + CFG['rootpath']['CMIP5'] = [tmp_path] filenames = [ - "cmip5/output1/CSIRO-QCCCE/CSIRO-Mk3-6-0/rcp85/mon/atmos/Amon/r4i1p1/" - "v20120323/tas_Amon_CSIRO-Mk3-6-0_rcp85_r4i1p1_200601-210012.nc", - "cmip5/output1/NIMR-KMA/HadGEM2-AO/historical/mon/atmos/Amon/r1i1p1/" - "v20130815/tas_Amon_HadGEM2-AO_historical_r1i1p1_186001-200512.nc", + "cmip5/output1/CSIRO-QCCCE/CSIRO-Mk3-6-0/rcp85/mon/atmos/Amon/r1i1p1/" + "v20120323/tas_Amon_CSIRO-Mk3-6-0_rcp85_r1i1p1_200601-210012.nc", + "cmip5/output1/NIMR-KMA/HadGEM2-AO/rcp85/mon/atmos/Amon/r1i1p1/" + "v20130815/tas_Amon_HadGEM2-AO_rcp85_r1i1p1_186001-200512.nc", ] - - mocker.patch.object( - esmvalcore.local, - '_get_input_filelist', - autospec=True, - spec_set=True, - return_value=(filenames, []), - ) + for filename in filenames: + path = tmp_path / filename + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text('') definitions = [ { @@ -1222,6 +1220,9 @@ def test_from_recipe_with_glob(tmp_path, session, mocker): 'short_name': 'tas', 'alias': 'CSIRO-Mk3-6-0', 'recipe_dataset_index': 0, + 'exp': 'rcp85', + 'ensemble': 'r1i1p1', + 'institute': 'CSIRO-QCCCE', }, { 'diagnostic': 'diagnostic1', @@ -1232,6 +1233,9 @@ def test_from_recipe_with_glob(tmp_path, session, mocker): 'short_name': 'tas', 'alias': 'HadGEM2-AO', 'recipe_dataset_index': 1, + 'exp': 'rcp85', + 'ensemble': 'r1i1p1', + 'institute': 'NIMR-KMA', }, ] expected = [] From c5ac5713620d99f89dfd9d54877e06c77a01bec9 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 24 May 2024 16:55:14 +0200 Subject: [PATCH 6/9] Fix test_recipe --- tests/integration/conftest.py | 12 ++++++++---- tests/integration/recipe/test_recipe.py | 2 +- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 4929dadacc..fbddec7fe0 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -56,8 +56,8 @@ def _get_files(root_path, facets, tracking_id): all_facets = [facets] # Globs without expanded facets - dir_template = _select_drs('input_dir', facets['project']) - file_template = _select_drs('input_file', facets['project']) + dir_template = _select_drs('input_dir', facets['project'], 'default') + file_template = _select_drs('input_file', facets['project'], 'default') dir_globs = _replace_tags(dir_template, facets) file_globs = _replace_tags(file_template, facets) globs = sorted( @@ -67,8 +67,12 @@ def _get_files(root_path, facets, tracking_id): files = [] for expanded_facets in all_facets: filenames = [] - dir_template = _select_drs('input_dir', expanded_facets['project']) - file_template = _select_drs('input_file', expanded_facets['project']) + dir_template = _select_drs( + 'input_dir', expanded_facets['project'], 'default' + ) + file_template = _select_drs( + 'input_file', expanded_facets['project'], 'default' + ) dir_globs = _replace_tags(dir_template, expanded_facets) file_globs = _replace_tags(file_template, expanded_facets) filename = ( diff --git a/tests/integration/recipe/test_recipe.py b/tests/integration/recipe/test_recipe.py index c50d837b68..e467ae5c47 100644 --- a/tests/integration/recipe/test_recipe.py +++ b/tests/integration/recipe/test_recipe.py @@ -2023,7 +2023,7 @@ def test_weighting_landsea_fraction_exclude(tmp_path, patched_datafinder, additional_datasets: - {dataset: CanESM2} - {dataset: GFDL-CM3} - - {dataset: TEST, project: obs4MIPs, + - {dataset: TEST, project: obs4MIPs, tier: 1, supplementary_variables: [{short_name: sftlf, mip: fx}]} scripts: null """) From a88eaea294241a546d65af3419f42e1bc4b88096 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 24 May 2024 17:36:18 +0200 Subject: [PATCH 7/9] Fix race condition for test --- esmvalcore/local.py | 1 + tests/unit/config/test_config.py | 8 +++++++- tests/unit/config/test_config_validator.py | 3 +++ tests/unit/local/test_get_data_sources.py | 7 +++++++ 4 files changed, 18 insertions(+), 1 deletion(-) diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 539679f682..8f9e7e9cac 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -380,6 +380,7 @@ def _apply_caps(original, lower, upper): def _select_drs(input_type: str, project: str, structure: str) -> list[str]: """Select the directory structure of input path.""" cfg = get_project_config(project) + print(cfg) input_path_patterns = cfg[input_type] if isinstance(input_path_patterns, str): return [input_path_patterns] diff --git a/tests/unit/config/test_config.py b/tests/unit/config/test_config.py index c840027796..ec88e281e3 100644 --- a/tests/unit/config/test_config.py +++ b/tests/unit/config/test_config.py @@ -6,7 +6,7 @@ import esmvalcore from esmvalcore.cmor.check import CheckLevels -from esmvalcore.config import CFG, _config +from esmvalcore.config import CFG, _config, _config_validators from esmvalcore.config._config import ( _deep_update, _load_extra_facets, @@ -248,6 +248,9 @@ def test_project_obs4mips_case_correction(tmp_path, monkeypatch, mocker): assert 'obs4mips' not in _config.CFG assert _config.CFG['obs4MIPs'] == project_cfg + # Restore config-developer file + _config_validators.validate_config_developer(None) + def test_load_config_developer_custom(tmp_path, monkeypatch, mocker): monkeypatch.setattr(_config, 'CFG', {}) @@ -261,6 +264,9 @@ def test_load_config_developer_custom(tmp_path, monkeypatch, mocker): assert 'custom' in _config.CFG + # Restore config-developer file + _config_validators.validate_config_developer(None) + @pytest.mark.parametrize( 'project,step', diff --git a/tests/unit/config/test_config_validator.py b/tests/unit/config/test_config_validator.py index a85f05688a..36dcd763fb 100644 --- a/tests/unit/config/test_config_validator.py +++ b/tests/unit/config/test_config_validator.py @@ -332,3 +332,6 @@ def test_validate_config_developer(tmp_path): path = validate_config_developer(cfg_dev_file) assert path == cfg_dev_file + + # Restore original config-developer file + validate_config_developer(None) diff --git a/tests/unit/local/test_get_data_sources.py b/tests/unit/local/test_get_data_sources.py index 6229a22149..3def03462b 100644 --- a/tests/unit/local/test_get_data_sources.py +++ b/tests/unit/local/test_get_data_sources.py @@ -3,6 +3,7 @@ import pytest from esmvalcore.config import CFG +from esmvalcore.config._config_validators import validate_config_developer from esmvalcore.local import DataSource, _get_data_sources @@ -33,6 +34,9 @@ ), ]) def test_get_data_sources(monkeypatch, rootpath_drs): + # Make sure that default config-developer file is used + validate_config_developer(None) + rootpath, drs = rootpath_drs monkeypatch.setitem(CFG, 'rootpath', rootpath) monkeypatch.setitem(CFG, 'drs', drs) @@ -45,6 +49,9 @@ def test_get_data_sources(monkeypatch, rootpath_drs): def test_get_data_sources_nodefault(monkeypatch): + # Make sure that default config-developer file is used + validate_config_developer(None) + monkeypatch.setitem( CFG, 'rootpath', From e7d59d0cf2fd8b8cf356d4498ef3b446646e70e2 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Fri, 24 May 2024 17:41:48 +0200 Subject: [PATCH 8/9] Remove print --- esmvalcore/local.py | 1 - 1 file changed, 1 deletion(-) diff --git a/esmvalcore/local.py b/esmvalcore/local.py index 8f9e7e9cac..539679f682 100644 --- a/esmvalcore/local.py +++ b/esmvalcore/local.py @@ -380,7 +380,6 @@ def _apply_caps(original, lower, upper): def _select_drs(input_type: str, project: str, structure: str) -> list[str]: """Select the directory structure of input path.""" cfg = get_project_config(project) - print(cfg) input_path_patterns = cfg[input_type] if isinstance(input_path_patterns, str): return [input_path_patterns] From bfdd2e43bdfbd753952a0a9dd1e60acb6b470ea3 Mon Sep 17 00:00:00 2001 From: Manuel Schlund Date: Tue, 28 May 2024 16:49:14 +0200 Subject: [PATCH 9/9] Fix line break --- esmvalcore/config-user.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/esmvalcore/config-user.yml b/esmvalcore/config-user.yml index a13caa4dd3..ecdee818fc 100644 --- a/esmvalcore/config-user.yml +++ b/esmvalcore/config-user.yml @@ -114,7 +114,8 @@ drs: # mapping from paths to directory structure names. # For single paths and list of paths, the directory structure names can be # defined under 'drs'. -# If no path is defined for a project, the tool will look in the 'default' path. +# If no path is defined for a project, the tool will look in the 'default' +# path. # If no directory structure name is given, the name 'default' will be used. # Directory structures corresponding to the names are defined in the file # config-developer.yml.