ESMValGroup · thomascrocker · Apr 26, 2021 · Apr 26, 2021 · Apr 26, 2021 · May 5, 2021
diff --git a/doc/recipe/preprocessor.rst b/doc/recipe/preprocessor.rst
@@ -374,6 +374,17 @@ or alternatively:
             {'short_name': 'sftof', 'exp': 'piControl'}
             ]
 
+Additionally, it is possible to search across all ensembles and experiments (or any other keys)
+when specifying the fx variable, by using the ``*`` character, which is useful for some projects
+where the location of the fx files is not consistent.
+This makes it possible to search for fx files under multiple ensemble members or experiments.
+For example: ``ensemble: '*'``. Note that the ``*`` character must be quoted since ``*`` is a
+special charcter in YAML. This functionality is only supported for time invariant fx variables
+(i.e. frequency ``fx``). Note also that if multiple folders of matching fx files are found,
-(i.e. frequency ``fx``). Note also that if multiple folders of matching fx files are found,
+(i.e. frequency ``fx`` or ``Ofx`` or ``Efx``). Note also that if multiple folders of matching fx files are found,
-(i.e. frequency ``fx``). Note also that if multiple folders of matching fx files are found,
+(i.e. frequency ``fx`` or ``Ofx`` or ``Efx``). Note also that if multiple folders of matching fx files are found,
+ESMValTool will default to ensemble r0i0p0 if it exists and then first folder found only 
+if it does not.
+
+
 See also :func:`esmvalcore.preprocessor.weighting_landsea_fraction`.
 
 
@@ -455,6 +466,17 @@ or alternatively:
             {'short_name': 'sftof', 'exp': 'piControl', 'ensemble': 'r2i1p1f1'}
             ]
 
+Additionally, it is possible to search across all ensembles and experiments (or any other keys)
+when specifying the fx variable, by using the ``*`` character, which is useful for some projects
+where the location of the fx files is not consistent.
+This makes it possible to search for fx files under multiple ensemble members or experiments.
+For example: ``ensemble: '*'``. Note that the ``*`` character must be quoted since ``*`` is a
+special charcter in YAML. This functionality is only supported for time invariant fx variables
+(i.e. frequency ``fx``). Note also that if multiple folders of matching fx files are found,
+ESMValTool will default to ensemble r0i0p0 if it exists and then first folder found only 
+if it does not.
+
+
 If the corresponding fx file is not found (which is
 the case for some models and almost all observational datasets), the
 preprocessor attempts to mask the data using Natural Earth mask files (that are
@@ -507,6 +529,16 @@ or alternatively:
           mask_out: sea
           fx_variables: [{'short_name': 'sftgif', 'exp': 'piControl'}]
 
+Additionally, it is possible to search across all ensembles and experiments (or any other keys)
+when specifying the fx variable, by using the ``*`` character, which is useful for some projects
+where the location of the fx files is not consistent.
+This makes it possible to search for fx files under multiple ensemble members or experiments.
+For example: ``ensemble: '*'``. Note that the ``*`` character must be quoted since ``*`` is a
+special charcter in YAML. This functionality is only supported for time invariant fx variables
+(i.e. frequency ``fx``). Note also that if multiple folders of matching fx files are found,
+ESMValTool will default to ensemble r0i0p0 if it exists and then first folder found only 
+if it does not.
+
 See also :func:`esmvalcore.preprocessor.mask_landseaice`.
 
 Glaciated masking

diff --git a/esmvalcore/_data_finder.py b/esmvalcore/_data_finder.py
@@ -38,19 +38,19 @@ def get_start_end_year(filename):
     start_year = end_year = None
 
     # First check for a block of two potential dates separated by _ or -
-    daterange = re.findall(r'([0-9]{4,12}[-_][0-9]{4,12})', stem)
+    daterange = re.findall(r"([0-9]{4,12}[-_][0-9]{4,12})", stem)
     if daterange:
-        start_date, end_date = re.findall(r'([0-9]{4,12})', daterange[0])
+        start_date, end_date = re.findall(r"([0-9]{4,12})", daterange[0])
         start_year = start_date[:4]
         end_year = end_date[:4]
     else:
         # Check for single dates in the filename
-        dates = re.findall(r'([0-9]{4,12})', stem)
+        dates = re.findall(r"([0-9]{4,12})", stem)
         if len(dates) == 1:
             start_year = end_year = dates[0][:4]
         elif len(dates) > 1:
             # Check for dates at start or end of filename
-            outerdates = re.findall(r'^[0-9]{4,12}|[0-9]{4,12}$', stem)
+            outerdates = re.findall(r"^[0-9]{4,12}|[0-9]{4,12}$", stem)
             if len(outerdates) == 1:
                 start_year = end_year = outerdates[0][:4]
 
@@ -61,16 +61,16 @@ def get_start_end_year(filename):
         for cube in cubes:
             logger.debug(cube)
             try:
-                time = cube.coord('time')
+                time = cube.coord("time")
             except iris.exceptions.CoordinateNotFoundError:
                 continue
             start_year = time.cell(0).point.year
             end_year = time.cell(-1).point.year
             break
 
     if start_year is None or end_year is None:
-        raise ValueError(f'File {filename} dates do not match a recognized'
-                         'pattern and time can not be read from the file')
+        raise ValueError(f"File {filename} dates do not match a recognized"
+                         "pattern and time can not be read from the file")
 
     logger.debug("Found start_year %s and end_year %s", start_year, end_year)
     return int(start_year), int(end_year)
@@ -92,7 +92,7 @@ def select_files(filenames, start_year, end_year):
 def _replace_tags(paths, variable):
     """Replace tags in the config-developer's file with actual values."""
     if isinstance(paths, str):
-        paths = set((paths.strip('/'),))
+        paths = set((paths.strip('/'), ))
     else:
         paths = set(path.strip('/') for path in paths)
     tlist = set()
@@ -101,10 +101,9 @@ def _replace_tags(paths, variable):
     if 'sub_experiment' in variable:
         new_paths = []
         for path in paths:
-            new_paths.extend((
-                re.sub(r'(\b{ensemble}\b)', r'{sub_experiment}-\1', path),
-                re.sub(r'({ensemble})', r'{sub_experiment}-\1', path)
-            ))
+            new_paths.extend(
+                (re.sub(r'(\b{ensemble}\b)', r'{sub_experiment}-\1', path),
+                 re.sub(r'({ensemble})', r'{sub_experiment}-\1', path)))
             tlist.add('sub_experiment')
         paths = new_paths
     logger.debug(tlist)
@@ -113,7 +112,7 @@ def _replace_tags(paths, variable):
         original_tag = tag
         tag, _, _ = _get_caps_options(tag)
 
-        if tag == 'latestversion':  # handled separately later
+        if tag == "latestversion":  # handled separately later
             continue
         if tag in variable:
             replacewith = variable[tag]
@@ -140,10 +139,10 @@ def _replace_tag(paths, tag, replacewith):
 def _get_caps_options(tag):
     lower = False
     upper = False
-    if tag.endswith('.lower'):
+    if tag.endswith(".lower"):
         lower = True
         tag = tag[0:-6]
-    elif tag.endswith('.upper'):
+    elif tag.endswith(".upper"):
         upper = True
         tag = tag[0:-6]
     return tag, lower, upper
@@ -163,60 +162,114 @@ def _resolve_latestversion(dirname_template):
     This implementation avoid globbing on centralized clusters with very
     large data root dirs (i.e. ESGF nodes like Jasmin/DKRZ).
     """
-    if '{latestversion}' not in dirname_template:
+    if "{latestversion}" not in dirname_template:
         return dirname_template
 
     # Find latest version
-    part1, part2 = dirname_template.split('{latestversion}')
+    part1, part2 = dirname_template.split("{latestversion}")
     part2 = part2.lstrip(os.sep)
     if os.path.exists(part1):
         versions = os.listdir(part1)
         versions.sort(reverse=True)
-        for version in ['latest'] + versions:
+        for version in ["latest"] + versions:
             dirname = os.path.join(part1, version, part2)
             if os.path.isdir(dirname):
                 return dirname
 
     return dirname_template
 
 
+def _resolve_wildcards_and_version(dirname, basepath, project, drs):
+    """Resolve wildcards and latestversion tag."""
+    if "{latestversion}" in dirname:
+        dirname_version_wildcard = dirname.replace("{latestversion}", "*")
+
+        # Find all directories that match the template
+        all_dirs = sorted(glob.glob(dirname_version_wildcard))
+
+        # Sort directories by version
+        all_dirs_dict = {}
+        for directory in all_dirs:
+            version = dir_to_var(
+                directory, basepath, project, drs)['latestversion']
+            all_dirs_dict.setdefault(version, [])
+            all_dirs_dict[version].append(directory)
+
+        # Select latest version
+        if not all_dirs_dict:
+            dirnames = []
+        elif 'latest' in all_dirs_dict:
+            dirnames = all_dirs_dict['latest']
+        else:
+            all_versions = sorted(list(all_dirs_dict))
+            dirnames = all_dirs_dict[all_versions[-1]]
+
+    # No {latestversion} tag
+    else:
+        dirnames = sorted(glob.glob(dirname))
+
+    # No directories found
+    if not dirnames:
+        logger.debug("Unable to resolve %s", dirname)
+        return dirname
+
+    # Exactly one directory found
+    if len(dirnames) == 1:
+        return dirnames[0]
+
+    # Warn if multiple directories have been found and prioritize r0i0p0
+    logger.warning("Multiple directories for fx variables found: %s", dirnames)
+    r0i0p0_matches = [d for d in dirnames if "r0i0p0" in d]
+    if r0i0p0_matches:
+        return r0i0p0_matches[0]
+    return dirnames[0]
+
+
 def _select_drs(input_type, drs, project):
     """Select the directory structure of input path."""
     cfg = get_project_config(project)
     input_path = cfg[input_type]
     if isinstance(input_path, str):
         return input_path
 
-    structure = drs.get(project, 'default')
+    structure = drs.get(project, "default")
     if structure in input_path:
         return input_path[structure]
 
     raise KeyError(
-        'drs {} for {} project not specified in config-developer file'.format(
+        "drs {} for {} project not specified in config-developer file".format(
             structure, project))
 
 
 def get_rootpath(rootpath, project):
     """Select the rootpath."""
     if project in rootpath:
         return rootpath[project]
-    if 'default' in rootpath:
-        return rootpath['default']
-    raise KeyError('default rootpath must be specified in config-user file')
+    if "default" in rootpath:
+        return rootpath["default"]
+    raise KeyError("default rootpath must be specified in config-user file")
 
 
 def _find_input_dirs(variable, rootpath, drs):
     """Return a the full paths to input directories."""
-    project = variable['project']
+    project = variable["project"]
 
     root = get_rootpath(rootpath, project)
-    path_template = _select_drs('input_dir', drs, project)
+    path_template = _select_drs("input_dir", drs, project)
 
     dirnames = []
     for dirname_template in _replace_tags(path_template, variable):
         for base_path in root:
             dirname = os.path.join(base_path, dirname_template)
-            dirname = _resolve_latestversion(dirname)
+            if variable["frequency"] == "fx" and "*" in dirname:
+                dirname = _resolve_wildcards_and_version(dirname, base_path,
+                                                         project, drs)
+                var_from_dir = dir_to_var(dirname, base_path, project, drs)
+                for (key, val) in variable.items():
+                    if val == '*':
+                        variable[key] = var_from_dir.get(key, '*')
+            else:
+                dirname = _resolve_latestversion(dirname)
             matches = glob.glob(dirname)
             matches = [match for match in matches if os.path.isdir(match)]
             if matches:
@@ -231,65 +284,104 @@ def _find_input_dirs(variable, rootpath, drs):
 
 def _get_filenames_glob(variable, drs):
     """Return patterns that can be used to look for input files."""
-    path_template = _select_drs('input_file', drs, variable['project'])
+    path_template = _select_drs("input_file", drs, variable["project"])
     filenames_glob = _replace_tags(path_template, variable)
     return filenames_glob
 
 
 def _find_input_files(variable, rootpath, drs):
-    short_name = variable['short_name']
-    variable['short_name'] = variable['original_short_name']
+    short_name = variable["short_name"]
+    variable["short_name"] = variable["original_short_name"]
     input_dirs = _find_input_dirs(variable, rootpath, drs)
     filenames_glob = _get_filenames_glob(variable, drs)
     files = find_files(input_dirs, filenames_glob)
-    variable['short_name'] = short_name
+    variable["short_name"] = short_name
     return (files, input_dirs, filenames_glob)
 
 
 def get_input_filelist(variable, rootpath, drs):
     """Return the full path to input files."""
     # change ensemble to fixed r0i0p0 for fx variables
     # this is needed and is not a duplicate effort
-    if variable['project'] == 'CMIP5' and variable['frequency'] == 'fx':
+    if all([
+            variable['project'] == 'CMIP5', variable['frequency'] == 'fx',
+            variable.get('ensemble') != '*'
+    ]):
         variable['ensemble'] = 'r0i0p0'
     (files, dirnames, filenames) = _find_input_files(variable, rootpath, drs)
+
     # do time gating only for non-fx variables
-    if variable['frequency'] != 'fx':
-        files = select_files(files, variable['start_year'],
-                             variable['end_year'])
+    if variable["frequency"] != "fx":
+        files = select_files(files, variable["start_year"],
+                             variable["end_year"])
     return (files, dirnames, filenames)
 
 
 def get_output_file(variable, preproc_dir):
     """Return the full path to the output (preprocessed) file."""
-    cfg = get_project_config(variable['project'])
+    cfg = get_project_config(variable["project"])
 
     # Join different experiment names
-    if isinstance(variable.get('exp'), (list, tuple)):
+    if isinstance(variable.get("exp"), (list, tuple)):
         variable = dict(variable)
-        variable['exp'] = '-'.join(variable['exp'])
+        variable["exp"] = "-".join(variable["exp"])
 
     outfile = os.path.join(
         preproc_dir,
-        variable['diagnostic'],
-        variable['variable_group'],
-        _replace_tags(cfg['output_file'], variable)[0],
+        variable["diagnostic"],
+        variable["variable_group"],
+        _replace_tags(cfg["output_file"], variable)[0],
     )
-    if variable['frequency'] != 'fx':
-        outfile += '_{start_year}-{end_year}'.format(**variable)
-    outfile += '.nc'
+    if variable["frequency"] != "fx":
+        outfile += "_{start_year}-{end_year}".format(**variable)
+    outfile += ".nc"
     return outfile
 
 
 def get_statistic_output_file(variable, preproc_dir):
     """Get multi model statistic filename depending on settings."""
     template = os.path.join(
         preproc_dir,
-        '{diagnostic}',
-        '{variable_group}',
-        '{dataset}_{mip}_{short_name}_{start_year}-{end_year}.nc',
+        "{diagnostic}",
+        "{variable_group}",
+        "{dataset}_{mip}_{short_name}_{start_year}-{end_year}.nc",
     )
 
     outfile = template.format(**variable)
 
     return outfile
+
+
+def dir_to_var(dirname, basepath, project, drs):
+    """Convert directory path to variable :obj:`dict`."""
+    if dirname != os.sep:
+        dirname = dirname.rstrip(os.sep)
+    if basepath != os.sep:
+        basepath = basepath.rstrip(os.sep)
+    path_template = _select_drs("input_dir", drs, project).rstrip(os.sep)
+    rel_dir = os.path.relpath(dirname, basepath)
+    keys = path_template.split(os.sep)
+    vals = rel_dir.split(os.sep)
+    if len(keys) != len(vals):
+        raise ValueError(
+            f"Cannot extract tags '{path_template}' from directory "
+            f"'{rel_dir}' (root: '{basepath}') with different numbers of "
+            f"elements")
+    variable = {}
+    for (idx, full_key) in enumerate(keys):
+        matches = re.findall(r'.*\{(.*)\}.*', full_key)
+        if len(matches) != 1:
+            continue
+        key = matches[0]
+        regex = rf"{full_key.replace(key, '(.*)')}"
+        regex = regex.replace('{', '').replace('}', '')
+        matches = re.findall(regex, vals[idx])
+        while '' in matches:
+            matches.remove('')
+        if len(matches) != 1:
+            raise ValueError(
+                f"Regex pattern '{regex}' for '{full_key}' cannot be "
+                f"(uniquely) matched to element '{vals[idx]}' in directory "
+                f"'{dirname}'")
+        variable[key] = matches[0]
+    return variable