diff --git a/nomenclature/codelist.py b/nomenclature/codelist.py index cdb6f982..81cbdc15 100644 --- a/nomenclature/codelist.py +++ b/nomenclature/codelist.py @@ -8,9 +8,9 @@ from pyam.utils import write_sheet from pydantic import BaseModel, validator +import nomenclature from nomenclature.code import Code, MetaCode, RegionCode, VariableCode from nomenclature.config import DataStructureConfig -from nomenclature.countries import countries from nomenclature.error.codelist import DuplicateCodeError from nomenclature.error.variable import ( MissingWeightError, @@ -191,8 +191,33 @@ def from_directory( instance of cls (:class:`CodeList` if not inherited) """ - code_list: List[Code] = [] + code_list = cls._parse_codelist_dir(path, file_glob_pattern) + + if config is not None: + dimension = path.name + codelistconfig = getattr(config, dimension, None) + if codelistconfig is not None and codelistconfig.repository is not None: + repo_path = ( + config.repository[codelistconfig.repository].local_path + / codelistconfig.repository_dimension_path + ) + code_list.extend( + cls._parse_codelist_dir( + repo_path, + file_glob_pattern, + ) + ) + + mapping: Dict[str, Code] = {} + for code in code_list: + if code.name in mapping: + raise DuplicateCodeError(name=name, code=code.name) + mapping[code.name] = code + return cls(name=name, mapping=mapping) + @classmethod + def _parse_codelist_dir(cls, path: Path, file_glob_pattern: str = "**/*"): + code_list: List[Code] = [] for yaml_file in ( f for f in path.glob(file_glob_pattern) @@ -200,19 +225,13 @@ def from_directory( ): with open(yaml_file, "r", encoding="utf-8") as stream: _code_list = yaml.safe_load(stream) - for code_dict in _code_list: code = cls.code_basis.from_dict(code_dict) - # add `file` attribute code.file = yaml_file.relative_to(path.parent).as_posix() code_list.append(code) + code_list = cls._parse_and_replace_tags(code_list, path, file_glob_pattern) - mapping: Dict[str, Code] = {} - for code in code_list: - if code.name in mapping: - raise DuplicateCodeError(name=name, code=code.name) - mapping[code.name] = code - return cls(name=name, mapping=mapping) + return code_list @classmethod def read_excel(cls, name, source, sheet_name, col, attrs=None): @@ -536,7 +555,7 @@ def from_directory( Name of the CodeList path : :class:`pathlib.Path` or path-like Directory with the codelist files - config : :class:`DataStructureConfig`, optional + config : :class:`RegionCodeListConfig`, optional Attributes for configuring the CodeList file_glob_pattern : str, optional Pattern to downselect codelist files by name, default: "**/*" (i.e. all @@ -554,7 +573,7 @@ def from_directory( if config is not None and config.region is not None: # adding all countries if config.region.country is True: - for c in countries: + for c in nomenclature.countries: try: code_list.append( RegionCode( @@ -566,12 +585,17 @@ def from_directory( code_list.append(RegionCode(name=c.name, hierarchy="Country")) # importing from an external repository - if repo := config.region.repository: - repo_path = path.parents[1] / repo - if not repo_path.exists(): - raise FileNotFoundError(f"Repository not found: {repo}") + if config.region.repository: + repo_path = ( + config.repository[config.region.repository].local_path + / config.region.repository_dimension_path + ) + code_list = cls._parse_region_code_dir( - code_list, repo_path, file_glob_pattern, repository=repo + code_list, + repo_path, + file_glob_pattern, + repository=config.repository, ) code_list = cls._parse_and_replace_tags( code_list, repo_path, file_glob_pattern diff --git a/nomenclature/config.py b/nomenclature/config.py index 427e189a..b31b279d 100644 --- a/nomenclature/config.py +++ b/nomenclature/config.py @@ -1,18 +1,69 @@ from pathlib import Path -from typing import Dict, Optional -from pydantic import BaseModel +from typing import Optional, Dict +from pydantic import BaseModel, root_validator, validator import yaml +from git import Repo class CodeListConfig(BaseModel): - repository: Optional[Path] + dimension: str + repository: Optional[str] + repository_dimension_path: Optional[Path] + + @root_validator() + def set_repository_dimension_path(cls, v): + if ( + v.get("repository") is not None + and v.get("repository_dimension_path") is None + ): + v["repository_dimension_path"] = f"definitions/{v['dimension']}" + return v class RegionCodeListConfig(CodeListConfig): country: Optional[bool] +class Repository(BaseModel): + url: str + hash: Optional[str] + release: Optional[str] + local_path: Optional[Path] # defined via the `repository` name in the configuration + + @root_validator() + def check_hash_and_release(cls, v): + if v.get("hash") and v.get("release"): + raise ValueError("Either 'hash' or 'release' can be provided, not both.") + return v + + @validator("local_path") + def check_path_empty(cls, v): + if v is not None: + raise ValueError("The `local_path` must not be set as part of the config.") + return v + + @property + def revision(self): + return self.hash or self.release or "main" + + def fetch_repo(self, to_path): + to_path = to_path if isinstance(to_path, Path) else Path(to_path) + + if not to_path.is_dir(): + repo = Repo.clone_from(self.url, to_path) + else: + repo = Repo(to_path) + repo.remotes.origin.fetch() + self.local_path = to_path + repo.git.reset("--hard") + repo.git.checkout(self.revision) + repo.git.reset("--hard") + repo.git.clean("-xdf") + if self.revision == "main": + repo.remotes.origin.pull() + + class DataStructureConfig(BaseModel): """A class for configuration of a DataStructureDefinition @@ -23,7 +74,32 @@ class DataStructureConfig(BaseModel): """ + repository: Dict[str, Repository] = {} region: Optional[RegionCodeListConfig] + variable: Optional[CodeListConfig] + + file: Path + + @validator("region", "variable", pre=True) + def add_dimension(cls, v, field): + return {"dimension": field.name, **v} + + @root_validator + def check_repository_consistency(cls, values): + for dimension in ("region", "variable"): + if ( + values.get("repository") + and values.get(dimension) + and values.get(dimension).repository + and values.get(dimension).repository not in values.get("repository") + ): + raise ValueError( + ( + f"Unknown repository '{values.get(dimension).repository}' in" + f" {dimension}.repository." + ) + ) + return values @classmethod def from_file(cls, path: Path, file: str): @@ -39,5 +115,10 @@ def from_file(cls, path: Path, file: str): """ with open(path / file, "r", encoding="utf-8") as stream: config = yaml.safe_load(stream) + instance = cls(**config, file=path / file) + instance.fetch_repos() + return instance - return cls(region=RegionCodeListConfig(**config["region"])) + def fetch_repos(self): + for repo_name, repo in self.repository.items(): + repo.fetch_repo(self.file.parent / repo_name) diff --git a/nomenclature/definition.py b/nomenclature/definition.py index 9fe3d551..6dedfeca 100644 --- a/nomenclature/definition.py +++ b/nomenclature/definition.py @@ -50,13 +50,13 @@ def __init__(self, path, dimensions=None): file="config.yaml", ) else: - self.config = DataStructureConfig() - + self.config = None self.dimensions = dimensions or ["region", "variable"] for dim in self.dimensions: codelist_cls = SPECIAL_CODELIST.get(dim, CodeList) self.__setattr__( - dim, codelist_cls.from_directory(dim, path / dim, self.config) + dim, + codelist_cls.from_directory(dim, path / dim, self.config), ) empty = [d for d in self.dimensions if not self.__getattribute__(d)] diff --git a/setup.cfg b/setup.cfg index efb51400..ddac166b 100644 --- a/setup.cfg +++ b/setup.cfg @@ -23,6 +23,7 @@ install_requires = pandas >= 1.5.2 numpy pycountry + gitpython setup_requires = setuptools >= 41 setuptools_scm diff --git a/tests/data/general-config-definitions/config.yaml b/tests/data/general-config-definitions/config.yaml index 7f5c2347..6c9a44f2 100644 --- a/tests/data/general-config-definitions/config.yaml +++ b/tests/data/general-config-definitions/config.yaml @@ -1,3 +1,9 @@ +repository: + common-definitions: + url: https://github.com/IAMconsortium/common-definitions.git/ region: - repository: validation_nc/region + repository: common-definitions country: true +variable: + repository: common-definitions + repository_dimension_path: definitions/variable diff --git a/tests/test_definition.py b/tests/test_definition.py index 99465606..b2954507 100644 --- a/tests/test_definition.py +++ b/tests/test_definition.py @@ -1,3 +1,4 @@ +import shutil import pytest import pandas as pd from nomenclature import DataStructureDefinition, create_yaml_from_xlsx @@ -43,19 +44,27 @@ def test_empty_codelist_raises(): def test_definition_from_general_config(): obs = DataStructureDefinition( TEST_DATA_DIR / "general-config-definitions", - dimensions=["region"], + dimensions=["region", "variable"], ) - - # explicitly defined in `general-config-definitions/region/regions.yaml` - assert "Region A" in obs.region - # imported from `validation_nc` repo - assert "World" in obs.region - # added via general-config definitions - assert "Austria" in obs.region - # added via general-config definitions renamed from pycountry name - assert "Bolivia" in obs.region - # added via general-config definitions in addition to pycountry.countries - assert "Kosovo" in obs.region + try: + # explicitly defined in `general-config-definitions/region/regions.yaml` + assert "Region A" in obs.region + # imported from https://github.com/IAMconsortium/common-definitions repo + assert "World" in obs.region + # added via general-config definitions + assert "Austria" in obs.region + # added via general-config definitions renamed from pycountry name + assert "Bolivia" in obs.region + # added via general-config definitions in addition to pycountry.countries + assert "Kosovo" in obs.region + + # imported from https://github.com/IAMconsortium/common-definitions repo + assert "Primary Energy" in obs.variable + finally: + # clean up the external repo + for repository in obs.config.repository.values(): + if repository.local_path.exists(): + shutil.rmtree(repository.local_path, ignore_errors=True) def test_to_excel(simple_definition, tmpdir):