From f0f001a001aa7cf0a78f6a7934683538c7d2985b Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 13 Aug 2024 06:55:06 +0200 Subject: [PATCH 01/10] Rename tests files --- nomenclature/processor/data_validator.py | 2 +- .../{definition => definitions}/region/region.yaml | 0 .../{definition => definitions}/variable/variable.yaml | 0 .../data/validation/validate_data/fail_unknown_region.yaml | 3 --- ...ion_unknown_region.yaml => validate_unknown_region.yaml} | 0 ...unknown_variable.yaml => validate_unknown_variable.yaml} | 0 tests/{test_data_validation.py => test_validate_data.py} | 6 +++--- 7 files changed, 4 insertions(+), 7 deletions(-) rename tests/data/validation/{definition => definitions}/region/region.yaml (100%) rename tests/data/validation/{definition => definitions}/variable/variable.yaml (100%) delete mode 100644 tests/data/validation/validate_data/fail_unknown_region.yaml rename tests/data/validation/validate_data/{validation_unknown_region.yaml => validate_unknown_region.yaml} (100%) rename tests/data/validation/validate_data/{validation_unknown_variable.yaml => validate_unknown_variable.yaml} (100%) rename tests/{test_data_validation.py => test_validate_data.py} (95%) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index 96b79d20..ef42a835 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -3,7 +3,7 @@ import yaml -from nomenclature import DataStructureDefinition +from nomenclature.definition import DataStructureDefinition from nomenclature.error import ErrorCollector from nomenclature.processor.iamc import IamcDataFilter from nomenclature.processor import Processor diff --git a/tests/data/validation/definition/region/region.yaml b/tests/data/validation/definitions/region/region.yaml similarity index 100% rename from tests/data/validation/definition/region/region.yaml rename to tests/data/validation/definitions/region/region.yaml diff --git a/tests/data/validation/definition/variable/variable.yaml b/tests/data/validation/definitions/variable/variable.yaml similarity index 100% rename from tests/data/validation/definition/variable/variable.yaml rename to tests/data/validation/definitions/variable/variable.yaml diff --git a/tests/data/validation/validate_data/fail_unknown_region.yaml b/tests/data/validation/validate_data/fail_unknown_region.yaml deleted file mode 100644 index 8987e6ae..00000000 --- a/tests/data/validation/validate_data/fail_unknown_region.yaml +++ /dev/null @@ -1,3 +0,0 @@ - - variable: Final Energy - region: Asia - diff --git a/tests/data/validation/validate_data/validation_unknown_region.yaml b/tests/data/validation/validate_data/validate_unknown_region.yaml similarity index 100% rename from tests/data/validation/validate_data/validation_unknown_region.yaml rename to tests/data/validation/validate_data/validate_unknown_region.yaml diff --git a/tests/data/validation/validate_data/validation_unknown_variable.yaml b/tests/data/validation/validate_data/validate_unknown_variable.yaml similarity index 100% rename from tests/data/validation/validate_data/validation_unknown_variable.yaml rename to tests/data/validation/validate_data/validate_unknown_variable.yaml diff --git a/tests/test_data_validation.py b/tests/test_validate_data.py similarity index 95% rename from tests/test_data_validation.py rename to tests/test_validate_data.py index 8a6f8614..366800aa 100644 --- a/tests/test_data_validation.py +++ b/tests/test_validate_data.py @@ -25,7 +25,7 @@ def test_DataValidator_from_file(): obs = DataValidator.from_file(DATA_VALIDATION_TEST_DIR / "simple_validation.yaml") assert obs == exp - dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition") + dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definitions") assert obs.validate_with_definition(dsd) is None @@ -47,13 +47,13 @@ def test_DataValidator_validate_with_definition_raises(dimension, match): ) # validating against a DataStructure with all dimensions raises - dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition") + dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definitions") with pytest.raises(ValueError, match=match): data_validator.validate_with_definition(dsd) # validating against a DataStructure without the offending dimension passes dsd = DataStructureDefinition( - TEST_DATA_DIR / "validation" / "definition", + TEST_DATA_DIR / "validation" / "definitions", dimensions=[dim for dim in ["region", "variable"] if dim != dimension], ) assert data_validator.validate_with_definition(dsd) is None From 7cddd20060e69d743a2a8bc0376f4317adac899b Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 13 Aug 2024 07:04:20 +0200 Subject: [PATCH 02/10] Add `validate_data` check to the `validate-project` CLI --- nomenclature/cli.py | 29 +++++++++++++------- nomenclature/testing.py | 60 +++++++++++++++++++++++++++-------------- 2 files changed, 60 insertions(+), 29 deletions(-) diff --git a/nomenclature/cli.py b/nomenclature/cli.py index d0324495..33cdafc1 100644 --- a/nomenclature/cli.py +++ b/nomenclature/cli.py @@ -38,6 +38,12 @@ def cli_valid_yaml(path: Path): type=str, default=None, ) +@click.option( + "--validate_data", + help="Optional name for validation folder", + type=str, + default=None, +) @click.option( "--dimension", "dimensions", @@ -51,6 +57,7 @@ def cli_valid_project( definitions: str, mappings: Optional[str], required_data: Optional[str], + validate_data: Optional[str], dimensions: Optional[List[str]], ): """Assert that `path` is a valid project nomenclature @@ -60,11 +67,13 @@ def cli_valid_project( path : Path Project directory to be validated definitions : str, optional - Name of the definitions folder, defaults to "definitions" + Name of 'definitions' folder, defaults to "definitions" mappings : str, optional - Name of the mappings folder, defaults to "mappings" (if this folder exists) + Name of 'mappings' folder, defaults to "mappings" required_data: str, optional - Name of the required data folder, default to "required_data" (if folder exists) + Name of folder for 'required data' criteria, default to "required_data" + validate_data: str, optional + Name of folder for data validation criteria, default to "validate_data" dimensions : List[str], optional Dimensions to be checked, defaults to all sub-folders of `definitions` @@ -76,23 +85,25 @@ def cli_valid_project( --dimension --dimension - - Note ---- This test includes three steps: - 1. Test that all yaml files in `definitions/` and `mappings/` can be correctly read + 1. Test that all yaml files in `definitions` and `mappings` can be correctly parsed as yaml files. This is a formal check for yaml syntax only. - 2. Test that all files in `definitions/` can be correctly parsed as a + 2. Test that all files in `definitions` can be correctly parsed as a :class:`DataStructureDefinition` object comprised of individual codelists. - 3. Test that all model mappings in `mappings/` can be correctly parsed as a + 3. Test that all model mappings in `mappings` can be correctly parsed as a :class:`RegionProcessor` object. This includes a check that all regions mentioned in a model mapping are defined in the region codelist. + 4. Test that all required-data and data-validation files can be parsed correctly + and are consistent with the `definitions`. """ assert_valid_yaml(path) - assert_valid_structure(path, definitions, mappings, required_data, dimensions) + assert_valid_structure( + path, definitions, mappings, required_data, validate_data, dimensions + ) @cli.command("check-region-aggregation") diff --git a/nomenclature/testing.py b/nomenclature/testing.py index 28069981..5ceb6f86 100644 --- a/nomenclature/testing.py +++ b/nomenclature/testing.py @@ -5,7 +5,12 @@ import yaml from nomenclature.definition import DataStructureDefinition -from nomenclature.processor import RegionProcessor, RequiredDataValidator +from nomenclature.processor import ( + DataValidator, + RegionProcessor, + RequiredDataValidator, + Processor, +) from nomenclature.error import ErrorCollector logger = logging.getLogger(__name__) @@ -63,35 +68,36 @@ def _check_mappings( raise FileNotFoundError(f"Mappings directory not found: {path / mappings}") -def _collect_RequiredData_errors( - required_data_dir: Path, dsd: DataStructureDefinition +def _collect_processor_errors( + path: Path, processor: Processor, dsd: DataStructureDefinition ) -> None: errors = ErrorCollector() - for file in required_data_dir.iterdir(): + for file in path.iterdir(): try: - RequiredDataValidator.from_file(file).validate_with_definition(dsd) + processor.from_file(file).validate_with_definition(dsd) except ValueError as error: errors.append(error) if errors: - raise ValueError(f"Found error(s) in required data files:\n{errors}") + raise ValueError(f"Error(s) in files for {processor.__class__}:\n{errors}") -def _check_RequiredData( +def _check_processor_directory( path: Path, - definitions: str = "definitions", + processor: Processor, + processor_arg: str, + folder: Optional[str] = None, + definitions: Optional[str] = "definitions", dimensions: Optional[List[str]] = None, - required_data: Optional[str] = None, ) -> None: dsd = DataStructureDefinition(path / definitions, dimensions) - if required_data is None: - if (path / "requiredData").is_dir(): - _collect_RequiredData_errors(path / "required_data", dsd) - - elif (path / required_data).is_dir(): - _collect_RequiredData_errors(path / required_data, dsd) + if folder is None: + if (path / processor_arg).is_dir(): + _collect_processor_errors(path / processor_arg, processor, dsd) + elif (path / folder).is_dir(): + _collect_processor_errors(path / folder, processor, dsd) else: raise FileNotFoundError( - f"Directory for required data not found at: {path / required_data}" + f"Directory for '{processor_arg}' not found at: {path / folder}" ) @@ -100,6 +106,7 @@ def assert_valid_structure( definitions: str = "definitions", mappings: Optional[str] = None, required_data: Optional[str] = None, + validate_data: Optional[str] = None, dimensions: Optional[List[str]] = None, ) -> None: """Assert that `path` can be initialized as a :class:`DataStructureDefinition` @@ -113,8 +120,11 @@ def assert_valid_structure( mappings : str, optional Name of the mappings folder, defaults to "mappings" (if this folder exists) required_data : str, optional - Name of the required data folder, defaults to "required_data" (if this folder - exists) + Name of the folder for required data, defaults to "required_data" + (if this folder exists) + validate_data : str, optional + Name of the folder for data validation criteria, defaults to "validate_date" + (if this folder exists) dimensions : List[str], optional Dimensions to be checked, defaults to all sub-folders of `definitions` @@ -122,7 +132,7 @@ def assert_valid_structure( ----- Folder structure of `path`: - A `definitions` folder is required and must be a valid - :class:`DataStructureDefinition` + :class:`DataStructureDefinition` - The `definitions` folder must contain sub-folder(s) to validate - If a `mappings` folder exists, it must be a valid :class:`RegionProcessor` @@ -139,7 +149,17 @@ def assert_valid_structure( f"`definitions` directory is empty: {path / definitions}" ) _check_mappings(path, definitions, dimensions, mappings) - _check_RequiredData(path, definitions, dimensions, required_data) + _check_processor_directory( + path, + RequiredDataValidator, + "required_data", + required_data, + definitions, + dimensions, + ) + _check_processor_directory( + path, DataValidator, "validate_data", validate_data, definitions, dimensions + ) # Todo: add function which runs `DataStructureDefinition(path).validate(scenario)` From 85cd7be9bb7499a0690920f0c0219d0ec2687f39 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 13 Aug 2024 07:04:35 +0200 Subject: [PATCH 03/10] Import `DataValidator` --- nomenclature/processor/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nomenclature/processor/__init__.py b/nomenclature/processor/__init__.py index 2423344c..4591f186 100644 --- a/nomenclature/processor/__init__.py +++ b/nomenclature/processor/__init__.py @@ -4,3 +4,4 @@ RegionProcessor, ) from nomenclature.processor.required_data import RequiredDataValidator # noqa +from nomenclature.processor.data_validator import DataValidator # noqa From f4c68e9d34080b2106179f4dc8925818206b22e2 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 13 Aug 2024 07:09:41 +0200 Subject: [PATCH 04/10] Add a test --- tests/test_cli.py | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index af08f259..de2c0ab7 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -256,6 +256,22 @@ def test_cli_missing_mappings_fails(): assert "Mappings directory not found" in str(cli_result.exception) +def test_cli_validate_data_fails(): + """Assert that validating invalid yaml fails""" + cli_result = runner.invoke( + cli, + [ + "validate-project", + str(TEST_DATA_DIR / "validation"), + ], + ) + + assert cli_result.exit_code == 1 + assert "Collected 2 errors:" in str(cli_result.exception) + assert "Asia" in str(cli_result.exception) + assert "Final Energy|Industry" in str(cli_result.exception) + + def test_cli_empty_definitions_dir(): """Assert that an error is raised when the `definitions` directory is empty""" @@ -365,7 +381,6 @@ def test_cli_add_missing_variables(simple_definition, tmp_path): def test_cli_run_workflow(tmp_path, simple_df): - simple_df.to_excel(tmp_path / "input.xlsx") runner.invoke( From ab5e7198fb9d81ba85b2d9091435ef68f96c4126 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 13 Aug 2024 08:35:58 +0200 Subject: [PATCH 05/10] Remove unused import --- tests/test_cli.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/test_cli.py b/tests/test_cli.py index de2c0ab7..9261de1c 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -12,7 +12,6 @@ from nomenclature import cli from nomenclature.testing import assert_valid_structure, assert_valid_yaml from nomenclature.codelist import VariableCodeList -from nomenclature.cli import cli_run_workflow from conftest import TEST_DATA_DIR From 7d02847d13c4c07e8e9e1aab91c05ca142ec563e Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 13 Aug 2024 08:37:39 +0200 Subject: [PATCH 06/10] Fix failing tests --- tests/test_validate_data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_validate_data.py b/tests/test_validate_data.py index 366800aa..4d339750 100644 --- a/tests/test_validate_data.py +++ b/tests/test_validate_data.py @@ -43,7 +43,7 @@ def test_DataValidator_validate_with_definition_raises(dimension, match): # TODO Undefined unit data_validator = DataValidator.from_file( - DATA_VALIDATION_TEST_DIR / f"validation_unknown_{dimension}.yaml" + DATA_VALIDATION_TEST_DIR / f"validate_unknown_{dimension}.yaml" ) # validating against a DataStructure with all dimensions raises From f0699077591a207ea8d387037d918732ae3fd262 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 13 Aug 2024 09:57:07 +0200 Subject: [PATCH 07/10] Shorter log messages for error collection --- nomenclature/error.py | 15 ++++++++++----- nomenclature/processor/data_validator.py | 4 ++-- nomenclature/testing.py | 6 ++++-- tests/test_cli.py | 2 +- 4 files changed, 17 insertions(+), 10 deletions(-) diff --git a/nomenclature/error.py b/nomenclature/error.py index 39296ce2..63926b3f 100644 --- a/nomenclature/error.py +++ b/nomenclature/error.py @@ -1,5 +1,6 @@ import textwrap from collections import namedtuple +from typing import Optional pydantic_custom_error_config = { "RegionNameCollisionError": ( @@ -47,9 +48,11 @@ class ErrorCollector: errors: list[Exception] + description: Optional[str] = None - def __init__(self) -> None: + def __init__(self, description: str = None) -> None: self.errors = [] + self.description = description def append(self, error: Exception) -> None: self.errors.append(error) @@ -57,12 +60,14 @@ def append(self, error: Exception) -> None: def __repr__(self) -> str: error = "error" if len(self.errors) == 1 else "errors" error_list_str = "\n".join( - f"{i+1}. {error}" for i, error in enumerate(self.errors) + f"{i + 1}. {error}" for i, error in enumerate(self.errors) ) - return f"Collected {len(self.errors)} {error}:\n" + textwrap.indent( - error_list_str, prefix=" " - ) + message = f"Collected {len(self.errors)} {error}" + if self.description is not None: + message += f" when checking {self.description}" + + return f"{message}:\n" + textwrap.indent(error_list_str, prefix=" ") def __bool__(self) -> bool: return bool(self.errors) diff --git a/nomenclature/processor/data_validator.py b/nomenclature/processor/data_validator.py index ef42a835..dfc5158c 100644 --- a/nomenclature/processor/data_validator.py +++ b/nomenclature/processor/data_validator.py @@ -33,11 +33,11 @@ def apply(self): pass def validate_with_definition(self, dsd: DataStructureDefinition) -> None: - errors = ErrorCollector() + errors = ErrorCollector(description=f"in file '{self.file}'") for data in self.criteria_items: try: data.validate_with_definition(dsd) except ValueError as value_error: errors.append(value_error) if errors: - raise ValueError(f"In file {get_relative_path(self.file)}:\n{errors}") + raise ValueError(errors) diff --git a/nomenclature/testing.py b/nomenclature/testing.py index 5ceb6f86..b93e0b65 100644 --- a/nomenclature/testing.py +++ b/nomenclature/testing.py @@ -71,14 +71,16 @@ def _check_mappings( def _collect_processor_errors( path: Path, processor: Processor, dsd: DataStructureDefinition ) -> None: - errors = ErrorCollector() + errors = ErrorCollector( + description=f"files for '{processor.__name__}' ({path})" + ) for file in path.iterdir(): try: processor.from_file(file).validate_with_definition(dsd) except ValueError as error: errors.append(error) if errors: - raise ValueError(f"Error(s) in files for {processor.__class__}:\n{errors}") + raise ValueError(errors) def _check_processor_directory( diff --git a/tests/test_cli.py b/tests/test_cli.py index 9261de1c..6687140a 100644 --- a/tests/test_cli.py +++ b/tests/test_cli.py @@ -266,7 +266,7 @@ def test_cli_validate_data_fails(): ) assert cli_result.exit_code == 1 - assert "Collected 2 errors:" in str(cli_result.exception) + assert "Collected 2 errors" in str(cli_result.exception) assert "Asia" in str(cli_result.exception) assert "Final Energy|Industry" in str(cli_result.exception) From 6432c443ece56edf8152ace5344183fc4816b8a7 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Tue, 13 Aug 2024 10:03:18 +0200 Subject: [PATCH 08/10] Make black --- nomenclature/testing.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/nomenclature/testing.py b/nomenclature/testing.py index b93e0b65..6df2081b 100644 --- a/nomenclature/testing.py +++ b/nomenclature/testing.py @@ -71,9 +71,7 @@ def _check_mappings( def _collect_processor_errors( path: Path, processor: Processor, dsd: DataStructureDefinition ) -> None: - errors = ErrorCollector( - description=f"files for '{processor.__name__}' ({path})" - ) + errors = ErrorCollector(description=f"files for '{processor.__name__}' ({path})") for file in path.iterdir(): try: processor.from_file(file).validate_with_definition(dsd) From 5ec1ac508d89c28295023450fe64e3c0ee9b0e93 Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 09:33:26 +0200 Subject: [PATCH 09/10] More specific type-hint per suggestion by @hackstock --- nomenclature/testing.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/nomenclature/testing.py b/nomenclature/testing.py index 6df2081b..b8704978 100644 --- a/nomenclature/testing.py +++ b/nomenclature/testing.py @@ -69,7 +69,9 @@ def _check_mappings( def _collect_processor_errors( - path: Path, processor: Processor, dsd: DataStructureDefinition + path: Path, + processor: type[RequiredDataValidator] | type[DataValidator], + dsd: DataStructureDefinition, ) -> None: errors = ErrorCollector(description=f"files for '{processor.__name__}' ({path})") for file in path.iterdir(): @@ -161,5 +163,4 @@ def assert_valid_structure( path, DataValidator, "validate_data", validate_data, definitions, dimensions ) - # Todo: add function which runs `DataStructureDefinition(path).validate(scenario)` From cd858f782d0b7d2d243dca2cc14193027dfcfd0f Mon Sep 17 00:00:00 2001 From: Daniel Huppmann Date: Fri, 16 Aug 2024 09:34:38 +0200 Subject: [PATCH 10/10] Make black --- nomenclature/testing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/nomenclature/testing.py b/nomenclature/testing.py index b8704978..8a4c689f 100644 --- a/nomenclature/testing.py +++ b/nomenclature/testing.py @@ -163,4 +163,5 @@ def assert_valid_structure( path, DataValidator, "validate_data", validate_data, definitions, dimensions ) + # Todo: add function which runs `DataStructureDefinition(path).validate(scenario)`