Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add check for validate_data consistency to validate-project CLI #367

Merged
merged 10 commits into from
Aug 16, 2024
29 changes: 20 additions & 9 deletions nomenclature/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ def cli_valid_yaml(path: Path):
type=str,
default=None,
)
@click.option(
"--validate_data",
help="Optional name for validation folder",
type=str,
default=None,
)
@click.option(
"--dimension",
"dimensions",
Expand All @@ -51,6 +57,7 @@ def cli_valid_project(
definitions: str,
mappings: Optional[str],
required_data: Optional[str],
validate_data: Optional[str],
dimensions: Optional[List[str]],
):
"""Assert that `path` is a valid project nomenclature
Expand All @@ -60,11 +67,13 @@ def cli_valid_project(
path : Path
Project directory to be validated
definitions : str, optional
Name of the definitions folder, defaults to "definitions"
Name of 'definitions' folder, defaults to "definitions"
mappings : str, optional
Name of the mappings folder, defaults to "mappings" (if this folder exists)
Name of 'mappings' folder, defaults to "mappings"
required_data: str, optional
Name of the required data folder, default to "required_data" (if folder exists)
Name of folder for 'required data' criteria, default to "required_data"
validate_data: str, optional
Name of folder for data validation criteria, default to "validate_data"
dimensions : List[str], optional
Dimensions to be checked, defaults to all sub-folders of `definitions`

Expand All @@ -76,23 +85,25 @@ def cli_valid_project(
--dimension <folder2>
--dimension <folder3>



Note
----
This test includes three steps:

1. Test that all yaml files in `definitions/` and `mappings/` can be correctly read
1. Test that all yaml files in `definitions` and `mappings` can be correctly parsed
as yaml files. This is a formal check for yaml syntax only.
2. Test that all files in `definitions/` can be correctly parsed as a
2. Test that all files in `definitions` can be correctly parsed as a
:class:`DataStructureDefinition` object comprised of individual codelists.
3. Test that all model mappings in `mappings/` can be correctly parsed as a
3. Test that all model mappings in `mappings` can be correctly parsed as a
:class:`RegionProcessor` object. This includes a check that all regions mentioned
in a model mapping are defined in the region codelist.
4. Test that all required-data and data-validation files can be parsed correctly
and are consistent with the `definitions`.

"""
assert_valid_yaml(path)
assert_valid_structure(path, definitions, mappings, required_data, dimensions)
assert_valid_structure(
path, definitions, mappings, required_data, validate_data, dimensions
)


@cli.command("check-region-aggregation")
Expand Down
15 changes: 10 additions & 5 deletions nomenclature/error.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import textwrap
from collections import namedtuple
from typing import Optional

pydantic_custom_error_config = {
"RegionNameCollisionError": (
Expand Down Expand Up @@ -47,22 +48,26 @@

class ErrorCollector:
errors: list[Exception]
description: Optional[str] = None

def __init__(self) -> None:
def __init__(self, description: str = None) -> None:
self.errors = []
self.description = description

def append(self, error: Exception) -> None:
self.errors.append(error)

def __repr__(self) -> str:
error = "error" if len(self.errors) == 1 else "errors"
error_list_str = "\n".join(
f"{i+1}. {error}" for i, error in enumerate(self.errors)
f"{i + 1}. {error}" for i, error in enumerate(self.errors)
)

return f"Collected {len(self.errors)} {error}:\n" + textwrap.indent(
error_list_str, prefix=" "
)
message = f"Collected {len(self.errors)} {error}"
if self.description is not None:
message += f" when checking {self.description}"

return f"{message}:\n" + textwrap.indent(error_list_str, prefix=" ")

def __bool__(self) -> bool:
return bool(self.errors)
1 change: 1 addition & 0 deletions nomenclature/processor/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@
RegionProcessor,
)
from nomenclature.processor.required_data import RequiredDataValidator # noqa
from nomenclature.processor.data_validator import DataValidator # noqa
6 changes: 3 additions & 3 deletions nomenclature/processor/data_validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

import yaml

from nomenclature import DataStructureDefinition
from nomenclature.definition import DataStructureDefinition
from nomenclature.error import ErrorCollector
from nomenclature.processor.iamc import IamcDataFilter
from nomenclature.processor import Processor
Expand Down Expand Up @@ -33,11 +33,11 @@ def apply(self):
pass

def validate_with_definition(self, dsd: DataStructureDefinition) -> None:
errors = ErrorCollector()
errors = ErrorCollector(description=f"in file '{self.file}'")
for data in self.criteria_items:
try:
data.validate_with_definition(dsd)
except ValueError as value_error:
errors.append(value_error)
if errors:
raise ValueError(f"In file {get_relative_path(self.file)}:\n{errors}")
raise ValueError(errors)
64 changes: 43 additions & 21 deletions nomenclature/testing.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,12 @@
import yaml

from nomenclature.definition import DataStructureDefinition
from nomenclature.processor import RegionProcessor, RequiredDataValidator
from nomenclature.processor import (
DataValidator,
RegionProcessor,
RequiredDataValidator,
Processor,
)
from nomenclature.error import ErrorCollector

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -63,35 +68,38 @@ def _check_mappings(
raise FileNotFoundError(f"Mappings directory not found: {path / mappings}")


def _collect_RequiredData_errors(
required_data_dir: Path, dsd: DataStructureDefinition
def _collect_processor_errors(
path: Path,
processor: type[RequiredDataValidator] | type[DataValidator],
dsd: DataStructureDefinition,
) -> None:
errors = ErrorCollector()
for file in required_data_dir.iterdir():
errors = ErrorCollector(description=f"files for '{processor.__name__}' ({path})")
for file in path.iterdir():
try:
RequiredDataValidator.from_file(file).validate_with_definition(dsd)
processor.from_file(file).validate_with_definition(dsd)
except ValueError as error:
errors.append(error)
if errors:
raise ValueError(f"Found error(s) in required data files:\n{errors}")
raise ValueError(errors)


def _check_RequiredData(
def _check_processor_directory(
path: Path,
definitions: str = "definitions",
processor: Processor,
processor_arg: str,
folder: Optional[str] = None,
definitions: Optional[str] = "definitions",
dimensions: Optional[List[str]] = None,
required_data: Optional[str] = None,
) -> None:
dsd = DataStructureDefinition(path / definitions, dimensions)
if required_data is None:
if (path / "requiredData").is_dir():
_collect_RequiredData_errors(path / "required_data", dsd)

elif (path / required_data).is_dir():
_collect_RequiredData_errors(path / required_data, dsd)
if folder is None:
if (path / processor_arg).is_dir():
_collect_processor_errors(path / processor_arg, processor, dsd)
elif (path / folder).is_dir():
_collect_processor_errors(path / folder, processor, dsd)
else:
raise FileNotFoundError(
f"Directory for required data not found at: {path / required_data}"
f"Directory for '{processor_arg}' not found at: {path / folder}"
)


Expand All @@ -100,6 +108,7 @@ def assert_valid_structure(
definitions: str = "definitions",
mappings: Optional[str] = None,
required_data: Optional[str] = None,
validate_data: Optional[str] = None,
dimensions: Optional[List[str]] = None,
) -> None:
"""Assert that `path` can be initialized as a :class:`DataStructureDefinition`
Expand All @@ -113,16 +122,19 @@ def assert_valid_structure(
mappings : str, optional
Name of the mappings folder, defaults to "mappings" (if this folder exists)
required_data : str, optional
Name of the required data folder, defaults to "required_data" (if this folder
exists)
Name of the folder for required data, defaults to "required_data"
(if this folder exists)
validate_data : str, optional
Name of the folder for data validation criteria, defaults to "validate_date"
(if this folder exists)
dimensions : List[str], optional
Dimensions to be checked, defaults to all sub-folders of `definitions`

Notes
-----
Folder structure of `path`:
- A `definitions` folder is required and must be a valid
:class:`DataStructureDefinition`
:class:`DataStructureDefinition`
- The `definitions` folder must contain sub-folder(s) to validate
- If a `mappings` folder exists, it must be a valid :class:`RegionProcessor`

Expand All @@ -139,7 +151,17 @@ def assert_valid_structure(
f"`definitions` directory is empty: {path / definitions}"
)
_check_mappings(path, definitions, dimensions, mappings)
_check_RequiredData(path, definitions, dimensions, required_data)
_check_processor_directory(
path,
RequiredDataValidator,
"required_data",
required_data,
definitions,
dimensions,
)
_check_processor_directory(
path, DataValidator, "validate_data", validate_data, definitions, dimensions
)


# Todo: add function which runs `DataStructureDefinition(path).validate(scenario)`
3 changes: 0 additions & 3 deletions tests/data/validation/validate_data/fail_unknown_region.yaml

This file was deleted.

18 changes: 16 additions & 2 deletions tests/test_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
from nomenclature import cli
from nomenclature.testing import assert_valid_structure, assert_valid_yaml
from nomenclature.codelist import VariableCodeList
from nomenclature.cli import cli_run_workflow

from conftest import TEST_DATA_DIR

Expand Down Expand Up @@ -256,6 +255,22 @@ def test_cli_missing_mappings_fails():
assert "Mappings directory not found" in str(cli_result.exception)


def test_cli_validate_data_fails():
"""Assert that validating invalid yaml fails"""
cli_result = runner.invoke(
cli,
[
"validate-project",
str(TEST_DATA_DIR / "validation"),
],
)

assert cli_result.exit_code == 1
assert "Collected 2 errors" in str(cli_result.exception)
assert "Asia" in str(cli_result.exception)
assert "Final Energy|Industry" in str(cli_result.exception)


def test_cli_empty_definitions_dir():
"""Assert that an error is raised when the `definitions` directory is empty"""

Expand Down Expand Up @@ -365,7 +380,6 @@ def test_cli_add_missing_variables(simple_definition, tmp_path):


def test_cli_run_workflow(tmp_path, simple_df):

simple_df.to_excel(tmp_path / "input.xlsx")

runner.invoke(
Expand Down
8 changes: 4 additions & 4 deletions tests/test_data_validation.py → tests/test_validate_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ def test_DataValidator_from_file():
obs = DataValidator.from_file(DATA_VALIDATION_TEST_DIR / "simple_validation.yaml")
assert obs == exp

dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition")
dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definitions")
assert obs.validate_with_definition(dsd) is None


Expand All @@ -43,17 +43,17 @@ def test_DataValidator_validate_with_definition_raises(dimension, match):
# TODO Undefined unit

data_validator = DataValidator.from_file(
DATA_VALIDATION_TEST_DIR / f"validation_unknown_{dimension}.yaml"
DATA_VALIDATION_TEST_DIR / f"validate_unknown_{dimension}.yaml"
)

# validating against a DataStructure with all dimensions raises
dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definition")
dsd = DataStructureDefinition(TEST_DATA_DIR / "validation" / "definitions")
with pytest.raises(ValueError, match=match):
data_validator.validate_with_definition(dsd)

# validating against a DataStructure without the offending dimension passes
dsd = DataStructureDefinition(
TEST_DATA_DIR / "validation" / "definition",
TEST_DATA_DIR / "validation" / "definitions",
dimensions=[dim for dim in ["region", "variable"] if dim != dimension],
)
assert data_validator.validate_with_definition(dsd) is None