Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MetaCode and MetaCodeList classes with allowed_values attribute #246

Merged
merged 8 commits into from
May 12, 2023
13 changes: 13 additions & 0 deletions nomenclature/code.py
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,16 @@ class RegionCode(Code):
"""

hierarchy: str = None


class MetaCode(Code):
"""A subclass of Code
phackstock marked this conversation as resolved.
Show resolved Hide resolved

Attributes
----------
allowed_values : Optional(list[any])
An optional list of allowed values

"""

allowed_values: Optional[List[Any]]
phackstock marked this conversation as resolved.
Show resolved Hide resolved
61 changes: 60 additions & 1 deletion nomenclature/codelist.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
from pyam.utils import write_sheet
from pydantic import BaseModel, validator

from nomenclature.code import Code, VariableCode, RegionCode
from nomenclature.code import Code, VariableCode, RegionCode, MetaCode
from nomenclature.error.codelist import DuplicateCodeError
from nomenclature.error.variable import (
MissingWeightError,
Expand Down Expand Up @@ -624,3 +624,62 @@ def filter(self, hierarchy: str) -> "RegionCodeList":
"Use `RegionCodeList.hierarchy` method for available items."
)
raise ValueError(msg)


class MetaCodeList(CodeList):
"""A subclass of CodeList specified for MetaCodes

Attributes
----------
name : str
Name of the MetaCodeList
mapping : dict
Dictionary of `MetaCode` objects

"""

# class variable
code_basis: ClassVar = MetaCode
phackstock marked this conversation as resolved.
Show resolved Hide resolved
validation_schema: ClassVar[str] = "meta indicators"
phackstock marked this conversation as resolved.
Show resolved Hide resolved

@classmethod
def from_yaml_files(cls, name: str, path: Path, file_glob_pattern: str = "**/*"):
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not keep the from_directory() convention here? I think we should keep the meta-indicators in a "meta" folder for consistency.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I second that one it should definitely be called from_directory() otherwise DataStructureDefinition cannot use it.
Probably should implement a MetaCodeList class that enforces that ...

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Looking at this again, I don't think the method is actually needed at all. Since MetaCodeList inherits from CodeList and CodeList implements from_directory we should be good.

"""Initialize a MetaCodeList from file(s) with MetaCodes

Parameters
----------
name : str
Name of the MetaCodeList
path : :class:`pathlib.Path` or path-like
Directory with the codelist files
file_glob_pattern : str, optional
Pattern to downselect codelist files by name, default: "**/*" (i.e. all
files in all sub-folders)

Returns
-------
MetaCodeList

"""
mapping: Dict[str, MetaCode] = {}
code_list: List[MetaCode] = []

for yaml_file in (
f for f in path.glob(file_glob_pattern) if f.suffix in {".yaml", ".yml"}
):
with open(yaml_file, "r", encoding="utf-8") as stream:
_code_list = yaml.safe_load(stream)

for top_level_cat in _code_list:
code = MetaCode(
name=list(top_level_cat.keys())[0],
mapping=top_level_cat,
)
code.file = yaml_file.relative_to(path.parent).as_posix()
code.allowed_values = list(top_level_cat.values())[1]
phackstock marked this conversation as resolved.
Show resolved Hide resolved
code_list.append(code)

for code in code_list:
mapping[code.name] = code

return cls(name=name, mapping=mapping)
282 changes: 143 additions & 139 deletions nomenclature/definition.py
Original file line number Diff line number Diff line change
@@ -1,139 +1,143 @@
import logging
from pathlib import Path

import pandas as pd
from pyam import IamDataFrame
from pyam.index import replace_index_labels
from pyam.logging import adjust_log_level

from nomenclature.codelist import CodeList, RegionCodeList, VariableCodeList
from nomenclature.validation import validate

logger = logging.getLogger(__name__)
SPECIAL_CODELIST = {"variable": VariableCodeList, "region": RegionCodeList}


class DataStructureDefinition:
"""Definition of datastructure codelists for dimensions used in the IAMC format"""

def __init__(self, path, dimensions=["region", "variable"]):
"""

Parameters
----------
path : str or path-like
The folder with the project definitions.
dimensions : list of str, optional
List of :meth:`CodeList` names. Each CodeList is initialized
from a sub-folder of `path` of that name.
"""
if not isinstance(path, Path):
path = Path(path)

if not path.is_dir():
raise NotADirectoryError(f"Definitions directory not found: {path}")

self.dimensions = dimensions
for dim in self.dimensions:
self.__setattr__(
dim, SPECIAL_CODELIST.get(dim, CodeList).from_directory(dim, path / dim)
)

empty = [d for d in self.dimensions if not self.__getattribute__(d)]
if empty:
raise ValueError(f"Empty codelist: {', '.join(empty)}")

def validate(self, df: IamDataFrame, dimensions: list = None) -> None:
"""Validate that the coordinates of `df` are defined in the codelists

Parameters
----------
df : :class:`pyam.IamDataFrame`
Scenario data to be validated against the codelists of this instance.
dimensions : list of str, optional
Dimensions to perform validation (defaults to all dimensions of self)

Returns
-------
None

Raises
------
ValueError
If `df` fails validation against any codelist.
"""
validate(self, df, dimensions=dimensions or self.dimensions)

def check_aggregate(self, df: IamDataFrame, **kwargs) -> None:
"""Check for consistency of scenario data along the variable hierarchy

Parameters
----------
df : :class:`pyam.IamDataFrame`
Scenario data to be checked for consistency along the variable hierarchy.
kwargs : Tolerance arguments for comparison of values
Passed to :any:`numpy.isclose` via :any:`pyam.IamDataFrame.check_aggregate`.

Returns
-------
:class:`pandas.DataFrame` or None
Data where a variable and its computed aggregate does not match.

Raises
------
ValueError
If the :any:`DataStructureDefinition` does not have a *variable* dimension.
"""
if "variable" not in self.dimensions:
raise ValueError("Aggregation check requires 'variable' dimension.")

lst = []

with adjust_log_level(level="WARNING"):
for code in df.variable:
attr = self.variable.mapping[code]
if attr.check_aggregate:
components = attr.components

# check if multiple lists of components are given for a code
if isinstance(components, dict):
for name, _components in components.items():
error = df.check_aggregate(code, _components, **kwargs)
if error is not None:
error.dropna(inplace=True)
# append components-name to variable column
error.index = replace_index_labels(
error.index, "variable", [f"{code} [{name}]"]
)
lst.append(error)

# else use components provided as single list or pyam-default (None)
else:
error = df.check_aggregate(code, components, **kwargs)
if error is not None:
lst.append(error.dropna())

if lst:
# there may be empty dataframes due to `dropna()` above
error = pd.concat(lst)
return error if not error.empty else None

def to_excel(
self, excel_writer, sheet_name=None, sort_by_code: bool = False, **kwargs
):
"""Write the *variable* codelist to an Excel sheet

Parameters
----------
excel_writer : path-like, file-like, or ExcelWriter object
File path as string or :class:`pathlib.Path`,
or existing :class:`pandas.ExcelWriter`.
sheet_name : str, optional
Name of sheet that will have the codelist. If *None*, use the codelist name.
sort_by_code : bool, optional
Sort the codelist before exporting to file.
**kwargs
Passed to :class:`pandas.ExcelWriter` (if *excel_writer* is path-like).
"""
# TODO write all dimensions to the file
self.variable.to_excel(excel_writer, sheet_name, sort_by_code, **kwargs)
import logging
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not sure why GitHub thinks this entire file was changed?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I was wondering the same, might even be a GitHub issue.

from pathlib import Path

import pandas as pd
from pyam import IamDataFrame
from pyam.index import replace_index_labels
from pyam.logging import adjust_log_level

from nomenclature.codelist import (
CodeList,
RegionCodeList,
VariableCodeList,
)
from nomenclature.validation import validate

logger = logging.getLogger(__name__)
SPECIAL_CODELIST = {"variable": VariableCodeList, "region": RegionCodeList}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add the MetaCodeList here.



class DataStructureDefinition:
"""Definition of datastructure codelists for dimensions used in the IAMC format"""

def __init__(self, path, dimensions=["region", "variable"]):
"""

Parameters
----------
path : str or path-like
The folder with the project definitions.
dimensions : list of str, optional
List of :meth:`CodeList` names. Each CodeList is initialized
from a sub-folder of `path` of that name.
"""
if not isinstance(path, Path):
path = Path(path)

if not path.is_dir():
raise NotADirectoryError(f"Definitions directory not found: {path}")

self.dimensions = dimensions
for dim in self.dimensions:
self.__setattr__(
dim, SPECIAL_CODELIST.get(dim, CodeList).from_directory(dim, path / dim)
)

empty = [d for d in self.dimensions if not self.__getattribute__(d)]
if empty:
raise ValueError(f"Empty codelist: {', '.join(empty)}")

def validate(self, df: IamDataFrame, dimensions: list = None) -> None:
"""Validate that the coordinates of `df` are defined in the codelists

Parameters
----------
df : :class:`pyam.IamDataFrame`
Scenario data to be validated against the codelists of this instance.
dimensions : list of str, optional
Dimensions to perform validation (defaults to all dimensions of self)

Returns
-------
None

Raises
------
ValueError
If `df` fails validation against any codelist.
"""
validate(self, df, dimensions=dimensions or self.dimensions)

def check_aggregate(self, df: IamDataFrame, **kwargs) -> None:
"""Check for consistency of scenario data along the variable hierarchy

Parameters
----------
df : :class:`pyam.IamDataFrame`
Scenario data to be checked for consistency along the variable hierarchy.
kwargs : Tolerance arguments for comparison of values
Passed to :any:`numpy.isclose` via :any:`pyam.IamDataFrame.check_aggregate`.

Returns
-------
:class:`pandas.DataFrame` or None
Data where a variable and its computed aggregate does not match.

Raises
------
ValueError
If the :any:`DataStructureDefinition` does not have a *variable* dimension.
"""
if "variable" not in self.dimensions:
raise ValueError("Aggregation check requires 'variable' dimension.")

lst = []

with adjust_log_level(level="WARNING"):
for code in df.variable:
attr = self.variable.mapping[code]
if attr.check_aggregate:
components = attr.components

# check if multiple lists of components are given for a code
if isinstance(components, dict):
for name, _components in components.items():
error = df.check_aggregate(code, _components, **kwargs)
if error is not None:
error.dropna(inplace=True)
# append components-name to variable column
error.index = replace_index_labels(
error.index, "variable", [f"{code} [{name}]"]
)
lst.append(error)

# else use components provided as single list or pyam-default (None)
else:
error = df.check_aggregate(code, components, **kwargs)
if error is not None:
lst.append(error.dropna())

if lst:
# there may be empty dataframes due to `dropna()` above
error = pd.concat(lst)
return error if not error.empty else None

def to_excel(
self, excel_writer, sheet_name=None, sort_by_code: bool = False, **kwargs
):
"""Write the *variable* codelist to an Excel sheet

Parameters
----------
excel_writer : path-like, file-like, or ExcelWriter object
File path as string or :class:`pathlib.Path`,
or existing :class:`pandas.ExcelWriter`.
sheet_name : str, optional
Name of sheet that will have the codelist. If *None*, use the codelist name.
sort_by_code : bool, optional
Sort the codelist before exporting to file.
**kwargs
Passed to :class:`pandas.ExcelWriter` (if *excel_writer* is path-like).
"""
# TODO write all dimensions to the file
self.variable.to_excel(excel_writer, sheet_name, sort_by_code, **kwargs)
4 changes: 4 additions & 0 deletions tests/data/meta/allowed_values_2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- Meta cat with boolean values:
allowed_values: [True, False]
phackstock marked this conversation as resolved.
Show resolved Hide resolved
- Another category with str values:
allowed_values: ['ABC']
4 changes: 4 additions & 0 deletions tests/data/meta/meta_indicators_allowed_values.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
- Meta category with boolean values:
allowed_values: [True, False]
- Meta cat with int values:
allowed_values: [1, 2, 3]
Loading