diff --git a/batdata/data.py b/batdata/data.py index 4d649c2..be8f117 100644 --- a/batdata/data.py +++ b/batdata/data.py @@ -1,5 +1,5 @@ """Objects that represent battery datasets""" -from typing import Union +from typing import Union, Optional from pandas import HDFStore from pandas.io.common import stringify_path @@ -10,17 +10,16 @@ from batdata.schemas import BatteryMetadata, CyclingData -# TODO (wardlt): Should I be more specific and call this "BatteryCyclingDataFrame"? -class BatteryDataFrame(pd.DataFrame): - """Representation for battery dataset +class BatteryDataset: + """Holder for all of the data associated with tests for a battery. - Subclass of the Pandas DataFrame object with small additions to store - metadata about the battery along with the battery measurement data + Attributes of this class define different view of the data (e.g., raw time-series, per-cycle statistics) + or different types of data (e.g., EIS) along with the metadata for the class I/O with BatteryDataFrame ------------------------- - This data frame provides additional I/O operations that store and retrieve the battery metadata into particular + This data frame provides I/O operations that store and retrieve the battery metadata into particular formats. The operations are named ``[to|from]_batdata_[format]``, where format could be one of - ``hdf``: Data is stored the `"table" format from PyTables @@ -34,14 +33,28 @@ class BatteryDataFrame(pd.DataFrame): """ - def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False, - metadata: Union[BatteryMetadata, dict] = None): - super().__init__(data=data, index=index, columns=columns, dtype=dtype, copy=copy) + raw_data: Optional[pd.DataFrame] + """Time-series data capturing the state of the battery as a function of time""" + + metadata: BatteryMetadata + """Metadata fro the battery construction and testing""" + + def __init__(self, metadata: Union[BatteryMetadata, dict] = None, raw_data: Optional[pd.DataFrame] = None): + """ + + Parameters + ---------- + metadata: BatteryMetadata or dict + Metadata that describe the battery construction, data provenance and testing routines + raw_data: pd.DataFrame + Time-series data of the battery state + """ if metadata is None: metadata = {} elif isinstance(metadata, BaseModel): metadata = metadata.dict() self.metadata = BatteryMetadata(**metadata) + self.raw_data = raw_data def validate_columns(self, allow_extra_columns: bool = True): """Determine whether the column types are appropriate @@ -56,20 +69,18 @@ def validate_columns(self, allow_extra_columns: bool = True): ValueError If the dataset fails validation """ - CyclingData.validate_dataframe(self) + CyclingData.validate_dataframe(self.raw_data, allow_extra_columns) def to_batdata_hdf(self, path_or_buf, complevel=0, complib='zlib'): """Save the data in the standardized HDF5 file format - This function wraps the :meth:`to_hdf` function of Pandas and supplies fixed values for some of the options + This function wraps the ``to_hdf`` function of Pandas and supplies fixed values for some of the options so that the data is written in a reproducible format. Parameters ---------- path_or_buf : str or pandas.HDFStore File path or HDFStore object. - key : str - Identifier for the group in the store. complevel : {0-9}, optional Specifies a compression level for data. A value of 0 disables compression. @@ -83,11 +94,10 @@ def to_batdata_hdf(self, path_or_buf, complevel=0, complib='zlib'): a ValueError. """ - # Cast the data as a DataFrame, as Pandas's HDF I/O logic does not support subclasses + # Store the various datasets # Note that we use the "table" format to allow for partial reads / querying - data = pd.DataFrame(self) - data.to_hdf(path_or_buf, 'raw_data', complevel=complevel, complib=complib, - append=False, format='table', index=False) + self.raw_data.to_hdf(path_or_buf, 'raw_data', complevel=complevel, complib=complib, + append=False, format='table', index=False) # Create logic for adding metadata def add_metadata(f: HDFStore): @@ -106,17 +116,16 @@ def add_metadata(f: HDFStore): add_metadata(path_or_buf) @classmethod - def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore], key=None): + def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore]): """Read the battery data from an HDF file Parameters ---------- path_or_buf : str or pandas.HDFStore File path or HDFStore object. - key : str - Identifier for the group in the store. """ - data = pd.read_hdf(path_or_buf, key) + # Read the available datasets + data = pd.read_hdf(path_or_buf, "raw_data") # Read out the battery metadata if isinstance(path_or_buf, str): @@ -125,7 +134,7 @@ def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore], key=None): else: metadata = BatteryMetadata.parse_raw(path_or_buf.root._v_attrs.metadata) - return cls(data=data, metadata=metadata) + return cls(raw_data=data, metadata=metadata) def to_batdata_dict(self) -> dict: """Generate data in dictionary format @@ -136,13 +145,13 @@ def to_batdata_dict(self) -> dict: """ return { 'metadata': self.metadata.dict(), - 'data': self.to_dict('list') + 'raw_data': self.raw_data.to_dict('list') } @classmethod def from_batdata_dict(cls, d): """Read battery data and metadata from """ - return cls(data=d['data'], metadata=d['metadata']) + return cls(raw_data=pd.DataFrame(d['raw_data']), metadata=d['metadata']) @staticmethod def get_metadata_from_hdf5(path: str) -> BatteryMetadata: diff --git a/batdata/extractors/base.py b/batdata/extractors/base.py index 9f56228..84d8025 100644 --- a/batdata/extractors/base.py +++ b/batdata/extractors/base.py @@ -4,7 +4,7 @@ import pandas as pd from materials_io.base import BaseParser -from batdata.data import BatteryDataFrame +from batdata.data import BatteryDataset from batdata.schemas import BatteryMetadata @@ -48,7 +48,7 @@ def parse(self, group: List[str], context: dict = None) -> dict: return df_out.to_batdata_dict() def parse_to_dataframe(self, group: List[str], - metadata: Optional[Union[BatteryMetadata, dict]] = None) -> BatteryDataFrame: + metadata: Optional[Union[BatteryMetadata, dict]] = None) -> BatteryDataset: """Parse a set of files into a Pandas dataframe Parameters @@ -84,4 +84,4 @@ def parse_to_dataframe(self, group: List[str], df_out = pd.concat(output_dfs, ignore_index=True) # Attach the metadata and return the data - return BatteryDataFrame(data=df_out, metadata=metadata) + return BatteryDataset(raw_data=df_out, metadata=metadata) diff --git a/batdata/postprocess/cycle_stats.py b/batdata/postprocess/cycle_stats.py index cd8d05c..94360ae 100644 --- a/batdata/postprocess/cycle_stats.py +++ b/batdata/postprocess/cycle_stats.py @@ -1,7 +1,7 @@ """Utility functions for computing properties of certain cycles""" from scipy.integrate import cumtrapz -from batdata.data import BatteryDataFrame +from batdata.data import BatteryDataset import pandas as pd import numpy as np @@ -12,14 +12,14 @@ from batdata.schemas import ChargingState -def compute_energy_per_cycle(df: BatteryDataFrame): +def compute_energy_per_cycle(data: BatteryDataset): """ Calculate the maximum energy and capacity on a per-cycle basis Parameters ---------- - df : BatteryDataFrame - Input dataframe + data : BatteryDataset + Input battery dataset. Must have raw data defined Returns ------- @@ -42,7 +42,7 @@ def compute_energy_per_cycle(df: BatteryDataFrame): cycle_ind = np.array([]) # Loop over each cycle - for cyc, cycle_data in df.query("state=='discharging'").groupby('cycle_number'): + for cyc, cycle_data in data.raw_data.query("state=='discharging'").groupby('cycle_number'): # Calculate accumulated energy/capacity for each sub-segment ene = 0 cap = 0 @@ -69,7 +69,7 @@ def compute_energy_per_cycle(df: BatteryDataFrame): return cycle_ind, energies, capacities -def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.DataFrame: +def compute_charging_curve(data: BatteryDataset, discharge: bool = True) -> pd.DataFrame: """Compute estimates for the battery capacity for each measurement of the charging or discharging sections of each cycle. @@ -78,8 +78,8 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D Parameters ---------- - df: BatteryDataFrame - Battery dataset. Must have test_time, voltage and current columns. + data: BatteryDataset + Battery dataset with raw data available. Must have test_time, voltage and current columns. Processing will add "capacity" and "energy" columns with units of A-hr and W-hr, respectively discharge: bool @@ -92,14 +92,15 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D """ # Get only the [dis]charging data - df = pd.DataFrame(df[df['state'] == (ChargingState.discharging if discharge else ChargingState.charging)]) + data = data.raw_data + data = pd.DataFrame(data[data['state'] == (ChargingState.discharging if discharge else ChargingState.charging)]) # Add columns for the capacity and energy - df['capacity'] = 0 - df['energy'] = 0 + data['capacity'] = 0 + data['energy'] = 0 # Compute the capacity and energy for each cycle - for cid, cycle in df.groupby('cycle_number'): + for cid, cycle in data.groupby('cycle_number'): # Compute in segments over each subset (avoid issues with rests) for _, subcycle in cycle.groupby('substep_index'): @@ -112,7 +113,7 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D if discharge: cap *= -1 eng *= -1 - df.loc[subcycle.index, 'capacity'] = cap - df.loc[subcycle.index, 'energy'] = eng + data.loc[subcycle.index, 'capacity'] = cap + data.loc[subcycle.index, 'energy'] = eng - return df + return data diff --git a/batdata/schemas.py b/batdata/schemas.py index 831ff35..6e26924 100644 --- a/batdata/schemas.py +++ b/batdata/schemas.py @@ -161,8 +161,14 @@ def validate_dataframe(cls, data: DataFrame, allow_extra_columns: bool = True): raise ValueError(f'Dataset is missing a required column: {column}') continue + # Get the data type for the column + if '$ref' in col_schema['items']: + ref_name = col_schema['items']['$ref'].split("/")[-1] + col_type = schema['definitions'][ref_name]['type'] + else: + col_type = col_schema['items']['type'] + # Check data types - col_type = col_schema['items']['type'] actual_type = data_columns[column] if col_type == "number": if actual_type.kind not in ['f', 'c']: diff --git a/batdata/tests/test_data.py b/batdata/tests/test_data.py index b82c281..1a57b45 100644 --- a/batdata/tests/test_data.py +++ b/batdata/tests/test_data.py @@ -3,18 +3,19 @@ import os import h5py +import pandas as pd from pandas import HDFStore from pytest import fixture -from batdata.data import BatteryDataFrame +from batdata.data import BatteryDataset @fixture() def test_df(): - return BatteryDataFrame(data={ + return BatteryDataset(raw_data=pd.DataFrame({ 'current': [1, 0, -1], 'voltage': [2, 2, 2] - }, metadata={'name': 'Test data'}) + }), metadata={'name': 'Test data'}) def test_write_hdf(tmpdir, test_df): @@ -41,16 +42,16 @@ def test_read_hdf(tmpdir, test_df): test_df.to_batdata_hdf(out_path) # Test reading only the metadata - metadata = BatteryDataFrame.get_metadata_from_hdf5(out_path) + metadata = BatteryDataset.get_metadata_from_hdf5(out_path) assert metadata.name == 'Test data' # Read it - data = BatteryDataFrame.from_batdata_hdf(out_path) + data = BatteryDataset.from_batdata_hdf(out_path) assert data.metadata.name == 'Test data' # Test reading from an already-open file store = HDFStore(out_path, 'r') - data = BatteryDataFrame.from_batdata_hdf(store) + data = BatteryDataset.from_batdata_hdf(store) assert data.metadata.name == 'Test data' @@ -58,9 +59,9 @@ def test_dict(test_df): # Test writing it d = test_df.to_batdata_dict() assert d['metadata']['name'] == 'Test data' - assert 'data' in d + assert 'raw_data' in d # Test reading it - data = BatteryDataFrame.from_batdata_dict(d) - assert len(data) == 3 + data = BatteryDataset.from_batdata_dict(d) + assert len(data.raw_data) == 3 assert data.metadata.name == 'Test data' diff --git a/environment.yml b/environment.yml index c1619be..e99a4f5 100644 --- a/environment.yml +++ b/environment.yml @@ -1,14 +1,15 @@ name: batdata channels: - - defaults - conda-forge + - defaults dependencies: - python==3.7.* - - pandas==1.0.0 - - scipy==1.3.2 - - pytables==3.6.1 - - h5py==2.10.0 - - xlrd==1.0.0 + - pandas==1.* + - scipy==1.3.* + - pytables + - pydantic + - xlrd + - h5py - jupyterlab - matplotlib - flake8 @@ -16,6 +17,6 @@ dependencies: - tqdm - pip - pip: - - pydantic==1.4 - git+https://github.com/materials-data-facility/MaterialsIO.git + - -r requirements.txt - -e . # Installs the batdata library in development mode diff --git a/requirements.txt b/requirements.txt index a6509ba..2b8dbb1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,6 +1,6 @@ pandas>=1.0 scipy>=1.3 -pydantic>=1.6.2 +pydantic>=1.7 tables>=3.6 h5py==2.10.0 xlrd diff --git a/setup.py b/setup.py index 67864a2..07a05b2 100644 --- a/setup.py +++ b/setup.py @@ -2,7 +2,7 @@ setup( name='batdata', - version='0.0.1', + version='0.1.0', packages=find_packages(), install_requires=['pandas'], entry_points={