ROVI-org · WardLT · Aug 11, 2021 · Feb 5, 2021 · Aug 11, 2021 · Aug 11, 2021
diff --git a/batdata/data.py b/batdata/data.py
@@ -1,5 +1,5 @@
 """Objects that represent battery datasets"""
-from typing import Union
+from typing import Union, Optional
 
 from pandas import HDFStore
 from pandas.io.common import stringify_path
@@ -10,17 +10,16 @@
 from batdata.schemas import BatteryMetadata, CyclingData
 
 
-# TODO (wardlt): Should I be more specific and call this "BatteryCyclingDataFrame"?
-class BatteryDataFrame(pd.DataFrame):
-    """Representation for battery dataset
+class BatteryDataset:
+    """Holder for all of the data associated with tests for a battery.
 
-    Subclass of the Pandas DataFrame object with small additions to store
-    metadata about the battery along with the battery measurement data
+    Attributes of this class define different view of the data (e.g., raw time-series, per-cycle statistics)
+    or different types of data (e.g., EIS) along with the metadata for the class
 
     I/O with BatteryDataFrame
     -------------------------
 
-    This data frame provides additional I/O operations that store and retrieve the battery metadata into particular
+    This data frame provides I/O operations that store and retrieve the battery metadata into particular
     formats. The operations are named ``[to|from]_batdata_[format]``, where format could be one of
 
     - ``hdf``: Data is stored the `"table" format from PyTables
@@ -34,14 +33,28 @@ class BatteryDataFrame(pd.DataFrame):
 
     """
 
-    def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False,
-                 metadata: Union[BatteryMetadata, dict] = None):
-        super().__init__(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
+    raw_data: Optional[pd.DataFrame]
+    """Time-series data capturing the state of the battery as a function of time"""
+
+    metadata: BatteryMetadata
+    """Metadata fro the battery construction and testing"""
+
+    def __init__(self, metadata: Union[BatteryMetadata, dict] = None, raw_data: Optional[pd.DataFrame] = None):
+        """
+
+        Parameters
+        ----------
+        metadata: BatteryMetadata or dict
+            Metadata that describe the battery construction, data provenance and testing routines
+        raw_data: pd.DataFrame
+            Time-series data of the battery state
+        """
         if metadata is None:
             metadata = {}
         elif isinstance(metadata, BaseModel):
             metadata = metadata.dict()
         self.metadata = BatteryMetadata(**metadata)
+        self.raw_data = raw_data
 
     def validate_columns(self, allow_extra_columns: bool = True):
         """Determine whether the column types are appropriate
@@ -56,20 +69,18 @@ def validate_columns(self, allow_extra_columns: bool = True):
         ValueError
             If the dataset fails validation
         """
-        CyclingData.validate_dataframe(self)
+        CyclingData.validate_dataframe(self.raw_data, allow_extra_columns)
 
     def to_batdata_hdf(self, path_or_buf, complevel=0, complib='zlib'):
         """Save the data in the standardized HDF5 file format
 
-        This function wraps the :meth:`to_hdf` function of Pandas and supplies fixed values for some of the options
+        This function wraps the ``to_hdf`` function of Pandas and supplies fixed values for some of the options
         so that the data is written in a reproducible format.
 
         Parameters
         ----------
         path_or_buf : str or pandas.HDFStore
             File path or HDFStore object.
-        key : str
-            Identifier for the group in the store.
         complevel : {0-9}, optional
             Specifies a compression level for data.
             A value of 0 disables compression.
@@ -83,11 +94,10 @@ def to_batdata_hdf(self, path_or_buf, complevel=0, complib='zlib'):
             a ValueError.
         """
 
-        # Cast the data as a DataFrame, as Pandas's HDF I/O logic does not support subclasses
+        # Store the various datasets
         #  Note that we use the "table" format to allow for partial reads / querying
-        data = pd.DataFrame(self)
-        data.to_hdf(path_or_buf, 'raw_data', complevel=complevel, complib=complib,
-                    append=False, format='table', index=False)
+        self.raw_data.to_hdf(path_or_buf, 'raw_data', complevel=complevel, complib=complib,
+                             append=False, format='table', index=False)
 
         # Create logic for adding metadata
         def add_metadata(f: HDFStore):
@@ -106,17 +116,16 @@ def add_metadata(f: HDFStore):
             add_metadata(path_or_buf)
 
     @classmethod
-    def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore], key=None):
+    def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore]):
         """Read the battery data from an HDF file
 
         Parameters
         ----------
         path_or_buf : str or pandas.HDFStore
             File path or HDFStore object.
-        key : str
-            Identifier for the group in the store.
         """
-        data = pd.read_hdf(path_or_buf, key)
+        # Read the available datasets
+        data = pd.read_hdf(path_or_buf, "raw_data")
 
         # Read out the battery metadata
         if isinstance(path_or_buf, str):
@@ -125,7 +134,7 @@ def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore], key=None):
         else:
             metadata = BatteryMetadata.parse_raw(path_or_buf.root._v_attrs.metadata)
 
-        return cls(data=data, metadata=metadata)
+        return cls(raw_data=data, metadata=metadata)
 
     def to_batdata_dict(self) -> dict:
         """Generate data in dictionary format
@@ -136,13 +145,13 @@ def to_batdata_dict(self) -> dict:
         """
         return {
             'metadata': self.metadata.dict(),
-            'data': self.to_dict('list')
+            'raw_data': self.raw_data.to_dict('list')
         }
 
     @classmethod
     def from_batdata_dict(cls, d):
         """Read battery data and metadata from """
-        return cls(data=d['data'], metadata=d['metadata'])
+        return cls(raw_data=pd.DataFrame(d['raw_data']), metadata=d['metadata'])
 
     @staticmethod
     def get_metadata_from_hdf5(path: str) -> BatteryMetadata:

diff --git a/batdata/extractors/base.py b/batdata/extractors/base.py
@@ -4,7 +4,7 @@
 import pandas as pd
 from materials_io.base import BaseParser
 
-from batdata.data import BatteryDataFrame
+from batdata.data import BatteryDataset
 from batdata.schemas import BatteryMetadata
 
 
@@ -48,7 +48,7 @@ def parse(self, group: List[str], context: dict = None) -> dict:
         return df_out.to_batdata_dict()
 
     def parse_to_dataframe(self, group: List[str],
-                           metadata: Optional[Union[BatteryMetadata, dict]] = None) -> BatteryDataFrame:
+                           metadata: Optional[Union[BatteryMetadata, dict]] = None) -> BatteryDataset:
         """Parse a set of  files into a Pandas dataframe
 
         Parameters
@@ -84,4 +84,4 @@ def parse_to_dataframe(self, group: List[str],
         df_out = pd.concat(output_dfs, ignore_index=True)
 
         # Attach the metadata and return the data
-        return BatteryDataFrame(data=df_out, metadata=metadata)
+        return BatteryDataset(raw_data=df_out, metadata=metadata)
diff --git a/batdata/postprocess/cycle_stats.py b/batdata/postprocess/cycle_stats.py
@@ -1,7 +1,7 @@
 """Utility functions for computing properties of certain cycles"""
 from scipy.integrate import cumtrapz
 
-from batdata.data import BatteryDataFrame
+from batdata.data import BatteryDataset
 import pandas as pd
 import numpy as np
 
@@ -12,14 +12,14 @@
 from batdata.schemas import ChargingState
 
 
-def compute_energy_per_cycle(df: BatteryDataFrame):
+def compute_energy_per_cycle(data: BatteryDataset):
     """
     Calculate the maximum energy and capacity on a per-cycle basis
 
     Parameters
     ----------
-    df : BatteryDataFrame
-        Input dataframe
+    data : BatteryDataset
+        Input battery dataset. Must have raw data defined
 
     Returns
     -------
@@ -42,7 +42,7 @@ def compute_energy_per_cycle(df: BatteryDataFrame):
     cycle_ind = np.array([])
 
     # Loop over each cycle
-    for cyc, cycle_data in df.query("state=='discharging'").groupby('cycle_number'):
+    for cyc, cycle_data in data.raw_data.query("state=='discharging'").groupby('cycle_number'):
         # Calculate accumulated energy/capacity for each sub-segment
         ene = 0
         cap = 0
@@ -69,7 +69,7 @@ def compute_energy_per_cycle(df: BatteryDataFrame):
     return cycle_ind, energies, capacities
 
 
-def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.DataFrame:
+def compute_charging_curve(data: BatteryDataset, discharge: bool = True) -> pd.DataFrame:
     """Compute estimates for the battery capacity for each measurement
     of the charging or discharging sections of each cycle.
 
@@ -78,8 +78,8 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D
 
     Parameters
     ----------
-    df: BatteryDataFrame
-        Battery dataset. Must have test_time, voltage and current columns.
+    data: BatteryDataset
+        Battery dataset with raw data available. Must have test_time, voltage and current columns.
         Processing will add "capacity" and "energy" columns with units
         of A-hr and W-hr, respectively
     discharge: bool
@@ -92,14 +92,15 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D
     """
 
     # Get only the [dis]charging data
-    df = pd.DataFrame(df[df['state'] == (ChargingState.discharging if discharge else ChargingState.charging)])
+    data = data.raw_data
+    data = pd.DataFrame(data[data['state'] == (ChargingState.discharging if discharge else ChargingState.charging)])
 
     # Add columns for the capacity and energy
-    df['capacity'] = 0
-    df['energy'] = 0
+    data['capacity'] = 0
+    data['energy'] = 0
 
     # Compute the capacity and energy for each cycle
-    for cid, cycle in df.groupby('cycle_number'):
+    for cid, cycle in data.groupby('cycle_number'):
 
         # Compute in segments over each subset (avoid issues with rests)
         for _, subcycle in cycle.groupby('substep_index'):
@@ -112,7 +113,7 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D
             if discharge:
                 cap *= -1
                 eng *= -1
-            df.loc[subcycle.index, 'capacity'] = cap
-            df.loc[subcycle.index, 'energy'] = eng
+            data.loc[subcycle.index, 'capacity'] = cap
+            data.loc[subcycle.index, 'energy'] = eng
 
-    return df
+    return data
diff --git a/batdata/schemas.py b/batdata/schemas.py
@@ -161,8 +161,14 @@ def validate_dataframe(cls, data: DataFrame, allow_extra_columns: bool = True):
                     raise ValueError(f'Dataset is missing a required column: {column}')
                 continue
 
+            # Get the data type for the column
+            if '$ref' in col_schema['items']:
+                ref_name = col_schema['items']['$ref'].split("/")[-1]
+                col_type = schema['definitions'][ref_name]['type']
+            else:
+                col_type = col_schema['items']['type']
+
             # Check data types
-            col_type = col_schema['items']['type']
             actual_type = data_columns[column]
             if col_type == "number":
                 if actual_type.kind not in ['f', 'c']:

diff --git a/batdata/tests/test_data.py b/batdata/tests/test_data.py
@@ -3,18 +3,19 @@
 import os
 
 import h5py
+import pandas as pd
 from pandas import HDFStore
 from pytest import fixture
 
-from batdata.data import BatteryDataFrame
+from batdata.data import BatteryDataset
 
 
 @fixture()
 def test_df():
-    return BatteryDataFrame(data={
+    return BatteryDataset(raw_data=pd.DataFrame({
         'current': [1, 0, -1],
         'voltage': [2, 2, 2]
-    }, metadata={'name': 'Test data'})
+    }), metadata={'name': 'Test data'})
 
 
 def test_write_hdf(tmpdir, test_df):
@@ -41,26 +42,26 @@ def test_read_hdf(tmpdir, test_df):
     test_df.to_batdata_hdf(out_path)
 
     # Test reading only the metadata
-    metadata = BatteryDataFrame.get_metadata_from_hdf5(out_path)
+    metadata = BatteryDataset.get_metadata_from_hdf5(out_path)
     assert metadata.name == 'Test data'
 
     # Read it
-    data = BatteryDataFrame.from_batdata_hdf(out_path)
+    data = BatteryDataset.from_batdata_hdf(out_path)
     assert data.metadata.name == 'Test data'
 
     # Test reading from an already-open file
     store = HDFStore(out_path, 'r')
-    data = BatteryDataFrame.from_batdata_hdf(store)
+    data = BatteryDataset.from_batdata_hdf(store)
     assert data.metadata.name == 'Test data'
 
 
 def test_dict(test_df):
     # Test writing it
     d = test_df.to_batdata_dict()
     assert d['metadata']['name'] == 'Test data'
-    assert 'data' in d
+    assert 'raw_data' in d
 
     # Test reading it
-    data = BatteryDataFrame.from_batdata_dict(d)
-    assert len(data) == 3
+    data = BatteryDataset.from_batdata_dict(d)
+    assert len(data.raw_data) == 3
     assert data.metadata.name == 'Test data'
diff --git a/environment.yml b/environment.yml
@@ -1,21 +1,22 @@
 name: batdata
 channels:
-  - defaults
   - conda-forge
+  - defaults
 dependencies:
   - python==3.7.*
-  - pandas==1.0.0
-  - scipy==1.3.2
-  - pytables==3.6.1
-  - h5py==2.10.0
-  - xlrd==1.0.0
+  - pandas==1.*
+  - scipy==1.3.*
+  - pytables
+  - pydantic
+  - xlrd
+  - h5py
   - jupyterlab
   - matplotlib
   - flake8
   - pytest
   - tqdm
   - pip
   - pip:
-    - pydantic==1.4
     - git+https://github.com/materials-data-facility/MaterialsIO.git
+    - -r requirements.txt
     - -e .  # Installs the batdata library in development mode
diff --git a/requirements.txt b/requirements.txt
@@ -1,6 +1,6 @@
 pandas>=1.0
 scipy>=1.3
-pydantic>=1.6.2
+pydantic>=1.7
 tables>=3.6
 h5py==2.10.0
 xlrd

diff --git a/setup.py b/setup.py
@@ -2,7 +2,7 @@
 
 setup(
     name='batdata',
-    version='0.0.1',
+    version='0.1.0',
     packages=find_packages(),
     install_requires=['pandas'],
     entry_points={