Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support Multiple Kinds of Data in Single HDF5 #5

Merged
merged 3 commits into from
Aug 11, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
59 changes: 34 additions & 25 deletions batdata/data.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
"""Objects that represent battery datasets"""
from typing import Union
from typing import Union, Optional

from pandas import HDFStore
from pandas.io.common import stringify_path
Expand All @@ -10,17 +10,16 @@
from batdata.schemas import BatteryMetadata, CyclingData


# TODO (wardlt): Should I be more specific and call this "BatteryCyclingDataFrame"?
class BatteryDataFrame(pd.DataFrame):
"""Representation for battery dataset
class BatteryDataset:
"""Holder for all of the data associated with tests for a battery.

Subclass of the Pandas DataFrame object with small additions to store
metadata about the battery along with the battery measurement data
Attributes of this class define different view of the data (e.g., raw time-series, per-cycle statistics)
or different types of data (e.g., EIS) along with the metadata for the class

I/O with BatteryDataFrame
-------------------------

This data frame provides additional I/O operations that store and retrieve the battery metadata into particular
This data frame provides I/O operations that store and retrieve the battery metadata into particular
formats. The operations are named ``[to|from]_batdata_[format]``, where format could be one of

- ``hdf``: Data is stored the `"table" format from PyTables
Expand All @@ -34,14 +33,28 @@ class BatteryDataFrame(pd.DataFrame):

"""

def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False,
metadata: Union[BatteryMetadata, dict] = None):
super().__init__(data=data, index=index, columns=columns, dtype=dtype, copy=copy)
raw_data: Optional[pd.DataFrame]
"""Time-series data capturing the state of the battery as a function of time"""

metadata: BatteryMetadata
"""Metadata fro the battery construction and testing"""

def __init__(self, metadata: Union[BatteryMetadata, dict] = None, raw_data: Optional[pd.DataFrame] = None):
"""

Parameters
----------
metadata: BatteryMetadata or dict
Metadata that describe the battery construction, data provenance and testing routines
raw_data: pd.DataFrame
Time-series data of the battery state
"""
if metadata is None:
metadata = {}
elif isinstance(metadata, BaseModel):
metadata = metadata.dict()
self.metadata = BatteryMetadata(**metadata)
self.raw_data = raw_data

def validate_columns(self, allow_extra_columns: bool = True):
"""Determine whether the column types are appropriate
Expand All @@ -56,20 +69,18 @@ def validate_columns(self, allow_extra_columns: bool = True):
ValueError
If the dataset fails validation
"""
CyclingData.validate_dataframe(self)
CyclingData.validate_dataframe(self.raw_data, allow_extra_columns)

def to_batdata_hdf(self, path_or_buf, complevel=0, complib='zlib'):
"""Save the data in the standardized HDF5 file format

This function wraps the :meth:`to_hdf` function of Pandas and supplies fixed values for some of the options
This function wraps the ``to_hdf`` function of Pandas and supplies fixed values for some of the options
so that the data is written in a reproducible format.

Parameters
----------
path_or_buf : str or pandas.HDFStore
File path or HDFStore object.
key : str
Identifier for the group in the store.
complevel : {0-9}, optional
Specifies a compression level for data.
A value of 0 disables compression.
Expand All @@ -83,11 +94,10 @@ def to_batdata_hdf(self, path_or_buf, complevel=0, complib='zlib'):
a ValueError.
"""

# Cast the data as a DataFrame, as Pandas's HDF I/O logic does not support subclasses
# Store the various datasets
# Note that we use the "table" format to allow for partial reads / querying
data = pd.DataFrame(self)
data.to_hdf(path_or_buf, 'raw_data', complevel=complevel, complib=complib,
append=False, format='table', index=False)
self.raw_data.to_hdf(path_or_buf, 'raw_data', complevel=complevel, complib=complib,
append=False, format='table', index=False)

# Create logic for adding metadata
def add_metadata(f: HDFStore):
Expand All @@ -106,17 +116,16 @@ def add_metadata(f: HDFStore):
add_metadata(path_or_buf)

@classmethod
def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore], key=None):
def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore]):
"""Read the battery data from an HDF file

Parameters
----------
path_or_buf : str or pandas.HDFStore
File path or HDFStore object.
key : str
Identifier for the group in the store.
"""
data = pd.read_hdf(path_or_buf, key)
# Read the available datasets
data = pd.read_hdf(path_or_buf, "raw_data")

# Read out the battery metadata
if isinstance(path_or_buf, str):
Expand All @@ -125,7 +134,7 @@ def from_batdata_hdf(cls, path_or_buf: Union[str, HDFStore], key=None):
else:
metadata = BatteryMetadata.parse_raw(path_or_buf.root._v_attrs.metadata)

return cls(data=data, metadata=metadata)
return cls(raw_data=data, metadata=metadata)

def to_batdata_dict(self) -> dict:
"""Generate data in dictionary format
Expand All @@ -136,13 +145,13 @@ def to_batdata_dict(self) -> dict:
"""
return {
'metadata': self.metadata.dict(),
'data': self.to_dict('list')
'raw_data': self.raw_data.to_dict('list')
}

@classmethod
def from_batdata_dict(cls, d):
"""Read battery data and metadata from """
return cls(data=d['data'], metadata=d['metadata'])
return cls(raw_data=pd.DataFrame(d['raw_data']), metadata=d['metadata'])

@staticmethod
def get_metadata_from_hdf5(path: str) -> BatteryMetadata:
Expand Down
6 changes: 3 additions & 3 deletions batdata/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas as pd
from materials_io.base import BaseParser

from batdata.data import BatteryDataFrame
from batdata.data import BatteryDataset
from batdata.schemas import BatteryMetadata


Expand Down Expand Up @@ -48,7 +48,7 @@ def parse(self, group: List[str], context: dict = None) -> dict:
return df_out.to_batdata_dict()

def parse_to_dataframe(self, group: List[str],
metadata: Optional[Union[BatteryMetadata, dict]] = None) -> BatteryDataFrame:
metadata: Optional[Union[BatteryMetadata, dict]] = None) -> BatteryDataset:
"""Parse a set of files into a Pandas dataframe

Parameters
Expand Down Expand Up @@ -84,4 +84,4 @@ def parse_to_dataframe(self, group: List[str],
df_out = pd.concat(output_dfs, ignore_index=True)

# Attach the metadata and return the data
return BatteryDataFrame(data=df_out, metadata=metadata)
return BatteryDataset(raw_data=df_out, metadata=metadata)
31 changes: 16 additions & 15 deletions batdata/postprocess/cycle_stats.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
"""Utility functions for computing properties of certain cycles"""
from scipy.integrate import cumtrapz

from batdata.data import BatteryDataFrame
from batdata.data import BatteryDataset
import pandas as pd
import numpy as np

Expand All @@ -12,14 +12,14 @@
from batdata.schemas import ChargingState


def compute_energy_per_cycle(df: BatteryDataFrame):
def compute_energy_per_cycle(data: BatteryDataset):
"""
Calculate the maximum energy and capacity on a per-cycle basis

Parameters
----------
df : BatteryDataFrame
Input dataframe
data : BatteryDataset
Input battery dataset. Must have raw data defined

Returns
-------
Expand All @@ -42,7 +42,7 @@ def compute_energy_per_cycle(df: BatteryDataFrame):
cycle_ind = np.array([])

# Loop over each cycle
for cyc, cycle_data in df.query("state=='discharging'").groupby('cycle_number'):
for cyc, cycle_data in data.raw_data.query("state=='discharging'").groupby('cycle_number'):
# Calculate accumulated energy/capacity for each sub-segment
ene = 0
cap = 0
Expand All @@ -69,7 +69,7 @@ def compute_energy_per_cycle(df: BatteryDataFrame):
return cycle_ind, energies, capacities


def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.DataFrame:
def compute_charging_curve(data: BatteryDataset, discharge: bool = True) -> pd.DataFrame:
"""Compute estimates for the battery capacity for each measurement
of the charging or discharging sections of each cycle.

Expand All @@ -78,8 +78,8 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D

Parameters
----------
df: BatteryDataFrame
Battery dataset. Must have test_time, voltage and current columns.
data: BatteryDataset
Battery dataset with raw data available. Must have test_time, voltage and current columns.
Processing will add "capacity" and "energy" columns with units
of A-hr and W-hr, respectively
discharge: bool
Expand All @@ -92,14 +92,15 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D
"""

# Get only the [dis]charging data
df = pd.DataFrame(df[df['state'] == (ChargingState.discharging if discharge else ChargingState.charging)])
data = data.raw_data
data = pd.DataFrame(data[data['state'] == (ChargingState.discharging if discharge else ChargingState.charging)])

# Add columns for the capacity and energy
df['capacity'] = 0
df['energy'] = 0
data['capacity'] = 0
data['energy'] = 0

# Compute the capacity and energy for each cycle
for cid, cycle in df.groupby('cycle_number'):
for cid, cycle in data.groupby('cycle_number'):

# Compute in segments over each subset (avoid issues with rests)
for _, subcycle in cycle.groupby('substep_index'):
Expand All @@ -112,7 +113,7 @@ def compute_charging_curve(df: BatteryDataFrame, discharge: bool = True) -> pd.D
if discharge:
cap *= -1
eng *= -1
df.loc[subcycle.index, 'capacity'] = cap
df.loc[subcycle.index, 'energy'] = eng
data.loc[subcycle.index, 'capacity'] = cap
data.loc[subcycle.index, 'energy'] = eng

return df
return data
8 changes: 7 additions & 1 deletion batdata/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,8 +161,14 @@ def validate_dataframe(cls, data: DataFrame, allow_extra_columns: bool = True):
raise ValueError(f'Dataset is missing a required column: {column}')
continue

# Get the data type for the column
if '$ref' in col_schema['items']:
ref_name = col_schema['items']['$ref'].split("/")[-1]
col_type = schema['definitions'][ref_name]['type']
else:
col_type = col_schema['items']['type']

# Check data types
col_type = col_schema['items']['type']
actual_type = data_columns[column]
if col_type == "number":
if actual_type.kind not in ['f', 'c']:
Expand Down
19 changes: 10 additions & 9 deletions batdata/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,19 @@
import os

import h5py
import pandas as pd
from pandas import HDFStore
from pytest import fixture

from batdata.data import BatteryDataFrame
from batdata.data import BatteryDataset


@fixture()
def test_df():
return BatteryDataFrame(data={
return BatteryDataset(raw_data=pd.DataFrame({
'current': [1, 0, -1],
'voltage': [2, 2, 2]
}, metadata={'name': 'Test data'})
}), metadata={'name': 'Test data'})


def test_write_hdf(tmpdir, test_df):
Expand All @@ -41,26 +42,26 @@ def test_read_hdf(tmpdir, test_df):
test_df.to_batdata_hdf(out_path)

# Test reading only the metadata
metadata = BatteryDataFrame.get_metadata_from_hdf5(out_path)
metadata = BatteryDataset.get_metadata_from_hdf5(out_path)
assert metadata.name == 'Test data'

# Read it
data = BatteryDataFrame.from_batdata_hdf(out_path)
data = BatteryDataset.from_batdata_hdf(out_path)
assert data.metadata.name == 'Test data'

# Test reading from an already-open file
store = HDFStore(out_path, 'r')
data = BatteryDataFrame.from_batdata_hdf(store)
data = BatteryDataset.from_batdata_hdf(store)
assert data.metadata.name == 'Test data'


def test_dict(test_df):
# Test writing it
d = test_df.to_batdata_dict()
assert d['metadata']['name'] == 'Test data'
assert 'data' in d
assert 'raw_data' in d

# Test reading it
data = BatteryDataFrame.from_batdata_dict(d)
assert len(data) == 3
data = BatteryDataset.from_batdata_dict(d)
assert len(data.raw_data) == 3
assert data.metadata.name == 'Test data'
15 changes: 8 additions & 7 deletions environment.yml
Original file line number Diff line number Diff line change
@@ -1,21 +1,22 @@
name: batdata
channels:
- defaults
- conda-forge
- defaults
dependencies:
- python==3.7.*
- pandas==1.0.0
- scipy==1.3.2
- pytables==3.6.1
- h5py==2.10.0
- xlrd==1.0.0
- pandas==1.*
- scipy==1.3.*
- pytables
- pydantic
- xlrd
- h5py
- jupyterlab
- matplotlib
- flake8
- pytest
- tqdm
- pip
- pip:
- pydantic==1.4
- git+https://github.com/materials-data-facility/MaterialsIO.git
- -r requirements.txt
- -e . # Installs the batdata library in development mode
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
pandas>=1.0
scipy>=1.3
pydantic>=1.6.2
pydantic>=1.7
tables>=3.6
h5py==2.10.0
xlrd
Expand Down
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

setup(
name='batdata',
version='0.0.1',
version='0.1.0',
packages=find_packages(),
install_requires=['pandas'],
entry_points={
Expand Down