Skip to content

Commit

Permalink
Add a class for computing features (#18)
Browse files Browse the repository at this point in the history
* Add a base class for feature computers

* Added missing base class

* Removed legacy name of battery data store

* Removed duplicate implementation

* Introduce "raw data enhancers"

* Flake8 fixes

* Draft of class for integral quantities

* Flake8 fixes, renamed test file
  • Loading branch information
WardLT authored Dec 20, 2022
1 parent c5ee040 commit af61c83
Show file tree
Hide file tree
Showing 10 changed files with 358 additions and 349 deletions.
9 changes: 5 additions & 4 deletions batdata/extractors/arbin.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from batdata.extractors.base import BatteryDataExtractor
from batdata.schemas.cycling import ChargingState
from batdata.utils import drop_cycles
from batdata.postprocess.tagging import add_method, add_steps, add_substeps
from batdata.postprocess.tagging import AddMethod, AddSteps, AddSubSteps


class ArbinExtractor(BatteryDataExtractor):
Expand Down Expand Up @@ -76,12 +76,13 @@ def compute_state(x):
if abs(x) < self.eps:
return ChargingState.hold
return ChargingState.charging if x > 0 else ChargingState.discharging

df_out['state'] = df_out['current'].apply(compute_state)

# Determine the method uses to control charging/discharging
add_steps(df_out)
add_method(df_out)
add_substeps(df_out)
AddSteps().enhance(df_out)
AddMethod().enhance(df_out)
AddSubSteps().enhance(df_out)
return df_out

def implementors(self) -> List[str]:
Expand Down
2 changes: 1 addition & 1 deletion batdata/extractors/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def parse_to_dataframe(self, group: List[str],
Returns
-------
pd.DataFrame
BatteryDataset
DataFrame containing the information from all files
"""

Expand Down
9 changes: 4 additions & 5 deletions batdata/extractors/maccor.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@

from batdata.extractors.base import BatteryDataExtractor
from batdata.schemas.cycling import ChargingState
from batdata.postprocess.tagging import add_method, add_steps, add_substeps
from batdata.postprocess.tagging import AddMethod, AddSteps, AddSubSteps
from batdata.utils import drop_cycles


Expand Down Expand Up @@ -62,10 +62,9 @@ def generate_dataframe(self, file: str, file_number: int = 0, start_cycle: int =

df_out['voltage'] = df['Volts']
df_out = drop_cycles(df_out)
add_steps(df_out)
add_method(df_out)
add_substeps(df_out)

AddSteps().enhance(df_out)
AddMethod().enhance(df_out)
AddSubSteps().enhance(df_out)
return df_out

def implementors(self) -> List[str]:
Expand Down
85 changes: 85 additions & 0 deletions batdata/postprocess/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""Base class and utilities related to post-processing on battery data"""
from typing import List

import pandas as pd

from batdata.data import BatteryDataset


class BaseFeatureComputer:
"""Base class for methods that produce new features given battery data
Features can be anything but are often collected statistics about a certain cycle.
"""

def compute_features(self, data: BatteryDataset) -> pd.DataFrame:
"""Compute
Parameters
----------
data: BatteryDataFrame
Battery data object
Returns
-------
features: pd.DataFrame
A dataframe of features where rows are different cycles or steps, columns are different features
"""
pass


class RawDataEnhancer(BaseFeatureComputer):
"""Base class for methods derives new data from the existing columns in raw data"""

column_names: List[str] = ...

def compute_features(self, data: BatteryDataset) -> pd.DataFrame:
self.enhance(data.raw_data)
return data.raw_data[self.column_names]

def enhance(self, data: pd.DataFrame):
"""Add additional columns to the raw data
Parameters
----------
data: pd.DataFrame
Raw data to be modified
"""
...


class CycleSummarizer(BaseFeatureComputer):
"""Classes which produce a summary of certain cycles given the raw data from a cycle"""

column_names: List[str] = ...

def compute_features(self, data: BatteryDataset) -> pd.DataFrame:
self.add_summaries(data)
return data.cycle_stats[['cycle_number'] + self.column_names]

def add_summaries(self, data: BatteryDataset):
"""Add cycle-level summaries to a battery dataset
Parameters
----------
data: BatteryDataset
Dataset to be modified
"""

# Add a cycle summary if not already available
if data.cycle_stats is None:
data.cycle_stats = pd.DataFrame({'cycle_number': sorted(set(data.raw_data['cycle_number']))})

# Perform the update
self._summarize(data.raw_data, data.cycle_stats)

def _summarize(self, raw_data: pd.DataFrame, cycle_data: pd.DataFrame):
"""Add additional data to a cycle summary dataframe
Parameters
----------
raw_data: pd.DataFrame
Raw data describing the initial cycles. Is not modified
cycle_data: pd.DataFrame
Cycle data frame to be updated
"""
188 changes: 1 addition & 187 deletions batdata/postprocess/cycle_stats.py
Original file line number Diff line number Diff line change
@@ -1,189 +1,3 @@
"""Utility functions for computing properties of certain cycles"""
from scipy.integrate import cumtrapz
import pandas as pd
import numpy as np


# TODO (wardlt): Add back in features I removed to simplify the code as other functions:
# - [ ] Dropping outliers
# - [ ] Smoothing with Gaussian Process regression
from batdata.schemas.cycling import ChargingState
from batdata.data import BatteryDataset


def compute_energy_per_cycle(data: BatteryDataset):
"""
Calculate the maximum energy and capacity on a per-cycle basis
Parameters
----------
data : BatteryDataset
Input battery dataset. Must have raw data defined
Returns
-------
cycle_ind : array
array of cycle numbers
energies : array
array of maximum for each cycle. Units: W-hr
capacities : array
array of maximum for each cycle. Units: A-hr
Examples
--------
none yet
"""

# Initialize the output arrays
energies = np.array([])
capacities = np.array([])
cycle_ind = np.array([])

# Loop over each cycle
for cyc, cycle_data in data.raw_data.query("state=='discharging'").groupby('cycle_number'):
# Calculate accumulated energy/capacity for each sub-segment
ene = 0
cap = 0
for _, subseg in cycle_data.groupby('substep_index'):
# Sort by test time, just in case
subseg_sorted = subseg.sort_values('test_time')

# Use current as always positive convention, opposite of what our standard uses
t = subseg_sorted['test_time'].values
i = -1 * subseg_sorted['current'].values
v = subseg_sorted['voltage'].values

# integrate for energy and capacity and convert to
# Watt/hrs. and Amp/hrs. respectively
ene += np.trapz(i * v, t) / 3600
cap += np.trapz(i, t) / 3600

# TODO (wardlt): This version of append re-allocates arrays, O(n). Consider using list.append instead,
# which uses linked lists O(1)
energies = np.append(energies, ene)
capacities = np.append(capacities, cap)
cycle_ind = np.append(cycle_ind, cyc)

return cycle_ind, energies, capacities


def compute_capacity_energy(data: BatteryDataset) -> pd.DataFrame:
"""Compute estimates for the battery capacity and energy
for each measurement of the charging and discharging sections of
each cycle.
The capacity/energy for each cycle are determined independently,
and is assumed to start at zero at the beginning of the cycle.
Parameters
----------
data: BatteryDataset or dataframe
Battery dataset with raw data available, or the raw dataframe itself.
Must have test_time, voltage and current columns.
Processing will add "capacity" and "energy" columns with units
of A-hr and W-hr, respectively.
discharge: bool
Whether to compute the discharge or charge curve
Returns
-------
curves: pd.DataFrame
Charge and discharge curves for each cycle in a single dataframe
"""

if not isinstance(data, pd.DataFrame):
data = data.raw_data

# Add columns for the capacity and energy
data['capacity'] = 0
data['energy'] = 0

# Compute the capacity and energy for each cycle
for cid, cycle in data.groupby('cycle_number'):

initial_cap = 0
initial_ene = 0

# Compute in segments over each subset (avoid issues with rests)
for _, subcycle in cycle.groupby('substep_index'):
# Integrate over it

sel = subcycle['state'] == ChargingState.discharging
sel += subcycle['state'] == ChargingState.charging
if sum(sel) == 0:
data.loc[subcycle.index, 'capacity'] = initial_cap
data.loc[subcycle.index, 'energy'] = initial_ene
continue

cap = cumtrapz(subcycle['current'],
subcycle['test_time'],
initial=0) / 3600 # Computes capacity in A-hr
ene = cumtrapz(subcycle['current'] * subcycle['voltage'],
subcycle['test_time'],
initial=0) / 3600 # Energy in A-hr

cap += initial_cap
ene += initial_ene

data.loc[subcycle.index, 'capacity'] = cap
data.loc[subcycle.index, 'energy'] = ene

initial_cap = cap[-1]
initial_ene = ene[-1]

return data


def compute_charging_curve(data: BatteryDataset, discharge: bool = True) -> pd.DataFrame:
"""Compute estimates for the battery capacity for each measurement
of the charging or discharging sections of each cycle.
The capacity for each cycle are determined independently,
and is assumed to start at zero at the beginning of the cycle.
Parameters
----------
data: BatteryDataset or dataframe
Battery dataset with raw data available, or the raw dataframe itself.
Must have test_time, voltage and current columns.
Processing will add "capacity" and "energy" columns with units
of A-hr and W-hr, respectively.
discharge: bool
Whether to compute the discharge or charge curve
Returns
-------
curves: pd.DataFrame
Charge and discharge curves for each cycle in a single dataframe
"""

if not isinstance(data, pd.DataFrame):
data = data.raw_data

# Get only the [dis]charging data
data = pd.DataFrame(data[data['state'] == (ChargingState.discharging if discharge else ChargingState.charging)])

# Add columns for the capacity and energy
data['capacity'] = 0
data['energy'] = 0

# Compute the capacity and energy for each cycle
for cid, cycle in data.groupby('cycle_number'):

# Compute in segments over each subset (avoid issues with rests)
for _, subcycle in cycle.groupby('substep_index'):
# Integrate over it
cap = cumtrapz(subcycle['current'], subcycle['test_time'], initial=0) / 3600 # Computes capacity in A-hr
eng = cumtrapz(subcycle['current'] * subcycle['voltage'],
subcycle['test_time'], initial=0) / 3600 # Energy in A-hr

# Multiply by -1 for the discharging segment
if discharge:
cap *= -1
eng *= -1

data.loc[subcycle.index, 'capacity'] = cap
data.loc[subcycle.index, 'energy'] = eng

return data
# TBD (wardlt): Still working on what goes here
Loading

0 comments on commit af61c83

Please sign in to comment.