Add a class for computing features (#18)

* Add a base class for feature computers * Added missing base class * Removed legacy name of battery data store * Removed duplicate implementation * Introduce "raw data enhancers" * Flake8 fixes * Draft of class for integral quantities * Flake8 fixes, renamed test file
ROVI-org · Dec 20, 2022 · af61c83 · af61c83
1 parent c5ee040
commit af61c83
Show file tree

Hide file tree

Showing 10 changed files with 358 additions and 349 deletions.
diff --git a/batdata/extractors/arbin.py b/batdata/extractors/arbin.py
@@ -7,7 +7,7 @@
 from batdata.extractors.base import BatteryDataExtractor
 from batdata.schemas.cycling import ChargingState
 from batdata.utils import drop_cycles
-from batdata.postprocess.tagging import add_method, add_steps, add_substeps
+from batdata.postprocess.tagging import AddMethod, AddSteps, AddSubSteps
 
 
 class ArbinExtractor(BatteryDataExtractor):
@@ -76,12 +76,13 @@ def compute_state(x):
             if abs(x) < self.eps:
                 return ChargingState.hold
             return ChargingState.charging if x > 0 else ChargingState.discharging
+
         df_out['state'] = df_out['current'].apply(compute_state)
 
         # Determine the method uses to control charging/discharging
-        add_steps(df_out)
-        add_method(df_out)
-        add_substeps(df_out)
+        AddSteps().enhance(df_out)
+        AddMethod().enhance(df_out)
+        AddSubSteps().enhance(df_out)
         return df_out
 
     def implementors(self) -> List[str]:

diff --git a/batdata/extractors/base.py b/batdata/extractors/base.py
@@ -60,7 +60,7 @@ def parse_to_dataframe(self, group: List[str],
 
         Returns
         -------
-        pd.DataFrame
+        BatteryDataset
             DataFrame containing the information from all files
         """
 

diff --git a/batdata/extractors/maccor.py b/batdata/extractors/maccor.py
@@ -7,7 +7,7 @@
 
 from batdata.extractors.base import BatteryDataExtractor
 from batdata.schemas.cycling import ChargingState
-from batdata.postprocess.tagging import add_method, add_steps, add_substeps
+from batdata.postprocess.tagging import AddMethod, AddSteps, AddSubSteps
 from batdata.utils import drop_cycles
 
 
@@ -62,10 +62,9 @@ def generate_dataframe(self, file: str, file_number: int = 0, start_cycle: int =
 
         df_out['voltage'] = df['Volts']
         df_out = drop_cycles(df_out)
-        add_steps(df_out)
-        add_method(df_out)
-        add_substeps(df_out)
-
+        AddSteps().enhance(df_out)
+        AddMethod().enhance(df_out)
+        AddSubSteps().enhance(df_out)
         return df_out
 
     def implementors(self) -> List[str]:

diff --git a/batdata/postprocess/base.py b/batdata/postprocess/base.py
@@ -0,0 +1,85 @@
+"""Base class and utilities related to post-processing on battery data"""
+from typing import List
+
+import pandas as pd
+
+from batdata.data import BatteryDataset
+
+
+class BaseFeatureComputer:
+    """Base class for methods that produce new features given battery data
+
+    Features can be anything but are often collected statistics about a certain cycle.
+    """
+
+    def compute_features(self, data: BatteryDataset) -> pd.DataFrame:
+        """Compute
+
+        Parameters
+        ----------
+        data: BatteryDataFrame
+            Battery data object
+
+        Returns
+        -------
+        features: pd.DataFrame
+            A dataframe of features where rows are different cycles or steps, columns are different features
+        """
+        pass
+
+
+class RawDataEnhancer(BaseFeatureComputer):
+    """Base class for methods derives new data from the existing columns in raw data"""
+
+    column_names: List[str] = ...
+
+    def compute_features(self, data: BatteryDataset) -> pd.DataFrame:
+        self.enhance(data.raw_data)
+        return data.raw_data[self.column_names]
+
+    def enhance(self, data: pd.DataFrame):
+        """Add additional columns to the raw data
+
+        Parameters
+        ----------
+        data: pd.DataFrame
+            Raw data to be modified
+        """
+        ...
+
+
+class CycleSummarizer(BaseFeatureComputer):
+    """Classes which produce a summary of certain cycles given the raw data from a cycle"""
+
+    column_names: List[str] = ...
+
+    def compute_features(self, data: BatteryDataset) -> pd.DataFrame:
+        self.add_summaries(data)
+        return data.cycle_stats[['cycle_number'] + self.column_names]
+
+    def add_summaries(self, data: BatteryDataset):
+        """Add cycle-level summaries to a battery dataset
+
+        Parameters
+        ----------
+        data: BatteryDataset
+            Dataset to be modified
+        """
+
+        # Add a cycle summary if not already available
+        if data.cycle_stats is None:
+            data.cycle_stats = pd.DataFrame({'cycle_number': sorted(set(data.raw_data['cycle_number']))})
+
+        # Perform the update
+        self._summarize(data.raw_data, data.cycle_stats)
+
+    def _summarize(self, raw_data: pd.DataFrame, cycle_data: pd.DataFrame):
+        """Add additional data to a cycle summary dataframe
+
+        Parameters
+        ----------
+        raw_data: pd.DataFrame
+            Raw data describing the initial cycles. Is not modified
+        cycle_data: pd.DataFrame
+            Cycle data frame to be updated
+        """
diff --git a/batdata/postprocess/cycle_stats.py b/batdata/postprocess/cycle_stats.py
@@ -1,189 +1,3 @@
 """Utility functions for computing properties of certain cycles"""
-from scipy.integrate import cumtrapz
-import pandas as pd
-import numpy as np
 
-
-# TODO (wardlt): Add back in features I removed to simplify the code as other functions:
-#   - [ ] Dropping outliers
-#   - [ ] Smoothing with Gaussian Process regression
-from batdata.schemas.cycling import ChargingState
-from batdata.data import BatteryDataset
-
-
-def compute_energy_per_cycle(data: BatteryDataset):
-    """
-    Calculate the maximum energy and capacity on a per-cycle basis
-
-    Parameters
-    ----------
-    data : BatteryDataset
-        Input battery dataset. Must have raw data defined
-
-    Returns
-    -------
-    cycle_ind : array
-        array of cycle numbers
-    energies : array
-        array of maximum for each cycle. Units: W-hr
-    capacities : array
-        array of maximum for each cycle. Units: A-hr
-
-    Examples
-    --------
-    none yet
-
-    """
-
-    # Initialize the output arrays
-    energies = np.array([])
-    capacities = np.array([])
-    cycle_ind = np.array([])
-
-    # Loop over each cycle
-    for cyc, cycle_data in data.raw_data.query("state=='discharging'").groupby('cycle_number'):
-        # Calculate accumulated energy/capacity for each sub-segment
-        ene = 0
-        cap = 0
-        for _, subseg in cycle_data.groupby('substep_index'):
-            # Sort by test time, just in case
-            subseg_sorted = subseg.sort_values('test_time')
-
-            # Use current as always positive convention, opposite of what our standard uses
-            t = subseg_sorted['test_time'].values
-            i = -1 * subseg_sorted['current'].values
-            v = subseg_sorted['voltage'].values
-
-            # integrate for energy and capacity and convert to
-            # Watt/hrs. and Amp/hrs. respectively
-            ene += np.trapz(i * v, t) / 3600
-            cap += np.trapz(i, t) / 3600
-
-        # TODO (wardlt): This version of append re-allocates arrays, O(n). Consider using list.append instead,
-        #  which uses linked lists O(1)
-        energies = np.append(energies, ene)
-        capacities = np.append(capacities, cap)
-        cycle_ind = np.append(cycle_ind, cyc)
-
-    return cycle_ind, energies, capacities
-
-
-def compute_capacity_energy(data: BatteryDataset) -> pd.DataFrame:
-    """Compute estimates for the battery capacity and energy
-    for each measurement of the charging and discharging sections of
-    each cycle.
-
-    The capacity/energy for each cycle are determined independently,
-    and is assumed to start at zero at the beginning of the cycle.
-
-    Parameters
-    ----------
-    data: BatteryDataset or dataframe
-        Battery dataset with raw data available, or the raw dataframe itself.
-        Must have test_time, voltage and current columns.
-        Processing will add "capacity" and "energy" columns with units
-        of A-hr and W-hr, respectively.
-    discharge: bool
-        Whether to compute the discharge or charge curve
-
-    Returns
-    -------
-    curves: pd.DataFrame
-        Charge and discharge curves for each cycle in a single dataframe
-    """
-
-    if not isinstance(data, pd.DataFrame):
-        data = data.raw_data
-
-    # Add columns for the capacity and energy
-    data['capacity'] = 0
-    data['energy'] = 0
-
-    # Compute the capacity and energy for each cycle
-    for cid, cycle in data.groupby('cycle_number'):
-
-        initial_cap = 0
-        initial_ene = 0
-
-        # Compute in segments over each subset (avoid issues with rests)
-        for _, subcycle in cycle.groupby('substep_index'):
-            # Integrate over it
-
-            sel = subcycle['state'] == ChargingState.discharging
-            sel += subcycle['state'] == ChargingState.charging
-            if sum(sel) == 0:
-                data.loc[subcycle.index, 'capacity'] = initial_cap
-                data.loc[subcycle.index, 'energy'] = initial_ene
-                continue
-
-            cap = cumtrapz(subcycle['current'],
-                           subcycle['test_time'],
-                           initial=0) / 3600  # Computes capacity in A-hr
-            ene = cumtrapz(subcycle['current'] * subcycle['voltage'],
-                           subcycle['test_time'],
-                           initial=0) / 3600  # Energy in A-hr
-
-            cap += initial_cap
-            ene += initial_ene
-
-            data.loc[subcycle.index, 'capacity'] = cap
-            data.loc[subcycle.index, 'energy'] = ene
-
-            initial_cap = cap[-1]
-            initial_ene = ene[-1]
-
-    return data
-
-
-def compute_charging_curve(data: BatteryDataset, discharge: bool = True) -> pd.DataFrame:
-    """Compute estimates for the battery capacity for each measurement
-    of the charging or discharging sections of each cycle.
-
-    The capacity for each cycle are determined independently,
-    and is assumed to start at zero at the beginning of the cycle.
-
-    Parameters
-    ----------
-    data: BatteryDataset or dataframe
-        Battery dataset with raw data available, or the raw dataframe itself.
-        Must have test_time, voltage and current columns.
-        Processing will add "capacity" and "energy" columns with units
-        of A-hr and W-hr, respectively.
-    discharge: bool
-        Whether to compute the discharge or charge curve
-
-    Returns
-    -------
-    curves: pd.DataFrame
-        Charge and discharge curves for each cycle in a single dataframe
-    """
-
-    if not isinstance(data, pd.DataFrame):
-        data = data.raw_data
-
-    # Get only the [dis]charging data
-    data = pd.DataFrame(data[data['state'] == (ChargingState.discharging if discharge else ChargingState.charging)])
-
-    # Add columns for the capacity and energy
-    data['capacity'] = 0
-    data['energy'] = 0
-
-    # Compute the capacity and energy for each cycle
-    for cid, cycle in data.groupby('cycle_number'):
-
-        # Compute in segments over each subset (avoid issues with rests)
-        for _, subcycle in cycle.groupby('substep_index'):
-            # Integrate over it
-            cap = cumtrapz(subcycle['current'], subcycle['test_time'], initial=0) / 3600  # Computes capacity in A-hr
-            eng = cumtrapz(subcycle['current'] * subcycle['voltage'],
-                           subcycle['test_time'], initial=0) / 3600  # Energy in A-hr
-
-            # Multiply by -1 for the discharging segment
-            if discharge:
-                cap *= -1
-                eng *= -1
-
-            data.loc[subcycle.index, 'capacity'] = cap
-            data.loc[subcycle.index, 'energy'] = eng
-
-    return data
+# TBD (wardlt): Still working on what goes here