TUW-GEO · wpreimes · May 13, 2022 · May 9, 2022 · May 9, 2022 · May 9, 2022
diff --git a/CHANGELOG.rst b/CHANGELOG.rst
@@ -10,6 +10,7 @@ Unreleased
 - ipynb files from docs/examples are now also used as (optional) tests (PR `#263 <https://github.com/TUW-GEO/pytesmo/pull/263>`_)
 - ``yapf`` for code formatting (see developers guide) (Fix #248, PR `#263 <https://github.com/TUW-GEO/pytesmo/pull/263>`_)
 - validation framework option to force dataset combinations that include reference dataset updated (PR `#265 <https://github.com/TUW-GEO/pytesmo/pull/265>`_)
+- Added `TimestampAdapter` to the validation framework to deal with datasets that have different date/time specification fields (PR `#268 <https://github.com/TUW-GEO/pytesmo/pull/268>`_)
 
 Version 0.13.4, 2022-01-12
 ==========================

diff --git a/src/pytesmo/validation_framework/adapters.py b/src/pytesmo/validation_framework/adapters.py
@@ -24,18 +24,19 @@
 # ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
 # THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
 """
 Module containing adapters that can be used together with the validation
 framework.
 """
 
 import operator
+
+import pandas as pd
 from pytesmo.time_series.anomaly import calc_anomaly
 from pytesmo.time_series.anomaly import calc_climatology
 from pytesmo.utils import deprecated
 from pandas import DataFrame
+import numpy as np
 import warnings
 
 _op_lookup = {
@@ -89,27 +90,24 @@ def __init__(self, cls, data_property_name="data", read_name=None):
             setattr(self, read_name, self._adapt_custom)
 
     def __get_dataframe(self, data):
-        if (
-            (not isinstance(data, DataFrame))
-            and (hasattr(data, self.data_property_name))
-            and (isinstance(getattr(data, self.data_property_name), DataFrame))
-        ):
+        if ((not isinstance(data, DataFrame)) and
+            (hasattr(data, self.data_property_name)) and
+            (isinstance(getattr(data, self.data_property_name), DataFrame))):
             data = getattr(data, self.data_property_name)
         return data
 
     def __drop_tz_info(self, data):
         if hasattr(data.index, "tz") and (data.index.tz is not None):
             warnings.warn(
                 f"Dropping timezone information ({data.index.tz})"
-                f" for data from reader {self.cls.__class__.__name__}"
-            )
+                f" for data from reader {self.cls.__class__.__name__}")
             data.index = data.index.tz_convert(None)
         return data
 
     def _adapt(self, df: DataFrame) -> DataFrame:
         # drop time zone info and extract df from ASCAT TS object
-        return self.__drop_tz_info(self.__get_dataframe(df)
-                                   if df is not None else DataFrame())
+        return self.__drop_tz_info(
+            self.__get_dataframe(df) if df is not None else DataFrame())
 
     def _adapt_custom(self, *args, **kwargs):
         # modifies data from whatever function was set as `read_name`.
@@ -133,10 +131,8 @@ def grid(self):
             return self.cls.grid
 
 
-@deprecated(
-    "`MaskingAdapter` is deprecated, use `SelfMaskingAdapter` "
-    "or `AdvancedMaskingAdapter` instead."
-)
+@deprecated("`MaskingAdapter` is deprecated, use `SelfMaskingAdapter` "
+            "or `AdvancedMaskingAdapter` instead.")
 class MaskingAdapter(BasicAdapter):
     """
     Transform the given class to return a boolean dataset given the operator
@@ -370,8 +366,7 @@ def _adapt(self, data):
             ite = self.columns
         for column in ite:
             data[column] = calc_anomaly(
-                data[column], window_size=self.window_size
-            )
+                data[column], window_size=self.window_size)
         return data
 
 
@@ -502,3 +497,187 @@ def _adapt(self, data: DataFrame) -> DataFrame:
         new_col = data[columns].apply(self.func, **self.func_kwargs)
         data[self.new_name] = new_col
         return data
+
+
+class TimestampAdapter(BasicAdapter):
+    """
+    Class that combines two or more timestamp fields to a single exact
+    measurement time. The fields of interest specify:
+
+    1. A basic observation time (e.g. days at midnight) which can
+        be expressed in timestamp (YYYY-mm-dd) or with respect to a
+        reference time (days since YYYY-mm-dd)
+    2. One or more (minute, s, µs) offset times to be added cumulatively
+
+    -------------------
+    Example input:
+
+         variable    base_time [w.r.t. 2005-02-01]  offset [min]  offset [sec]
+    100  0.889751                            100.0          38.0         999.0
+    101  0.108279                            101.0          40.0        1000.0
+    102 -1.201708                            102.0          39.0         999.0
+
+    Example output:
+
+                         variable
+    2005-05-12 00:55:42  0.889751
+    2005-05-13 00:57:39  0.108279
+    2005-05-14 00:56:38 -1.201708
+
+    Parameters
+    ----------
+     cls: object
+        Reader object, has to have a `read_ts` or `read` method or a method
+        name must be specified in the `read_name` kwarg. The same method will
+        be available for the adapted version of the reader.
+    time_offset_fields: str, list or None
+        name or list of names of the fields that provide information on the time offset.
+        If a list is given, all values will contribute to the offset, assuming that
+        each refers to the previous. For instance:
+        offset = minutes + seconds in the minute + µs in the second
+        NOTE: np.nan values are counted as 0 offset
+        NOTE: if None, no offset is considered
+    time_units: str or list
+        time units that the time_offset_fields are specified in. If a list is given,
+        it should have the same size as the 'time_offset_fields' parameter
+        Can be any of the np.datetime[64] units:
+        https://numpy.org/doc/stable/reference/arrays.datetime.html
+    base_time_field: str, optional. Default is None.
+        If a name is provided, the generic time field will be searched for
+        in the columns; otherwise, it is assumed to be the index
+        NOTE: np.nan values in this field are dropped
+    base_time_reference: str, optional. Default is None.
+        String of format 'YYYY-mm-dd' that can be specified to tranform the
+        'base_time_field' from [units since base_time_reference] to
+        np.datetime[64]. If not provided, it will be assumed that the base_time_field
+        is already in np.datetime[64] units
+    base_time_units: str, optional. Default is "D"
+        Units that the base_time_field is specified in. Only applicable with 'base_time_reference'
+    replace_index: bool, optional. Default is True.
+        If True, the exact timestamp is used as index. Else, it will be added
+        to the dataframe on the column 'output_field'
+    output_field: str, optional. Default is None.
+        If a name is specified, an additional column is generated under the name,
+        with the exact timestamp. Only with 'replace_index' == False
+    drop_original: bool, optional. Default is True.
+        Whether the base_time_field and time_offset_fields should be dropped in the
+        final DataFrame
+    handle_invalid: str, optional. Default is 'return_original'
+        How to handle cases where all the entries in the generic observation time (base_time_field)
+        are NaT. Options:
+        - 'return_original': the original dataframe is returned
+        - 'return_null': an empty dataframe is returned, with columns defined by the 'drop_original' parameter
+    """
+
+    def __init__(self,
+                 cls: object,
+                 time_offset_fields: str or list,
+                 time_units: str or list = "s",
+                 base_time_field: str = None,
+                 base_time_reference: str = None,
+                 base_time_units: str = "D",
+                 replace_index: bool = True,
+                 output_field: str = None,
+                 drop_original: bool = True,
+                 handle_invalid: str = 'return_original',
+                 **kwargs):
+        super().__init__(cls, **kwargs)
+
+        self.time_offset_fields = time_offset_fields if isinstance(
+            time_offset_fields, list) else [time_offset_fields]
+        self.time_units = time_units if isinstance(time_units,
+                                                   list) else [time_units]
+
+        self.base_time_field = base_time_field
+        self.base_time_reference = np.datetime64(
+            base_time_reference) if base_time_reference is not None else None
+        self.base_time_units = base_time_units
+
+        self.replace_index = replace_index
+        if not replace_index and output_field is None:
+            raise ValueError(
+                "'output_field' should be specified in case the new timestamp"
+                "should not be used as index. Alternatively, set 'replace_index' to True"
+            )
+        elif replace_index and output_field is not None:
+            warnings.warn(
+                "Ignoring the 'output_field' value. Set 'replace_index' to True to"
+                "avoid this behavior")
+        else:
+            self.output_field = output_field
+
+        self.drop_original = drop_original
+        self.handle_invalid = handle_invalid
+
+    def convert_generic(self, time_arr: np.array) -> np.array:
+        """Convert the generic time field to np.datetime[64] dtype"""
+        time_delta = time_arr.astype(int).astype('timedelta64[D]')
+        time_date = np.full(time_delta.shape,
+                            self.base_time_reference) + time_delta
+
+        return time_date
+
+    def add_offset_cumulative(self, data: DataFrame) -> np.array:
+        """Return an array of timedelta calculated with all the time_offset_fields"""
+        total_offset = np.full(data.index.shape, 0, dtype='timedelta64[s]')
+        for field, unit in zip(self.time_offset_fields, self.time_units):
+            total_offset += data[field].map(
+                lambda x: np.timedelta64(int(x), unit)
+                if not np.isnan(x) else np.timedelta64(0, unit)).values
+
+        return total_offset
+
+    def _adapt(self, data: DataFrame) -> DataFrame:
+        """
+        Adapt the timestamps in the original with the specified offset
+        NOTE: assumes the index dtype is 'datetime64[ns]'
+        """
+        data = super()._adapt(data)
+        original = data.copy()
+
+        # Get the generic time array
+        if self.base_time_field is not None:
+            base_time = data[self.base_time_field]
+        else:
+            base_time = data.index
+
+        # Take only the valid dates
+        data = data[base_time.notna()]
+        base_time_values = base_time.dropna().values
+
+        # Make sure the dataframes contains values after dropna()
+        if data.empty and self.handle_invalid == "return_original":
+            return original
+
+        elif data.empty and self.handle_invalid == "return_null":
+            # Define the shape of the output (empty) dataframe
+            if self.drop_original:
+                data.drop(columns=self.time_offset_fields, inplace=True)
+                if self.base_time_field in data.columns:
+                    data.drop(columns=[self.base_time_field], inplace=True)
+
+            return data
+
+        if self.base_time_reference is not None:
+            base_time_values = self.convert_generic(base_time_values)
+
+        # If no offset is specified
+        if self.time_offset_fields is None:
+            exact_time = base_time_values
+        else:
+            # Add time offset to the generic time
+            offset = self.add_offset_cumulative(data)
+            exact_time = base_time_values + offset
+
+        # generate the final frame
+        if not self.replace_index:
+            data[self.output_field] = exact_time
+        else:
+            data.index = exact_time
+
+        if self.drop_original:
+            data.drop(columns=self.time_offset_fields, inplace=True)
+            if self.base_time_field in data.columns:
+                data.drop(columns=[self.base_time_field], inplace=True)
+
+        return data