Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added timestamp adapter #268

Merged
merged 11 commits into from
May 13, 2022
Merged
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Unreleased
- ipynb files from docs/examples are now also used as (optional) tests (PR `#263 <https://github.com/TUW-GEO/pytesmo/pull/263>`_)
- ``yapf`` for code formatting (see developers guide) (Fix #248, PR `#263 <https://github.com/TUW-GEO/pytesmo/pull/263>`_)
- validation framework option to force dataset combinations that include reference dataset updated (PR `#265 <https://github.com/TUW-GEO/pytesmo/pull/265>`_)
- Added `TimestampAdapter` to the validation framework to deal with datasets that have different date/time specification fields (PR `#268 <https://github.com/TUW-GEO/pytesmo/pull/268>`_)

Version 0.13.4, 2022-01-12
==========================
Expand Down
213 changes: 196 additions & 17 deletions src/pytesmo/validation_framework/adapters.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,18 +24,19 @@
# ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
# THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


"""
Module containing adapters that can be used together with the validation
framework.
"""

import operator

import pandas as pd
from pytesmo.time_series.anomaly import calc_anomaly
from pytesmo.time_series.anomaly import calc_climatology
from pytesmo.utils import deprecated
from pandas import DataFrame
import numpy as np
import warnings

_op_lookup = {
Expand Down Expand Up @@ -89,27 +90,24 @@ def __init__(self, cls, data_property_name="data", read_name=None):
setattr(self, read_name, self._adapt_custom)

def __get_dataframe(self, data):
if (
(not isinstance(data, DataFrame))
and (hasattr(data, self.data_property_name))
and (isinstance(getattr(data, self.data_property_name), DataFrame))
):
if ((not isinstance(data, DataFrame)) and
(hasattr(data, self.data_property_name)) and
(isinstance(getattr(data, self.data_property_name), DataFrame))):
data = getattr(data, self.data_property_name)
return data

def __drop_tz_info(self, data):
if hasattr(data.index, "tz") and (data.index.tz is not None):
warnings.warn(
f"Dropping timezone information ({data.index.tz})"
f" for data from reader {self.cls.__class__.__name__}"
)
f" for data from reader {self.cls.__class__.__name__}")
data.index = data.index.tz_convert(None)
return data

def _adapt(self, df: DataFrame) -> DataFrame:
# drop time zone info and extract df from ASCAT TS object
return self.__drop_tz_info(self.__get_dataframe(df)
if df is not None else DataFrame())
return self.__drop_tz_info(
self.__get_dataframe(df) if df is not None else DataFrame())

def _adapt_custom(self, *args, **kwargs):
# modifies data from whatever function was set as `read_name`.
Expand All @@ -133,10 +131,8 @@ def grid(self):
return self.cls.grid


@deprecated(
"`MaskingAdapter` is deprecated, use `SelfMaskingAdapter` "
"or `AdvancedMaskingAdapter` instead."
)
@deprecated("`MaskingAdapter` is deprecated, use `SelfMaskingAdapter` "
"or `AdvancedMaskingAdapter` instead.")
class MaskingAdapter(BasicAdapter):
"""
Transform the given class to return a boolean dataset given the operator
Expand Down Expand Up @@ -370,8 +366,7 @@ def _adapt(self, data):
ite = self.columns
for column in ite:
data[column] = calc_anomaly(
data[column], window_size=self.window_size
)
data[column], window_size=self.window_size)
return data


Expand Down Expand Up @@ -502,3 +497,187 @@ def _adapt(self, data: DataFrame) -> DataFrame:
new_col = data[columns].apply(self.func, **self.func_kwargs)
data[self.new_name] = new_col
return data


class TimestampAdapter(BasicAdapter):
"""
Class that combines two or more timestamp fields to a single exact
measurement time. The fields of interest specify:

1. A basic observation time (e.g. days at midnight) which can
be expressed in timestamp (YYYY-mm-dd) or with respect to a
reference time (days since YYYY-mm-dd)
2. One or more (minute, s, µs) offset times to be added cumulatively

-------------------
Example input:

variable base_time [w.r.t. 2005-02-01] offset [min] offset [sec]
100 0.889751 100.0 38.0 999.0
101 0.108279 101.0 40.0 1000.0
102 -1.201708 102.0 39.0 999.0

Example output:

variable
2005-05-12 00:55:42 0.889751
2005-05-13 00:57:39 0.108279
2005-05-14 00:56:38 -1.201708

Parameters
----------
cls: object
pstradio marked this conversation as resolved.
Show resolved Hide resolved
Reader object, has to have a `read_ts` or `read` method or a method
name must be specified in the `read_name` kwarg. The same method will
be available for the adapted version of the reader.
time_offset_fields: str, list or None
name or list of names of the fields that provide information on the time offset.
If a list is given, all values will contribute to the offset, assuming that
each refers to the previous. For instance:
offset = minutes + seconds in the minute + µs in the second
NOTE: np.nan values are counted as 0 offset
NOTE: if None, no offset is considered
time_units: str or list
time units that the time_offset_fields are specified in. If a list is given,
it should have the same size as the 'time_offset_fields' parameter
Can be any of the np.datetime[64] units:
https://numpy.org/doc/stable/reference/arrays.datetime.html
base_time_field: str, optional. Default is None.
If a name is provided, the generic time field will be searched for
in the columns; otherwise, it is assumed to be the index
NOTE: np.nan values in this field are dropped
base_time_reference: str, optional. Default is None.
String of format 'YYYY-mm-dd' that can be specified to tranform the
'base_time_field' from [units since base_time_reference] to
np.datetime[64]. If not provided, it will be assumed that the base_time_field
is already in np.datetime[64] units
base_time_units: str, optional. Default is "D"
Units that the base_time_field is specified in. Only applicable with 'base_time_reference'
replace_index: bool, optional. Default is True.
If True, the exact timestamp is used as index. Else, it will be added
to the dataframe on the column 'output_field'
output_field: str, optional. Default is None.
If a name is specified, an additional column is generated under the name,
with the exact timestamp. Only with 'replace_index' == False
drop_original: bool, optional. Default is True.
Whether the base_time_field and time_offset_fields should be dropped in the
final DataFrame
handle_invalid: str, optional. Default is 'return_original'
pstradio marked this conversation as resolved.
Show resolved Hide resolved
How to handle cases where all the entries in the generic observation time (base_time_field)
are NaT. Options:
- 'return_original': the original dataframe is returned
- 'return_null': an empty dataframe is returned, with columns defined by the 'drop_original' parameter
"""

def __init__(self,
cls: object,
time_offset_fields: str or list,
time_units: str or list = "s",
base_time_field: str = None,
base_time_reference: str = None,
base_time_units: str = "D",
replace_index: bool = True,
output_field: str = None,
drop_original: bool = True,
handle_invalid: str = 'return_original',
**kwargs):
super().__init__(cls, **kwargs)

self.time_offset_fields = time_offset_fields if isinstance(
time_offset_fields, list) else [time_offset_fields]
self.time_units = time_units if isinstance(time_units,
list) else [time_units]

self.base_time_field = base_time_field
self.base_time_reference = np.datetime64(
base_time_reference) if base_time_reference is not None else None
self.base_time_units = base_time_units

self.replace_index = replace_index
if not replace_index and output_field is None:
raise ValueError(
"'output_field' should be specified in case the new timestamp"
"should not be used as index. Alternatively, set 'replace_index' to True"
)
elif replace_index and output_field is not None:
warnings.warn(
"Ignoring the 'output_field' value. Set 'replace_index' to True to"
"avoid this behavior")
else:
self.output_field = output_field

self.drop_original = drop_original
self.handle_invalid = handle_invalid

def convert_generic(self, time_arr: np.array) -> np.array:
"""Convert the generic time field to np.datetime[64] dtype"""
time_delta = time_arr.astype(int).astype('timedelta64[D]')
time_date = np.full(time_delta.shape,
self.base_time_reference) + time_delta

return time_date

def add_offset_cumulative(self, data: DataFrame) -> np.array:
"""Return an array of timedelta calculated with all the time_offset_fields"""
total_offset = np.full(data.index.shape, 0, dtype='timedelta64[s]')
for field, unit in zip(self.time_offset_fields, self.time_units):
total_offset += data[field].map(
lambda x: np.timedelta64(int(x), unit)
if not np.isnan(x) else np.timedelta64(0, unit)).values

return total_offset

def _adapt(self, data: DataFrame) -> DataFrame:
"""
Adapt the timestamps in the original with the specified offset
NOTE: assumes the index dtype is 'datetime64[ns]'
"""
data = super()._adapt(data)
original = data.copy()

# Get the generic time array
if self.base_time_field is not None:
base_time = data[self.base_time_field]
else:
base_time = data.index

# Take only the valid dates
data = data[base_time.notna()]
base_time_values = base_time.dropna().values

# Make sure the dataframes contains values after dropna()
if data.empty and self.handle_invalid == "return_original":
return original

elif data.empty and self.handle_invalid == "return_null":
# Define the shape of the output (empty) dataframe
if self.drop_original:
data.drop(columns=self.time_offset_fields, inplace=True)
if self.base_time_field in data.columns:
data.drop(columns=[self.base_time_field], inplace=True)

return data

if self.base_time_reference is not None:
base_time_values = self.convert_generic(base_time_values)

# If no offset is specified
if self.time_offset_fields is None:
exact_time = base_time_values
else:
# Add time offset to the generic time
offset = self.add_offset_cumulative(data)
exact_time = base_time_values + offset

# generate the final frame
if not self.replace_index:
data[self.output_field] = exact_time
else:
data.index = exact_time

if self.drop_original:
data.drop(columns=self.time_offset_fields, inplace=True)
if self.base_time_field in data.columns:
data.drop(columns=[self.base_time_field], inplace=True)

return data
Loading