Skip to content

Commit

Permalink
313 metadata comparison slightly too strict for hatyanread dia (#314)
Browse files Browse the repository at this point in the history
* add testcase that fails with the current setup

* added tests for multiblock diafiles errors

* removed tstart/tstop/tzone metadata from timeseries dataframes

* removed timestep_min attr throughout code

* prevent overwriting of block_ids argument in file loop

* updated whatsnew
  • Loading branch information
veenstrajelmer authored Jun 4, 2024
1 parent 8a004bc commit 5befefd
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 95 deletions.
3 changes: 3 additions & 0 deletions docs/whats-new.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,9 @@

## UNRELEASED

### Fix
- repaired support for equidistant multiblock diafiles with varying timesteps in [#314](https://github.com/Deltares/hatyan/pull/314)


## 2.8.0 (2024-05-08)
This release contains many changes of which several also impact the user. These changes benefit the user friendliness and robustness of hatyan.
Expand Down
13 changes: 10 additions & 3 deletions hatyan/analysis_prediction.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,6 +255,8 @@ def analysis(ts, const_list,
metadata['xfac'] = hatyan_settings.xfac
metadata['fu_alltimes'] = hatyan_settings.fu_alltimes
metadata['source'] = hatyan_settings.source
metadata['tstart'] = ts.index.min().tz_localize(None)
metadata['tstop'] = ts.index.max().tz_localize(None)
metadata['tzone'] = ts.index.tz
COMP_mean_pd = metadata_add_to_obj(COMP_mean_pd, metadata)

Expand All @@ -280,8 +282,8 @@ def analysis_singleperiod(ts, const_list, hatyan_settings):
if bool_ts_duplicated.any():
raise ValueError(f'{bool_ts_duplicated.sum()} duplicate timesteps in provided timeseries, remove them e.g. with: ts = ts[~ts.index.duplicated(keep="first")]')
message = (f'#timesteps = {len(ts)}\n'
f'tstart = {ts.index[0].strftime("%Y-%m-%d %H:%M:%S")}\n'
f'tstop = {ts.index[-1].strftime("%Y-%m-%d %H:%M:%S")}\n'
f'tstart = {ts.index.min().strftime("%Y-%m-%d %H:%M:%S")}\n'
f'tstop = {ts.index.max().strftime("%Y-%m-%d %H:%M:%S")}\n'
f'timestep = {ts.index.freq}')
logger.info(message)

Expand Down Expand Up @@ -417,7 +419,7 @@ def split_components(comp, dood_date_mid, hatyan_settings):
def prediction_singleperiod(comp:pd.DataFrame, times:pd.DatetimeIndex, hatyan_settings) -> pd.DataFrame:

metadata_comp = metadata_from_obj(comp)
tzone_comp = metadata_comp.pop("tzone")
tzone_comp = metadata_comp.pop('tzone')

if not isinstance(times, pd.DatetimeIndex):
raise TypeError(f'times argument can be of type pd.DatetimeIndex or slice, not {type(times)}')
Expand Down Expand Up @@ -566,8 +568,13 @@ def prediction(comp, times=None, timestep=None):
# update metadata
if metadata_comp['grootheid'] == 'WATHTE':
metadata_comp['grootheid'] = 'WATHTBRKD'
# prevent adding time metadata from component dataframe to prediction dataframe
if 'tzone' in metadata_comp.keys():
metadata_comp.pop('tzone')
if 'tstart' in metadata_comp.keys():
metadata_comp.pop("tstart")
if 'tstop' in metadata_comp.keys():
metadata_comp.pop("tstop")
ts_prediction = metadata_add_to_obj(ts_prediction, metadata_comp)

return ts_prediction
4 changes: 2 additions & 2 deletions hatyan/components.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def write_components(comp, filename):
tstop = metadata.pop('tstop')
tzone = metadata.pop('tzone')
if tzone is None:
raise AttributeError("write_components() encountered tzone=None in components dataframe, not allowed.")
raise ValueError("write_components() encountered tzone=None in components dataframe, not allowed.")
tzone_min = tzone._minutes
tstart_str = tstart.strftime("%Y%m%d %H%M")
tstop_str = tstop.strftime("%Y%m%d %H%M")
Expand Down Expand Up @@ -243,7 +243,7 @@ def merge_componentgroups(comp_main, comp_sec):
comp_sec_meta = metadata_from_obj(comp_sec).copy()
comp_sec_list = comp_sec.index.tolist()

meta_settings_list = ['origin','groepering','timestep_min','timestep_unit','TYP']
meta_settings_list = ['origin','groepering','tstart','tstop','TYP']
comp_main_meta_others = {}
for key in meta_settings_list:
if key in comp_main_meta:
Expand Down
29 changes: 4 additions & 25 deletions hatyan/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
@author: veenstra
"""

import numpy as np
import pandas as pd


Expand All @@ -24,22 +23,13 @@ def metadata_from_diablocks(diablocks_pd, block_id):
diablocks_pd_onerow = diablocks_pd.iloc[block_id]

metadata_keys = ['station', 'grootheid', 'eenheid',
'vertref',
'tstart', 'tstop',
'timestep_min', 'timestep_unit',
'TYP', 'groepering']
'vertref', 'TYP', 'groepering']

#TODO: align with metadata from hatyan.read_components()
metadata = {key:diablocks_pd_onerow[key] for key in metadata_keys}

# add origin
metadata['origin'] = 'from timeseries dia file'

# replace nan with None (otherwise metadata_compare fails)
#TODO: avoid nan in metadata (timestep for hoek_har.dia)
if np.isnan(metadata['timestep_min']): #non-equidistant, nan in py38 and none in py39 (pandas 2.1.2)
metadata['timestep_min'] = None
metadata['timestep_unit'] = None
return metadata


Expand All @@ -49,21 +39,10 @@ def metadata_from_obj(obj):


def metadata_compare(metadata_list):

# remove tstart/tstop since they cannot be compared on equality in case of multifile dia
metadata_list_notstartstop = []
for meta in metadata_list:
meta_new = meta.copy()
if 'tstart' in meta_new:
meta_new.pop('tstart')
if 'tstop' in meta_new:
meta_new.pop('tstop')
metadata_list_notstartstop.append(meta_new)

nmeta = len(metadata_list_notstartstop)
nmeta = len(metadata_list)
for i in range(1,nmeta):
meta1 = metadata_list_notstartstop[i-1]
meta2 = metadata_list_notstartstop[i]
meta1 = metadata_list[i-1]
meta2 = metadata_list[i]
if meta1!=meta2:
meta_12_df = pd.concat([pd.Series(meta1), pd.Series(meta2)],axis=1)
meta_12_df["equal"] = meta_12_df[0]==meta_12_df[1]
Expand Down
49 changes: 22 additions & 27 deletions hatyan/timeseries.py
Original file line number Diff line number Diff line change
Expand Up @@ -610,13 +610,12 @@ def write_netcdf(ts, filename, ts_ext=None, nosidx=False, mode='w'):

times_all = ts.index
timeseries = ts['values']
times_stepmin = (ts.index[1]-ts.index[0]).total_seconds()/60
dt_analysistime = dt.datetime.now()
data_nc = Dataset(filename, mode, format="NETCDF3_CLASSIC")
attr_dict = {'title': 'tidal prediction for %s to %s'%(times_all[0].strftime('%Y-%m-%d %H:%M:%S'), times_all[-1].strftime('%Y-%m-%d %H:%M:%S')),
'institution': 'Rijkswaterstaat',
'source': 'hatyan-%s tidal analysis program of Rijkswaterstaat'%(version_no),
'timestep_min': times_stepmin}
}
data_nc.setncatts(attr_dict)

ncvarlist = list(data_nc.variables.keys())
Expand Down Expand Up @@ -1060,10 +1059,7 @@ def crop_timeseries(ts, times, onlyfull=True):

# add metadata
metadata = metadata_from_obj(ts)
metadata['tstart'] = pd.Timestamp(tstart)
metadata['tstop'] = pd.Timestamp(tstop)
ts_pd_out = metadata_add_to_obj(ts_pd_out,metadata)

return ts_pd_out


Expand Down Expand Up @@ -1104,9 +1100,6 @@ def resample_timeseries(ts, timestep_min, tstart=None, tstop=None):

# add metadata
metadata = metadata_from_obj(ts)
metadata['tstart'] = pd.Timestamp(tstart)
metadata['tstop'] = pd.Timestamp(tstop)
metadata['timestep_min'] = timestep_min
data_pd_resample = metadata_add_to_obj(data_pd_resample,metadata)

return data_pd_resample
Expand Down Expand Up @@ -1533,49 +1526,52 @@ def read_dia(filename, station=None, block_ids=None, allow_duplicates=False):
pd.set_option('display.width', 200) #default was 80, but need more to display groepering
print_cols = ['block_starts', 'station', 'grootheid', 'groepering', 'tstart', 'tstop']
logger.info('blocks in diafile:\n%s'%(diablocks_pd[print_cols]))
str_getdiablockspd = 'A summary of the available blocks is printed above, obtain a full DataFrame of available diablocks with "diablocks_pd=hatyan.get_diablocks(filename)"'

#get equidistant timeseries from metadata
if block_ids is None or block_ids=='allstation':
if station is None:
if len(diablocks_pd)==1:
station = diablocks_pd.loc[0,'station']
else:
raise ValueError(('If block_ids argument is not provided (or None) or is "allstation", station '
f'argument should be provided.\n{diablocks_pd[print_cols]}'))
raise ValueError('If block_ids=None or block_ids="allstation", station argument should be provided. '
f'Available blocks:\n{diablocks_pd[print_cols]}')
bool_station = diablocks_pd['station']==station
ids_station = diablocks_pd[bool_station].index.tolist()
if len(ids_station)<1:
raise ValueError(f"No data block with requested station ({station}) present in dia file. {str_getdiablockspd}")
raise ValueError(f"No data block with requested station ({station}) present in dia file. "
f"Available blocks:\n{diablocks_pd[print_cols]}")
elif len(ids_station)>1 and block_ids is None:
raise ValueError(f"More than one data block with requested station ({station}) "
"present in dia file. Provide block_ids argument to read_dia() (int, list of int or 'allstation'). "
f"{str_getdiablockspd}")
f"Available blocks:\n{diablocks_pd[print_cols]}")
else: #exactly one occurrence or block_ids is provided or block_ids='allstation'
block_ids = ids_station
block_ids_one = ids_station
elif isinstance(block_ids,int):
block_ids_one = [block_ids]
else:
# prevent overwriting of block_ids in this file loop
block_ids_one = block_ids

#check validity of blockids of type listlist
if isinstance(block_ids,int):
block_ids = [block_ids]
if not isinstance(block_ids,list):
if not isinstance(block_ids_one,list):
raise TypeError('Invalid type for block_ids (should be int, list of int or "allstation")')
if not pd.Series(block_ids).isin(diablocks_pd.index).all():
raise ValueError(f"Invalid values in block_ids list ({block_ids}), "
if not pd.Series(block_ids_one).isin(diablocks_pd.index).all():
raise ValueError(f"Invalid values in block_ids list ({block_ids_one}), "
f"possible are {diablocks_pd.index.tolist()} (all integers)")

if station is not None:
if not isinstance(station,str):
raise TypeError('Station argument should be of type string')
bool_samestation = diablocks_pd.loc[block_ids,'station']==station
bool_samestation = diablocks_pd.loc[block_ids_one,'station']==station
if not bool_samestation.all():
raise ValueError("Both the arguments station and block_ids are provided, "
"but at least one of the requested block_ids corresponds to a different station. "
f"{str_getdiablockspd}")
f"Available blocks:\n{diablocks_pd[print_cols]}")

for block_id in block_ids:
if np.isnan(diablocks_pd.loc[block_id,'timestep_min']):
for block_id in block_ids_one:
if np.isnan(diablocks_pd.loc[block_id,'timestep_min']): # non-equidistant
data_pd_oneblock = read_dia_nonequidistant(filename_one, diablocks_pd, block_id)
else: #equidistant
else: # equidistant
data_pd_oneblock = read_dia_equidistant(filename_one, diablocks_pd, block_id)
data_pd_list.append(data_pd_oneblock)
metadata = metadata_from_obj(data_pd_oneblock)
Expand All @@ -1585,8 +1581,6 @@ def read_dia(filename, station=None, block_ids=None, allow_duplicates=False):
data_pd_all = pd.concat(data_pd_list)
metadata_compare(metadata_list)
metadata = metadata_list[0].copy()
metadata['tstart'] = metadata_list[0]['tstart']
metadata['tstop'] = metadata_list[-1]['tstop']
data_pd_all = metadata_add_to_obj(data_pd_all,metadata)

if allow_duplicates:
Expand All @@ -1595,7 +1589,8 @@ def read_dia(filename, station=None, block_ids=None, allow_duplicates=False):
#check overlapping timesteps, sort values on time
if data_pd_all.index.duplicated().any():
raise ValueError("Merged datasets have duplicate/overlapping timesteps, "
"clean up your input data or provide one file instead of a list")
"clean up your input data or provide one file instead of a list. "
"Or pass `allow_duplicates=True`")
if not data_pd_all.index.is_monotonic_increasing:
data_pd_all = data_pd_all.sort_index()

Expand Down
20 changes: 20 additions & 0 deletions tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,6 +89,26 @@ def test_read_write_components_nondefaultsettings():
hatyan.write_components(comp_orig, filename=file_new)
assert "source" in str(e.value)

comp_orig = hatyan.read_components(filename=file_orig)
comp_orig.attrs['tzone'] = None
with pytest.raises(ValueError) as e:
hatyan.write_components(comp_orig, filename=file_new)
assert "tzone=None" in str(e.value)


@pytest.mark.unittest
def test_writecomponents_fromanalysis(tmp_path):
"""
this is of added value to check if all required metadata is present from an analysis
"""
current_station = 'VLISSGN'
file_data_comp0 = os.path.join(dir_testdata,f'{current_station}_obs1.txt')
ts = hatyan.read_dia(filename=file_data_comp0, station=current_station)

comp = hatyan.analysis(ts=ts, const_list='month', fu_alltimes=False)
file_comp = os.path.join(tmp_path, "temp_comp.txt")
hatyan.write_components(comp, file_comp)


@pytest.mark.unittest
def test_plot_components_validation():
Expand Down
29 changes: 1 addition & 28 deletions tests/test_metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
import os
import pytest
import pytz
import pandas as pd
import hatyan
from hatyan.metadata import metadata_from_obj, metadata_compare, wns_from_metadata

Expand All @@ -28,10 +27,6 @@ def test_readts_dia_metadata_multifile():
'grootheid': 'WATHTE',
'eenheid': 'cm',
'vertref': 'NAP',
'tstart': pd.Timestamp('2009-01-01 00:00:00'),
'tstop': pd.Timestamp('2012-12-31 23:00:00'),
'timestep_min': 60.0,
'timestep_unit': 'min',
'TYP': 'TE',
'groepering': 'NVT',
'origin': 'from timeseries dia file'}
Expand All @@ -56,7 +51,6 @@ def test_metadata_compare_valueerror():

@pytest.mark.unittest
def test_anapred_metadata():

current_station = 'VLISSGN'
file_ts = os.path.join(dir_testdata, f'{current_station}_obs1.txt')
ts_measurements_group0 = hatyan.read_dia(filename=file_ts, station=current_station)
Expand All @@ -75,10 +69,6 @@ def test_anapred_metadata():
'grootheid': 'WATHTBRKD',
'eenheid': 'cm',
'vertref': 'NAP',
'tstart': pd.Timestamp('2009-01-01 00:00:00'),
'tstop': pd.Timestamp('2009-12-31 23:00:00'),
'timestep_min': 60.0,
'timestep_unit': 'min',
'TYP': 'TE',
'groepering': 'NVT',
'origin': 'from timeseries dia file',
Expand All @@ -100,7 +90,6 @@ def test_anapred_metadata():

@pytest.mark.unittest
def test_hwlw_metadata():

current_station = 'VLISSGN'
file_ts = os.path.join(dir_testdata, f'{current_station}_obs1.txt')
ts_measurements_group0 = hatyan.read_dia(filename=file_ts, station=current_station)
Expand All @@ -119,10 +108,6 @@ def test_hwlw_metadata():
'grootheid': 'WATHTE',
'eenheid': 'cm',
'vertref': 'NAP',
'tstart': pd.Timestamp('2009-01-01 00:00:00'),
'tstop': pd.Timestamp('2009-12-31 23:00:00'),
'timestep_min': 60.0,
'timestep_unit': 'min',
'TYP': 'TE',
'groepering': 'NVT',
'origin': 'from timeseries dia file'}
Expand All @@ -131,10 +116,6 @@ def test_hwlw_metadata():
'grootheid': 'WATHTBRKD',
'eenheid': 'cm',
'vertref': 'NAP',
'tstart': pd.Timestamp('2009-01-01 00:00:00'),
'tstop': pd.Timestamp('2009-12-31 23:00:00'),
'timestep_min': 60.0,
'timestep_unit': 'min',
'TYP': 'TE',
'groepering': 'NVT',
'origin': 'from timeseries dia file',
Expand Down Expand Up @@ -163,10 +144,6 @@ def test_readts_dia_metadata_multiblock():
'grootheid': 'WATHTE',
'eenheid': 'cm',
'vertref': 'NAP',
'tstart': pd.Timestamp('1980-01-01 01:32:00'),
'tstop': pd.Timestamp('1991-12-31 23:45:00'),
'timestep_min': None,
'timestep_unit': None,
'TYP': 'TN',
'groepering': 'GETETM2',
'origin': 'from timeseries dia file'}
Expand All @@ -184,10 +161,7 @@ def test_metadata_compare():
'groepering': 'NVT',
'grootheid': 'WATHTE',
'eenheid': 'cm',
'timestep_min': 60.0,
'timestep_unit': 'min',
'tstart': None,
'tstop': None}
}

metadata_compare([metadata,metadata,metadata])

Expand Down Expand Up @@ -220,4 +194,3 @@ def test_metadata_persistence():
assert len(comp.iloc[:10].attrs) > 0
assert len(comp["A"].attrs) > 0
# assert len(comp.max().attrs) > 0 # TODO: max is not persistent in python 3.8

Loading

0 comments on commit 5befefd

Please sign in to comment.