313 metadata comparison slightly too strict for hatyanread dia (#314)

* add testcase that fails with the current setup * added tests for multiblock diafiles errors * removed tstart/tstop/tzone metadata from timeseries dataframes * removed timestep_min attr throughout code * prevent overwriting of block_ids argument in file loop * updated whatsnew
Deltares · Jun 4, 2024 · 5befefd · 5befefd
1 parent 8a004bc
commit 5befefd
Show file tree

Hide file tree

Showing 8 changed files with 118 additions and 95 deletions.
diff --git a/docs/whats-new.md b/docs/whats-new.md
@@ -2,6 +2,9 @@
 
 ## UNRELEASED
 
+### Fix
+- repaired support for equidistant multiblock diafiles with varying timesteps in [#314](https://github.com/Deltares/hatyan/pull/314)
+
 
 ## 2.8.0 (2024-05-08)
 This release contains many changes of which several also impact the user. These changes benefit the user friendliness and robustness of hatyan.

diff --git a/hatyan/analysis_prediction.py b/hatyan/analysis_prediction.py
@@ -255,6 +255,8 @@ def analysis(ts, const_list,
     metadata['xfac'] = hatyan_settings.xfac
     metadata['fu_alltimes'] = hatyan_settings.fu_alltimes
     metadata['source'] = hatyan_settings.source
+    metadata['tstart'] = ts.index.min().tz_localize(None)
+    metadata['tstop'] = ts.index.max().tz_localize(None)
     metadata['tzone'] = ts.index.tz
     COMP_mean_pd = metadata_add_to_obj(COMP_mean_pd, metadata)
 
@@ -280,8 +282,8 @@ def analysis_singleperiod(ts, const_list, hatyan_settings):
     if bool_ts_duplicated.any():
         raise ValueError(f'{bool_ts_duplicated.sum()} duplicate timesteps in provided timeseries, remove them e.g. with: ts = ts[~ts.index.duplicated(keep="first")]')
     message = (f'#timesteps    = {len(ts)}\n'
-               f'tstart        = {ts.index[0].strftime("%Y-%m-%d %H:%M:%S")}\n'
-               f'tstop         = {ts.index[-1].strftime("%Y-%m-%d %H:%M:%S")}\n'
+               f'tstart        = {ts.index.min().strftime("%Y-%m-%d %H:%M:%S")}\n'
+               f'tstop         = {ts.index.max().strftime("%Y-%m-%d %H:%M:%S")}\n'
                f'timestep      = {ts.index.freq}')
     logger.info(message)
 
@@ -417,7 +419,7 @@ def split_components(comp, dood_date_mid, hatyan_settings):
 def prediction_singleperiod(comp:pd.DataFrame, times:pd.DatetimeIndex, hatyan_settings) -> pd.DataFrame:
 
     metadata_comp = metadata_from_obj(comp)
-    tzone_comp = metadata_comp.pop("tzone")
+    tzone_comp = metadata_comp.pop('tzone')
 
     if not isinstance(times, pd.DatetimeIndex):
         raise TypeError(f'times argument can be of type pd.DatetimeIndex or slice, not {type(times)}')
@@ -566,8 +568,13 @@ def prediction(comp, times=None, timestep=None):
         # update metadata
         if metadata_comp['grootheid'] == 'WATHTE':
             metadata_comp['grootheid'] = 'WATHTBRKD'
+    # prevent adding time metadata from component dataframe to prediction dataframe
     if 'tzone' in metadata_comp.keys():
         metadata_comp.pop('tzone')
+    if 'tstart' in metadata_comp.keys():
+        metadata_comp.pop("tstart")
+    if 'tstop' in metadata_comp.keys():
+        metadata_comp.pop("tstop")
     ts_prediction = metadata_add_to_obj(ts_prediction, metadata_comp)
 
     return ts_prediction
diff --git a/hatyan/components.py b/hatyan/components.py
@@ -152,7 +152,7 @@ def write_components(comp, filename):
     tstop = metadata.pop('tstop')
     tzone = metadata.pop('tzone')
     if tzone is None:
-        raise AttributeError("write_components() encountered tzone=None in components dataframe, not allowed.")
+        raise ValueError("write_components() encountered tzone=None in components dataframe, not allowed.")
     tzone_min = tzone._minutes
     tstart_str = tstart.strftime("%Y%m%d  %H%M")
     tstop_str = tstop.strftime("%Y%m%d  %H%M")
@@ -243,7 +243,7 @@ def merge_componentgroups(comp_main, comp_sec):
     comp_sec_meta = metadata_from_obj(comp_sec).copy()
     comp_sec_list = comp_sec.index.tolist()
 
-    meta_settings_list = ['origin','groepering','timestep_min','timestep_unit','TYP']
+    meta_settings_list = ['origin','groepering','tstart','tstop','TYP']
     comp_main_meta_others = {}
     for key in meta_settings_list:
         if key in comp_main_meta:

diff --git a/hatyan/metadata.py b/hatyan/metadata.py
@@ -5,7 +5,6 @@
 @author: veenstra
 """
 
-import numpy as np
 import pandas as pd
 
 
@@ -24,22 +23,13 @@ def metadata_from_diablocks(diablocks_pd, block_id):
     diablocks_pd_onerow = diablocks_pd.iloc[block_id]
 
     metadata_keys = ['station', 'grootheid', 'eenheid', 
-                     'vertref', 
-                     'tstart', 'tstop',
-                     'timestep_min', 'timestep_unit',
-                     'TYP', 'groepering']
+                     'vertref', 'TYP', 'groepering']
 
     #TODO: align with metadata from hatyan.read_components()
     metadata = {key:diablocks_pd_onerow[key] for key in metadata_keys}
 
     # add origin
     metadata['origin'] = 'from timeseries dia file'
-
-    # replace nan with None (otherwise metadata_compare fails)
-    #TODO: avoid nan in metadata (timestep for hoek_har.dia)
-    if np.isnan(metadata['timestep_min']): #non-equidistant, nan in py38 and none in py39 (pandas 2.1.2)
-        metadata['timestep_min'] = None
-        metadata['timestep_unit'] = None
     return metadata
 
 
@@ -49,21 +39,10 @@ def metadata_from_obj(obj):
 
 
 def metadata_compare(metadata_list):
-
-    # remove tstart/tstop since they cannot be compared on equality in case of multifile dia
-    metadata_list_notstartstop = []
-    for meta in metadata_list:
-        meta_new = meta.copy()
-        if 'tstart' in meta_new:
-            meta_new.pop('tstart')
-        if 'tstop' in meta_new:
-            meta_new.pop('tstop')
-        metadata_list_notstartstop.append(meta_new)
-
-    nmeta = len(metadata_list_notstartstop)
+    nmeta = len(metadata_list)
     for i in range(1,nmeta):
-        meta1 = metadata_list_notstartstop[i-1]
-        meta2 = metadata_list_notstartstop[i]
+        meta1 = metadata_list[i-1]
+        meta2 = metadata_list[i]
         if meta1!=meta2:
             meta_12_df = pd.concat([pd.Series(meta1), pd.Series(meta2)],axis=1)
             meta_12_df["equal"] = meta_12_df[0]==meta_12_df[1]

diff --git a/hatyan/timeseries.py b/hatyan/timeseries.py
@@ -610,13 +610,12 @@ def write_netcdf(ts, filename, ts_ext=None, nosidx=False, mode='w'):
 
     times_all = ts.index
     timeseries = ts['values']
-    times_stepmin = (ts.index[1]-ts.index[0]).total_seconds()/60
     dt_analysistime = dt.datetime.now()
     data_nc = Dataset(filename, mode, format="NETCDF3_CLASSIC")
     attr_dict = {'title': 'tidal prediction for %s to %s'%(times_all[0].strftime('%Y-%m-%d %H:%M:%S'), times_all[-1].strftime('%Y-%m-%d %H:%M:%S')),
                  'institution': 'Rijkswaterstaat',
                  'source': 'hatyan-%s tidal analysis program of Rijkswaterstaat'%(version_no),
-                 'timestep_min': times_stepmin}
+                 }
     data_nc.setncatts(attr_dict)
 
     ncvarlist = list(data_nc.variables.keys())
@@ -1060,10 +1059,7 @@ def crop_timeseries(ts, times, onlyfull=True):
 
     # add metadata
     metadata = metadata_from_obj(ts)
-    metadata['tstart'] = pd.Timestamp(tstart)
-    metadata['tstop'] = pd.Timestamp(tstop)
     ts_pd_out = metadata_add_to_obj(ts_pd_out,metadata)
-
     return ts_pd_out
 
 
@@ -1104,9 +1100,6 @@ def resample_timeseries(ts, timestep_min, tstart=None, tstop=None):
 
     # add metadata
     metadata = metadata_from_obj(ts)
-    metadata['tstart'] = pd.Timestamp(tstart)
-    metadata['tstop'] = pd.Timestamp(tstop)
-    metadata['timestep_min'] = timestep_min
     data_pd_resample = metadata_add_to_obj(data_pd_resample,metadata)
 
     return data_pd_resample
@@ -1533,49 +1526,52 @@ def read_dia(filename, station=None, block_ids=None, allow_duplicates=False):
         pd.set_option('display.width', 200) #default was 80, but need more to display groepering
         print_cols = ['block_starts', 'station', 'grootheid', 'groepering', 'tstart', 'tstop']
         logger.info('blocks in diafile:\n%s'%(diablocks_pd[print_cols]))
-        str_getdiablockspd = 'A summary of the available blocks is printed above, obtain a full DataFrame of available diablocks with "diablocks_pd=hatyan.get_diablocks(filename)"'
 
         #get equidistant timeseries from metadata
         if block_ids is None or block_ids=='allstation':
             if station is None:
                 if len(diablocks_pd)==1:
                     station = diablocks_pd.loc[0,'station']
                 else:
-                    raise ValueError(('If block_ids argument is not provided (or None) or is "allstation", station '
-                                      f'argument should be provided.\n{diablocks_pd[print_cols]}'))
+                    raise ValueError('If block_ids=None or block_ids="allstation", station argument should be provided. '
+                                      f'Available blocks:\n{diablocks_pd[print_cols]}')
             bool_station = diablocks_pd['station']==station
             ids_station = diablocks_pd[bool_station].index.tolist()
             if len(ids_station)<1:
-                raise ValueError(f"No data block with requested station ({station}) present in dia file. {str_getdiablockspd}")
+                raise ValueError(f"No data block with requested station ({station}) present in dia file. "
+                                 f"Available blocks:\n{diablocks_pd[print_cols]}")
             elif len(ids_station)>1 and block_ids is None:
                 raise ValueError(f"More than one data block with requested station ({station}) "
                                  "present in dia file. Provide block_ids argument to read_dia() (int, list of int or 'allstation'). "
-                                 f"{str_getdiablockspd}")
+                                 f"Available blocks:\n{diablocks_pd[print_cols]}")
             else: #exactly one occurrence or block_ids is provided or block_ids='allstation'
-                block_ids = ids_station
+                block_ids_one = ids_station
+        elif isinstance(block_ids,int):
+            block_ids_one = [block_ids]
+        else:
+            # prevent overwriting of block_ids in this file loop
+            block_ids_one = block_ids
 
         #check validity of blockids of type listlist
-        if isinstance(block_ids,int):
-            block_ids = [block_ids]
-        if not isinstance(block_ids,list):
+        if not isinstance(block_ids_one,list):
             raise TypeError('Invalid type for block_ids (should be int, list of int or "allstation")')
-        if not pd.Series(block_ids).isin(diablocks_pd.index).all():
-            raise ValueError(f"Invalid values in block_ids list ({block_ids}), "
+        if not pd.Series(block_ids_one).isin(diablocks_pd.index).all():
+            raise ValueError(f"Invalid values in block_ids list ({block_ids_one}), "
                              f"possible are {diablocks_pd.index.tolist()} (all integers)")
 
         if station is not None:
             if not isinstance(station,str):
                 raise TypeError('Station argument should be of type string')
-            bool_samestation = diablocks_pd.loc[block_ids,'station']==station
+            bool_samestation = diablocks_pd.loc[block_ids_one,'station']==station
             if not bool_samestation.all():
                 raise ValueError("Both the arguments station and block_ids are provided, "
                                  "but at least one of the requested block_ids corresponds to a different station. "
-                                 f"{str_getdiablockspd}")
+                                 f"Available blocks:\n{diablocks_pd[print_cols]}")
 
-        for block_id in block_ids:
-            if np.isnan(diablocks_pd.loc[block_id,'timestep_min']):
+        for block_id in block_ids_one:
+            if np.isnan(diablocks_pd.loc[block_id,'timestep_min']): # non-equidistant
                 data_pd_oneblock = read_dia_nonequidistant(filename_one, diablocks_pd, block_id)
-            else: #equidistant
+            else: # equidistant
                 data_pd_oneblock = read_dia_equidistant(filename_one, diablocks_pd, block_id)
             data_pd_list.append(data_pd_oneblock)
             metadata = metadata_from_obj(data_pd_oneblock)
@@ -1585,8 +1581,6 @@ def read_dia(filename, station=None, block_ids=None, allow_duplicates=False):
     data_pd_all = pd.concat(data_pd_list)
     metadata_compare(metadata_list)
     metadata = metadata_list[0].copy()
-    metadata['tstart'] = metadata_list[0]['tstart']
-    metadata['tstop'] = metadata_list[-1]['tstop']
     data_pd_all = metadata_add_to_obj(data_pd_all,metadata)
 
     if allow_duplicates:
@@ -1595,7 +1589,8 @@ def read_dia(filename, station=None, block_ids=None, allow_duplicates=False):
     #check overlapping timesteps, sort values on time
     if data_pd_all.index.duplicated().any():
         raise ValueError("Merged datasets have duplicate/overlapping timesteps, "
-                         "clean up your input data or provide one file instead of a list")
+                         "clean up your input data or provide one file instead of a list. "
+                         "Or pass `allow_duplicates=True`")
     if not data_pd_all.index.is_monotonic_increasing:
         data_pd_all = data_pd_all.sort_index()
 

diff --git a/tests/test_components.py b/tests/test_components.py
@@ -89,6 +89,26 @@ def test_read_write_components_nondefaultsettings():
         hatyan.write_components(comp_orig, filename=file_new)
     assert "source" in str(e.value)
 
+    comp_orig = hatyan.read_components(filename=file_orig)
+    comp_orig.attrs['tzone'] = None
+    with pytest.raises(ValueError) as e:
+        hatyan.write_components(comp_orig, filename=file_new)
+    assert "tzone=None" in str(e.value)
+
+
+@pytest.mark.unittest
+def test_writecomponents_fromanalysis(tmp_path):
+    """
+    this is of added value to check if all required metadata is present from an analysis
+    """
+    current_station = 'VLISSGN'
+    file_data_comp0 = os.path.join(dir_testdata,f'{current_station}_obs1.txt')
+    ts = hatyan.read_dia(filename=file_data_comp0, station=current_station)
+
+    comp = hatyan.analysis(ts=ts, const_list='month', fu_alltimes=False)
+    file_comp = os.path.join(tmp_path, "temp_comp.txt")
+    hatyan.write_components(comp, file_comp)
+
 
 @pytest.mark.unittest
 def test_plot_components_validation():

diff --git a/tests/test_metadata.py b/tests/test_metadata.py
@@ -8,7 +8,6 @@
 import os
 import pytest
 import pytz
-import pandas as pd
 import hatyan
 from hatyan.metadata import metadata_from_obj, metadata_compare, wns_from_metadata
 
@@ -28,10 +27,6 @@ def test_readts_dia_metadata_multifile():
      'grootheid': 'WATHTE',
      'eenheid': 'cm',
      'vertref': 'NAP',
-     'tstart': pd.Timestamp('2009-01-01 00:00:00'),
-     'tstop': pd.Timestamp('2012-12-31 23:00:00'),
-     'timestep_min': 60.0,
-     'timestep_unit': 'min',
      'TYP': 'TE',
      'groepering': 'NVT',
      'origin': 'from timeseries dia file'}
@@ -56,7 +51,6 @@ def test_metadata_compare_valueerror():
 
 @pytest.mark.unittest
 def test_anapred_metadata():
-
     current_station = 'VLISSGN'
     file_ts = os.path.join(dir_testdata, f'{current_station}_obs1.txt')
     ts_measurements_group0 = hatyan.read_dia(filename=file_ts, station=current_station)
@@ -75,10 +69,6 @@ def test_anapred_metadata():
      'grootheid': 'WATHTBRKD',
      'eenheid': 'cm',
      'vertref': 'NAP',
-     'tstart': pd.Timestamp('2009-01-01 00:00:00'),
-     'tstop': pd.Timestamp('2009-12-31 23:00:00'),
-     'timestep_min': 60.0,
-     'timestep_unit': 'min',
      'TYP': 'TE',
      'groepering': 'NVT',
      'origin': 'from timeseries dia file',
@@ -100,7 +90,6 @@ def test_anapred_metadata():
 
 @pytest.mark.unittest
 def test_hwlw_metadata():
-
     current_station = 'VLISSGN'
     file_ts = os.path.join(dir_testdata, f'{current_station}_obs1.txt')
     ts_measurements_group0 = hatyan.read_dia(filename=file_ts, station=current_station)
@@ -119,10 +108,6 @@ def test_hwlw_metadata():
      'grootheid': 'WATHTE',
      'eenheid': 'cm',
      'vertref': 'NAP',
-     'tstart': pd.Timestamp('2009-01-01 00:00:00'),
-     'tstop': pd.Timestamp('2009-12-31 23:00:00'),
-     'timestep_min': 60.0,
-     'timestep_unit': 'min',
      'TYP': 'TE',
      'groepering': 'NVT',
      'origin': 'from timeseries dia file'}
@@ -131,10 +116,6 @@ def test_hwlw_metadata():
      'grootheid': 'WATHTBRKD',
      'eenheid': 'cm',
      'vertref': 'NAP',
-     'tstart': pd.Timestamp('2009-01-01 00:00:00'),
-     'tstop': pd.Timestamp('2009-12-31 23:00:00'),
-     'timestep_min': 60.0,
-     'timestep_unit': 'min',
      'TYP': 'TE',
      'groepering': 'NVT',
      'origin': 'from timeseries dia file',
@@ -163,10 +144,6 @@ def test_readts_dia_metadata_multiblock():
      'grootheid': 'WATHTE',
      'eenheid': 'cm',
      'vertref': 'NAP',
-     'tstart': pd.Timestamp('1980-01-01 01:32:00'),
-     'tstop': pd.Timestamp('1991-12-31 23:45:00'),
-     'timestep_min': None,
-     'timestep_unit': None,
      'TYP': 'TN',
      'groepering': 'GETETM2',
      'origin': 'from timeseries dia file'}
@@ -184,10 +161,7 @@ def test_metadata_compare():
         'groepering': 'NVT',
         'grootheid': 'WATHTE',
         'eenheid': 'cm',
-        'timestep_min': 60.0,
-        'timestep_unit': 'min',
-        'tstart': None,
-        'tstop': None}
+        }
 
     metadata_compare([metadata,metadata,metadata])
 
@@ -220,4 +194,3 @@ def test_metadata_persistence():
     assert len(comp.iloc[:10].attrs) > 0
     assert len(comp["A"].attrs) > 0
     # assert len(comp.max().attrs) > 0 # TODO: max is not persistent in python 3.8
-