From 38bb1ed3dc4a4ce349059965559a63c493a47a52 Mon Sep 17 00:00:00 2001 From: Callum Rollo Date: Fri, 9 Dec 2022 10:10:22 +0100 Subject: [PATCH 1/6] raise error if no valid timebase specified --- pyglider/seaexplorer.py | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index 2c98c41..1ba8748 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -313,25 +313,14 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', if atts != 'coordinates': attr[atts] = ncvar[name][atts] - # If present, use the timebase specified in ncva: timebase in the - # deployment yaml. Otherwise, the ctd will be our timebase. - # It oversamples the nav data, but mildly undersamples the optics and - # oxygen.... - if 'timebase' in ncvar: - vals = sensor.select([ncvar['timebase']['source']]).to_numpy()[:, 0] - indctd = np.where(~np.isnan(vals))[0] - elif 'GPCTD_TEMPERATURE' in list(sensor.variables): - _log.warning('No timebase specified. Using GPCTD_TEMPERATURE as time' - 'base') - indctd = np.where(~np.isnan(sensor.GPCTD_TEMPERATURE))[0] - elif 'LEGATO_TEMPERATURE' in list(sensor.variables): - _log.warning('No timebase specified. Using LEGATO_TEMPERATURE as time' - 'base') - indctd = np.where(~np.isnan(sensor.LEGATO_TEMPERATURE))[0] - else: - _log.warning('No gpctd or legato data found. Using NAV_DEPTH as time' - 'base') - indctd = np.where(~np.isnan(sensor.NAV_DEPTH))[0] + # If present, use the timebase specified in ncvar: timebase in the + # deployment yaml. + if 'timebase' not in ncvar: + raise ValueError("Must specify timebase:source in netcdf_variables section of deployment yaml") + if ncvar['timebase']['source'] not in sensor.columns: + raise ValueError(f"timebase source: {ncvar['timebase']['source']} not found in pld1 columns") + vals = sensor.select([ncvar['timebase']['source']]).to_numpy()[:, 0] + indctd = np.where(~np.isnan(vals))[0] ds['time'] = (('time'), sensor.select('time').to_numpy()[indctd, 0], attr) thenames = list(ncvar.keys()) for i in ['time', 'timebase', 'keep_variables']: From b2e24ea3d184c62a0889944289a8e55276d40687 Mon Sep 17 00:00:00 2001 From: Callum Rollo Date: Mon, 12 Dec 2022 10:29:24 +0100 Subject: [PATCH 2/6] test for bad/missing timebase --- tests/test_seaexplorer.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/tests/test_seaexplorer.py b/tests/test_seaexplorer.py index d0ed977..30ba9c1 100644 --- a/tests/test_seaexplorer.py +++ b/tests/test_seaexplorer.py @@ -2,6 +2,7 @@ import pytest from pathlib import Path import os +import yaml os.system('rm tests/data/realtime_rawnc/*') library_dir = Path(__file__).parent.parent.absolute() example_dir = library_dir / 'tests/example-data/' @@ -88,3 +89,26 @@ def test_raw_to_timeseries(): assert 'No such file or directory' in str(missing_file_exc) assert result_sub == 'tests/data/l0-profiles/dfo-eva035-20190718.nc' + +def test_missing_bad_timebase(): + # Prepare yaml files with bad timebase and no timebase + with open(example_dir / 'example-seaexplorer/deploymentRealtime.yml') as fin: + deployment = yaml.safe_load(fin) + deployment['netcdf_variables']['timebase']['source'] = "non existing sensor" + with open(example_dir / 'example-seaexplorer/bad_timebase.yml', "w") as fin: + yaml.dump(deployment, fin) + deployment['netcdf_variables'].pop('timebase') + with open(example_dir / 'example-seaexplorer/no_timebase.yml', "w") as fin: + yaml.dump(deployment, fin) + with pytest.raises(ValueError) as bad_timebase_exc: + result_bad_timebase = seaexplorer.raw_to_timeseries('tests/data/realtime_rawnc/', + 'tests/data/l0-profiles/', + example_dir / 'example-seaexplorer/bad_timebase.yml', + kind='sub') + with pytest.raises(ValueError) as no_timebase_exc: + result_no_timebase = seaexplorer.raw_to_timeseries('tests/data/realtime_rawnc/', + 'tests/data/l0-profiles/', + example_dir / 'example-seaexplorer/no_timebase.yml', + kind='sub') + assert "sensor not found in pld1 columns" in str(bad_timebase_exc) + assert "Must specify timebase" in str(no_timebase_exc) From 918d3aa3138c2929b04210cc61b6740424634f07 Mon Sep 17 00:00:00 2001 From: Hayley Dosser Date: Thu, 5 Jan 2023 12:15:22 -0800 Subject: [PATCH 3/6] Fixed time_profile variable units to be IOOS compliant. The variable time_profile had units that were e.g., 'seconds from 2022-12-01' but with the time changing for each profile. Code has been modified so the correct units will show up for all profiles. This fix also solved an issue where the value of time_profile was showing up as zero for some reason. --- pyglider/ncprocess.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index 5a71a17..1d57b67 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -111,12 +111,12 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml): # outname = outdir + '/' + utils.get_file_id(dss) + '.nc' _log.info('Writing %s', outname) - if 'units' in dss.profile_time.attrs: - dss.profile_time.attrs.pop('units') timeunits = 'seconds since 1970-01-01T00:00:00Z' timecalendar = 'gregorian' dss.to_netcdf(outname, encoding={'time': {'units': timeunits, - 'calendar': timecalendar}}) + 'calendar': timecalendar}, + 'profile_time': + {'units': timeunits}}) # add traj_strlen using bare ntcdf to make IOOS happy with netCDF4.Dataset(outname, 'r+') as nc: From d3950ce1f543732624cf6092d5c0fcb62e270e2e Mon Sep 17 00:00:00 2001 From: Callum Rollo Date: Tue, 17 Jan 2023 09:35:11 +0100 Subject: [PATCH 4/6] replace 9999 data values with nan (#143) * replace 9999 data values with nan * add test for _remove_fill_values --- pyglider/seaexplorer.py | 15 +++++++++++++++ tests/test_seaexplorer.py | 9 +++++++++ 2 files changed, 24 insertions(+) diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index 1ba8748..7d0cd6d 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -287,6 +287,20 @@ def _interp_pld_to_pld(pld, ds, val, indctd): return val +def _remove_fill_values(df, fill_value=9999): + """ + For input dataframe df, this function converts all Float values equaling fill_values to null. Columns of other + datatypes are not affected. + """ + df = df.with_columns( + pl.when(pl.col(pl.Float64) == fill_value) + .then(None) + .otherwise(pl.col(pl.Float64)) + .keep_name() + ) + return df + + def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', profile_filt_time=100, profile_min_time=300): """ @@ -303,6 +317,7 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', gli = pl.read_parquet(f'{indir}/{id}-rawgli.parquet') _log.info(f'Opening combined payload file {indir}/{id}-{kind}pld.parquet') sensor = pl.read_parquet(f'{indir}/{id}-{kind}pld.parquet') + sensor = _remove_fill_values(sensor) # build a new data set based on info in `deploymentyaml.` # We will use ctd as the interpolant diff --git a/tests/test_seaexplorer.py b/tests/test_seaexplorer.py index 30ba9c1..37f2b6b 100644 --- a/tests/test_seaexplorer.py +++ b/tests/test_seaexplorer.py @@ -1,4 +1,5 @@ import polars as pl +import numpy as np import pytest from pathlib import Path import os @@ -64,6 +65,14 @@ def test_merge_rawnc(): kind='sub') assert result_default is False assert result_sub is True + + +def test__remove_fill_values(): + # This should convert values equallling 9999 in the original df to nan + df_in = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet') + df_out = seaexplorer._remove_fill_values(df_in) + assert (df_in.select("GPCTD_DOF").to_numpy()[:, 0] == 9999).all() + assert np.isnan(df_out.select("GPCTD_DOF").to_numpy()[:, 0]).all() def test__interp_gli_to_pld(): From db6fd9350621364b2767ef8503b61195e99a4b4a Mon Sep 17 00:00:00 2001 From: C-PROOF Date: Sat, 4 Feb 2023 15:11:18 -0800 Subject: [PATCH 5/6] Add fnamesuffix --- pyglider/ncprocess.py | 6 +++--- pyglider/slocum.py | 7 ++++--- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/pyglider/ncprocess.py b/pyglider/ncprocess.py index 1d57b67..c78e0d0 100644 --- a/pyglider/ncprocess.py +++ b/pyglider/ncprocess.py @@ -115,7 +115,7 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml): timecalendar = 'gregorian' dss.to_netcdf(outname, encoding={'time': {'units': timeunits, 'calendar': timecalendar}, - 'profile_time': + 'profile_time': {'units': timeunits}}) # add traj_strlen using bare ntcdf to make IOOS happy @@ -123,7 +123,7 @@ def extract_timeseries_profiles(inname, outdir, deploymentyaml): nc.renameDimension('string%d' % trajlen, 'traj_strlen') -def make_gridfiles(inname, outdir, deploymentyaml, dz=1): +def make_gridfiles(inname, outdir, deploymentyaml, *, fnamesuffix='', dz=1): """ Turn a timeseries netCDF file into a vertically gridded netCDF. @@ -241,7 +241,7 @@ def make_gridfiles(inname, outdir, deploymentyaml, dz=1): 'water_velocity_northward']) dsout.attrs = ds.attrs - outname = outdir + '/' + ds.attrs['deployment_name'] + '_grid.nc' + outname = outdir + '/' + ds.attrs['deployment_name'] + '_grid' + fnamesuffix + '.nc' _log.info('Writing %s', outname) timeunits = 'seconds since 1970-01-01T00:00:00Z' dsout.to_netcdf(outname, encoding={'time': {'units': timeunits}}) diff --git a/pyglider/slocum.py b/pyglider/slocum.py index d279b43..a1e675a 100644 --- a/pyglider/slocum.py +++ b/pyglider/slocum.py @@ -789,8 +789,9 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, *, def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, - search='*.[D|E]BD', time_base='sci_water_temp', - profile_filt_time=100, profile_min_time=300): + search='*.[D|E]BD', fnamesuffix='', + time_base='sci_water_temp', profile_filt_time=100, + profile_min_time=300): """ Convert directly from binary files to netcdf timeseries file. Requires dbdreader to be installed. @@ -934,7 +935,7 @@ def binary_to_timeseries(indir, cachedir, outdir, deploymentyaml, *, os.mkdir(outdir) except: pass - outname = (outdir + '/' + ds.attrs['deployment_name'] + '.nc') + outname = (outdir + '/' + ds.attrs['deployment_name'] + fnamesuffix + '.nc') _log.info('writing %s', outname) ds.to_netcdf(outname, 'w', encoding={'time': {'units': 'seconds since 1970-01-01T00:00:00Z'}}) From 7e07b72bfcad7d232b82962741e0298a8631a9fc Mon Sep 17 00:00:00 2001 From: C-PROOF Date: Sat, 4 Feb 2023 15:13:13 -0800 Subject: [PATCH 6/6] Add fnamesuffix --- pyglider/seaexplorer.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index 2c98c41..21b3918 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -288,7 +288,8 @@ def _interp_pld_to_pld(pld, ds, val, indctd): def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', - profile_filt_time=100, profile_min_time=300): + profile_filt_time=100, profile_min_time=300, + fnamesuffix=''): """ A little different than above, for the 4-file version of the data set. """ @@ -455,7 +456,7 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', except: pass id0 = ds.attrs['deployment_name'] - outname = outdir + id0 + '.nc' + outname = outdir + id0 + fnamesuffix + '.nc' _log.info('writing %s', outname) if 'units' in ds.time.attrs.keys(): ds.time.attrs.pop('units')