diff --git a/pyglider/seaexplorer.py b/pyglider/seaexplorer.py index 2c98c41..1ede7b1 100644 --- a/pyglider/seaexplorer.py +++ b/pyglider/seaexplorer.py @@ -287,6 +287,20 @@ def _interp_pld_to_pld(pld, ds, val, indctd): return val +def _remove_fill_values(df, fill_value=9999): + """ + For input dataframe df, this function converts all Float values equaling fill_values to null. Columns of other + datatypes are not affected. + """ + df = df.with_columns( + pl.when(pl.col(pl.Float64) == fill_value) + .then(None) + .otherwise(pl.col(pl.Float64)) + .keep_name() + ) + return df + + def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', profile_filt_time=100, profile_min_time=300): """ @@ -303,6 +317,7 @@ def raw_to_timeseries(indir, outdir, deploymentyaml, kind='raw', gli = pl.read_parquet(f'{indir}/{id}-rawgli.parquet') _log.info(f'Opening combined payload file {indir}/{id}-{kind}pld.parquet') sensor = pl.read_parquet(f'{indir}/{id}-{kind}pld.parquet') + sensor = _remove_fill_values(sensor) # build a new data set based on info in `deploymentyaml.` # We will use ctd as the interpolant diff --git a/tests/test_seaexplorer.py b/tests/test_seaexplorer.py index d0ed977..6b7c866 100644 --- a/tests/test_seaexplorer.py +++ b/tests/test_seaexplorer.py @@ -1,4 +1,5 @@ import polars as pl +import numpy as np import pytest from pathlib import Path import os @@ -63,6 +64,14 @@ def test_merge_rawnc(): kind='sub') assert result_default is False assert result_sub is True + + +def test__remove_fill_values(): + # This should convert values equallling 9999 in the original df to nan + df_in = pl.read_parquet('tests/data/realtime_rawnc/sea035.0012.pld1.sub.0036.parquet') + df_out = seaexplorer._remove_fill_values(df_in) + assert (df_in.select("GPCTD_DOF").to_numpy()[:, 0] == 9999).all() + assert np.isnan(df_out.select("GPCTD_DOF").to_numpy()[:, 0]).all() def test__interp_gli_to_pld():