942 fix test prevent dtype int (#943)

* added extra dtype assertions to test_download_era5 * simplified prevent_dtype_int() and merged with dfmt.preprocess_ERA5() * removed prevent_dtype_int() also in merge_meteofiles()
Deltares · Aug 9, 2024 · ed77eb0 · ed77eb0
1 parent abb1b0f
commit ed77eb0
Show file tree

Hide file tree

Showing 4 changed files with 17 additions and 71 deletions.
diff --git a/dfm_tools/xarray_helpers.py b/dfm_tools/xarray_helpers.py
@@ -7,7 +7,6 @@
 import pandas as pd
 import logging
 import numpy as np
-import netCDF4
 from dfm_tools.errors import OutOfRangeError
 
 __all__ = [
@@ -109,8 +108,16 @@ def preprocess_ERA5(ds):
     if 'valid_time' in ds.coords:
         ds = ds.rename({'valid_time':'time'})
 
-    # prevent incorrect scaling/offset when merging files
-    prevent_dtype_int(ds)
+    # Prevent writing to (incorrectly scaled) int, since it might mess up mfdataset (https://github.com/Deltares/dfm_tools/issues/239)
+    # By dropping scaling/offset encoding and converting to float32 (will result in a larger dataset)
+    # ERA5 datasets retrieved with the new CDS-beta are zipped float32 instead of scaled int, so this is only needed for backwards compatibility with old files.
+    for var in ds.data_vars:
+        if not set(['dtype','scale_factor','add_offset']).issubset(ds[var].encoding.keys()):
+            continue
+        # the _FillValue will still be -32767 (int default), but this is no issue for float32
+        ds[var].encoding.pop('scale_factor')
+        ds[var].encoding.pop('add_offset')
+        ds[var].encoding["dtype"] = "float32"
 
     return ds
 
@@ -124,37 +131,6 @@ def preprocess_woa(ds):
     return ds
 
 
-def prevent_dtype_int(ds, zlib:bool = True):
-    """
-    Prevent writing to int, since it might mess up mfdataset (https://github.com/Deltares/dfm_tools/issues/239)
-    Since floats are used instead of ints, the disksize of the dataset will be larger
-    zlib=True decreases the filesize again (approx same as int filesize), but writing this file is slower
-    """
-
-    for var in ds.data_vars:
-        var_encoding = ds[var].encoding
-        if not set(['dtype','scale_factor','add_offset']).issubset(ds[var].encoding.keys()):
-            continue
-
-        if 'int' not in str(var_encoding['dtype']):
-            continue
-
-        # prevent incorrectly scaled integers by dropping scaling/offset encoding
-        ds[var].encoding.pop('scale_factor')
-        ds[var].encoding.pop('add_offset')
-
-        # remove non-convention attribute
-        if 'missing_value' in ds[var].encoding.keys():
-            ds[var].encoding.pop('missing_value')
-
-        # reduce filesize with float32 instead of int and compression
-        ds[var].encoding["dtype"] = "float32"
-        if '_FillValue' in ds[var].encoding.keys():
-            float32_fillvalue = netCDF4.default_fillvals['f4']
-            ds[var].encoding['_FillValue'] = float32_fillvalue
-        ds[var].encoding["zlib"] = zlib
-
-
 def merge_meteofiles(file_nc:str, preprocess=None, 
                      time_slice:slice = slice(None,None),
                      add_global_overlap:bool = False, zerostart:bool = False,
@@ -266,9 +242,6 @@ def merge_meteofiles(file_nc:str, preprocess=None,
         field_zerostart['time'] = [times_pd.index[0]-dt.timedelta(days=2),times_pd.index[0]-dt.timedelta(days=1)] #TODO: is one zero field not enough? (is replacing first field not also ok? (results in 1hr transition period)
         data_xr = xr.concat([field_zerostart,data_xr],dim='time',combine_attrs='no_conflicts') #combine_attrs argument prevents attrs from being dropped
 
-    # converting from int16 with scalefac/offset to float32 with zlib, relevant for ERA5
-    prevent_dtype_int(data_xr)
-
     return data_xr
 
 

diff --git a/docs/whats-new.md b/docs/whats-new.md
@@ -6,6 +6,7 @@
 - making `dfmt.open_dataset_extra()` more modular by partly moving it to separate private functions in [#913](https://github.com/Deltares/dfm_tools/pull/913)
 - added station_id variable to dataset returned by `dfmt.interp_uds_to_plipoints()` in [#914](https://github.com/Deltares/dfm_tools/pull/914)
 - update private functions under `dfmt.download_ERA5()` to CDS-Beta (requires ECMWF apikey instead) in [#925](https://github.com/Deltares/dfm_tools/pull/925)
+- simplified prevention of int dtypes in `dfmt.preprocess_ERA5()` in [#943](https://github.com/Deltares/dfm_tools/pull/943)
 
 ### Fix
 - also apply convert_360to180 to longitude variable in `dfmt.open_dataset_curvilinear()` in [#913](https://github.com/Deltares/dfm_tools/pull/913)

diff --git a/tests/test_download.py b/tests/test_download.py
@@ -184,6 +184,12 @@ def test_download_era5(file_nc_era5_pattern):
     assert ds.sizes[timedim] == 1416
     assert ds[timedim].to_pandas().iloc[0] == pd.Timestamp('2010-01-01')
     assert ds[timedim].to_pandas().iloc[-1] == pd.Timestamp('2010-02-28 23:00')
+
+    # check if there are no integers in the dataset anymore
+    # this was the case before CDS-beta in https://github.com/Deltares/dfm_tools/issues/239
+    msl_encoding = ds['msl'].encoding
+    assert str(msl_encoding['dtype']) == 'float32'
+    assert 'scale_factor' not in msl_encoding.keys()
 
 
 @pytest.mark.requiressecrets

diff --git a/tests/test_xarray_helpers.py b/tests/test_xarray_helpers.py
@@ -5,47 +5,13 @@
 @author: veenstra
 """
 
-
-import os
 import pytest
-import xarray as xr
-import numpy as np
-from dfm_tools.xarray_helpers import prevent_dtype_int
 import dfm_tools as dfmt
 import pandas as pd
 
 #TODO: many xarray_helpers tests are still in test_dfm_tools.py
 
 
-@pytest.mark.requiressecrets
-@pytest.mark.unittest
-@pytest.mark.timeout(60) # useful since CDS downloads are terribly slow sometimes, so skip in that case
-def test_prevent_dtype_int(tmp_path, file_nc_era5_pattern):
-    # file_nc_era5_pattern comes from file_nc_era5_pattern() in conftest.py
-    varn = "msl"
-    file_nc = os.path.join(tmp_path,f'era5_{varn}_*.nc')
-
-    #optional encoding
-    for zlib, size_expected in zip([False, True], [480000, 250000]):
-        data_xr = xr.open_mfdataset(file_nc)
-        prevent_dtype_int(data_xr, zlib=zlib)
-
-        #write to netcdf file
-        file_out = os.path.join(tmp_path, f'era5_prevent_dtype_int_{varn}_zlib_{zlib}.nc')
-        data_xr.to_netcdf(file_out)
-        data_xr_check = xr.open_dataset(file_out)
-        print(data_xr_check[varn].encoding)
-
-        assert np.allclose(data_xr[varn], data_xr_check[varn])
-
-        del data_xr
-        del data_xr_check
-
-        file_size = os.path.getsize(file_out)
-        print(file_size)
-        assert file_size < size_expected
-
-
 @pytest.mark.requiressecrets
 @pytest.mark.unittest
 def test_merge_meteofiles(file_nc_era5_pattern):