From 08aba0bc996d3ee37ffd5218fead982852d0d3fb Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Wed, 20 Feb 2019 13:42:26 -0800 Subject: [PATCH 1/9] attempt at loading remote hdf5 --- xarray/backends/api.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 61efcfdedf2..892c71446f7 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -352,11 +352,15 @@ def maybe_decode_store(store, lock=False): with close_on_error(store): ds = maybe_decode_store(store) else: - if engine is not None and engine != 'scipy': + if engine == 'h5netcdf': + store = backends.H5NetCDFStore( + filename_or_obj, group=group, lock=lock, **backend_kwargs) + elif engine is not None and engine != 'scipy' and engine != 'h5netcdf': raise ValueError('can only read file-like objects with ' "default engine or engine='scipy'") - # assume filename_or_obj is a file-like object - store = backends.ScipyDataStore(filename_or_obj) + else: + # assume filename_or_obj is a file-like object + store = backends.ScipyDataStore(filename_or_obj) ds = maybe_decode_store(store) # Ensure source filename always stored in dataset object (GH issue #2550) From 8ec34a680eb28119d55d72fea833eda7e8e510e9 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Mon, 25 Feb 2019 22:17:44 -0800 Subject: [PATCH 2/9] added a couple tests --- xarray/backends/api.py | 44 +++++++++++++++++------------------ xarray/tests/test_backends.py | 32 +++++++++++++++++++++++++ 2 files changed, 54 insertions(+), 22 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 892c71446f7..7faf978f668 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -170,8 +170,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, Strings and Path objects are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). File-like objects are opened - with scipy.io.netcdf (only netCDF3 supported). + scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like + objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). @@ -310,17 +310,9 @@ def maybe_decode_store(store, lock=False): if isinstance(filename_or_obj, backends.AbstractDataStore): store = filename_or_obj ds = maybe_decode_store(store) - elif isinstance(filename_or_obj, str): - if (isinstance(filename_or_obj, bytes) and - filename_or_obj.startswith(b'\x89HDF')): - raise ValueError('cannot read netCDF4/HDF5 file images') - elif (isinstance(filename_or_obj, bytes) and - filename_or_obj.startswith(b'CDF')): - # netCDF3 file images are handled by scipy - pass - elif isinstance(filename_or_obj, str): - filename_or_obj = _normalize_path(filename_or_obj) + elif isinstance(filename_or_obj, str): + filename_or_obj = _normalize_path(filename_or_obj) if engine is None: engine = _get_default_engine(filename_or_obj, @@ -352,15 +344,23 @@ def maybe_decode_store(store, lock=False): with close_on_error(store): ds = maybe_decode_store(store) else: - if engine == 'h5netcdf': - store = backends.H5NetCDFStore( - filename_or_obj, group=group, lock=lock, **backend_kwargs) - elif engine is not None and engine != 'scipy' and engine != 'h5netcdf': - raise ValueError('can only read file-like objects with ' - "default engine or engine='scipy'") + if engine is not None and engine != 'scipy' and engine != 'h5netcdf': + raise ValueError('can only read bytes or file-like objects with ' + "engine = None, 'scipy', or 'h5netcdf'") else: - # assume filename_or_obj is a file-like object - store = backends.ScipyDataStore(filename_or_obj) + if isinstance(filename_or_obj, bytes): + filename_or_obj = BytesIO(filename_or_obj) + # read first bytes of file-like object to determine engine + magic_number = filename_or_obj.read(8) + if magic_number.startswith(b'CDF'): + store = backends.ScipyDataStore(filename_or_obj, + **backend_kwargs) + elif magic_number.startswith(b'\211HDF\r\n\032\n'): + store = backends.H5NetCDFStore(filename_or_obj, group=group, + lock=lock, **backend_kwargs) + else: + raise ValueError("byte header doesn't match netCDF3 or " + "netCDF4/HDF5: {}".format(magic_number)) ds = maybe_decode_store(store) # Ensure source filename always stored in dataset object (GH issue #2550) @@ -387,8 +387,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, Strings and Paths are interpreted as a path to a netCDF file or an OpenDAP URL and opened with python-netCDF4, unless the filename ends with .gz, in which case the file is gunzipped and opened with - scipy.io.netcdf (only netCDF3 supported). File-like objects are opened - with scipy.io.netcdf (only netCDF3 supported). + scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like + objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF). group : str, optional Path to the netCDF4 group in the given file to open (only works for netCDF4 files). diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index f610dba1352..84d9a619c7c 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1955,6 +1955,38 @@ def test_dump_encodings_h5py(self): assert actual.x.encoding['compression_opts'] is None +# Requires h5py>2.9.0 +@requires_h5netcdf +class TestH5NetCDFFileObject(TestH5NetCDFData): + engine = 'h5netcdf' + + @network + def test_h5remote(self): + # alternative: http://era5-pds.s3.amazonaws.com/2008/01/main.nc + import requests + url = ('https://www.unidata.ucar.edu/' + 'software/netcdf/examples/test_hgroups.nc') + print(url) + bytes = requests.get(url).content + with xr.open_dataset(bytes) as ds: + assert len(ds['UTC_time']) == 74 + assert ds['UTC_time'].attrs['name'] == 'time' + + def test_h5bytes(self): + import h5py + bio = BytesIO() + with h5py.File(bio) as ds: + v = np.array(2.0) + ds['scalar'] = v + bio.seek(0) + with xr.open_dataset(bio) as ds: + v = ds['scalar'] + assert v == np.array(2.0) + assert v.dtype == 'float64' + assert v.ndim == 0 + assert list(v.attrs) == [] + + @requires_h5netcdf @requires_dask @pytest.mark.filterwarnings('ignore:deallocating CachingFileManager') From 48b23b669609ac5f1b226b9e7bc6c52d711b6af6 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Mon, 4 Mar 2019 22:18:52 -0800 Subject: [PATCH 3/9] rewind bytes after reading header --- xarray/backends/api.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7faf978f668..c2e9c5c5df2 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -344,7 +344,7 @@ def maybe_decode_store(store, lock=False): with close_on_error(store): ds = maybe_decode_store(store) else: - if engine is not None and engine != 'scipy' and engine != 'h5netcdf': + if engine not in [None, 'scipy', 'h5netcdf']: raise ValueError('can only read bytes or file-like objects with ' "engine = None, 'scipy', or 'h5netcdf'") else: @@ -352,6 +352,7 @@ def maybe_decode_store(store, lock=False): filename_or_obj = BytesIO(filename_or_obj) # read first bytes of file-like object to determine engine magic_number = filename_or_obj.read(8) + filename_or_obj.seek(0) if magic_number.startswith(b'CDF'): store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs) From 4a7e56055bab5cb9f87218f045d9b5e7152a84be Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 5 Mar 2019 17:46:42 -0800 Subject: [PATCH 4/9] addressed comments for tests and error message --- xarray/backends/api.py | 5 +++-- xarray/tests/__init__.py | 6 ++++++ xarray/tests/test_backends.py | 33 +++++++++++++++++---------------- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index c2e9c5c5df2..c5a37b8209e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -360,8 +360,9 @@ def maybe_decode_store(store, lock=False): store = backends.H5NetCDFStore(filename_or_obj, group=group, lock=lock, **backend_kwargs) else: - raise ValueError("byte header doesn't match netCDF3 or " - "netCDF4/HDF5: {}".format(magic_number)) + print(magic_number) + raise ValueError("file-like object is not a netCDF file: {}" + .format(filename_or_obj)) ds = maybe_decode_store(store) # Ensure source filename always stored in dataset object (GH issue #2550) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 281fc662197..841a56dbd8b 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -77,6 +77,12 @@ def LooseVersion(vstring): has_cfgrib, requires_cfgrib = _importorskip('cfgrib') # some special cases +has_h5netcdf07, requires_h5netcdf07 = _importorskip('h5netcdf', + minversion='0.7') +has_h5py29, requires_h5py29 = _importorskip('h5py', minversion='2.9.0') +has_h5fileobj = has_h5netcdf07 and has_h5py29 +requires_h5fileobj = pytest.mark.skipif( + not has_h5fileobj, reason='requires h5py>2.9.0 & h5netcdf>0.7') has_scipy_or_netCDF4 = has_scipy or has_netCDF4 requires_scipy_or_netCDF4 = pytest.mark.skipif( not has_scipy_or_netCDF4, reason='requires scipy or netCDF4') diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 84d9a619c7c..9e4845c6ad2 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -35,7 +35,7 @@ requires_cftime, requires_dask, requires_h5netcdf, requires_netCDF4, requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio, requires_rasterio, requires_scipy, requires_scipy_or_netCDF4, - requires_zarr) + requires_zarr, requires_h5fileobj) from .test_coding_times import (_STANDARD_CALENDARS, _NON_STANDARD_CALENDARS, _ALL_CALENDARS) from .test_dataset import create_test_data @@ -1955,9 +1955,9 @@ def test_dump_encodings_h5py(self): assert actual.x.encoding['compression_opts'] is None -# Requires h5py>2.9.0 -@requires_h5netcdf +@requires_h5fileobj class TestH5NetCDFFileObject(TestH5NetCDFData): + h5py = pytest.importorskip('h5py', minversion='2.9.0') engine = 'h5netcdf' @network @@ -1972,19 +1972,20 @@ def test_h5remote(self): assert len(ds['UTC_time']) == 74 assert ds['UTC_time'].attrs['name'] == 'time' - def test_h5bytes(self): - import h5py - bio = BytesIO() - with h5py.File(bio) as ds: - v = np.array(2.0) - ds['scalar'] = v - bio.seek(0) - with xr.open_dataset(bio) as ds: - v = ds['scalar'] - assert v == np.array(2.0) - assert v.dtype == 'float64' - assert v.ndim == 0 - assert list(v.attrs) == [] + def test_h5binary(self): + expected = create_test_data().drop('dim3') + expected.attrs['foo'] = 'bar' + with create_tmp_file() as tmp_file: + expected.to_netcdf(tmp_file, engine='h5netcdf') + + with open(tmp_file, 'rb') as f: + with open_dataset(f, engine='h5netcdf') as actual: + assert_identical(expected, actual) + + f.seek(0) + with BytesIO(f.read()) as bio: + with open_dataset(bio, engine='h5netcdf') as actual: + assert_identical(expected, actual) @requires_h5netcdf From 2aa7349f8c968df1fcc8d00eef594b59df7484f0 Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Tue, 5 Mar 2019 17:47:54 -0800 Subject: [PATCH 5/9] fixed pep8 formatting --- xarray/tests/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/xarray/tests/__init__.py b/xarray/tests/__init__.py index 841a56dbd8b..4ebcc29a61e 100644 --- a/xarray/tests/__init__.py +++ b/xarray/tests/__init__.py @@ -82,7 +82,7 @@ def LooseVersion(vstring): has_h5py29, requires_h5py29 = _importorskip('h5py', minversion='2.9.0') has_h5fileobj = has_h5netcdf07 and has_h5py29 requires_h5fileobj = pytest.mark.skipif( - not has_h5fileobj, reason='requires h5py>2.9.0 & h5netcdf>0.7') + not has_h5fileobj, reason='requires h5py>2.9.0 & h5netcdf>0.7') has_scipy_or_netCDF4 = has_scipy or has_netCDF4 requires_scipy_or_netCDF4 = pytest.mark.skipif( not has_scipy_or_netCDF4, reason='requires scipy or netCDF4') From 1a4c4f3ce87321cb89cb465f2c6be7e34423521e Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Thu, 7 Mar 2019 17:06:26 -0800 Subject: [PATCH 6/9] created _get_engine_from_magic_number function, new tests --- xarray/backends/api.py | 69 +++++++++++++++++++++++------------ xarray/tests/test_backends.py | 43 ++++++++++++++-------- 2 files changed, 73 insertions(+), 39 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index c5a37b8209e..8e5c29ba86f 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -75,6 +75,34 @@ def _get_default_engine_netcdf(): return engine +def _get_engine_from_magic_number(filename_or_obj): + # check byte header to determine file type + if isinstance(filename_or_obj, bytes): + magic_number = filename_or_obj[:8] + else: + if filename_or_obj.tell() != 0: + raise ValueError("file-like object read/write pointer not at zero " + "please close and reopen, or use a context " + "manager") + magic_number = filename_or_obj.read(8) + filename_or_obj.seek(0) + + if magic_number.startswith(b'CDF'): + engine = 'scipy' + elif magic_number.startswith(b'\211HDF\r\n\032\n'): + engine = 'h5netcdf' + if isinstance(filename_or_obj, bytes): + raise ValueError("can't open netCDF4/HDF5 as bytes " + "try passing a path or file-like object") + else: + if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80: + filename_or_obj = filename_or_obj[:80] + b'...' + raise ValueError('{} is not a valid netCDF file ' + 'did you mean to pass a string for a path instead?' + .format(filename_or_obj)) + return engine + + def _get_default_engine(path, allow_remote=False): if allow_remote and is_remote_uri(path): engine = _get_default_engine_remote_uri() @@ -251,6 +279,13 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True, -------- open_mfdataset """ + engines = [None, 'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio', + 'cfgrib', 'pseudonetcdf'] + if engine not in engines: + raise ValueError('unrecognized engine for open_dataset: {}\n' + 'must be one of: {}' + .format(engine, engines)) + if autoclose is not None: warnings.warn( 'The autoclose argument is no longer used by ' @@ -309,7 +344,6 @@ def maybe_decode_store(store, lock=False): if isinstance(filename_or_obj, backends.AbstractDataStore): store = filename_or_obj - ds = maybe_decode_store(store) elif isinstance(filename_or_obj, str): filename_or_obj = _normalize_path(filename_or_obj) @@ -337,32 +371,19 @@ def maybe_decode_store(store, lock=False): elif engine == 'cfgrib': store = backends.CfGribDataStore( filename_or_obj, lock=lock, **backend_kwargs) - else: - raise ValueError('unrecognized engine for open_dataset: %r' - % engine) - with close_on_error(store): - ds = maybe_decode_store(store) else: if engine not in [None, 'scipy', 'h5netcdf']: - raise ValueError('can only read bytes or file-like objects with ' - "engine = None, 'scipy', or 'h5netcdf'") - else: - if isinstance(filename_or_obj, bytes): - filename_or_obj = BytesIO(filename_or_obj) - # read first bytes of file-like object to determine engine - magic_number = filename_or_obj.read(8) - filename_or_obj.seek(0) - if magic_number.startswith(b'CDF'): - store = backends.ScipyDataStore(filename_or_obj, - **backend_kwargs) - elif magic_number.startswith(b'\211HDF\r\n\032\n'): - store = backends.H5NetCDFStore(filename_or_obj, group=group, - lock=lock, **backend_kwargs) - else: - print(magic_number) - raise ValueError("file-like object is not a netCDF file: {}" - .format(filename_or_obj)) + raise ValueError("can only read bytes or file-like objects " + "with engine='scipy' or 'h5netcdf'") + engine = _get_engine_from_magic_number(filename_or_obj) + if engine == 'scipy': + store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs) + elif engine == 'h5netcdf': + store = backends.H5NetCDFStore(filename_or_obj, group=group, + lock=lock, **backend_kwargs) + + with close_on_error(store): ds = maybe_decode_store(store) # Ensure source filename always stored in dataset object (GH issue #2550) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 9e4845c6ad2..81a0f61d67b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1770,7 +1770,7 @@ def test_engine(self): open_dataset(tmp_file, engine='foobar') netcdf_bytes = data.to_netcdf() - with raises_regex(ValueError, 'can only read'): + with raises_regex(ValueError, 'unrecognized engine'): open_dataset(BytesIO(netcdf_bytes), engine='foobar') def test_cross_engine_read_write_netcdf3(self): @@ -1957,22 +1957,35 @@ def test_dump_encodings_h5py(self): @requires_h5fileobj class TestH5NetCDFFileObject(TestH5NetCDFData): - h5py = pytest.importorskip('h5py', minversion='2.9.0') engine = 'h5netcdf' - @network - def test_h5remote(self): - # alternative: http://era5-pds.s3.amazonaws.com/2008/01/main.nc - import requests - url = ('https://www.unidata.ucar.edu/' - 'software/netcdf/examples/test_hgroups.nc') - print(url) - bytes = requests.get(url).content - with xr.open_dataset(bytes) as ds: - assert len(ds['UTC_time']) == 74 - assert ds['UTC_time'].attrs['name'] == 'time' - - def test_h5binary(self): + def test_open_badbytes(self): + with raises_regex(ValueError, "HDF5 as bytes"): + with open_dataset(b'\211HDF\r\n\032\n', engine='h5netcdf'): + pass + with raises_regex(ValueError, "not a valid netCDF"): + with open_dataset(b'garbage'): + pass + with raises_regex(ValueError, "can only read bytes"): + with open_dataset(b'garbage', engine='netcdf4'): + pass + with raises_regex(ValueError, "not a valid netCDF"): + with open_dataset(BytesIO(b'garbage'), engine='h5netcdf'): + pass + + def test_open_twice(self): + expected = create_test_data() + expected.attrs['foo'] = 'bar' + with raises_regex(ValueError, 'read/write pointer not at zero'): + with create_tmp_file() as tmp_file: + expected.to_netcdf(tmp_file, engine='h5netcdf') + f = open(tmp_file, 'rb') + with open_dataset(f, engine='h5netcdf'): + with open_dataset(f, engine='h5netcdf'): + pass + + def test_open_fileobj(self): + # open in-memory datasets instead of local file paths expected = create_test_data().drop('dim3') expected.attrs['foo'] = 'bar' with create_tmp_file() as tmp_file: From 94a3afeddb2c15af662b29e888e28412a74456ff Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Fri, 8 Mar 2019 09:30:29 -0800 Subject: [PATCH 7/9] added description in whats-new --- doc/whats-new.rst | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/whats-new.rst b/doc/whats-new.rst index 4218771848c..1d69848728b 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -33,7 +33,9 @@ Breaking changes Enhancements ~~~~~~~~~~~~ - +- Added ability to open netcdf4/hdf5 file-like objects with ``open_dataset``. + Requires (h5netcdf>0.7 and h5py>2.9.0). (:issue:`2781`) + By `Scott Henderson `_ - Internal plotting now supports ``cftime.datetime`` objects as time series. (:issue:`2164`) By `Julius Busecke `_ and @@ -77,8 +79,8 @@ Enhancements :py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence serialization warnings raised if dates from a standard calendar are found to be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By - `Spencer Clark `_. - + `Spencer Clark `_. + - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. From c067fa09191e30c38a3fefb495a5091728b0319a Mon Sep 17 00:00:00 2001 From: Scott Henderson Date: Fri, 15 Mar 2019 12:41:56 -0700 Subject: [PATCH 8/9] fixed test failure on windows --- xarray/tests/test_backends.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index 81a0f61d67b..8214e102a86 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1976,13 +1976,17 @@ def test_open_badbytes(self): def test_open_twice(self): expected = create_test_data() expected.attrs['foo'] = 'bar' - with raises_regex(ValueError, 'read/write pointer not at zero'): + if ON_WINDOWS: + error = raises_regex(PermissionError, 'cannot access the file') + else: + error = raises_regex(ValueError, 'read/write pointer not at zero') + with error: with create_tmp_file() as tmp_file: expected.to_netcdf(tmp_file, engine='h5netcdf') - f = open(tmp_file, 'rb') - with open_dataset(f, engine='h5netcdf'): + with open(tmp_file, 'rb') as f: with open_dataset(f, engine='h5netcdf'): - pass + with open_dataset(f, engine='h5netcdf'): + pass def test_open_fileobj(self): # open in-memory datasets instead of local file paths From 73c022e641146e8a1b1daf6b175948bceb323102 Mon Sep 17 00:00:00 2001 From: Joseph Hamman Date: Fri, 15 Mar 2019 16:35:12 -0700 Subject: [PATCH 9/9] same error on windows and nix --- xarray/tests/test_backends.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index eccd96c4d85..a20ba2df229 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -1976,11 +1976,7 @@ def test_open_badbytes(self): def test_open_twice(self): expected = create_test_data() expected.attrs['foo'] = 'bar' - if ON_WINDOWS: - error = raises_regex(PermissionError, 'cannot access the file') - else: - error = raises_regex(ValueError, 'read/write pointer not at zero') - with error: + with raises_regex(ValueError, 'read/write pointer not at zero'): with create_tmp_file() as tmp_file: expected.to_netcdf(tmp_file, engine='h5netcdf') with open(tmp_file, 'rb') as f: