Skip to content

Commit

Permalink
enable loading remote hdf5 files (#2782)
Browse files Browse the repository at this point in the history
* attempt at loading remote hdf5

* added a couple tests

* rewind bytes after reading header

* addressed comments for tests and error message

* fixed pep8 formatting

* created _get_engine_from_magic_number function, new tests

* added description in whats-new

* fixed test failure on windows

* same error on windows and nix
  • Loading branch information
scottyhq authored and shoyer committed Mar 16, 2019
1 parent f382fd8 commit 225868d
Show file tree
Hide file tree
Showing 4 changed files with 111 additions and 30 deletions.
8 changes: 5 additions & 3 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ Breaking changes

Enhancements
~~~~~~~~~~~~

- Added ability to open netcdf4/hdf5 file-like objects with ``open_dataset``.
Requires (h5netcdf>0.7 and h5py>2.9.0). (:issue:`2781`)
By `Scott Henderson <https://github.com/scottyhq>`_
- Internal plotting now supports ``cftime.datetime`` objects as time series.
(:issue:`2164`)
By `Julius Busecke <https://github.com/jbusecke>`_ and
Expand Down Expand Up @@ -81,8 +83,8 @@ Enhancements
:py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence
serialization warnings raised if dates from a standard calendar are found to
be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By
`Spencer Clark <https://github.com/spencerkclark>`_.
`Spencer Clark <https://github.com/spencerkclark>`_.

- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`).
By `Kevin Squire <https://github.com/kmsquire>`_.

Expand Down
77 changes: 52 additions & 25 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,34 @@ def _get_default_engine_netcdf():
return engine


def _get_engine_from_magic_number(filename_or_obj):
# check byte header to determine file type
if isinstance(filename_or_obj, bytes):
magic_number = filename_or_obj[:8]
else:
if filename_or_obj.tell() != 0:
raise ValueError("file-like object read/write pointer not at zero "
"please close and reopen, or use a context "
"manager")
magic_number = filename_or_obj.read(8)
filename_or_obj.seek(0)

if magic_number.startswith(b'CDF'):
engine = 'scipy'
elif magic_number.startswith(b'\211HDF\r\n\032\n'):
engine = 'h5netcdf'
if isinstance(filename_or_obj, bytes):
raise ValueError("can't open netCDF4/HDF5 as bytes "
"try passing a path or file-like object")
else:
if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80:
filename_or_obj = filename_or_obj[:80] + b'...'
raise ValueError('{} is not a valid netCDF file '
'did you mean to pass a string for a path instead?'
.format(filename_or_obj))
return engine


def _get_default_engine(path, allow_remote=False):
if allow_remote and is_remote_uri(path):
engine = _get_default_engine_remote_uri()
Expand Down Expand Up @@ -170,8 +198,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
Strings and Path objects are interpreted as a path to a netCDF file
or an OpenDAP URL and opened with python-netCDF4, unless the filename
ends with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
with scipy.io.netcdf (only netCDF3 supported).
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
Expand Down Expand Up @@ -258,6 +286,13 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
--------
open_mfdataset
"""
engines = [None, 'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio',
'cfgrib', 'pseudonetcdf']
if engine not in engines:
raise ValueError('unrecognized engine for open_dataset: {}\n'
'must be one of: {}'
.format(engine, engines))

if autoclose is not None:
warnings.warn(
'The autoclose argument is no longer used by '
Expand Down Expand Up @@ -316,18 +351,9 @@ def maybe_decode_store(store, lock=False):

if isinstance(filename_or_obj, backends.AbstractDataStore):
store = filename_or_obj
ds = maybe_decode_store(store)
elif isinstance(filename_or_obj, str):

if (isinstance(filename_or_obj, bytes) and
filename_or_obj.startswith(b'\x89HDF')):
raise ValueError('cannot read netCDF4/HDF5 file images')
elif (isinstance(filename_or_obj, bytes) and
filename_or_obj.startswith(b'CDF')):
# netCDF3 file images are handled by scipy
pass
elif isinstance(filename_or_obj, str):
filename_or_obj = _normalize_path(filename_or_obj)
elif isinstance(filename_or_obj, str):
filename_or_obj = _normalize_path(filename_or_obj)

if engine is None:
engine = _get_default_engine(filename_or_obj,
Expand All @@ -352,18 +378,19 @@ def maybe_decode_store(store, lock=False):
elif engine == 'cfgrib':
store = backends.CfGribDataStore(
filename_or_obj, lock=lock, **backend_kwargs)
else:
raise ValueError('unrecognized engine for open_dataset: %r'
% engine)

with close_on_error(store):
ds = maybe_decode_store(store)
else:
if engine is not None and engine != 'scipy':
raise ValueError('can only read file-like objects with '
"default engine or engine='scipy'")
# assume filename_or_obj is a file-like object
store = backends.ScipyDataStore(filename_or_obj)
if engine not in [None, 'scipy', 'h5netcdf']:
raise ValueError("can only read bytes or file-like objects "
"with engine='scipy' or 'h5netcdf'")
engine = _get_engine_from_magic_number(filename_or_obj)
if engine == 'scipy':
store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
elif engine == 'h5netcdf':
store = backends.H5NetCDFStore(filename_or_obj, group=group,
lock=lock, **backend_kwargs)

with close_on_error(store):
ds = maybe_decode_store(store)

# Ensure source filename always stored in dataset object (GH issue #2550)
Expand All @@ -390,8 +417,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
Strings and Paths are interpreted as a path to a netCDF file or an
OpenDAP URL and opened with python-netCDF4, unless the filename ends
with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
with scipy.io.netcdf (only netCDF3 supported).
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
Expand Down
6 changes: 6 additions & 0 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ def LooseVersion(vstring):
has_cfgrib, requires_cfgrib = _importorskip('cfgrib')

# some special cases
has_h5netcdf07, requires_h5netcdf07 = _importorskip('h5netcdf',
minversion='0.7')
has_h5py29, requires_h5py29 = _importorskip('h5py', minversion='2.9.0')
has_h5fileobj = has_h5netcdf07 and has_h5py29
requires_h5fileobj = pytest.mark.skipif(
not has_h5fileobj, reason='requires h5py>2.9.0 & h5netcdf>0.7')
has_scipy_or_netCDF4 = has_scipy or has_netCDF4
requires_scipy_or_netCDF4 = pytest.mark.skipif(
not has_scipy_or_netCDF4, reason='requires scipy or netCDF4')
Expand Down
50 changes: 48 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
requires_cftime, requires_dask, requires_h5netcdf, requires_netCDF4,
requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio,
requires_rasterio, requires_scipy, requires_scipy_or_netCDF4,
requires_zarr)
requires_zarr, requires_h5fileobj)
from .test_coding_times import (_STANDARD_CALENDARS, _NON_STANDARD_CALENDARS,
_ALL_CALENDARS)
from .test_dataset import create_test_data
Expand Down Expand Up @@ -1770,7 +1770,7 @@ def test_engine(self):
open_dataset(tmp_file, engine='foobar')

netcdf_bytes = data.to_netcdf()
with raises_regex(ValueError, 'can only read'):
with raises_regex(ValueError, 'unrecognized engine'):
open_dataset(BytesIO(netcdf_bytes), engine='foobar')

def test_cross_engine_read_write_netcdf3(self):
Expand Down Expand Up @@ -1955,6 +1955,52 @@ def test_dump_encodings_h5py(self):
assert actual.x.encoding['compression_opts'] is None


@requires_h5fileobj
class TestH5NetCDFFileObject(TestH5NetCDFData):
engine = 'h5netcdf'

def test_open_badbytes(self):
with raises_regex(ValueError, "HDF5 as bytes"):
with open_dataset(b'\211HDF\r\n\032\n', engine='h5netcdf'):
pass
with raises_regex(ValueError, "not a valid netCDF"):
with open_dataset(b'garbage'):
pass
with raises_regex(ValueError, "can only read bytes"):
with open_dataset(b'garbage', engine='netcdf4'):
pass
with raises_regex(ValueError, "not a valid netCDF"):
with open_dataset(BytesIO(b'garbage'), engine='h5netcdf'):
pass

def test_open_twice(self):
expected = create_test_data()
expected.attrs['foo'] = 'bar'
with raises_regex(ValueError, 'read/write pointer not at zero'):
with create_tmp_file() as tmp_file:
expected.to_netcdf(tmp_file, engine='h5netcdf')
with open(tmp_file, 'rb') as f:
with open_dataset(f, engine='h5netcdf'):
with open_dataset(f, engine='h5netcdf'):
pass

def test_open_fileobj(self):
# open in-memory datasets instead of local file paths
expected = create_test_data().drop('dim3')
expected.attrs['foo'] = 'bar'
with create_tmp_file() as tmp_file:
expected.to_netcdf(tmp_file, engine='h5netcdf')

with open(tmp_file, 'rb') as f:
with open_dataset(f, engine='h5netcdf') as actual:
assert_identical(expected, actual)

f.seek(0)
with BytesIO(f.read()) as bio:
with open_dataset(bio, engine='h5netcdf') as actual:
assert_identical(expected, actual)


@requires_h5netcdf
@requires_dask
@pytest.mark.filterwarnings('ignore:deallocating CachingFileManager')
Expand Down

0 comments on commit 225868d

Please sign in to comment.