Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enable loading remote hdf5 files #2782

Merged
merged 12 commits into from
Mar 16, 2019
8 changes: 5 additions & 3 deletions doc/whats-new.rst
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ Breaking changes

Enhancements
~~~~~~~~~~~~

- Added ability to open netcdf4/hdf5 file-like objects with ``open_dataset``.
Requires (h5netcdf>0.7 and h5py>2.9.0). (:issue:`2781`)
By `Scott Henderson <https://github.com/scottyhq>`_
- Internal plotting now supports ``cftime.datetime`` objects as time series.
(:issue:`2164`)
By `Julius Busecke <https://github.com/jbusecke>`_ and
Expand Down Expand Up @@ -81,8 +83,8 @@ Enhancements
:py:meth:`~xarray.open_mfdataset` (:issue:`1263`) and/or to silence
serialization warnings raised if dates from a standard calendar are found to
be outside the :py:class:`pandas.Timestamp`-valid range (:issue:`2754`). By
`Spencer Clark <https://github.com/spencerkclark>`_.
`Spencer Clark <https://github.com/spencerkclark>`_.

- Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`).
By `Kevin Squire <https://github.com/kmsquire>`_.

Expand Down
77 changes: 52 additions & 25 deletions xarray/backends/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,34 @@ def _get_default_engine_netcdf():
return engine


def _get_engine_from_magic_number(filename_or_obj):
# check byte header to determine file type
if isinstance(filename_or_obj, bytes):
magic_number = filename_or_obj[:8]
else:
if filename_or_obj.tell() != 0:
raise ValueError("file-like object read/write pointer not at zero "
"please close and reopen, or use a context "
"manager")
magic_number = filename_or_obj.read(8)
filename_or_obj.seek(0)

if magic_number.startswith(b'CDF'):
engine = 'scipy'
elif magic_number.startswith(b'\211HDF\r\n\032\n'):
engine = 'h5netcdf'
if isinstance(filename_or_obj, bytes):
raise ValueError("can't open netCDF4/HDF5 as bytes "
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just a note: we could support this in the future, by wrapping bytes in a io.BytesIO object (like we do for the scipy backend). But no need to add it now -- I like explicitly providing file objects.

"try passing a path or file-like object")
else:
if isinstance(filename_or_obj, bytes) and len(filename_or_obj) > 80:
filename_or_obj = filename_or_obj[:80] + b'...'
raise ValueError('{} is not a valid netCDF file '
'did you mean to pass a string for a path instead?'
.format(filename_or_obj))
return engine


def _get_default_engine(path, allow_remote=False):
if allow_remote and is_remote_uri(path):
engine = _get_default_engine_remote_uri()
Expand Down Expand Up @@ -170,8 +198,8 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
Strings and Path objects are interpreted as a path to a netCDF file
or an OpenDAP URL and opened with python-netCDF4, unless the filename
ends with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
with scipy.io.netcdf (only netCDF3 supported).
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
Expand Down Expand Up @@ -258,6 +286,13 @@ def open_dataset(filename_or_obj, group=None, decode_cf=True,
--------
open_mfdataset
"""
engines = [None, 'netcdf4', 'scipy', 'pydap', 'h5netcdf', 'pynio',
'cfgrib', 'pseudonetcdf']
if engine not in engines:
raise ValueError('unrecognized engine for open_dataset: {}\n'
'must be one of: {}'
.format(engine, engines))

if autoclose is not None:
warnings.warn(
'The autoclose argument is no longer used by '
Expand Down Expand Up @@ -316,18 +351,9 @@ def maybe_decode_store(store, lock=False):

if isinstance(filename_or_obj, backends.AbstractDataStore):
store = filename_or_obj
ds = maybe_decode_store(store)
elif isinstance(filename_or_obj, str):

if (isinstance(filename_or_obj, bytes) and
filename_or_obj.startswith(b'\x89HDF')):
raise ValueError('cannot read netCDF4/HDF5 file images')
elif (isinstance(filename_or_obj, bytes) and
filename_or_obj.startswith(b'CDF')):
# netCDF3 file images are handled by scipy
pass
elif isinstance(filename_or_obj, str):
filename_or_obj = _normalize_path(filename_or_obj)
elif isinstance(filename_or_obj, str):
filename_or_obj = _normalize_path(filename_or_obj)

if engine is None:
engine = _get_default_engine(filename_or_obj,
Expand All @@ -352,18 +378,19 @@ def maybe_decode_store(store, lock=False):
elif engine == 'cfgrib':
store = backends.CfGribDataStore(
filename_or_obj, lock=lock, **backend_kwargs)
else:
raise ValueError('unrecognized engine for open_dataset: %r'
% engine)

with close_on_error(store):
ds = maybe_decode_store(store)
else:
if engine is not None and engine != 'scipy':
raise ValueError('can only read file-like objects with '
"default engine or engine='scipy'")
# assume filename_or_obj is a file-like object
store = backends.ScipyDataStore(filename_or_obj)
if engine not in [None, 'scipy', 'h5netcdf']:
raise ValueError("can only read bytes or file-like objects "
"with engine='scipy' or 'h5netcdf'")
engine = _get_engine_from_magic_number(filename_or_obj)
if engine == 'scipy':
store = backends.ScipyDataStore(filename_or_obj, **backend_kwargs)
elif engine == 'h5netcdf':
store = backends.H5NetCDFStore(filename_or_obj, group=group,
lock=lock, **backend_kwargs)

with close_on_error(store):
ds = maybe_decode_store(store)

# Ensure source filename always stored in dataset object (GH issue #2550)
Expand All @@ -390,8 +417,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True,
Strings and Paths are interpreted as a path to a netCDF file or an
OpenDAP URL and opened with python-netCDF4, unless the filename ends
with .gz, in which case the file is gunzipped and opened with
scipy.io.netcdf (only netCDF3 supported). File-like objects are opened
with scipy.io.netcdf (only netCDF3 supported).
scipy.io.netcdf (only netCDF3 supported). Byte-strings or file-like
objects are opened by scipy.io.netcdf (netCDF3) or h5py (netCDF4/HDF).
group : str, optional
Path to the netCDF4 group in the given file to open (only works for
netCDF4 files).
Expand Down
6 changes: 6 additions & 0 deletions xarray/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,12 @@ def LooseVersion(vstring):
has_cfgrib, requires_cfgrib = _importorskip('cfgrib')

# some special cases
has_h5netcdf07, requires_h5netcdf07 = _importorskip('h5netcdf',
minversion='0.7')
has_h5py29, requires_h5py29 = _importorskip('h5py', minversion='2.9.0')
has_h5fileobj = has_h5netcdf07 and has_h5py29
requires_h5fileobj = pytest.mark.skipif(
not has_h5fileobj, reason='requires h5py>2.9.0 & h5netcdf>0.7')
has_scipy_or_netCDF4 = has_scipy or has_netCDF4
requires_scipy_or_netCDF4 = pytest.mark.skipif(
not has_scipy_or_netCDF4, reason='requires scipy or netCDF4')
Expand Down
50 changes: 48 additions & 2 deletions xarray/tests/test_backends.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@
requires_cftime, requires_dask, requires_h5netcdf, requires_netCDF4,
requires_pathlib, requires_pseudonetcdf, requires_pydap, requires_pynio,
requires_rasterio, requires_scipy, requires_scipy_or_netCDF4,
requires_zarr)
requires_zarr, requires_h5fileobj)
from .test_coding_times import (_STANDARD_CALENDARS, _NON_STANDARD_CALENDARS,
_ALL_CALENDARS)
from .test_dataset import create_test_data
Expand Down Expand Up @@ -1770,7 +1770,7 @@ def test_engine(self):
open_dataset(tmp_file, engine='foobar')

netcdf_bytes = data.to_netcdf()
with raises_regex(ValueError, 'can only read'):
with raises_regex(ValueError, 'unrecognized engine'):
open_dataset(BytesIO(netcdf_bytes), engine='foobar')

def test_cross_engine_read_write_netcdf3(self):
Expand Down Expand Up @@ -1955,6 +1955,52 @@ def test_dump_encodings_h5py(self):
assert actual.x.encoding['compression_opts'] is None


@requires_h5fileobj
class TestH5NetCDFFileObject(TestH5NetCDFData):
engine = 'h5netcdf'

def test_open_badbytes(self):
with raises_regex(ValueError, "HDF5 as bytes"):
with open_dataset(b'\211HDF\r\n\032\n', engine='h5netcdf'):
pass
with raises_regex(ValueError, "not a valid netCDF"):
with open_dataset(b'garbage'):
pass
with raises_regex(ValueError, "can only read bytes"):
with open_dataset(b'garbage', engine='netcdf4'):
pass
with raises_regex(ValueError, "not a valid netCDF"):
with open_dataset(BytesIO(b'garbage'), engine='h5netcdf'):
pass

def test_open_twice(self):
expected = create_test_data()
expected.attrs['foo'] = 'bar'
with raises_regex(ValueError, 'read/write pointer not at zero'):
with create_tmp_file() as tmp_file:
expected.to_netcdf(tmp_file, engine='h5netcdf')
with open(tmp_file, 'rb') as f:
with open_dataset(f, engine='h5netcdf'):
with open_dataset(f, engine='h5netcdf'):
pass

def test_open_fileobj(self):
# open in-memory datasets instead of local file paths
expected = create_test_data().drop('dim3')
expected.attrs['foo'] = 'bar'
with create_tmp_file() as tmp_file:
expected.to_netcdf(tmp_file, engine='h5netcdf')

with open(tmp_file, 'rb') as f:
with open_dataset(f, engine='h5netcdf') as actual:
assert_identical(expected, actual)

f.seek(0)
with BytesIO(f.read()) as bio:
with open_dataset(bio, engine='h5netcdf') as actual:
assert_identical(expected, actual)


@requires_h5netcdf
@requires_dask
@pytest.mark.filterwarnings('ignore:deallocating CachingFileManager')
Expand Down