From f17f3ad1a4a2069cd70385af8ad331f644ec66ba Mon Sep 17 00:00:00 2001 From: Dan Nowacki Date: Wed, 17 Apr 2019 22:16:55 -0700 Subject: [PATCH 1/3] Partial fix for #2841 to improve formatting. Updates formatting to use .format() instead of % operator. Changed all instances of % to .format() and added test for using tuple as key, which errored using % operator. --- xarray/core/formatting.py | 43 ++++++++++++++++++--------------- xarray/tests/test_formatting.py | 11 +++++++++ 2 files changed, 34 insertions(+), 20 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index f3fcc1ecb37..b51fdd6673a 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -116,7 +116,7 @@ def format_timestamp(t): if time_str == '00:00:00': return date_str else: - return '%sT%s' % (date_str, time_str) + return '{0}T{1}'.format(date_str, time_str) def format_timedelta(t, timedelta_format=None): @@ -212,12 +212,12 @@ def summarize_variable(name, var, col_width, show_values=True, marker=' ', max_width=None): if max_width is None: max_width = OPTIONS['display_width'] - first_col = pretty_print(' %s %s ' % (marker, name), col_width) + first_col = pretty_print(' {0} {1} '.format(marker, name), col_width) if var.dims: - dims_str = '(%s) ' % ', '.join(map(str, var.dims)) + dims_str = '({0}) '.format(', '.join(map(str, var.dims))) else: dims_str = '' - front_str = '%s%s%s ' % (first_col, dims_str, var.dtype) + front_str = '{0}{1}{2} '.format(first_col, dims_str, var.dtype) if show_values: values_str = format_array_flat(var, max_width - len(front_str)) elif isinstance(var._data, dask_array_type): @@ -229,8 +229,9 @@ def summarize_variable(name, var, col_width, show_values=True, def _summarize_coord_multiindex(coord, col_width, marker): - first_col = pretty_print(' %s %s ' % (marker, coord.name), col_width) - return '%s(%s) MultiIndex' % (first_col, str(coord.dims[0])) + first_col = pretty_print(' {0} {1} '.format( + marker, coord.name), col_width) + return '{0}({1}) MultiIndex'.format(first_col, str(coord.dims[0])) def _summarize_coord_levels(coord, col_width, marker='-'): @@ -264,13 +265,14 @@ def summarize_coord(name, var, col_width): def summarize_attr(key, value, col_width=None): """Summary for __repr__ - use ``X.attrs[key]`` for full value.""" # Indent key and add ':', then right-pad if col_width is not None - k_str = ' %s:' % key + k_str = ' {0}:'.format(key) if col_width is not None: k_str = pretty_print(k_str, col_width) # Replace tabs and newlines, so we print on one line in known width v_str = str(value).replace('\t', '\\t').replace('\n', '\\n') # Finally, truncate to the desired display width - return maybe_truncate('%s %s' % (k_str, v_str), OPTIONS['display_width']) + return maybe_truncate('{0} {1}'.format(k_str, v_str), + OPTIONS['display_width']) EMPTY_REPR = ' *empty*' @@ -303,7 +305,7 @@ def _calculate_col_width(col_items): def _mapping_repr(mapping, title, summarizer, col_width=None): if col_width is None: col_width = _calculate_col_width(mapping) - summary = ['%s:' % title] + summary = ['{0}:'.format(title)] if mapping: summary += [summarizer(k, v, col_width) for k, v in mapping.items()] else: @@ -329,19 +331,19 @@ def coords_repr(coords, col_width=None): def indexes_repr(indexes): summary = [] for k, v in indexes.items(): - summary.append(wrap_indent(repr(v), '%s: ' % k)) + summary.append(wrap_indent(repr(v), '{0}: '.format(k))) return '\n'.join(summary) def dim_summary(obj): - elements = ['%s: %s' % (k, v) for k, v in obj.sizes.items()] + elements = ['{0}: {1}'.format(k, v) for k, v in obj.sizes.items()] return ', '.join(elements) def unindexed_dims_repr(dims, coords): unindexed_dims = [d for d in dims if d not in coords] if unindexed_dims: - dims_str = ', '.join('%s' % d for d in unindexed_dims) + dims_str = ', '.join('{0}'.format(d) for d in unindexed_dims) return 'Dimensions without coordinates: ' + dims_str else: return None @@ -382,10 +384,11 @@ def short_dask_repr(array, show_dtype=True): """ chunksize = tuple(c[0] for c in array.chunks) if show_dtype: - return 'dask.array' % ( + return 'dask.array'.format( array.shape, array.dtype, chunksize) else: - return 'dask.array' % (array.shape, chunksize) + return 'dask.array'.format( + array.shape, chunksize) def short_data_repr(array): @@ -394,18 +397,18 @@ def short_data_repr(array): elif array._in_memory or array.size < 1e5: return short_array_repr(array.values) else: - return u'[%s values with dtype=%s]' % (array.size, array.dtype) + return u'[{0} values with dtype={1}]'.format(array.size, array.dtype) def array_repr(arr): # used for DataArray, Variable and IndexVariable if hasattr(arr, 'name') and arr.name is not None: - name_str = '%r ' % arr.name + name_str = '{0} '.format(arr.name) else: name_str = '' - summary = ['' - % (type(arr).__name__, name_str, dim_summary(arr))] + summary = [''.format( + type(arr).__name__, name_str, dim_summary(arr))] summary.append(short_data_repr(arr)) @@ -424,12 +427,12 @@ def array_repr(arr): def dataset_repr(ds): - summary = ['' % type(ds).__name__] + summary = [''.format(type(ds).__name__)] col_width = _calculate_col_width(_get_col_items(ds.variables)) dims_start = pretty_print('Dimensions:', col_width) - summary.append('%s(%s)' % (dims_start, dim_summary(ds))) + summary.append('{0}({1})'.format(dims_start, dim_summary(ds))) if ds.coords: summary.append(coords_repr(ds.coords, col_width=col_width)) diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 82b7b86bb76..71096c4dfb8 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -303,6 +303,17 @@ def test_diff_dataset_repr(self): actual = formatting.diff_dataset_repr(ds_a, ds_b, 'identical') assert actual == expected + def test_array_repr(self): + ds = xr.Dataset(coords={'foo':[1,2,3], 'bar':[1,2,3]}) + ds[(1,2)] = xr.DataArray([0], dims='test') + actual = formatting.array_repr(ds[(1,2)]) + expected = dedent("""\ + + array([0]) + Dimensions without coordinates: test""") + + assert actual == expected + def test_set_numpy_options(): original_options = np.get_printoptions() From 28df9097bd3a79156a3526d8a8c95779f6bd8098 Mon Sep 17 00:00:00 2001 From: Dan Nowacki Date: Wed, 17 Apr 2019 22:25:28 -0700 Subject: [PATCH 2/3] Revert "Partial fix for #2841 to improve formatting." This reverts commit f17f3ad1a4a2069cd70385af8ad331f644ec66ba. --- xarray/core/formatting.py | 43 +++++++++++++++------------------ xarray/tests/test_formatting.py | 11 --------- 2 files changed, 20 insertions(+), 34 deletions(-) diff --git a/xarray/core/formatting.py b/xarray/core/formatting.py index b51fdd6673a..f3fcc1ecb37 100644 --- a/xarray/core/formatting.py +++ b/xarray/core/formatting.py @@ -116,7 +116,7 @@ def format_timestamp(t): if time_str == '00:00:00': return date_str else: - return '{0}T{1}'.format(date_str, time_str) + return '%sT%s' % (date_str, time_str) def format_timedelta(t, timedelta_format=None): @@ -212,12 +212,12 @@ def summarize_variable(name, var, col_width, show_values=True, marker=' ', max_width=None): if max_width is None: max_width = OPTIONS['display_width'] - first_col = pretty_print(' {0} {1} '.format(marker, name), col_width) + first_col = pretty_print(' %s %s ' % (marker, name), col_width) if var.dims: - dims_str = '({0}) '.format(', '.join(map(str, var.dims))) + dims_str = '(%s) ' % ', '.join(map(str, var.dims)) else: dims_str = '' - front_str = '{0}{1}{2} '.format(first_col, dims_str, var.dtype) + front_str = '%s%s%s ' % (first_col, dims_str, var.dtype) if show_values: values_str = format_array_flat(var, max_width - len(front_str)) elif isinstance(var._data, dask_array_type): @@ -229,9 +229,8 @@ def summarize_variable(name, var, col_width, show_values=True, def _summarize_coord_multiindex(coord, col_width, marker): - first_col = pretty_print(' {0} {1} '.format( - marker, coord.name), col_width) - return '{0}({1}) MultiIndex'.format(first_col, str(coord.dims[0])) + first_col = pretty_print(' %s %s ' % (marker, coord.name), col_width) + return '%s(%s) MultiIndex' % (first_col, str(coord.dims[0])) def _summarize_coord_levels(coord, col_width, marker='-'): @@ -265,14 +264,13 @@ def summarize_coord(name, var, col_width): def summarize_attr(key, value, col_width=None): """Summary for __repr__ - use ``X.attrs[key]`` for full value.""" # Indent key and add ':', then right-pad if col_width is not None - k_str = ' {0}:'.format(key) + k_str = ' %s:' % key if col_width is not None: k_str = pretty_print(k_str, col_width) # Replace tabs and newlines, so we print on one line in known width v_str = str(value).replace('\t', '\\t').replace('\n', '\\n') # Finally, truncate to the desired display width - return maybe_truncate('{0} {1}'.format(k_str, v_str), - OPTIONS['display_width']) + return maybe_truncate('%s %s' % (k_str, v_str), OPTIONS['display_width']) EMPTY_REPR = ' *empty*' @@ -305,7 +303,7 @@ def _calculate_col_width(col_items): def _mapping_repr(mapping, title, summarizer, col_width=None): if col_width is None: col_width = _calculate_col_width(mapping) - summary = ['{0}:'.format(title)] + summary = ['%s:' % title] if mapping: summary += [summarizer(k, v, col_width) for k, v in mapping.items()] else: @@ -331,19 +329,19 @@ def coords_repr(coords, col_width=None): def indexes_repr(indexes): summary = [] for k, v in indexes.items(): - summary.append(wrap_indent(repr(v), '{0}: '.format(k))) + summary.append(wrap_indent(repr(v), '%s: ' % k)) return '\n'.join(summary) def dim_summary(obj): - elements = ['{0}: {1}'.format(k, v) for k, v in obj.sizes.items()] + elements = ['%s: %s' % (k, v) for k, v in obj.sizes.items()] return ', '.join(elements) def unindexed_dims_repr(dims, coords): unindexed_dims = [d for d in dims if d not in coords] if unindexed_dims: - dims_str = ', '.join('{0}'.format(d) for d in unindexed_dims) + dims_str = ', '.join('%s' % d for d in unindexed_dims) return 'Dimensions without coordinates: ' + dims_str else: return None @@ -384,11 +382,10 @@ def short_dask_repr(array, show_dtype=True): """ chunksize = tuple(c[0] for c in array.chunks) if show_dtype: - return 'dask.array'.format( + return 'dask.array' % ( array.shape, array.dtype, chunksize) else: - return 'dask.array'.format( - array.shape, chunksize) + return 'dask.array' % (array.shape, chunksize) def short_data_repr(array): @@ -397,18 +394,18 @@ def short_data_repr(array): elif array._in_memory or array.size < 1e5: return short_array_repr(array.values) else: - return u'[{0} values with dtype={1}]'.format(array.size, array.dtype) + return u'[%s values with dtype=%s]' % (array.size, array.dtype) def array_repr(arr): # used for DataArray, Variable and IndexVariable if hasattr(arr, 'name') and arr.name is not None: - name_str = '{0} '.format(arr.name) + name_str = '%r ' % arr.name else: name_str = '' - summary = [''.format( - type(arr).__name__, name_str, dim_summary(arr))] + summary = ['' + % (type(arr).__name__, name_str, dim_summary(arr))] summary.append(short_data_repr(arr)) @@ -427,12 +424,12 @@ def array_repr(arr): def dataset_repr(ds): - summary = [''.format(type(ds).__name__)] + summary = ['' % type(ds).__name__] col_width = _calculate_col_width(_get_col_items(ds.variables)) dims_start = pretty_print('Dimensions:', col_width) - summary.append('{0}({1})'.format(dims_start, dim_summary(ds))) + summary.append('%s(%s)' % (dims_start, dim_summary(ds))) if ds.coords: summary.append(coords_repr(ds.coords, col_width=col_width)) diff --git a/xarray/tests/test_formatting.py b/xarray/tests/test_formatting.py index 71096c4dfb8..82b7b86bb76 100644 --- a/xarray/tests/test_formatting.py +++ b/xarray/tests/test_formatting.py @@ -303,17 +303,6 @@ def test_diff_dataset_repr(self): actual = formatting.diff_dataset_repr(ds_a, ds_b, 'identical') assert actual == expected - def test_array_repr(self): - ds = xr.Dataset(coords={'foo':[1,2,3], 'bar':[1,2,3]}) - ds[(1,2)] = xr.DataArray([0], dims='test') - actual = formatting.array_repr(ds[(1,2)]) - expected = dedent("""\ - - array([0]) - Dimensions without coordinates: test""") - - assert actual == expected - def test_set_numpy_options(): original_options = np.get_printoptions() From cf993618fd9d051f6a87592177a61da44b0f7442 Mon Sep 17 00:00:00 2001 From: Dan Nowacki Date: Wed, 24 Apr 2019 20:57:20 -0700 Subject: [PATCH 3/3] Implement load_dataset() and load_dataarray() BUG: Fixes #2887 by adding @shoyer solution for load_dataset and load_dataarray, wrappers around open_dataset and open_dataarray which open, load, and close the file and return the Dataset/DataArray TST: Add tests for sequentially opening and writing to files using new functions DOC: Add to whats-new.rst. Also a tiny change to the open_dataset docstring Update docstrings and check for cache in kwargs Undeprecate load_dataset Add to api.rst, fix whats-new.rst typo, raise error instead of warning --- doc/api.rst | 2 ++ doc/whats-new.rst | 12 ++++++-- xarray/__init__.py | 2 +- xarray/backends/api.py | 57 +++++++++++++++++++++++++++++++++-- xarray/tests/test_backends.py | 19 +++++++++++- xarray/tutorial.py | 15 +++------ 6 files changed, 90 insertions(+), 17 deletions(-) diff --git a/doc/api.rst b/doc/api.rst index 00b33959eed..0e766f2cf9a 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -460,6 +460,7 @@ Dataset methods :toctree: generated/ open_dataset + load_dataset open_mfdataset open_rasterio open_zarr @@ -487,6 +488,7 @@ DataArray methods :toctree: generated/ open_dataarray + load_dataarray DataArray.to_dataset DataArray.to_netcdf DataArray.to_pandas diff --git a/doc/whats-new.rst b/doc/whats-new.rst index ac1b5269bfa..d904a3814f1 100644 --- a/doc/whats-new.rst +++ b/doc/whats-new.rst @@ -29,6 +29,12 @@ Enhancements By `James McCreight `_. - Clean up Python 2 compatibility in code (:issue:`2950`) By `Guido Imperiale `_. +- Implement ``load_dataset()`` and ``load_dataarray()`` as alternatives to + ``open_dataset()`` and ``open_dataarray()`` to open, load into memory, + and close files, returning the Dataset or DataArray. These functions are + helpful for avoiding file-lock errors when trying to write to files opened + using ``open_dataset()`` or ``open_dataarray()``. (:issue:`2887`) + By `Dan Nowacki `_. Bug fixes ~~~~~~~~~ @@ -153,9 +159,9 @@ Other enhancements By `Keisuke Fujii `_. - Added :py:meth:`~xarray.Dataset.drop_dims` (:issue:`1949`). By `Kevin Squire `_. -- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` - parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for - backwards compatibility. The ``overwrite_encoded_chunks`` parameter is +- ``xr.open_zarr`` now accepts manually specified chunks with the ``chunks=`` + parameter. ``auto_chunk=True`` is equivalent to ``chunks='auto'`` for + backwards compatibility. The ``overwrite_encoded_chunks`` parameter is added to remove the original zarr chunk encoding. By `Lily Wang `_. diff --git a/xarray/__init__.py b/xarray/__init__.py index 773dfe19d01..506cb46de26 100644 --- a/xarray/__init__.py +++ b/xarray/__init__.py @@ -17,7 +17,7 @@ from .core.options import set_options from .backends.api import (open_dataset, open_dataarray, open_mfdataset, - save_mfdataset) + save_mfdataset, load_dataset, load_dataarray) from .backends.rasterio_ import open_rasterio from .backends.zarr import open_zarr diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 7c5040580fe..01188e92752 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -185,12 +185,64 @@ def _finalize_store(write, store): store.close() +def load_dataset(filename_or_obj, **kwargs): + """Open, load into memory, and close a Dataset from a file or file-like + object. + + This is a thin wrapper around :py:meth:`~xarray.open_dataset`. It differs + from `open_dataset` in that it loads the Dataset into memory, closes the + file, and returns the Dataset. In contrast, `open_dataset` keeps the file + handle open and lazy loads its contents. All parameters are passed directly + to `open_dataset`. See that documentation for further details. + + Returns + ------- + dataset : Dataset + The newly created Dataset. + + See Also + -------- + open_dataset + """ + if 'cache' in kwargs: + raise TypeError('cache has no effect in this context') + + with open_dataset(filename_or_obj, **kwargs) as ds: + return ds.load() + + +def load_dataarray(filename_or_obj, **kwargs): + """Open, load into memory, and close a DataArray from a file or file-like + object containing a single data variable. + + This is a thin wrapper around :py:meth:`~xarray.open_dataarray`. It differs + from `open_dataarray` in that it loads the Dataset into memory, closes the + file, and returns the Dataset. In contrast, `open_dataarray` keeps the file + handle open and lazy loads its contents. All parameters are passed directly + to `open_dataarray`. See that documentation for further details. + + Returns + ------- + datarray : DataArray + The newly created DataArray. + + See Also + -------- + open_dataarray + """ + if 'cache' in kwargs: + raise TypeError('cache has no effect in this context') + + with open_dataarray(filename_or_obj, **kwargs) as da: + return da.load() + + def open_dataset(filename_or_obj, group=None, decode_cf=True, mask_and_scale=None, decode_times=True, autoclose=None, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None, backend_kwargs=None, use_cftime=None): - """Load and decode a dataset from a file or file-like object. + """Open and decode a dataset from a file or file-like object. Parameters ---------- @@ -406,7 +458,8 @@ def open_dataarray(filename_or_obj, group=None, decode_cf=True, concat_characters=True, decode_coords=True, engine=None, chunks=None, lock=None, cache=None, drop_variables=None, backend_kwargs=None, use_cftime=None): - """Open an DataArray from a netCDF file containing a single data variable. + """Open an DataArray from a file or file-like object containing a single + data variable. This is designed to read netCDF files with only one data variable. If multiple variables are present then a ValueError is raised. diff --git a/xarray/tests/test_backends.py b/xarray/tests/test_backends.py index a4c0374e158..f31d3bf4f9b 100644 --- a/xarray/tests/test_backends.py +++ b/xarray/tests/test_backends.py @@ -19,7 +19,7 @@ import xarray as xr from xarray import ( DataArray, Dataset, backends, open_dataarray, open_dataset, open_mfdataset, - save_mfdataset) + save_mfdataset, load_dataset, load_dataarray) from xarray.backends.common import robust_getitem from xarray.backends.netCDF4_ import _extract_nc4_variable_encoding from xarray.backends.pydap_ import PydapDataStore @@ -2641,6 +2641,23 @@ def test_save_mfdataset_compute_false_roundtrip(self): with open_mfdataset([tmp1, tmp2]) as actual: assert_identical(actual, original) + def test_load_dataset(self): + with create_tmp_file() as tmp: + original = Dataset({'foo': ('x', np.random.randn(10))}) + original.to_netcdf(tmp) + ds = load_dataset(tmp) + # this would fail if we used open_dataset instead of load_dataset + ds.to_netcdf(tmp) + + def test_load_dataarray(self): + with create_tmp_file() as tmp: + original = Dataset({'foo': ('x', np.random.randn(10))}) + original.to_netcdf(tmp) + ds = load_dataarray(tmp) + # this would fail if we used open_dataarray instead of + # load_dataarray + ds.to_netcdf(tmp) + @requires_scipy_or_netCDF4 @requires_pydap diff --git a/xarray/tutorial.py b/xarray/tutorial.py index f54cf7b3889..1a977450ed6 100644 --- a/xarray/tutorial.py +++ b/xarray/tutorial.py @@ -27,7 +27,7 @@ def open_dataset(name, cache=True, cache_dir=_default_cache_dir, github_url='https://github.com/pydata/xarray-data', branch='master', **kws): """ - Load a dataset from the online repository (requires internet). + Open a dataset from the online repository (requires internet). If a local copy is found then always use that to avoid network traffic. @@ -91,17 +91,12 @@ def open_dataset(name, cache=True, cache_dir=_default_cache_dir, def load_dataset(*args, **kwargs): """ - `load_dataset` will be removed a future version of xarray. The current - behavior of this function can be achived by using - `tutorial.open_dataset(...).load()`. + Open, load into memory, and close a dataset from the online repository + (requires internet). See Also -------- open_dataset """ - warnings.warn( - "load_dataset` will be removed in a future version of xarray. The " - "current behavior of this function can be achived by using " - "`tutorial.open_dataset(...).load()`.", - DeprecationWarning, stacklevel=2) - return open_dataset(*args, **kwargs).load() + with open_dataset(*args, **kwargs) as ds: + return ds.load()