Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use real array for data of of small netCDF variables. #5229

Merged
merged 6 commits into from
Apr 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 8 additions & 8 deletions docs/src/further_topics/ugrid/operations.rst
Original file line number Diff line number Diff line change
Expand Up @@ -430,20 +430,20 @@ creating any associated :class:`~iris.cube.Cube`\s:
node
node_dimension: 'Mesh2d_node'
node coordinates
<AuxCoord: longitude / (degrees) <lazy> shape(5,)>
<AuxCoord: latitude / (unknown) <lazy> shape(5,)>
<AuxCoord: longitude / (degrees) [...] shape(5,)>
<AuxCoord: latitude / (unknown) [...] shape(5,)>
edge
edge_dimension: 'Mesh2d_edge'
edge_node_connectivity: <Connectivity: mesh2d_edge / (unknown) <lazy> shape(6, 2)>
edge_node_connectivity: <Connectivity: mesh2d_edge / (unknown) [...] shape(6, 2)>
edge coordinates
<AuxCoord: longitude / (unknown) <lazy> shape(6,)>
<AuxCoord: latitude / (unknown) <lazy> shape(6,)>
<AuxCoord: longitude / (unknown) [...] shape(6,)>
<AuxCoord: latitude / (unknown) [...] shape(6,)>
face
face_dimension: 'Mesh2d_face'
face_node_connectivity: <Connectivity: mesh2d_face / (unknown) <lazy> shape(2, 4)>
face_node_connectivity: <Connectivity: mesh2d_face / (unknown) [...] shape(2, 4)>
face coordinates
<AuxCoord: longitude / (unknown) <lazy> shape(2,)>
<AuxCoord: latitude / (unknown) <lazy> shape(2,)>
<AuxCoord: longitude / (unknown) [...] shape(2,)>
<AuxCoord: latitude / (unknown) [...] shape(2,)>
long_name: 'my_mesh'
var_name: 'my_mesh'

Expand Down
15 changes: 7 additions & 8 deletions docs/src/userguide/real_and_lazy_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -188,17 +188,17 @@ coordinates' lazy points and bounds:

.. doctest::

>>> cube = iris.load_cube(iris.sample_data_path('hybrid_height.nc'), 'air_potential_temperature')
>>> cube = iris.load_cube(iris.sample_data_path('orca2_votemper.nc'),'votemper')

>>> dim_coord = cube.coord('model_level_number')
>>> dim_coord = cube.coord('depth')
>>> print(dim_coord.has_lazy_points())
False
>>> print(dim_coord.has_bounds())
False
True
>>> print(dim_coord.has_lazy_bounds())
False

>>> aux_coord = cube.coord('sigma')
>>> aux_coord = cube.coord('longitude')
>>> print(aux_coord.has_lazy_points())
True
>>> print(aux_coord.has_bounds())
Expand All @@ -213,17 +213,16 @@ coordinates' lazy points and bounds:
>>> print(aux_coord.has_lazy_bounds())
True

>>> derived_coord = cube.coord('altitude')
# Fetch a derived coordinate, from a different file: These can also have lazy data.
>>> cube2 = iris.load_cube(iris.sample_data_path('hybrid_height.nc'), 'air_potential_temperature')
>>> derived_coord = cube2.coord('altitude')
>>> print(derived_coord.has_lazy_points())
True
>>> print(derived_coord.has_bounds())
True
>>> print(derived_coord.has_lazy_bounds())
True

.. note::
Printing a lazy :class:`~iris.coords.AuxCoord` will realise its points and bounds arrays!


Dask Processing Options
-----------------------
Expand Down
5 changes: 4 additions & 1 deletion docs/src/whatsnew/latest.rst
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,10 @@ This document explains the changes made to Iris for this release
🚀 Performance Enhancements
===========================

#. N/A
#. `@pp-mo`_ changed the netCDF loader to fetch data immediately from small netCDF
variables, instead of creating a dask array: This saves both time and memory.
Note that some cubes, coordinates etc loaded from netCDF will now have real data
where previously it was lazy. (:pull:`5229`)


🔥 Deprecations
Expand Down
65 changes: 46 additions & 19 deletions lib/iris/fileformats/netcdf/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -173,26 +173,53 @@ def _get_actual_dtype(cf_var):
return dummy_data.dtype


# An arbitrary variable array size, below which we will fetch real data from a variable
# rather than making a lazy array for deferred access.
# Set by experiment at roughly the point where it begins to save us memory, but actually
# mostly done for speed improvement. See https://github.com/SciTools/iris/pull/5069
_LAZYVAR_MIN_BYTES = 5000


def _get_cf_var_data(cf_var, filename):
# Get lazy chunked data out of a cf variable.
dtype = _get_actual_dtype(cf_var)

# Create cube with deferred data, but no metadata
fill_value = getattr(
cf_var.cf_data,
"_FillValue",
_thread_safe_nc.default_fillvals[cf_var.dtype.str[1:]],
)
proxy = NetCDFDataProxy(
cf_var.shape, dtype, filename, cf_var.cf_name, fill_value
)
# Get the chunking specified for the variable : this is either a shape, or
# maybe the string "contiguous".
chunks = cf_var.cf_data.chunking()
# In the "contiguous" case, pass chunks=None to 'as_lazy_data'.
if chunks == "contiguous":
chunks = None
return as_lazy_data(proxy, chunks=chunks)
"""
Get an array representing the data of a CF variable.

This is typically a lazy array based around a NetCDFDataProxy, but if the variable
is "sufficiently small", we instead fetch the data as a real (numpy) array.
The latter is especially valuable for scalar coordinates, which are otherwise
unnecessarily slow + wasteful of memory.

"""
total_bytes = cf_var.size * cf_var.dtype.itemsize
if total_bytes < _LAZYVAR_MIN_BYTES:
# Don't make a lazy array, as it will cost more memory AND more time to access.
# Instead fetch the data immediately, as a real array, and return that.
result = cf_var[:]

else:
# Get lazy chunked data out of a cf variable.
dtype = _get_actual_dtype(cf_var)

# Make a data-proxy that mimics array access and can fetch from the file.
fill_value = getattr(
cf_var.cf_data,
"_FillValue",
_thread_safe_nc.default_fillvals[cf_var.dtype.str[1:]],
)
proxy = NetCDFDataProxy(
cf_var.shape, dtype, filename, cf_var.cf_name, fill_value
)
# Get the chunking specified for the variable : this is either a shape, or
# maybe the string "contiguous".
chunks = cf_var.cf_data.chunking()
# In the "contiguous" case, pass chunks=None to 'as_lazy_data'.
if chunks == "contiguous":
chunks = None

# Return a dask array providing deferred access.
result = as_lazy_data(proxy, chunks=chunks)

return result


class _OrderedAddableList(list):
Expand Down
7 changes: 6 additions & 1 deletion lib/iris/tests/integration/netcdf/test_general.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
import os.path
import shutil
import tempfile
from unittest import mock
import warnings

import numpy as np
Expand All @@ -34,7 +35,11 @@ def test_lazy_preserved_save(self):
fpath = tests.get_data_path(
("NetCDF", "label_and_climate", "small_FC_167_mon_19601101.nc")
)
acube = iris.load_cube(fpath, "air_temperature")
# While loading, "turn off" loading small variables as real data.
with mock.patch(
"iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0
):
acube = iris.load_cube(fpath, "air_temperature")
self.assertTrue(acube.has_lazy_data())
# Also check a coord with lazy points + bounds.
self.assertTrue(acube.coord("forecast_period").has_lazy_points())
Expand Down
10 changes: 9 additions & 1 deletion lib/iris/tests/integration/test_cube.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
# importing anything else.
import iris.tests as tests # isort:skip

from unittest import mock

import numpy as np

import iris
Expand All @@ -23,7 +25,13 @@ def test_agg_by_aux_coord(self):
problem_test_file = tests.get_data_path(
("NetCDF", "testing", "small_theta_colpex.nc")
)
cube = iris.load_cube(problem_test_file, "air_potential_temperature")
# While loading, "turn off" loading small variables as real data.
with mock.patch(
"iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0
):
cube = iris.load_cube(
problem_test_file, "air_potential_temperature"
)

# Test aggregating by aux coord, notably the `forecast_period` aux
# coord on `cube`, whose `_points` attribute is a lazy array.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
</coord>
</coords>
<cellMethods/>
<data dtype="float64" shape="(2, 2)" state="deferred"/>
<data dtype="float64" shape="(2, 2)" state="loaded"/>
</cube>
</cubes>
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
</coord>
</coords>
<cellMethods/>
<data dtype="float64" shape="(2, 2)" state="deferred"/>
<data dtype="float64" shape="(2, 2)" state="loaded"/>
</cube>
</cubes>
2 changes: 1 addition & 1 deletion lib/iris/tests/results/netcdf/save_load_traj.cml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,6 @@
<coord name="time"/>
</cellMethod>
</cellMethods>
<data dtype="float32" shape="(10,)" state="deferred"/>
<data dtype="float32" shape="(10,)" state="loaded"/>
</cube>
</cubes>
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
</coord>
</coords>
<cellMethods/>
<data dtype="float64" shape="(2, 2)" state="deferred"/>
<data dtype="float64" shape="(2, 2)" state="loaded"/>
</cube>
</cubes>
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,6 @@
</coord>
</coords>
<cellMethods/>
<data dtype="float64" shape="(2, 2)" state="deferred"/>
<data dtype="float64" shape="(2, 2)" state="loaded"/>
</cube>
</cubes>
8 changes: 7 additions & 1 deletion lib/iris/tests/unit/aux_factory/test_AuxCoordFactory.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
# importing anything else.
import iris.tests as tests # isort:skip

from unittest import mock

import numpy as np

import iris
Expand Down Expand Up @@ -143,7 +145,11 @@ def setUp(self):
path = tests.get_data_path(
["NetCDF", "testing", "small_theta_colpex.nc"]
)
self.cube = iris.load_cube(path, "air_potential_temperature")
# While loading, "turn off" loading small variables as real data.
with mock.patch(
"iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0
):
self.cube = iris.load_cube(path, "air_potential_temperature")

def _check_lazy(self):
coords = self.cube.aux_coords + self.cube.derived_coords
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def mock_cf_av_var(monkeypatch):
long_name="wibble",
units="m2",
shape=data.shape,
size=np.prod(data.shape),
dtype=data.dtype,
__getitem__=lambda self, key: data[key],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,11 @@
build_auxilliary_coordinate`.

"""

# import iris tests first so that some things can be initialised before
# importing anything else
import iris.tests as tests # isort:skip

import contextlib
from unittest import mock

import numpy as np
Expand Down Expand Up @@ -48,6 +48,7 @@ def setUp(self):
long_name="wibble",
units="m",
shape=points.shape,
size=np.prod(points.shape),
dtype=points.dtype,
__getitem__=lambda self, key: points[key],
)
Expand Down Expand Up @@ -111,6 +112,7 @@ def _make_cf_bounds_var(self, dimension_names):
cf_name="wibble_bnds",
cf_data=cf_data,
shape=bounds.shape,
size=np.prod(bounds.shape),
dtype=bounds.dtype,
__getitem__=lambda self, key: bounds[key],
)
Expand Down Expand Up @@ -165,6 +167,7 @@ def setUp(self):
long_name="wibble",
units="m",
shape=points.shape,
size=np.prod(points.shape),
dtype=points.dtype,
__getitem__=lambda self, key: points[key],
)
Expand All @@ -176,21 +179,29 @@ def setUp(self):
cube_parts=dict(coordinates=[]),
)

@contextlib.contextmanager
def deferred_load_patch(self):
def patched__getitem__(proxy_self, keys):
if proxy_self.variable_name == self.cf_coord_var.cf_name:
return self.cf_coord_var[keys]
raise RuntimeError()

self.deferred_load_patch = mock.patch(
# Fix for deferred load, *AND* avoid loading small variable data in real arrays.
with mock.patch(
"iris.fileformats.netcdf.NetCDFDataProxy.__getitem__",
new=patched__getitem__,
)
):
# While loading, "turn off" loading small variables as real data.
with mock.patch(
"iris.fileformats.netcdf.loader._LAZYVAR_MIN_BYTES", 0
):
yield

def test_scale_factor_add_offset_int(self):
self.cf_coord_var.scale_factor = 3
self.cf_coord_var.add_offset = 5

with self.deferred_load_patch:
with self.deferred_load_patch():
build_auxiliary_coordinate(self.engine, self.cf_coord_var)

coord, _ = self.engine.cube_parts["coordinates"][0]
Expand All @@ -199,7 +210,7 @@ def test_scale_factor_add_offset_int(self):
def test_scale_factor_float(self):
self.cf_coord_var.scale_factor = 3.0

with self.deferred_load_patch:
with self.deferred_load_patch():
build_auxiliary_coordinate(self.engine, self.cf_coord_var)

coord, _ = self.engine.cube_parts["coordinates"][0]
Expand All @@ -208,7 +219,7 @@ def test_scale_factor_float(self):
def test_add_offset_float(self):
self.cf_coord_var.add_offset = 5.0

with self.deferred_load_patch:
with self.deferred_load_patch():
build_auxiliary_coordinate(self.engine, self.cf_coord_var)

coord, _ = self.engine.cube_parts["coordinates"][0]
Expand Down Expand Up @@ -239,6 +250,7 @@ def setUp(self):
units="days since 1970-01-01",
calendar=None,
shape=points.shape,
size=np.prod(points.shape),
dtype=points.dtype,
__getitem__=lambda self, key: points[key],
)
Expand All @@ -251,6 +263,7 @@ def setUp(self):
cf_name="wibble_bnds",
cf_data=mock.MagicMock(chunking=mock.Mock(return_value=None)),
shape=bounds.shape,
size=np.prod(bounds.shape),
dtype=bounds.dtype,
__getitem__=lambda self, key: bounds[key],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def mock_cf_cm_var(monkeypatch):
long_name="wibble",
units="m2",
shape=data.shape,
size=np.prod(data.shape),
dtype=data.dtype,
__getitem__=lambda self, key: data[key],
cf_measure="area",
Expand Down
Loading