From b6a1376c9d4f879727af088f606601832ab6d404 Mon Sep 17 00:00:00 2001 From: James Bourbeau Date: Wed, 15 May 2024 13:07:39 -0500 Subject: [PATCH 1/3] Add rechunking example --- rechunk.ipynb | 185 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 185 insertions(+) create mode 100644 rechunk.ipynb diff --git a/rechunk.ipynb b/rechunk.ipynb new file mode 100644 index 0000000..b89929c --- /dev/null +++ b/rechunk.ipynb @@ -0,0 +1,185 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "75688ac6-879d-4449-b73e-74f03a5f991f", + "metadata": { + "tags": [], + "user_expressions": [] + }, + "source": [ + "\n", + "\n", + "# Geospatial Dataset Rechunking\n", + "\n", + "This is a national water model: https://registry.opendata.aws/nwm-archive/" + ] + }, + { + "cell_type": "markdown", + "id": "5dd71599-465f-4c97-baaa-19d900d2a070", + "metadata": { + "user_expressions": [] + }, + "source": [ + "## Set up cluster" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "24beda07-03c8-4a23-8600-80dbe10298ce", + "metadata": {}, + "outputs": [], + "source": [ + "import dask\n", + "\n", + "dask.config.set({\n", + " \"array.rechunk.method\": \"p2p\",\n", + " \"optimization.fuse.active\": False,\n", + " \"distributed.comm.retry.count\": 20,\n", + " \"distributed.comm.timeouts.connect\": 120,\n", + "});" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "60b08a1c-d042-40f2-aaaa-e7665ca85d64", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import coiled\n", + "\n", + "cluster = coiled.Cluster(\n", + " n_workers=100,\n", + " region=\"us-east-1\",\n", + ")\n", + "client = cluster.get_client()\n", + "client" + ] + }, + { + "cell_type": "markdown", + "id": "8185966d-6659-482b-bcbb-826b8f30b1e3", + "metadata": {}, + "source": [ + "## Load NWM data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8b1749a-0d64-4278-823c-892120bf1a5b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import xarray as xr\n", + "\n", + "ds = xr.open_zarr(\n", + " \"s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr\",\n", + " consolidated=True,\n", + ").drop_encoding()\n", + "ds" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2147fc5c-60ee-4409-8c22-69c5e68a4c63", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "ds.nbytes / 1e12 # half-petabyte" + ] + }, + { + "cell_type": "markdown", + "id": "0911fb96-7c08-4ca6-a35a-22e2a5a908cd", + "metadata": { + "tags": [] + }, + "source": [ + "## Time-optimized rechunking\n", + "\n", + "Let's look at two months worth of data (~1 TB) and rechunk it to be optimized for time dimension selections." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "2a6fb91d-6a02-4afc-8d8a-ec3529f805f4", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "data = ds.zwattablrt.sel(time=slice(\"2020-01-01\", \"2020-03-01\")) # 1 TB of data\n", + "data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "8057c72c-7212-49fa-ad18-7aa346beb8cc", + "metadata": {}, + "outputs": [], + "source": [ + "result = data.chunk({\"time\": 1, \"x\": \"auto\", \"y\": \"auto\"})\n", + "result" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "68c7e99e-dec7-4201-9344-2738e5f8bca3", + "metadata": {}, + "outputs": [], + "source": [ + "result.to_zarr(\"s3://oss-scratch-space/nwm-time-optimized.zarr\", mode=\"w\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "57e3a741-ad69-4f54-9094-78586a59d29e", + "metadata": {}, + "outputs": [], + "source": [ + "import fsspec\n", + "\n", + "fs = fsspec.filesystem(\"s3\")\n", + "fs.ls(\"s3://oss-scratch-space/nwm-time-optimized.zarr/\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.9" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} From 0ccfa8e6ffe0dd4dc216bd13abc475caf6d44f78 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 23 May 2024 10:17:13 -0500 Subject: [PATCH 2/3] add rechunking example to xarray example --- xarray.ipynb | 119 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 76 insertions(+), 43 deletions(-) diff --git a/xarray.ipynb b/xarray.ipynb index 3e77935..117d72d 100644 --- a/xarray.ipynb +++ b/xarray.ipynb @@ -17,6 +17,34 @@ "This is a national water model: https://registry.opendata.aws/nwm-archive/" ] }, + { + "cell_type": "markdown", + "id": "8185966d-6659-482b-bcbb-826b8f30b1e3", + "metadata": { + "tags": [] + }, + "source": [ + "## Load NWM data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e8b1749a-0d64-4278-823c-892120bf1a5b", + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "import xarray as xr\n", + "\n", + "ds = xr.open_zarr(\n", + " \"s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr\",\n", + " consolidated=True,\n", + ").drop_encoding()\n", + "ds" + ] + }, { "cell_type": "markdown", "id": "5dd71599-465f-4c97-baaa-19d900d2a070", @@ -39,7 +67,7 @@ "import coiled\n", "\n", "cluster = coiled.Cluster(\n", - " n_workers=40,\n", + " n_workers=100,\n", " region=\"us-east-1\",\n", ")\n", "client = cluster.get_client()" @@ -47,99 +75,104 @@ }, { "cell_type": "markdown", - "id": "8185966d-6659-482b-bcbb-826b8f30b1e3", - "metadata": {}, + "id": "0911fb96-7c08-4ca6-a35a-22e2a5a908cd", + "metadata": { + "tags": [] + }, "source": [ - "## Load NWM data" + "## Compute average over space" ] }, { "cell_type": "code", "execution_count": null, - "id": "e8b1749a-0d64-4278-823c-892120bf1a5b", + "id": "2a6fb91d-6a02-4afc-8d8a-ec3529f805f4", "metadata": { "tags": [] }, "outputs": [], "source": [ - "import xarray as xr\n", - "\n", - "ds = xr.open_zarr(\n", - " \"s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr\",\n", - " consolidated=True,\n", - ")\n", - "ds" + "subset = ds.zwattablrt.sel(time=slice(\"2001-01-01\", \"2001-03-31\"))\n", + "subset" ] }, { "cell_type": "code", "execution_count": null, - "id": "2147fc5c-60ee-4409-8c22-69c5e68a4c63", + "id": "8ae07b31-383c-4cc9-b94a-cbbb68369746", "metadata": { "tags": [] }, "outputs": [], "source": [ - "ds.nbytes / 1e12 # half-petabyte" + "avg = subset.mean(dim=[\"x\", \"y\"]).compute()\n", + "avg.plot()" ] }, { "cell_type": "markdown", - "id": "0911fb96-7c08-4ca6-a35a-22e2a5a908cd", - "metadata": { - "tags": [] - }, + "id": "b237d221-a2db-44fb-924d-6003cd73f933", + "metadata": {}, "source": [ - "## Compute average over space" + "## Rechunk" ] }, { "cell_type": "code", "execution_count": null, - "id": "2a6fb91d-6a02-4afc-8d8a-ec3529f805f4", - "metadata": { - "tags": [] - }, + "id": "da1eca49-8362-42c6-aea7-ac986df36ef3", + "metadata": {}, "outputs": [], "source": [ - "subset = ds.zwattablrt.sel(time=slice(\"2001-01-01\", \"2001-12-31\"))\n", - "subset" + "import dask\n", + "\n", + "dask.config.set({\n", + " \"array.rechunk.method\": \"p2p\",\n", + " \"optimization.fuse.active\": False,\n", + "});" ] }, { "cell_type": "code", "execution_count": null, - "id": "8ae07b31-383c-4cc9-b94a-cbbb68369746", - "metadata": { - "tags": [] - }, + "id": "8ac6e24d-24d6-445d-a532-438b9d3a13f9", + "metadata": {}, "outputs": [], "source": [ - "avg = subset.mean(dim=[\"x\", \"y\"]).persist()" + "result = subset.chunk({\"time\": \"auto\", \"x\": -1, \"y\": \"auto\"})\n", + "result" ] }, { "cell_type": "code", "execution_count": null, - "id": "c42ef712-c60c-4049-816f-fcbd115a27a5", - "metadata": { - "tags": [] - }, + "id": "657ce639-b644-42ea-b98b-b70c2cb3170a", + "metadata": {}, "outputs": [], "source": [ - "cluster.scale(300)" + "%%time\n", + "\n", + "result.to_zarr(\"s3://oss-scratch-space/nwm-x-optimized.zarr\", mode=\"w\")" ] }, { "cell_type": "code", "execution_count": null, - "id": "fdb5ac6f-48c3-4457-bf80-aab4336598f3", - "metadata": { - "tags": [] - }, + "id": "1d615e48-29dc-48b8-a185-c2fd20c3fdda", + "metadata": {}, "outputs": [], "source": [ - "avg.compute().plot()" + "result.chunk(" + ] + }, + { + "cell_type": "markdown", + "id": "268ccde2-7e83-4e97-9fb7-4887a52adbe6", + "metadata": {}, + "source": [ + "## Cleanup if you like\n", + "\n", + "(but we'll clean up automatically eventually)" ] }, { @@ -155,9 +188,9 @@ ], "metadata": { "kernelspec": { - "display_name": "Python [conda env:deepak]", + "display_name": "Python [conda env:coiled]", "language": "python", - "name": "conda-env-deepak-py" + "name": "conda-env-coiled-py" }, "language_info": { "codemirror_mode": { @@ -169,7 +202,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.11.3" + "version": "3.11.8" } }, "nbformat": 4, From 78928c237e21fd0b71c99acd1b1db0386861d462 Mon Sep 17 00:00:00 2001 From: Matthew Rocklin Date: Thu, 23 May 2024 10:17:25 -0500 Subject: [PATCH 3/3] remove rechunk notebook --- rechunk.ipynb | 185 -------------------------------------------------- 1 file changed, 185 deletions(-) delete mode 100644 rechunk.ipynb diff --git a/rechunk.ipynb b/rechunk.ipynb deleted file mode 100644 index b89929c..0000000 --- a/rechunk.ipynb +++ /dev/null @@ -1,185 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "75688ac6-879d-4449-b73e-74f03a5f991f", - "metadata": { - "tags": [], - "user_expressions": [] - }, - "source": [ - "\n", - "\n", - "# Geospatial Dataset Rechunking\n", - "\n", - "This is a national water model: https://registry.opendata.aws/nwm-archive/" - ] - }, - { - "cell_type": "markdown", - "id": "5dd71599-465f-4c97-baaa-19d900d2a070", - "metadata": { - "user_expressions": [] - }, - "source": [ - "## Set up cluster" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "24beda07-03c8-4a23-8600-80dbe10298ce", - "metadata": {}, - "outputs": [], - "source": [ - "import dask\n", - "\n", - "dask.config.set({\n", - " \"array.rechunk.method\": \"p2p\",\n", - " \"optimization.fuse.active\": False,\n", - " \"distributed.comm.retry.count\": 20,\n", - " \"distributed.comm.timeouts.connect\": 120,\n", - "});" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "60b08a1c-d042-40f2-aaaa-e7665ca85d64", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import coiled\n", - "\n", - "cluster = coiled.Cluster(\n", - " n_workers=100,\n", - " region=\"us-east-1\",\n", - ")\n", - "client = cluster.get_client()\n", - "client" - ] - }, - { - "cell_type": "markdown", - "id": "8185966d-6659-482b-bcbb-826b8f30b1e3", - "metadata": {}, - "source": [ - "## Load NWM data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e8b1749a-0d64-4278-823c-892120bf1a5b", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "import xarray as xr\n", - "\n", - "ds = xr.open_zarr(\n", - " \"s3://noaa-nwm-retrospective-2-1-zarr-pds/rtout.zarr\",\n", - " consolidated=True,\n", - ").drop_encoding()\n", - "ds" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2147fc5c-60ee-4409-8c22-69c5e68a4c63", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "ds.nbytes / 1e12 # half-petabyte" - ] - }, - { - "cell_type": "markdown", - "id": "0911fb96-7c08-4ca6-a35a-22e2a5a908cd", - "metadata": { - "tags": [] - }, - "source": [ - "## Time-optimized rechunking\n", - "\n", - "Let's look at two months worth of data (~1 TB) and rechunk it to be optimized for time dimension selections." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a6fb91d-6a02-4afc-8d8a-ec3529f805f4", - "metadata": { - "tags": [] - }, - "outputs": [], - "source": [ - "data = ds.zwattablrt.sel(time=slice(\"2020-01-01\", \"2020-03-01\")) # 1 TB of data\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8057c72c-7212-49fa-ad18-7aa346beb8cc", - "metadata": {}, - "outputs": [], - "source": [ - "result = data.chunk({\"time\": 1, \"x\": \"auto\", \"y\": \"auto\"})\n", - "result" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "68c7e99e-dec7-4201-9344-2738e5f8bca3", - "metadata": {}, - "outputs": [], - "source": [ - "result.to_zarr(\"s3://oss-scratch-space/nwm-time-optimized.zarr\", mode=\"w\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "57e3a741-ad69-4f54-9094-78586a59d29e", - "metadata": {}, - "outputs": [], - "source": [ - "import fsspec\n", - "\n", - "fs = fsspec.filesystem(\"s3\")\n", - "fs.ls(\"s3://oss-scratch-space/nwm-time-optimized.zarr/\")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.9" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}