Feature/coarse grain c384 diagnostic data (#122)

Add script `workflows/coarse_grain_c384_diags/coarse_grain.py` to coarsen and save C48 diagnostic variables from the high res data. Merge features from above dataset into the training data.
ai2cm · Feb 4, 2020 · d87ee75 · d87ee75
1 parent 816f12c
commit d87ee75
Show file tree

Hide file tree

Showing 9 changed files with 319 additions and 113 deletions.
diff --git a/catalog.yml b/catalog.yml
@@ -29,7 +29,7 @@ sources:
         project: 'vcm-ml'
         access: read_only
       urlpath: 'gcs://vcm-ml-data/2019-07-17-GFDL_FV3_DYAMOND_0.25deg_15minute/3d.zarr'
-      
+
   marat_sam_tend:
     description: Initial data sample provided by Lucas Harris. 1 day of 3D fields sampled every 3 hours.
     driver: netcdf
@@ -43,7 +43,7 @@ sources:
     description: 2D and 3D data sampled every 15 minutes from a 2-day 3-km simulation, regridded to a roughly 0.25 degree x 0.25 degree regular lat-lon grid.
     driver: zarr
     metadata:
-      data_transforms: 
+      data_transforms:
         - _rename_SHiELD_varnames_to_orig
     args:
       storage_options:
@@ -61,18 +61,27 @@ sources:
       urlpath: 'gcs://vcm-ml-data/2019-09-24_GFDL-SHiELD-15-minute-2-days_regrid_1degree.zarr'
 
   2019-10-09-SAM-SOCRATES_tend_9216x4608x74_7.5s_4km_nudge24h:
-    description: 5 day forecast for SOCRATES with 24 hour nudging to ERA5, with bug fixes for tendency calculations, from Marat Khairoutdinov.  
+    description: 5 day forecast for SOCRATES with 24 hour nudging to ERA5, with bug fixes for tendency calculations, from Marat Khairoutdinov.
     driver: zarr
     args:
       storage_options:
         project: 'vcm-ml'
         access: read_only
       urlpath: 'gcs://vcm-ml-data/2019-10-09-SAM-SOCRATES_tend_9216x4608x74_7.5s_4km_nudge24h.zarr'
-
+
+  40day_c384_diags_time_avg:
+    description: Time-averaged diagnostics for 40-day nudged simulation at C384 resolution
+    driver: zarr
+    args:
+      storage_options:
+        project: 'vcm-ml'
+        access: read_only
+      urlpath: 'gs://vcm-ml-data/2019-12-05-40-day-X-SHiELD-simulation-C384-diagnostics/gfsphysics_15min_coarse.zarr/'
+
   ## Local Data Intake ##
   # TODO: Could this be replicated with intake caching? Or switch to an ignored file?
   local_2019-09-24-GFDL-SHiELD-15-minute-2-days_regrid_1degree:
     description: Local version of 2D and 3D SHieLD data at 1 degree resolution
     driver: zarr
     args:
-      urlpath: '{{ CATALOG_DIR }}/data/interim/2019-09-24_GFDL-SHiELD-15-minute-2-days_regrid_1degree.zarr'
+      urlpath: '{{ CATALOG_DIR }}/data/interim/2019-09-24_GFDL-SHiELD-15-minute-2-days_regrid_1degree.zarr'
diff --git a/external/vcm/vcm/cubedsphere/constants.py b/external/vcm/vcm/cubedsphere/constants.py
@@ -32,3 +32,4 @@
 GRID_VARS = [VAR_LAT_CENTER, VAR_LAT_OUTER, VAR_LON_CENTER, VAR_LON_OUTER, "area"]
 INIT_TIME_DIM = "initialization_time"
 FORECAST_TIME_DIM = "forecast_time"
+TILE_COORDS = range(6)
diff --git a/fv3net/pipelines/create_training_data/__main__.py b/fv3net/pipelines/create_training_data/__main__.py
@@ -10,6 +10,14 @@
         help="Location of input data in Google Cloud Storage bucket. "
         "Don't include bucket in path.",
     )
+    parser.add_argument(
+        "--diag-c48-path",
+        type=str,
+        required=False,
+        help="Location of C48 (coarsened from C384) high res diagnostic zarr for "
+        "features (SHF, LHF, etc.) that are not saved in restarts. If not provided, "
+        "features from diagnostics will not be in the final training data set.",
+    )
     parser.add_argument(
         "--gcs-output-data-dir",
         type=str,

diff --git a/fv3net/pipelines/create_training_data/helpers.py b/fv3net/pipelines/create_training_data/helpers.py
@@ -0,0 +1,100 @@
+from datetime import timedelta
+import fsspec
+import logging
+import os
+import xarray as xr
+
+from vcm.fv3_restarts import _split_url
+from vcm.cubedsphere.constants import (
+    INIT_TIME_DIM,
+    FORECAST_TIME_DIM,
+    TIME_FMT,
+    TILE_COORDS,
+)
+
+
+logger = logging.getLogger()
+logger.setLevel(logging.INFO)
+
+
+def _round_time(t):
+    """ The high res data timestamps are often +/- a few 1e-2 seconds off the
+    initialization times of the restarts, which makes it difficult to merge on
+    time. This rounds time to the nearest second, assuming the init time is at most
+    1 sec away from a round minute.
+
+    Args:
+        t: datetime or cftime object
+
+    Returns:
+        datetime or cftime object rounded to nearest minute
+    """
+    if t.second == 0:
+        return t.replace(microsecond=0)
+    elif t.second == 59:
+        return t.replace(microsecond=0) + timedelta(seconds=1)
+    else:
+        raise ValueError(
+            f"Time value > 1 second from 1 minute timesteps for "
+            "C48 initialization time {t}. Are you sure you're joining "
+            "the correct high res data?"
+        )
+
+
+def _path_from_first_timestep(ds, train_test_labels=None):
+    """ Uses first init time as zarr filename, and appends a 'train'/'test' subdir
+    if a dict of labels is provided
+
+    Args:
+        ds: input dataset
+        train_test_labels: optional dict with keys ["test", "train"] and values lists of
+            timestep strings that go to each set
+
+    Returns:
+        path in args.gcs_output_dir to write the zarr to
+    """
+    timestep = min(ds[INIT_TIME_DIM].values).strftime(TIME_FMT)
+    if isinstance(train_test_labels, dict):
+        try:
+            if timestep in train_test_labels["train"]:
+                train_test_subdir = "train"
+            elif timestep in train_test_labels["test"]:
+                train_test_subdir = "test"
+        except KeyError:
+            logger.warning(
+                "train_test_labels dict does not have keys ['train', 'test']."
+                "Will write zarrs directly to gcs_output_dir."
+            )
+            train_test_subdir = ""
+    else:
+        logger.info(
+            "No train_test_labels dict provided."
+            "Will write zarrs directly to gcs_output_dir."
+        )
+        train_test_subdir = ""
+    return os.path.join(train_test_subdir, timestep + ".zarr")
+
+
+def _set_relative_forecast_time_coord(ds):
+    delta_t_forecast = (
+        ds[FORECAST_TIME_DIM].values[-1] - ds[FORECAST_TIME_DIM].values[-2]
+    )
+    ds.reset_index([FORECAST_TIME_DIM], drop=True)
+    return ds.assign_coords(
+        {FORECAST_TIME_DIM: [timedelta(seconds=0), delta_t_forecast]}
+    )
+
+
+def load_diag(diag_data_path, init_times):
+    protocol, path = _split_url(diag_data_path)
+    fs = fsspec.filesystem(protocol)
+    ds_diag = xr.open_zarr(fs.get_mapper(diag_data_path), consolidated=True).rename(
+        {"time": INIT_TIME_DIM}
+    )
+    ds_diag = ds_diag.assign_coords(
+        {
+            INIT_TIME_DIM: [_round_time(t) for t in ds_diag[INIT_TIME_DIM].values],
+            "tile": TILE_COORDS,
+        }
+    )
+    return ds_diag.sel({INIT_TIME_DIM: init_times})