Skip to content

Commit

Permalink
Feature/coarse grain c384 diagnostic data (#122)
Browse files Browse the repository at this point in the history
Add script  `workflows/coarse_grain_c384_diags/coarse_grain.py` to coarsen and save C48 diagnostic variables from the high res data.

Merge features from above dataset into the training data.
  • Loading branch information
nbren12 authored Feb 4, 2020
1 parent 816f12c commit d87ee75
Show file tree
Hide file tree
Showing 9 changed files with 319 additions and 113 deletions.
19 changes: 14 additions & 5 deletions catalog.yml
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ sources:
project: 'vcm-ml'
access: read_only
urlpath: 'gcs://vcm-ml-data/2019-07-17-GFDL_FV3_DYAMOND_0.25deg_15minute/3d.zarr'

marat_sam_tend:
description: Initial data sample provided by Lucas Harris. 1 day of 3D fields sampled every 3 hours.
driver: netcdf
Expand All @@ -43,7 +43,7 @@ sources:
description: 2D and 3D data sampled every 15 minutes from a 2-day 3-km simulation, regridded to a roughly 0.25 degree x 0.25 degree regular lat-lon grid.
driver: zarr
metadata:
data_transforms:
data_transforms:
- _rename_SHiELD_varnames_to_orig
args:
storage_options:
Expand All @@ -61,18 +61,27 @@ sources:
urlpath: 'gcs://vcm-ml-data/2019-09-24_GFDL-SHiELD-15-minute-2-days_regrid_1degree.zarr'

2019-10-09-SAM-SOCRATES_tend_9216x4608x74_7.5s_4km_nudge24h:
description: 5 day forecast for SOCRATES with 24 hour nudging to ERA5, with bug fixes for tendency calculations, from Marat Khairoutdinov.
description: 5 day forecast for SOCRATES with 24 hour nudging to ERA5, with bug fixes for tendency calculations, from Marat Khairoutdinov.
driver: zarr
args:
storage_options:
project: 'vcm-ml'
access: read_only
urlpath: 'gcs://vcm-ml-data/2019-10-09-SAM-SOCRATES_tend_9216x4608x74_7.5s_4km_nudge24h.zarr'


40day_c384_diags_time_avg:
description: Time-averaged diagnostics for 40-day nudged simulation at C384 resolution
driver: zarr
args:
storage_options:
project: 'vcm-ml'
access: read_only
urlpath: 'gs://vcm-ml-data/2019-12-05-40-day-X-SHiELD-simulation-C384-diagnostics/gfsphysics_15min_coarse.zarr/'

## Local Data Intake ##
# TODO: Could this be replicated with intake caching? Or switch to an ignored file?
local_2019-09-24-GFDL-SHiELD-15-minute-2-days_regrid_1degree:
description: Local version of 2D and 3D SHieLD data at 1 degree resolution
driver: zarr
args:
urlpath: '{{ CATALOG_DIR }}/data/interim/2019-09-24_GFDL-SHiELD-15-minute-2-days_regrid_1degree.zarr'
urlpath: '{{ CATALOG_DIR }}/data/interim/2019-09-24_GFDL-SHiELD-15-minute-2-days_regrid_1degree.zarr'
1 change: 1 addition & 0 deletions external/vcm/vcm/cubedsphere/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,3 +32,4 @@
GRID_VARS = [VAR_LAT_CENTER, VAR_LAT_OUTER, VAR_LON_CENTER, VAR_LON_OUTER, "area"]
INIT_TIME_DIM = "initialization_time"
FORECAST_TIME_DIM = "forecast_time"
TILE_COORDS = range(6)
8 changes: 8 additions & 0 deletions fv3net/pipelines/create_training_data/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,14 @@
help="Location of input data in Google Cloud Storage bucket. "
"Don't include bucket in path.",
)
parser.add_argument(
"--diag-c48-path",
type=str,
required=False,
help="Location of C48 (coarsened from C384) high res diagnostic zarr for "
"features (SHF, LHF, etc.) that are not saved in restarts. If not provided, "
"features from diagnostics will not be in the final training data set.",
)
parser.add_argument(
"--gcs-output-data-dir",
type=str,
Expand Down
100 changes: 100 additions & 0 deletions fv3net/pipelines/create_training_data/helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from datetime import timedelta
import fsspec
import logging
import os
import xarray as xr

from vcm.fv3_restarts import _split_url
from vcm.cubedsphere.constants import (
INIT_TIME_DIM,
FORECAST_TIME_DIM,
TIME_FMT,
TILE_COORDS,
)


logger = logging.getLogger()
logger.setLevel(logging.INFO)


def _round_time(t):
""" The high res data timestamps are often +/- a few 1e-2 seconds off the
initialization times of the restarts, which makes it difficult to merge on
time. This rounds time to the nearest second, assuming the init time is at most
1 sec away from a round minute.
Args:
t: datetime or cftime object
Returns:
datetime or cftime object rounded to nearest minute
"""
if t.second == 0:
return t.replace(microsecond=0)
elif t.second == 59:
return t.replace(microsecond=0) + timedelta(seconds=1)
else:
raise ValueError(
f"Time value > 1 second from 1 minute timesteps for "
"C48 initialization time {t}. Are you sure you're joining "
"the correct high res data?"
)


def _path_from_first_timestep(ds, train_test_labels=None):
""" Uses first init time as zarr filename, and appends a 'train'/'test' subdir
if a dict of labels is provided
Args:
ds: input dataset
train_test_labels: optional dict with keys ["test", "train"] and values lists of
timestep strings that go to each set
Returns:
path in args.gcs_output_dir to write the zarr to
"""
timestep = min(ds[INIT_TIME_DIM].values).strftime(TIME_FMT)
if isinstance(train_test_labels, dict):
try:
if timestep in train_test_labels["train"]:
train_test_subdir = "train"
elif timestep in train_test_labels["test"]:
train_test_subdir = "test"
except KeyError:
logger.warning(
"train_test_labels dict does not have keys ['train', 'test']."
"Will write zarrs directly to gcs_output_dir."
)
train_test_subdir = ""
else:
logger.info(
"No train_test_labels dict provided."
"Will write zarrs directly to gcs_output_dir."
)
train_test_subdir = ""
return os.path.join(train_test_subdir, timestep + ".zarr")


def _set_relative_forecast_time_coord(ds):
delta_t_forecast = (
ds[FORECAST_TIME_DIM].values[-1] - ds[FORECAST_TIME_DIM].values[-2]
)
ds.reset_index([FORECAST_TIME_DIM], drop=True)
return ds.assign_coords(
{FORECAST_TIME_DIM: [timedelta(seconds=0), delta_t_forecast]}
)


def load_diag(diag_data_path, init_times):
protocol, path = _split_url(diag_data_path)
fs = fsspec.filesystem(protocol)
ds_diag = xr.open_zarr(fs.get_mapper(diag_data_path), consolidated=True).rename(
{"time": INIT_TIME_DIM}
)
ds_diag = ds_diag.assign_coords(
{
INIT_TIME_DIM: [_round_time(t) for t in ds_diag[INIT_TIME_DIM].values],
"tile": TILE_COORDS,
}
)
return ds_diag.sel({INIT_TIME_DIM: init_times})
Loading

0 comments on commit d87ee75

Please sign in to comment.