Skip to content

Commit

Permalink
Update benchmark datasets
Browse files Browse the repository at this point in the history
  • Loading branch information
abarciauskas-bgse committed Oct 17, 2023
1 parent 3509b52 commit 458fc02
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 41 deletions.
7 changes: 7 additions & 0 deletions 01-generate-datasets/external-datasets.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,13 @@
"dataset_url": "https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/pangeo-forge/aws-noaa-oisst-feedstock/aws-noaa-oisst-avhrr-only.zarr/reference.json",
"variable": "sst",
"extra_args": {"reference": true }
}, "prod-giovanni-cache-GPM_3IMERGHH_06_precipitationCal": {
"dataset_url": "s3://prod-giovanni-cache/zarr/GPM_3IMERGHH_06_precipitationCal/",
"variable": "variable",
"extra_args": { "consolidated": false }
}, "pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc": {
"dataset_url": "https://nex-gddp-cmip6.s3-us-west-2.amazonaws.com/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/pr/pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc",
"variable": "pr"
}
}

Expand Down
14 changes: 7 additions & 7 deletions 03-e2e/gen_test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,15 +15,13 @@
import sys
sys.path.append('..')
import helpers.zarr_helpers as zarr_helpers
from titiler_xarray.titiler.xarray.reader import xarray_open_dataset, ZarrReader
from titiler_xarray.titiler.xarray.reader import xarray_open_dataset, get_variable

# Step 2: Merge the dictionaries
sources = json.loads(open('../01-generate-datasets/all-datasets.json', 'r').read())
sources = json.loads(open('../01-generate-datasets/external-datasets.json', 'r').read())

# remove pyramids and https dataset for now
sources = list(filter(lambda x: 'pyramid' not in x[0], sources.items()))
# Also, skip HTTPS for now
sources = list(filter(lambda x: 'https' not in x[1]['dataset_url'], sources))

def get_arguments():
parser = argparse.ArgumentParser(description="Set environment for the script.")
Expand Down Expand Up @@ -120,17 +118,19 @@ def generate_extremas(bounds: list[float]):
variable = value["variable"]
reference = value.get("extra_args", {}).get("reference", False)
multiscale = value.get("extra_args", {}).get("multiscale", False)
ds = xarray_open_dataset(source, reference=reference)
consolidated = value.get("extra_args", {}).get("consolidated", True)
ds = xarray_open_dataset(source, reference=reference, consolidated=consolidated)
bounds = default_bounds
if not multiscale:
lat_extent, lon_extent = zarr_helpers.get_lat_lon_extents(ds)
da = get_variable(ds, variable=variable)
lat_extent, lon_extent = zarr_helpers.get_lat_lon_extents(da)
bounds = [lon_extent[0], lat_extent[0], lon_extent[1], lat_extent[1]]

array_specs = {
'collection_name': collection_name,
'source': source
}
array_specs.update(zarr_helpers.get_array_chunk_information(ds, variable=variable, multiscale=multiscale))
array_specs.update(zarr_helpers.get_array_chunk_information(da, multiscale=multiscale))
mode = "w" if idx == 0 else "a"
with open(csv_file, mode, newline="") as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=array_specs.keys())
Expand Down
23 changes: 5 additions & 18 deletions 03-e2e/zarr_info.csv
Original file line number Diff line number Diff line change
@@ -1,19 +1,6 @@
collection_name,source,chunks,shape_dict,dtype,chunk_size_mb,compression,number_of_spatial_chunks,number_coordinate_chunks
cmip6-kerchunk,s3://nasa-eodc-data-store/test-data/cmip6-kerchunk/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json,"{'time': 1, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,3.2958984375,Zlib(level=5),1.0,3
600_1440_29_CMIP6_daily_GISS-E2-1-G_tas.zarr,s3://nasa-eodc-data-store/test-data/cmip6-zarr/600_1440_29_CMIP6_daily_GISS-E2-1-G_tas.zarr,"{'time': 29, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,95.5810546875,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr,s3://nasa-eodc-data-store/test-data/cmip6-zarr/600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr,"{'time': 1, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,3.2958984375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
365_262_262_CMIP6_daily_GISS-E2-1-G_tas.zarr,s3://nasa-eodc-data-store/test-data/cmip6-zarr/365_262_262_CMIP6_daily_GISS-E2-1-G_tas.zarr,"{'time': 365, 'lat': 262, 'lon': 262}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,95.57746887207031,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",12.586679097954665,3
power_901_monthly_meteorology_utc.zarr,s3://power-analysis-ready-datastore/power_901_monthly_meteorology_utc.zarr,"{'time': 504, 'lat': 25, 'lon': 25}","{'time': 504, 'lat': 361, 'lon': 576}",float64,2.40325927734375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",332.69759999999997,44
cmip6-pds_GISS-E2-1-G_historical_tas,s3://cmip6-pds/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/historical/r2i1p1f1/Amon/tas/gn/v20180827/,"{'time': 600, 'lat': 90, 'lon': 144}","{'time': 1980, 'lat': 90, 'lon': 144}",float32,29.6630859375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,6
single_chunk_store_lat1024_lon2048.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1024_lon2048.zarr,"{'time': 1, 'lat': 1024, 'lon': 2048}","{'time': 1, 'lat': 1024, 'lon': 2048}",float64,16.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
single_chunk_store_lat1448_lon2896.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1448_lon2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 1448, 'lon': 2896}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
single_chunk_store_lat2048_lon4096.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat2048_lon4096.zarr,"{'time': 1, 'lat': 2048, 'lon': 4096}","{'time': 1, 'lat': 2048, 'lon': 4096}",float64,64.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
single_chunk_store_lat2896_lon5792.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat2896_lon5792.zarr,"{'time': 1, 'lat': 2896, 'lon': 5792}","{'time': 1, 'lat': 2896, 'lon': 5792}",float64,127.97265625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
single_chunk_store_lat4096_lon8192.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat4096_lon8192.zarr,"{'time': 1, 'lat': 4096, 'lon': 8192}","{'time': 1, 'lat': 4096, 'lon': 8192}",float64,256.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
single_chunk_store_lat512_lon1024.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat512_lon1024.zarr,"{'time': 1, 'lat': 512, 'lon': 1024}","{'time': 1, 'lat': 512, 'lon': 1024}",float64,4.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
single_chunk_store_lat724_lon1448.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat724_lon1448.zarr,"{'time': 1, 'lat': 724, 'lon': 1448}","{'time': 1, 'lat': 724, 'lon': 1448}",float64,7.998291015625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
with_chunks_store_lat1448_lon2896.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat1448_lon2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 1448, 'lon': 2896}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
with_chunks_store_lat2048_lon4096.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat2048_lon4096.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 2048, 'lon': 4096}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",2.0004273373828636,3
with_chunks_store_lat2896_lon5792.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat2896_lon5792.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 2896, 'lon': 5792}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",4.0,3
with_chunks_store_lat4096_lon8192.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat4096_lon8192.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 4096, 'lon': 8192}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",8.001709349531454,3
with_chunks_store_lat5793_lon11586.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat5793_lon11586.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 5793, 'lon': 11586}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",16.0055253388175,3
power_901_monthly_meteorology_utc.zarr,s3://power-analysis-ready-datastore/power_901_monthly_meteorology_utc.zarr,"{'y': 504, 'x': 25}","{'y': 361, 'x': 576}",float64,2.40325927734375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",16.502857142857142,2
cmip6-pds_GISS-E2-1-G_historical_tas,s3://cmip6-pds/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/historical/r2i1p1f1/Amon/tas/gn/v20180827/,"{'y': 600, 'x': 90}","{'y': 90, 'x': 144}",float32,29.6630859375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",0.24,1
aws-noaa-oisst-feedstock_reference,https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/pangeo-forge/aws-noaa-oisst-feedstock/aws-noaa-oisst-avhrr-only.zarr/reference.json,"{'zlev': 1, 'y': 1, 'x': 720}","{'zlev': 1, 'y': 720, 'x': 1440}",int16,1.9775390625,Zlib(level=4),1440.0,2
prod-giovanni-cache-GPM_3IMERGHH_06_precipitationCal,s3://prod-giovanni-cache/zarr/GPM_3IMERGHH_06_precipitationCal/,"{'y': 36, 'x': 72}","{'y': 1800, 'x': 3600}",float32,1.9775390625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",2500.0,100
pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc,https://nex-gddp-cmip6.s3-us-west-2.amazonaws.com/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/pr/pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc,"{'y': 'N', 'x': '/'}","{'y': 600, 'x': 1440}",float32,N/A,N/A,N/A,0
32 changes: 16 additions & 16 deletions helpers/zarr_helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,43 +10,43 @@ def get_dataarray_size(da: xr.DataArray):
dtype = da.encoding.get("dtype", "N/A")
return np.prod(da.shape) * dtype.itemsize/1024/1024

def get_number_coord_chunks(ds: xr.Dataset):
def get_number_coord_chunks(da: xr.DataArray):
number_coord_chunks = 0
for key in ds.coords.keys():
if ds[key].shape != ():
number_coord_chunks += round(ds[key].shape[0]/ds[key].encoding['chunks'][0])
for key in da.coords.keys():
if da[key].shape != () and da[key].encoding.get('chunks'):
number_coord_chunks += round(da[key].shape[0]/da[key].encoding['chunks'][0])
return number_coord_chunks

def get_lat_lon_extents(ds: xr.Dataset):
lat_values = ds.lat.values
lon_values = ds.lon.values
if (ds.lon > 180).any():
# Adjust the longitude coordinates to the -180 to 180 range
lon_values = (ds.lon + 180) % 360 - 180
def get_lat_lon_extents(da: xr.DataArray):
lat_values = da.y.values
lon_values = da.x.values
lat_extent= [math.ceil(np.min(lat_values)), math.floor(np.max(lat_values))]
lon_extent = [math.ceil(np.min(lon_values)), math.floor(np.max(lon_values))]
return lat_extent, lon_extent

def get_array_chunk_information(ds: xr.Dataset, variable: str, multiscale: bool = False):
def get_array_chunk_information(da: xr.DataArray, multiscale: bool = False):
if multiscale: # TODO
chunks, shape_dict, chunks_dict, dtype, chunk_size_mb, compression, number_of_spatial_chunks = ["N/A"] * 7
else:
da = ds[variable]
chunks = da.encoding.get("chunks", "N/A")
chunks_dict = dict(zip(da.dims, chunks))
shape_dict = dict(zip(da.dims, da.shape))
dtype = da.encoding.get("dtype", "N/A")
chunk_size_mb = "N/A" if chunks is None else (np.prod(chunks) * dtype.itemsize)/1024/1024
dtype = da.encoding.get("dtype", "N/A")
# import pdb; pdb.set_trace()
chunk_size_mb = "N/A" if chunks == 'N/A' else (np.prod(chunks) * dtype.itemsize)/1024/1024
compression = da.encoding.get("compressor", "N/A")
number_of_spatial_chunks = (shape_dict['lat']/chunks_dict['lat']) * (shape_dict['lon']/chunks_dict['lon'])
try:
number_of_spatial_chunks = (shape_dict['y']/chunks_dict['y']) * (shape_dict['x']/chunks_dict['x'])
except:
number_of_spatial_chunks = "N/A"
return {
'chunks': chunks_dict,
'shape_dict': shape_dict,
'dtype': str(dtype),
'chunk_size_mb': chunk_size_mb,
'compression': str(compression),
'number_of_spatial_chunks': number_of_spatial_chunks,
'number_coordinate_chunks': get_number_coord_chunks(ds)
'number_coordinate_chunks': get_number_coord_chunks(da)
}

def generate_data_store(
Expand Down

0 comments on commit 458fc02

Please sign in to comment.