Update benchmark datasets

developmentseed · Oct 17, 2023 · 458fc02 · 458fc02
1 parent 3509b52
commit 458fc02
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 41 deletions.
diff --git a/01-generate-datasets/external-datasets.json b/01-generate-datasets/external-datasets.json
@@ -9,6 +9,13 @@
         "dataset_url": "https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/pangeo-forge/aws-noaa-oisst-feedstock/aws-noaa-oisst-avhrr-only.zarr/reference.json",
         "variable": "sst",
         "extra_args": {"reference": true }
+    }, "prod-giovanni-cache-GPM_3IMERGHH_06_precipitationCal": {
+        "dataset_url": "s3://prod-giovanni-cache/zarr/GPM_3IMERGHH_06_precipitationCal/",
+        "variable": "variable",
+        "extra_args": { "consolidated": false }
+    }, "pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc": {
+        "dataset_url": "https://nex-gddp-cmip6.s3-us-west-2.amazonaws.com/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/pr/pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc",
+        "variable": "pr"
     }
 }
 

diff --git a/03-e2e/gen_test_urls.py b/03-e2e/gen_test_urls.py
@@ -15,15 +15,13 @@
 import sys
 sys.path.append('..')
 import helpers.zarr_helpers as zarr_helpers
-from titiler_xarray.titiler.xarray.reader import xarray_open_dataset, ZarrReader
+from titiler_xarray.titiler.xarray.reader import xarray_open_dataset, get_variable
 
 # Step 2: Merge the dictionaries
-sources = json.loads(open('../01-generate-datasets/all-datasets.json', 'r').read())
+sources = json.loads(open('../01-generate-datasets/external-datasets.json', 'r').read())
 
 # remove pyramids and https dataset for now
 sources = list(filter(lambda x: 'pyramid' not in x[0], sources.items()))
-# Also, skip HTTPS for now
-sources = list(filter(lambda x: 'https' not in x[1]['dataset_url'], sources))
 
 def get_arguments():
     parser = argparse.ArgumentParser(description="Set environment for the script.")
@@ -120,17 +118,19 @@ def generate_extremas(bounds: list[float]):
         variable = value["variable"]
         reference = value.get("extra_args", {}).get("reference", False)
         multiscale = value.get("extra_args", {}).get("multiscale", False)
-        ds = xarray_open_dataset(source, reference=reference)
+        consolidated = value.get("extra_args", {}).get("consolidated", True)
+        ds = xarray_open_dataset(source, reference=reference, consolidated=consolidated)
         bounds = default_bounds
         if not multiscale:
-            lat_extent, lon_extent = zarr_helpers.get_lat_lon_extents(ds)
+            da = get_variable(ds, variable=variable)
+            lat_extent, lon_extent = zarr_helpers.get_lat_lon_extents(da)
             bounds = [lon_extent[0], lat_extent[0], lon_extent[1], lat_extent[1]]
 
         array_specs = {
             'collection_name': collection_name,
             'source': source
         }
-        array_specs.update(zarr_helpers.get_array_chunk_information(ds, variable=variable, multiscale=multiscale))
+        array_specs.update(zarr_helpers.get_array_chunk_information(da, multiscale=multiscale))
         mode = "w" if idx == 0 else "a"
         with open(csv_file, mode, newline="") as csvfile:
             writer = csv.DictWriter(csvfile, fieldnames=array_specs.keys())

diff --git a/03-e2e/zarr_info.csv b/03-e2e/zarr_info.csv
@@ -1,19 +1,6 @@
 collection_name,source,chunks,shape_dict,dtype,chunk_size_mb,compression,number_of_spatial_chunks,number_coordinate_chunks
-cmip6-kerchunk,s3://nasa-eodc-data-store/test-data/cmip6-kerchunk/combined_CMIP6_daily_GISS-E2-1-G_tas_kerchunk.json,"{'time': 1, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,3.2958984375,Zlib(level=5),1.0,3
-600_1440_29_CMIP6_daily_GISS-E2-1-G_tas.zarr,s3://nasa-eodc-data-store/test-data/cmip6-zarr/600_1440_29_CMIP6_daily_GISS-E2-1-G_tas.zarr,"{'time': 29, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,95.5810546875,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr,s3://nasa-eodc-data-store/test-data/cmip6-zarr/600_1440_1_CMIP6_daily_GISS-E2-1-G_tas.zarr,"{'time': 1, 'lat': 600, 'lon': 1440}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,3.2958984375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-365_262_262_CMIP6_daily_GISS-E2-1-G_tas.zarr,s3://nasa-eodc-data-store/test-data/cmip6-zarr/365_262_262_CMIP6_daily_GISS-E2-1-G_tas.zarr,"{'time': 365, 'lat': 262, 'lon': 262}","{'time': 730, 'lat': 600, 'lon': 1440}",float32,95.57746887207031,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",12.586679097954665,3
-power_901_monthly_meteorology_utc.zarr,s3://power-analysis-ready-datastore/power_901_monthly_meteorology_utc.zarr,"{'time': 504, 'lat': 25, 'lon': 25}","{'time': 504, 'lat': 361, 'lon': 576}",float64,2.40325927734375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",332.69759999999997,44
-cmip6-pds_GISS-E2-1-G_historical_tas,s3://cmip6-pds/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/historical/r2i1p1f1/Amon/tas/gn/v20180827/,"{'time': 600, 'lat': 90, 'lon': 144}","{'time': 1980, 'lat': 90, 'lon': 144}",float32,29.6630859375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,6
-single_chunk_store_lat1024_lon2048.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1024_lon2048.zarr,"{'time': 1, 'lat': 1024, 'lon': 2048}","{'time': 1, 'lat': 1024, 'lon': 2048}",float64,16.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-single_chunk_store_lat1448_lon2896.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat1448_lon2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 1448, 'lon': 2896}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-single_chunk_store_lat2048_lon4096.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat2048_lon4096.zarr,"{'time': 1, 'lat': 2048, 'lon': 4096}","{'time': 1, 'lat': 2048, 'lon': 4096}",float64,64.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-single_chunk_store_lat2896_lon5792.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat2896_lon5792.zarr,"{'time': 1, 'lat': 2896, 'lon': 5792}","{'time': 1, 'lat': 2896, 'lon': 5792}",float64,127.97265625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-single_chunk_store_lat4096_lon8192.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat4096_lon8192.zarr,"{'time': 1, 'lat': 4096, 'lon': 8192}","{'time': 1, 'lat': 4096, 'lon': 8192}",float64,256.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-single_chunk_store_lat512_lon1024.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat512_lon1024.zarr,"{'time': 1, 'lat': 512, 'lon': 1024}","{'time': 1, 'lat': 512, 'lon': 1024}",float64,4.0,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-single_chunk_store_lat724_lon1448.zarr,s3://nasa-eodc-data-store/test-data/fake-data/single_chunk/store_lat724_lon1448.zarr,"{'time': 1, 'lat': 724, 'lon': 1448}","{'time': 1, 'lat': 724, 'lon': 1448}",float64,7.998291015625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-with_chunks_store_lat1448_lon2896.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat1448_lon2896.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 1448, 'lon': 2896}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",1.0,3
-with_chunks_store_lat2048_lon4096.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat2048_lon4096.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 2048, 'lon': 4096}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",2.0004273373828636,3
-with_chunks_store_lat2896_lon5792.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat2896_lon5792.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 2896, 'lon': 5792}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",4.0,3
-with_chunks_store_lat4096_lon8192.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat4096_lon8192.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 4096, 'lon': 8192}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",8.001709349531454,3
-with_chunks_store_lat5793_lon11586.zarr,s3://nasa-eodc-data-store/test-data/fake-data/with_chunks/store_lat5793_lon11586.zarr,"{'time': 1, 'lat': 1448, 'lon': 2896}","{'time': 1, 'lat': 5793, 'lon': 11586}",float64,31.9931640625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",16.0055253388175,3
+power_901_monthly_meteorology_utc.zarr,s3://power-analysis-ready-datastore/power_901_monthly_meteorology_utc.zarr,"{'y': 504, 'x': 25}","{'y': 361, 'x': 576}",float64,2.40325927734375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",16.502857142857142,2
+cmip6-pds_GISS-E2-1-G_historical_tas,s3://cmip6-pds/CMIP6/CMIP/NASA-GISS/GISS-E2-1-G/historical/r2i1p1f1/Amon/tas/gn/v20180827/,"{'y': 600, 'x': 90}","{'y': 90, 'x': 144}",float32,29.6630859375,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",0.24,1
+aws-noaa-oisst-feedstock_reference,https://ncsa.osn.xsede.org/Pangeo/pangeo-forge/pangeo-forge/aws-noaa-oisst-feedstock/aws-noaa-oisst-avhrr-only.zarr/reference.json,"{'zlev': 1, 'y': 1, 'x': 720}","{'zlev': 1, 'y': 720, 'x': 1440}",int16,1.9775390625,Zlib(level=4),1440.0,2
+prod-giovanni-cache-GPM_3IMERGHH_06_precipitationCal,s3://prod-giovanni-cache/zarr/GPM_3IMERGHH_06_precipitationCal/,"{'y': 36, 'x': 72}","{'y': 1800, 'x': 3600}",float32,1.9775390625,"Blosc(cname='lz4', clevel=5, shuffle=SHUFFLE, blocksize=0)",2500.0,100
+pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc,https://nex-gddp-cmip6.s3-us-west-2.amazonaws.com/NEX-GDDP-CMIP6/ACCESS-CM2/historical/r1i1p1f1/pr/pr_day_ACCESS-CM2_historical_r1i1p1f1_gn_1950.nc,"{'y': 'N', 'x': '/'}","{'y': 600, 'x': 1440}",float32,N/A,N/A,N/A,0
diff --git a/helpers/zarr_helpers.py b/helpers/zarr_helpers.py
@@ -10,43 +10,43 @@ def get_dataarray_size(da: xr.DataArray):
     dtype = da.encoding.get("dtype", "N/A")
     return np.prod(da.shape) * dtype.itemsize/1024/1024
 
-def get_number_coord_chunks(ds: xr.Dataset):
+def get_number_coord_chunks(da: xr.DataArray):
     number_coord_chunks = 0
-    for key in ds.coords.keys():
-        if ds[key].shape != ():
-            number_coord_chunks += round(ds[key].shape[0]/ds[key].encoding['chunks'][0])
+    for key in da.coords.keys():
+        if da[key].shape != () and da[key].encoding.get('chunks'):
+            number_coord_chunks += round(da[key].shape[0]/da[key].encoding['chunks'][0])
     return number_coord_chunks
 
-def get_lat_lon_extents(ds: xr.Dataset):
-    lat_values = ds.lat.values
-    lon_values = ds.lon.values
-    if (ds.lon > 180).any():
-        # Adjust the longitude coordinates to the -180 to 180 range
-        lon_values = (ds.lon + 180) % 360 - 180
+def get_lat_lon_extents(da: xr.DataArray):
+    lat_values = da.y.values
+    lon_values = da.x.values
     lat_extent= [math.ceil(np.min(lat_values)), math.floor(np.max(lat_values))]
     lon_extent = [math.ceil(np.min(lon_values)), math.floor(np.max(lon_values))]    
     return lat_extent, lon_extent
 
-def get_array_chunk_information(ds: xr.Dataset, variable: str, multiscale: bool = False): 
+def get_array_chunk_information(da: xr.DataArray, multiscale: bool = False): 
     if multiscale: # TODO
         chunks, shape_dict, chunks_dict, dtype, chunk_size_mb, compression, number_of_spatial_chunks = ["N/A"] * 7
     else:
-        da = ds[variable]
         chunks = da.encoding.get("chunks", "N/A")
         chunks_dict = dict(zip(da.dims, chunks))
         shape_dict = dict(zip(da.dims, da.shape))
-        dtype = da.encoding.get("dtype", "N/A")    
-        chunk_size_mb = "N/A" if chunks is None else (np.prod(chunks) * dtype.itemsize)/1024/1024
+        dtype = da.encoding.get("dtype", "N/A")
+        # import pdb; pdb.set_trace()
+        chunk_size_mb = "N/A" if chunks == 'N/A' else (np.prod(chunks) * dtype.itemsize)/1024/1024
         compression = da.encoding.get("compressor", "N/A")
-        number_of_spatial_chunks = (shape_dict['lat']/chunks_dict['lat']) * (shape_dict['lon']/chunks_dict['lon'])
+        try:
+            number_of_spatial_chunks = (shape_dict['y']/chunks_dict['y']) * (shape_dict['x']/chunks_dict['x'])
+        except:
+            number_of_spatial_chunks = "N/A"
     return {
         'chunks': chunks_dict,
         'shape_dict': shape_dict,
         'dtype': str(dtype),
         'chunk_size_mb': chunk_size_mb,
         'compression': str(compression),
         'number_of_spatial_chunks': number_of_spatial_chunks,
-        'number_coordinate_chunks': get_number_coord_chunks(ds)
+        'number_coordinate_chunks': get_number_coord_chunks(da)
     }
 
 def generate_data_store(