Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add MMEarth dataset #2202

Open
wants to merge 28 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 24 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,11 @@ Million-AID

.. autoclass:: MillionAID

MMEarth
^^^^^^^^

.. autoclass:: MMEarth

NASA Marine Debris
^^^^^^^^^^^^^^^^^^

Expand Down
1 change: 1 addition & 0 deletions docs/api/datasets/non_geo_datasets.csv
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ Dataset,Task,Source,License,# Samples,# Classes,Size (px),Resolution (m),Bands
`LoveDA`_,S,Google Earth,"CC-BY-NC-SA-4.0","5,987",7,"1,024x1,024",0.3,RGB
`MapInWild`_,S,"Sentinel-1/2, ESA WorldCover, NOAA VIIRS DNB","CC-BY-4.0",1018,1,1920x1920,10--463.83,"SAR, MSI, 2020_Map, avg_rad"
`Million-AID`_,C,Google Earth,-,1M,51--73,,0.5--153,RGB
`MMEarth`_,"C, S","Aster, Sentinel, ERA5","CC-BY-4.0","100K--1M",,"128x128 or 64x64",10,MSI
`NASA Marine Debris`_,OD,PlanetScope,"Apache-2.0",707,1,256x256,3,RGB
`OSCD`_,CD,Sentinel-2,"CC-BY-4.0",24,2,"40--1,180",60,MSI
`PASTIS`_,I,Sentinel-1/2,"CC-BY-4.0","2,433",19,128x128xT,10,MSI
Expand Down
221 changes: 221 additions & 0 deletions tests/data/mmearth/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,221 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import json
import os
import shutil

import h5py
import numpy as np

meta_dummy_dict = {
'S2_DATE': '2018-07-16',
'S2_type': 'l1c',
'CRS': 'EPSG:32721',
'lat': -14.499441524746077,
'lon': -56.98355999998649,
}

num_tiles = 10

meta_id_strings = [str(i) for i in range(num_tiles)]

modalities = {
'aster': {'bands': 2, 'dtype': np.int16},
'biome': {'bands': 14, 'dtype': np.uint8},
'canopy_height_eth': {'bands': 2, 'dtype': np.int8},
'dynamic_world': {'bands': 1, 'dtype': np.uint8},
'eco_region': {'bands': 846, 'dtype': np.uint16},
'era5': {'bands': 12, 'dtype': np.float32},
'esa_worldcover': {'bands': 1, 'dtype': np.uint8},
'sentinel1': {'bands': 8, 'dtype': np.float32},
'sentinel2': {'bands': 13, 'dtype': np.uint16},
'sentinel2_cloudmask': {'bands': 1, 'dtype': np.uint16},
'sentinel2_cloudprod': {'bands': 1, 'dtype': np.uint16},
'sentinel2_scl': {'bands': 1, 'dtype': np.uint16},
}

all_modality_bands = {
'sentinel2': [
'B1',
'B2',
'B3',
'B4',
'B5',
'B6',
'B7',
'B8A',
'B8',
'B9',
'B10',
'B11',
'B12',
],
'sentinel2_cloudmask': ['QA60'],
'sentinel2_cloudprod': ['MSK_CLDPRB'],
'sentinel2_scl': ['SCL'],
'sentinel1_asc': ['VV', 'VH', 'HH', 'HV'],
'sentinel1_desc': ['VV', 'VH', 'HH', 'HV'],
'aster': ['elevation', 'slope'],
'era5': [
'prev_month_avg_temp',
'prev_month_min_temp',
'prev_month_max_temp',
'prev_month_total_precip',
'curr_month_avg_temp',
'curr_month_min_temp',
'curr_month_max_temp',
'curr_month_total_precip',
'year_avg_temp',
'year_min_temp',
'year_max_temp',
'year_total_precip',
],
'dynamic_world': ['landcover'],
'canopy_height_eth': ['height', 'std'],
'lat': ['sin', 'cos'],
'lon': ['sin', 'cos'],
'biome': ['biome'],
'eco_region': ['eco_region'],
'month': ['sin_month', 'cos_month'],
'esa_worldcover': ['map'],
}


def create_hd5f(dataset_name: str, px_dim: tuple[int]) -> list[dict[str, str]]:
# Create the HDF5 file
with h5py.File(f'{dataset_name}.h5', 'w') as h5file:
# Create datasets for each modality
for modality, modal_info in modalities.items():
bands = modal_info['bands']
if modality in ['era5', 'eco_region', 'biome']:
h5file.create_dataset(
modality, (num_tiles, bands), dtype=modal_info['dtype']
)
else:
h5file.create_dataset(
modality, (num_tiles, bands, *px_dim), dtype=modal_info['dtype']
)

# Create datasets for metadata
h5file.create_dataset('lat', (num_tiles, 2), dtype=np.float32)
h5file.create_dataset('lon', (num_tiles, 2), dtype=np.float32)
h5file.create_dataset('month', (num_tiles, 2), dtype=np.int32)
h5file.create_dataset(
'metadata',
(num_tiles,),
dtype=np.dtype([('meta_id', 'S10'), ('S2_type', 'S3')]),
)

# Populate the datasets with sample data
tile_info = {}
for i in range(num_tiles):
for modality in modalities:
if modality == 'dynamic_world':
old_values = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
data = np.random.choice(old_values, size=(bands, *px_dim))
elif modality == 'esa_worldcover':
old_values = [10, 20, 30, 40, 50, 60, 70, 80, 90, 95, 100, 255]
data = np.random.choice(old_values, size=(bands, *px_dim))
elif modality == 'era5':
# only vector not image data
data = np.random.random(size=(bands,))
elif modality in ['biome', 'eco_region']:
data = np.random.randint(0, 2, size=(bands,))
elif modality == 'sentinel2':
data = np.random.randint(0, 65535, size=(bands, *px_dim))
elif modality in ['aster', 'canopy_height_eth', 'sentinel1']:
data = np.random.random(size=(bands, *px_dim))
elif modality in [
'sentinel2_cloudmask',
'sentinel2_cloudprod',
'sentinel2_scl',
]:
data = np.random.randint(0, 2, size=(bands, *px_dim))

data = data.astype(modal_info['dtype'])
h5file[modality][i] = data

# add other data for lat, lon, month
h5file['lat'][i] = np.random.random(size=(2,))
h5file['lon'][i] = np.random.random(size=(2,))
h5file['month'][i] = np.random.random(size=(2,))

# Assign S2_type and store in metadata
S2_type = np.random.choice(['l1c', 'l2a']).encode('utf-8')
meta_id = str(i).encode('utf-8')
h5file['metadata'][i] = (meta_id, S2_type)

# Collect tile info for JSON file
tile_meta = meta_dummy_dict.copy()
tile_meta['S2_type'] = S2_type.decode('utf-8')
tile_meta['BANDS'] = all_modality_bands
tile_info[str(i)] = tile_meta

return tile_info


extra_band_stats = {
'sentinel2_l1c': {'bands': 13, 'dtype': np.uint16},
'sentinel2_l2a': {'bands': 13, 'dtype': np.uint16},
'lat': {'bands': 2, 'dtype': np.float32},
'lon': {'bands': 2, 'dtype': np.float32},
'month': {'bands': 2, 'dtype': np.float32},
}

band_modalities = {
k: v
for k, v in {**modalities, **extra_band_stats}.items()
if k not in {'biome', 'eco_region', 'dynamic_world', 'esa_worldcover'}
}

# Create JSON files for band stats and splits
# sentinel 2 has l1c and l2a but there is only a common sentinel 2 data entry
band_stats = {
modality: {
'mean': np.random.random(size=(mod_info['bands'])).tolist(),
'std': np.random.random(size=(mod_info['bands'])).tolist(),
'min': np.random.random(size=(mod_info['bands'])).tolist(),
'max': np.random.random(size=(mod_info['bands'])).tolist(),
}
for modality, mod_info in band_modalities.items()
}

train_split = num_tiles
val_split = 0
test_split = 0

splits = {
'train': list(range(train_split)),
'val': list(range(train_split, train_split + val_split)),
'test': list(range(train_split + val_split, num_tiles)),
}

if __name__ == '__main__':
filenames = {
'MMEarth': {'dirname': 'data_1M_v001', 'px_dim': (128, 128)},
'MMEarth64': {'dirname': 'data_1M_v001_64', 'px_dim': (64, 64)},
'MMEarth100k': {'dirname': 'data_100k_v001', 'px_dim': (128, 128)},
}
for key, vals in filenames.items():
dirname = vals['dirname']
# remove existing files
if os.path.exists(dirname):
shutil.rmtree(dirname)

# create directory
os.makedirs(dirname)
tile_info = create_hd5f(os.path.join(dirname, dirname), vals['px_dim'])

print(f'{key} data file and JSON files created successfully.')

with open(os.path.join(dirname, f'{dirname}_splits.json'), 'w') as f:
json.dump(splits, f, indent=4)

with open(os.path.join(dirname, f'{dirname}_band_stats.json'), 'w') as f:
json.dump(band_stats, f, indent=4)

with open(os.path.join(dirname, f'{dirname}_tile_info.json'), 'w') as f:
json.dump(tile_info, f, indent=4)
Binary file not shown.
Loading