From 0373233ac5487d0c2c7893c8123a3c57de7b012e Mon Sep 17 00:00:00 2001 From: "Adam J. Stewart" Date: Sat, 7 May 2022 11:45:15 -0500 Subject: [PATCH] Add iNaturalist dataset --- docs/api/datasets.rst | 5 + tests/data/inaturalist/data.py | 58 +++++++++ .../data/inaturalist/observations-012345.csv | 5 + tests/datasets/test_inaturalist.py | 72 ++++++++++ torchgeo/datasets/__init__.py | 2 + torchgeo/datasets/inaturalist.py | 123 ++++++++++++++++++ 6 files changed, 265 insertions(+) create mode 100755 tests/data/inaturalist/data.py create mode 100644 tests/data/inaturalist/observations-012345.csv create mode 100644 tests/datasets/test_inaturalist.py create mode 100644 torchgeo/datasets/inaturalist.py diff --git a/docs/api/datasets.rst b/docs/api/datasets.rst index dfc63e08bc7..e5dd579b96d 100644 --- a/docs/api/datasets.rst +++ b/docs/api/datasets.rst @@ -77,6 +77,11 @@ GlobBiomass .. autoclass:: GlobBiomass +iNaturalist +^^^^^^^^^^^ + +.. autoclass:: INaturalist + Landsat ^^^^^^^ diff --git a/tests/data/inaturalist/data.py b/tests/data/inaturalist/data.py new file mode 100755 index 00000000000..6bfbc685008 --- /dev/null +++ b/tests/data/inaturalist/data.py @@ -0,0 +1,58 @@ +#!/usr/bin/env python3 + +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import pandas as pd + +filename = "observations-012345.csv" + +# User can select which columns to export. The following are the default columns. +# Not all columns may exist in the actual dataset. +size = 4 +data = { + "id": [""] * size, + "observed_on_string": [""] * size, + "observed_on": ["", "", "2022-05-07", "2022-05-07"], + "time_observed_at": ["", "", "", "2022-05-07 11:02:53 +0100"], + "time_zone": ["Central Time (US & Canada)"] * size, + "user_id": [123] * size, + "user_login": ["darwin"] * size, + "created_at": ["2022-05-07 11:02:53 +0100"] * size, + "updated_at": ["2022-05-07 11:02:53 +0100"] * size, + "quality_grade": ["research"] * size, + "license": ["CCO"] * size, + "url": ["https://inaturalist.org/observations/123"] * size, + "image_url": [ + "https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg" + ] + * size, + "sound_url": ["https://static.inaturalist.org/sounds/123.m4a?123"] * size, + "tag_list": ["Chicago"] * size, + "description": [""] * size, + "num_identification_agreements": [1] * size, + "num_identification_disagreements": [0] * size, + "captive_cultivated": ["false"] * size, + "oauth_application_id": [""] * size, + "place_guess": ["Chicago"] * size, + "latitude": [41.881832] * size, + "longitude": [""] + [-87.623177] * (size - 1), + "positional_accuracy": [5] * size, + "private_place_guess": [""] * size, + "private_latitude": [""] * size, + "private_longitude": [""] * size, + "public_positional_accuracy": [5] * size, + "geoprivacy": [""] * size, + "taxon_geoprivacy": [""] * size, + "coordinates_obscured": ["false"] * size, + "positioning_method": ["gps"] * size, + "positioning_device": ["gps"] * size, + "species_guess": ["Homo sapiens"] * size, + "scientific_name": ["Homo sapiens"] * size, + "common_name": ["human"] * size, + "iconic_taxon_name": ["Animalia"] * size, + "taxon_id": [123] * size, +} + +df = pd.DataFrame(data) +df.to_csv(filename, index=False) diff --git a/tests/data/inaturalist/observations-012345.csv b/tests/data/inaturalist/observations-012345.csv new file mode 100644 index 00000000000..dc340cbe0a3 --- /dev/null +++ b/tests/data/inaturalist/observations-012345.csv @@ -0,0 +1,5 @@ +id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,created_at,updated_at,quality_grade,license,url,image_url,sound_url,tag_list,description,num_identification_agreements,num_identification_disagreements,captive_cultivated,oauth_application_id,place_guess,latitude,longitude,positional_accuracy,private_place_guess,private_latitude,private_longitude,public_positional_accuracy,geoprivacy,taxon_geoprivacy,coordinates_obscured,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id +,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123 +,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123 +,,2022-05-07,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123 +,,2022-05-07,2022-05-07 11:02:53 +0100,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123 diff --git a/tests/datasets/test_inaturalist.py b/tests/datasets/test_inaturalist.py new file mode 100644 index 00000000000..623c64837bd --- /dev/null +++ b/tests/datasets/test_inaturalist.py @@ -0,0 +1,72 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import builtins +import os +from pathlib import Path +from typing import Any + +import pytest +from _pytest.monkeypatch import MonkeyPatch + +from torchgeo.datasets import ( + BoundingBox, + INaturalist, + IntersectionDataset, + UnionDataset, +) + +pytest.importorskip("pandas", minversion="0.23.2") + + +class TestINaturalist: + @pytest.fixture(scope="class") + def dataset(self) -> INaturalist: + root = os.path.join("tests", "data", "inaturalist") + return INaturalist(root) + + def test_getitem(self, dataset: INaturalist) -> None: + x = dataset[dataset.bounds] + assert isinstance(x, dict) + + def test_len(self, dataset: INaturalist) -> None: + assert len(dataset) == 3 + + def test_and(self, dataset: INaturalist) -> None: + ds = dataset & dataset + assert isinstance(ds, IntersectionDataset) + + def test_or(self, dataset: INaturalist) -> None: + ds = dataset | dataset + assert isinstance(ds, UnionDataset) + + def test_no_data(self, tmp_path: Path) -> None: + with pytest.raises(FileNotFoundError, match="Dataset not found"): + INaturalist(str(tmp_path)) + + @pytest.fixture + def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None: + import_orig = builtins.__import__ + + def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any: + if name == "pandas": + raise ImportError() + return import_orig(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", mocked_import) + + def test_mock_missing_module( + self, dataset: INaturalist, mock_missing_module: None + ) -> None: + with pytest.raises( + ImportError, + match="pandas is not installed and is required to use this dataset", + ): + INaturalist(dataset.root) + + def test_invalid_query(self, dataset: INaturalist) -> None: + query = BoundingBox(0, 0, 0, 0, 0, 0) + with pytest.raises( + IndexError, match="query: .* not found in index with bounds:" + ): + dataset[query] diff --git a/torchgeo/datasets/__init__.py b/torchgeo/datasets/__init__.py index d3a5f9fe3f0..1e00587b898 100644 --- a/torchgeo/datasets/__init__.py +++ b/torchgeo/datasets/__init__.py @@ -48,6 +48,7 @@ from .gid15 import GID15 from .globbiomass import GlobBiomass from .idtrees import IDTReeS +from .inaturalist import INaturalist from .inria import InriaAerialImageLabeling from .landcoverai import LandCoverAI from .landsat import ( @@ -121,6 +122,7 @@ "EUDEM", "GBIF", "GlobBiomass", + "INaturalist", "Landsat", "Landsat1", "Landsat2", diff --git a/torchgeo/datasets/inaturalist.py b/torchgeo/datasets/inaturalist.py new file mode 100644 index 00000000000..1083be878b3 --- /dev/null +++ b/torchgeo/datasets/inaturalist.py @@ -0,0 +1,123 @@ +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +"""Dataset for iNaturalist.""" + +import glob +import os +import sys +from typing import Any, Dict + +from rasterio.crs import CRS + +from .geo import GeoDataset +from .utils import BoundingBox, disambiguate_timestamp + + +class INaturalist(GeoDataset): + """Dataset for iNaturalist. + + `iNaturalist `_ is a joint initiative of the + California Academy of Sciences and the National Geographic Society. It allows + citizen scientists to upload observations of organisms that can be downloaded by + scientists and researchers. + + If you use an iNaturalist dataset in your research, please cite it according to: + + * https://www.inaturalist.org/pages/help#cite + + .. note:: + This dataset requires the following additional library to be installed: + + * `pandas `_ to load CSV files + + .. versionadded:: 0.3 + """ + + res = 0 + _crs = CRS.from_epsg(4326) # Lat/Lon + + def __init__(self, root: str = "data") -> None: + """Initialize a new Dataset instance. + + Args: + root: root directory where dataset can be found + + Raises: + FileNotFoundError: if no files are found in ``root`` + ImportError: if pandas is not installed + """ + super().__init__() + + self.root = root + + files = glob.glob(os.path.join(root, "**.csv")) + if not files: + raise FileNotFoundError(f"Dataset not found in `root={self.root}`") + + try: + import pandas as pd # noqa: F401 + except ImportError: + raise ImportError( + "pandas is not installed and is required to use this dataset" + ) + + # Read CSV file + data = pd.read_csv( + files[0], + engine="c", + usecols=["observed_on", "time_observed_at", "latitude", "longitude"], + ) + + # Dataset contains many possible timestamps: + # + # * observed_on_string: no consistent format (can't use) + # * observed_on: day precision (better) + # * time_observed_at: second precision (best) + # * created_at: when observation was submitted (shouldn't use) + # * updated_at: when submission was updated (shouldn't use) + # + # The created_at/updated_at timestamps can be years after the actual submission, + # so they shouldn't be used, even if observed_on/time_observed_at are missing. + + # Convert from pandas DataFrame to rtree Index + i = 0 + for date, time, y, x in data.itertuples(index=False, name=None): + # Skip rows without lat/lon + if pd.isna(y) or pd.isna(x): + continue + + if not pd.isna(time): + mint, maxt = disambiguate_timestamp(time, "%Y-%m-%d %H:%M:%S %z") + elif not pd.isna(date): + mint, maxt = disambiguate_timestamp(date, "%Y-%m-%d") + else: + mint, maxt = 0, sys.maxsize + + coords = (x, x, y, y, mint, maxt) + self.index.insert(i, coords) + i += 1 + + def __getitem__(self, query: BoundingBox) -> Dict[str, Any]: + """Retrieve metadata indexed by query. + + Args: + query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index + + Returns: + sample of metadata at that index + + Raises: + IndexError: if query is not found in the index + """ + hits = self.index.intersection(tuple(query), objects=True) + bboxes = [hit.bbox for hit in hits] + + if not bboxes: + raise IndexError( + f"query: {query} not found in index with bounds: {self.bounds}" + ) + + sample = {"crs": self.crs, "bbox": bboxes} + + return sample