Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add iNaturalist dataset #532

Merged
merged 1 commit into from
May 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions docs/api/datasets.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,11 @@ GlobBiomass

.. autoclass:: GlobBiomass

iNaturalist
^^^^^^^^^^^

.. autoclass:: INaturalist

Landsat
^^^^^^^

Expand Down
58 changes: 58 additions & 0 deletions tests/data/inaturalist/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
#!/usr/bin/env python3

# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import pandas as pd

filename = "observations-012345.csv"

# User can select which columns to export. The following are the default columns.
# Not all columns may exist in the actual dataset.
size = 4
data = {
"id": [""] * size,
"observed_on_string": [""] * size,
"observed_on": ["", "", "2022-05-07", "2022-05-07"],
"time_observed_at": ["", "", "", "2022-05-07 11:02:53 +0100"],
"time_zone": ["Central Time (US & Canada)"] * size,
"user_id": [123] * size,
"user_login": ["darwin"] * size,
"created_at": ["2022-05-07 11:02:53 +0100"] * size,
"updated_at": ["2022-05-07 11:02:53 +0100"] * size,
"quality_grade": ["research"] * size,
"license": ["CCO"] * size,
"url": ["https://inaturalist.org/observations/123"] * size,
"image_url": [
"https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg"
]
* size,
"sound_url": ["https://static.inaturalist.org/sounds/123.m4a?123"] * size,
"tag_list": ["Chicago"] * size,
"description": [""] * size,
"num_identification_agreements": [1] * size,
"num_identification_disagreements": [0] * size,
"captive_cultivated": ["false"] * size,
"oauth_application_id": [""] * size,
"place_guess": ["Chicago"] * size,
"latitude": [41.881832] * size,
"longitude": [""] + [-87.623177] * (size - 1),
"positional_accuracy": [5] * size,
"private_place_guess": [""] * size,
"private_latitude": [""] * size,
"private_longitude": [""] * size,
"public_positional_accuracy": [5] * size,
"geoprivacy": [""] * size,
"taxon_geoprivacy": [""] * size,
"coordinates_obscured": ["false"] * size,
"positioning_method": ["gps"] * size,
"positioning_device": ["gps"] * size,
"species_guess": ["Homo sapiens"] * size,
"scientific_name": ["Homo sapiens"] * size,
"common_name": ["human"] * size,
"iconic_taxon_name": ["Animalia"] * size,
"taxon_id": [123] * size,
}

df = pd.DataFrame(data)
df.to_csv(filename, index=False)
5 changes: 5 additions & 0 deletions tests/data/inaturalist/observations-012345.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
id,observed_on_string,observed_on,time_observed_at,time_zone,user_id,user_login,created_at,updated_at,quality_grade,license,url,image_url,sound_url,tag_list,description,num_identification_agreements,num_identification_disagreements,captive_cultivated,oauth_application_id,place_guess,latitude,longitude,positional_accuracy,private_place_guess,private_latitude,private_longitude,public_positional_accuracy,geoprivacy,taxon_geoprivacy,coordinates_obscured,positioning_method,positioning_device,species_guess,scientific_name,common_name,iconic_taxon_name,taxon_id
,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
,,,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
,,2022-05-07,,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
,,2022-05-07,2022-05-07 11:02:53 +0100,Central Time (US & Canada),123,darwin,2022-05-07 11:02:53 +0100,2022-05-07 11:02:53 +0100,research,CCO,https://inaturalist.org/observations/123,https://inaturalist-open-data.s3.amazonaws.com/photos/123/medium.jpg,https://static.inaturalist.org/sounds/123.m4a?123,Chicago,,1,0,false,,Chicago,41.881832,-87.623177,5,,,,5,,,false,gps,gps,Homo sapiens,Homo sapiens,human,Animalia,123
72 changes: 72 additions & 0 deletions tests/datasets/test_inaturalist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

import builtins
import os
from pathlib import Path
from typing import Any

import pytest
from _pytest.monkeypatch import MonkeyPatch

from torchgeo.datasets import (
BoundingBox,
INaturalist,
IntersectionDataset,
UnionDataset,
)

pytest.importorskip("pandas", minversion="0.23.2")


class TestINaturalist:
@pytest.fixture(scope="class")
def dataset(self) -> INaturalist:
root = os.path.join("tests", "data", "inaturalist")
return INaturalist(root)

def test_getitem(self, dataset: INaturalist) -> None:
x = dataset[dataset.bounds]
assert isinstance(x, dict)

def test_len(self, dataset: INaturalist) -> None:
assert len(dataset) == 3

def test_and(self, dataset: INaturalist) -> None:
ds = dataset & dataset
assert isinstance(ds, IntersectionDataset)

def test_or(self, dataset: INaturalist) -> None:
ds = dataset | dataset
assert isinstance(ds, UnionDataset)

def test_no_data(self, tmp_path: Path) -> None:
with pytest.raises(FileNotFoundError, match="Dataset not found"):
INaturalist(str(tmp_path))

@pytest.fixture
def mock_missing_module(self, monkeypatch: MonkeyPatch) -> None:
import_orig = builtins.__import__

def mocked_import(name: str, *args: Any, **kwargs: Any) -> Any:
if name == "pandas":
raise ImportError()
return import_orig(name, *args, **kwargs)

monkeypatch.setattr(builtins, "__import__", mocked_import)

def test_mock_missing_module(
self, dataset: INaturalist, mock_missing_module: None
) -> None:
with pytest.raises(
ImportError,
match="pandas is not installed and is required to use this dataset",
):
INaturalist(dataset.root)

def test_invalid_query(self, dataset: INaturalist) -> None:
query = BoundingBox(0, 0, 0, 0, 0, 0)
with pytest.raises(
IndexError, match="query: .* not found in index with bounds:"
):
dataset[query]
2 changes: 2 additions & 0 deletions torchgeo/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
from .gid15 import GID15
from .globbiomass import GlobBiomass
from .idtrees import IDTReeS
from .inaturalist import INaturalist
from .inria import InriaAerialImageLabeling
from .landcoverai import LandCoverAI
from .landsat import (
Expand Down Expand Up @@ -121,6 +122,7 @@
"EUDEM",
"GBIF",
"GlobBiomass",
"INaturalist",
"Landsat",
"Landsat1",
"Landsat2",
Expand Down
123 changes: 123 additions & 0 deletions torchgeo/datasets/inaturalist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
# Copyright (c) Microsoft Corporation. All rights reserved.
# Licensed under the MIT License.

"""Dataset for iNaturalist."""

import glob
import os
import sys
from typing import Any, Dict

from rasterio.crs import CRS

from .geo import GeoDataset
from .utils import BoundingBox, disambiguate_timestamp


class INaturalist(GeoDataset):
"""Dataset for iNaturalist.

`iNaturalist <https://www.inaturalist.org/>`_ is a joint initiative of the
California Academy of Sciences and the National Geographic Society. It allows
citizen scientists to upload observations of organisms that can be downloaded by
scientists and researchers.

If you use an iNaturalist dataset in your research, please cite it according to:

* https://www.inaturalist.org/pages/help#cite

.. note::
This dataset requires the following additional library to be installed:

* `pandas <https://pypi.org/project/pandas/>`_ to load CSV files

.. versionadded:: 0.3
"""

res = 0
_crs = CRS.from_epsg(4326) # Lat/Lon

def __init__(self, root: str = "data") -> None:
"""Initialize a new Dataset instance.

Args:
root: root directory where dataset can be found

Raises:
FileNotFoundError: if no files are found in ``root``
ImportError: if pandas is not installed
"""
super().__init__()

self.root = root

files = glob.glob(os.path.join(root, "**.csv"))
if not files:
raise FileNotFoundError(f"Dataset not found in `root={self.root}`")

try:
import pandas as pd # noqa: F401
except ImportError:
raise ImportError(
"pandas is not installed and is required to use this dataset"
)

# Read CSV file
data = pd.read_csv(
files[0],
engine="c",
usecols=["observed_on", "time_observed_at", "latitude", "longitude"],
)

# Dataset contains many possible timestamps:
#
# * observed_on_string: no consistent format (can't use)
# * observed_on: day precision (better)
# * time_observed_at: second precision (best)
# * created_at: when observation was submitted (shouldn't use)
# * updated_at: when submission was updated (shouldn't use)
#
# The created_at/updated_at timestamps can be years after the actual submission,
# so they shouldn't be used, even if observed_on/time_observed_at are missing.

# Convert from pandas DataFrame to rtree Index
i = 0
for date, time, y, x in data.itertuples(index=False, name=None):
# Skip rows without lat/lon
if pd.isna(y) or pd.isna(x):
continue

if not pd.isna(time):
mint, maxt = disambiguate_timestamp(time, "%Y-%m-%d %H:%M:%S %z")
elif not pd.isna(date):
mint, maxt = disambiguate_timestamp(date, "%Y-%m-%d")
else:
mint, maxt = 0, sys.maxsize

coords = (x, x, y, y, mint, maxt)
self.index.insert(i, coords)
i += 1

def __getitem__(self, query: BoundingBox) -> Dict[str, Any]:
"""Retrieve metadata indexed by query.

Args:
query: (minx, maxx, miny, maxy, mint, maxt) coordinates to index

Returns:
sample of metadata at that index

Raises:
IndexError: if query is not found in the index
"""
hits = self.index.intersection(tuple(query), objects=True)
bboxes = [hit.bbox for hit in hits]

if not bboxes:
raise IndexError(
f"query: {query} not found in index with bounds: {self.bounds}"
)

sample = {"crs": self.crs, "bbox": bboxes}

return sample