Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Dataset object #296

Merged
merged 34 commits into from
Nov 3, 2020
Merged
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
a7c3d03
Dataset object, heavily inspired by the RFC in #219
Oct 20, 2020
a710b0f
update top-level docs, adapt two loaders
Oct 20, 2020
8a71bee
major version change
Oct 20, 2020
c53fa77
integrate review comments
Oct 20, 2020
c9f0e5d
update dataset api
Oct 21, 2020
7b95a83
update all loaders to fit new API
Oct 21, 2020
4fe7371
remove outdated test
Oct 21, 2020
b5e8d3d
update tests, inherit dataset-specific load functions, docstring hack…
Oct 22, 2020
552e88a
remove data_home from Track docstrings
Oct 23, 2020
e4a3976
beta v0.3
Oct 23, 2020
0c27892
normalize dataset_dir to match module name, removes need for DATASET_DIR
Oct 26, 2020
19f2131
update test_full dataset; fix introduced bug in orchset
Oct 27, 2020
23560fe
fix bug in orchset download method
Oct 27, 2020
f150ecd
consolodate track.py and dataset.py into core.py
Oct 27, 2020
9e630a1
create datasets submodule
Oct 27, 2020
250e2d7
fix import bug in tests
Oct 27, 2020
35dc732
hack around git case sensitiveness
Oct 27, 2020
0c7c01a
hack back around git case sensitiveness
Oct 27, 2020
bf195e3
hack around git ignore case changes
Oct 27, 2020
ba6dbce
hack back around git ignoring case changes
Oct 27, 2020
a47085e
fix capitalization in tests paths
Oct 27, 2020
0557f10
fix more tests
Oct 27, 2020
ed9e9ea
fixing tests
Oct 27, 2020
ad15b56
last test maybe
Oct 27, 2020
5401313
initial merge with master
Oct 27, 2020
e551691
port beatport key to 0.3
Oct 27, 2020
639c6b5
test automodule for datasets
Oct 27, 2020
a5acb26
update datasets
Oct 27, 2020
90c76ee
format docstring
Oct 27, 2020
1c8d919
update contributing
Oct 30, 2020
a1236ce
fix merge conflicts
Oct 30, 2020
a559bf8
update dataset to new api
Oct 30, 2020
20cb153
update test location
Oct 30, 2020
b4e496c
merge with master
Nov 3, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
142 changes: 39 additions & 103 deletions CONTRIBUTING.md
Original file line number Diff line number Diff line change
Expand Up @@ -138,10 +138,18 @@ import os

from mirdata import download_utils
from mirdata import jams_utils
from mirdata import track
from mirdata import core
from mirdata import utils

DATASET_DIR = 'Example'

# -- Add any relevant citations here
BIBTEX = """@article{article-minimal,
author = "L[eslie] B. Lamport",
title = "The Gnats and Gnus Document Preparation System",
journal = "G-Animal's Journal",
year = "1986"
}"""

# -- REMOTES is a dictionary containing all files that need to be downloaded.
# -- The keys should be descriptive (e.g. 'annotations', 'audio')
REMOTES = {
Expand All @@ -153,6 +161,14 @@ REMOTES = {
),
}

# -- Include any information that should be printed when downloading
# -- remove this variable if you don't need to print anything during download
DOWNLOAD_INFO = """
Include any information you want to be printed when dataset.download() is called.
These can be instructions for how to download the dataset (e.g. request access on zenodo),
caveats about the download, etc
"""

# -- change this to load any top-level metadata
## delete this function if you don't have global metadata
def _load_metadata(data_home):
Expand All @@ -175,32 +191,27 @@ DATA = utils.LargeData('example_index.json', _load_metadata)
# DATA = utils.LargeData('example_index.json') ## use this if your dataset has no metadata


class Track(track.Track):
class Track(core.Track):
"""Example track class
# -- YOU CAN AUTOMATICALLY GENERATE THIS DOCSTRING BY CALLING THE SCRIPT:
# -- `scripts/print_track_docstring.py my_dataset`
# -- note that you'll first need to have a test track (see "Adding tests to your dataset" below)

Args:
track_id (str): track id of the track
data_home (str): Local path where the dataset is stored.
If `None`, looks for the data in the default directory, `~/mir_datasets/Example`

Attributes:
track_id (str): track id
# -- Add any of the dataset specific attributes here

"""
def __init__(self, track_id, data_home=None):
def __init__(self, track_id, data_home):
if track_id not in DATA.index:
raise ValueError(
'{} is not a valid track ID in Example'.format(track_id))

self.track_id = track_id

if data_home is None:
data_home = utils.get_default_dataset_path(DATASET_DIR)

self._data_home = data_home
self._track_paths = DATA.index[track_id]

Expand Down Expand Up @@ -266,97 +277,34 @@ def load_audio(audio_path):
raise IOError("audio_path {} does not exist".format(audio_path))
return librosa.load(audio_path, sr=None, mono=True)

# -- the partial_download argument can be removed if `dataset.REMOTES` is missing/has only one value
# -- the force_overwrite argument can be removed if the dataset does not download anything
# -- (i.e. there is no `dataset.REMOTES`)
# -- the cleanup argument can be removed if the dataset has no tar or zip files in `dataset.REMOTES`.
def download(
data_home=None, partial_download=None, force_overwrite=False, cleanup=True
# -- this function is not necessary unless you need very custom download logic
# -- If you need it, it must have this signature.
def _download(
save_dir, remotes, partial_download, info_message, force_overwrite, cleanup
):
"""Download the dataset.

Args:
data_home (str):
Local path where the dataset is stored.
If `None`, looks for the data in the default directory, `~/mir_datasets`
save_dir (str):
The directory to download the data
remotes (dict or None):
A dictionary of RemoteFileMetadata tuples of data in zip format.
If None, there is no data to download
partial_download (list or None):
A list of keys to partially download the remote objects of the download dict.
If None, all data is downloaded
info_message (str or None):
A string of info to print when this function is called.
If None, no string is printed.
force_overwrite (bool):
Whether to overwrite the existing downloaded data
partial_download (list):
List indicating what to partially download. The list can include any of:
* 'TODO_KEYS_OF_REMOTES' TODO ADD DESCRIPTION
If `None`, all data is downloaded.
If True, existing files are overwritten by the downloaded files.
cleanup (bool):
Whether to delete the zip/tar file after extracting.

"""
if data_home is None:
data_home = utils.get_default_dataset_path(DATASET_DIR)

download_utils.downloader(
# -- everything will be downloaded & uncompressed inside `data_home`
data_home,
# -- by default all elements in REMOTES will be downloaded
remotes=REMOTES,
# -- we allow partial downloads of the datasets containing multiple remote files
# -- this is done by specifying a list of keys in partial_download (when using the library)
partial_download=partial_download,
# -- if you need to give the user any instructions, such as how to download
# -- a dataset which is not freely availalbe, put them here
info_message=None,
force_overwrite=force_overwrite,
cleanup=cleanup,
)


# -- keep this function exactly as it is
def validate(data_home=None, silence=False):
"""Validate if the stored dataset is a valid version

Args:
data_home (str): Local path where the dataset is stored.
If `None`, looks for the data in the default directory, `~/mir_datasets`
Returns:
missing_files (list): List of file paths that are in the dataset index
but missing locally
invalid_checksums (list): List of file paths that file exists in the dataset
index but has a different checksum compare to the reference checksum
"""
if data_home is None:
data_home = utils.get_default_dataset_path(DATASET_DIR)

missing_files, invalid_checksums = utils.validator(
DATA.index, data_home, silence=silence
)
return missing_files, invalid_checksums


# -- keep this function exactly as it is
def track_ids():
"""Return track ids

Returns:
(list): A list of track ids
"""
return list(DATA.index.keys())


# -- keep this function as it is
def load(data_home=None):
"""Load Example dataset

Args:
data_home (str): Local path where the dataset is stored.
If `None`, looks for the data in the default directory, `~/mir_datasets`
Returns:
(dict): {`track_id`: track data}
"""
if data_home is None:
data_home = utils.get_default_dataset_path(DATASET_DIR)

data = {}
for key in DATA.index.keys():
data[key] = Track(key, data_home=data_home)
return data
# see download_utils.downloader for basic usage - if you only need to call downloader
# once, you do not need this function at all.
# only write a custom function if you need it!


# -- Write any necessary loader functions for loading the dataset's data
Expand Down Expand Up @@ -385,18 +333,6 @@ def load_annotation(annotation_path):
np.array(annotation))
return annotation_data


def cite():
"""Print the reference"""

cite_data = """
=========== MLA ===========
MLA format citation/s here
========== Bibtex ==========
Bibtex format citations/s here
"""
print(cite_data)

```


Expand Down
50 changes: 32 additions & 18 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,14 +24,14 @@ pip install mirdata

Try it out!
```python
import mirdata.orchset
import mirdata
import random

mirdata.orchset.download() # download the dataset
mirdata.orchset.validate() # validate that all the expected files are there
orchset_data = mirdata.orchset.load() # (lazy) load the data index
orchset = mirdata.Dataset('orchset')
orchset.download() # download the dataset
orchset.validate() # validate that all the expected files are there

example_track = random.choice(list(orchset_data.items())) # choose a random example track
example_track = orchset.choice_track() # choose a random example track
print(example_track) # see the availalbe data
```
See the Examples section below for more details, or the [documentation](https://mirdata.readthedocs.io/en/latest/) for more examples and the API reference.
Expand Down Expand Up @@ -91,31 +91,44 @@ We welcome contributions to this library, especially new datasets. Please see [C

### Download the Orchset Dataset
```python
import mirdata.orchset
import mirdata

mirdata.orchset.download()
orchset = mirdata.Dataset('orchset')
orchset.download()
```

### Validate the data
```python
import mirdata.orchset
import mirdata

mirdata.orchset.validate()
orchset = mirdata.Dataset('orchset')
orchset.validate()
```

### Load the Orchset Dataset
### Load data for a specific track
```python
import mirdata.orchset
import mirdata

orchset_data = mirdata.orchset.load()
orchset = mirdata.Dataset('orchset')
track = orchset.track('Beethoven-S3-I-ex1')
rabitt marked this conversation as resolved.
Show resolved Hide resolved
print(track)
```

### Load all tracks in the Orchset Dataset
```python
import mirdata

orchset = mirdata.Dataset('orchset')
orchset_data = orchset.load_tracks()
```

### See what data are available for a track
```python
import mirdata.orchset
import mirdata

orchset_ids = mirdata.orchset.track_ids()
orchset_data = mirdata.orchset.load()
orchset = mirdata.Dataset('orchset')
orchset_ids = orchset.track_ids()
orchset_data = orchset.load_tracks()

example_track = orchset_data[orchset_ids[0]]
print(example_track)
Expand Down Expand Up @@ -143,7 +156,7 @@ print(example_track)
### Evaluate a melody extraction algorithm on Orchset
```python
import mir_eval
import mirdata.orchset
import mirdata
import numpy as np
import sox

Expand All @@ -155,7 +168,8 @@ def very_bad_melody_extractor(audio_path):

# Evaluate on the full dataset
orchset_scores = {}
orchset_data = mirdata.orchset.load()
orchset = mirdata.Dataset('orchset')
orchset_data = orchset.load_tracks()
for track_id, track_data in orchset_data.items():
est_times, est_freqs = very_bad_melody_extractor(track_data.audio_path_mono)

Expand All @@ -182,4 +196,4 @@ for track_id, track_data in orchset_data.items():
By default, all datasets tracked by this library are stored in `~/mir_datasets`,
(defined as `MIR_DATASETS_DIR` in `mirdata/__init__.py`).
Data can alternatively be stored in another location by specifying `data_home`
within a relevant function, e.g. `mirdata.orchset.download(data_home='my_custom_path')`
within a relevant function, e.g. `mirdata.Dataset('orchset', data_home='my_custom_path')`
Loading