mir-dataset-loaders · rabitt · Nov 3, 2020 · Oct 20, 2020 · Oct 20, 2020 · Oct 20, 2020
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -138,10 +138,18 @@ import os
 
 from mirdata import download_utils
 from mirdata import jams_utils
-from mirdata import track
+from mirdata import core
 from mirdata import utils
 
-DATASET_DIR = 'Example'
+
+# -- Add any relevant citations here
+BIBTEX = """@article{article-minimal,
+    author = "L[eslie] B. Lamport",
+    title = "The Gnats and Gnus Document Preparation System",
+    journal = "G-Animal's Journal",
+    year = "1986"
+}"""
+
 # -- REMOTES is a dictionary containing all files that need to be downloaded.
 # -- The keys should be descriptive (e.g. 'annotations', 'audio')
 REMOTES = {
@@ -153,6 +161,14 @@ REMOTES = {
     ),
 }
 
+# -- Include any information that should be printed when downloading
+# -- remove this variable if you don't need to print anything during download
+DOWNLOAD_INFO = """
+Include any information you want to be printed when dataset.download() is called.
+These can be instructions for how to download the dataset (e.g. request access on zenodo),
+caveats about the download, etc
+"""
+
 # -- change this to load any top-level metadata
 ## delete this function if you don't have global metadata
 def _load_metadata(data_home):
@@ -175,32 +191,27 @@ DATA = utils.LargeData('example_index.json', _load_metadata)
 # DATA = utils.LargeData('example_index.json')  ## use this if your dataset has no metadata
 
 
-class Track(track.Track):
+class Track(core.Track):
     """Example track class
     # -- YOU CAN AUTOMATICALLY GENERATE THIS DOCSTRING BY CALLING THE SCRIPT:
     # -- `scripts/print_track_docstring.py my_dataset`
     # -- note that you'll first need to have a test track (see "Adding tests to your dataset" below)
 
     Args:
         track_id (str): track id of the track
-        data_home (str): Local path where the dataset is stored.
-            If `None`, looks for the data in the default directory, `~/mir_datasets/Example`
 
     Attributes:
         track_id (str): track id
         # -- Add any of the dataset specific attributes here
 
     """
-    def __init__(self, track_id, data_home=None):
+    def __init__(self, track_id, data_home):
         if track_id not in DATA.index:
             raise ValueError(
                 '{} is not a valid track ID in Example'.format(track_id))
 
         self.track_id = track_id
 
-        if data_home is None:
-            data_home = utils.get_default_dataset_path(DATASET_DIR)
-
         self._data_home = data_home
         self._track_paths = DATA.index[track_id]
 
@@ -266,97 +277,34 @@ def load_audio(audio_path):
         raise IOError("audio_path {} does not exist".format(audio_path))
     return librosa.load(audio_path, sr=None, mono=True)
 
-# -- the partial_download argument can be removed if `dataset.REMOTES` is missing/has only one value
-# -- the force_overwrite argument can be removed if the dataset does not download anything
-# -- (i.e. there is no `dataset.REMOTES`)
-# -- the cleanup argument can be removed if the dataset has no tar or zip files in `dataset.REMOTES`.
-def download(
-    data_home=None, partial_download=None, force_overwrite=False, cleanup=True
+# -- this function is not necessary unless you need very custom download logic
+# -- If you need it, it must have this signature.
+def _download(
+    save_dir, remotes, partial_download, info_message, force_overwrite, cleanup
 ):
     """Download the dataset.
 
     Args:
-        data_home (str):
-            Local path where the dataset is stored.
-            If `None`, looks for the data in the default directory, `~/mir_datasets`
+        save_dir (str):
+            The directory to download the data
+        remotes (dict or None):
+            A dictionary of RemoteFileMetadata tuples of data in zip format.
+            If None, there is no data to download
+        partial_download (list or None):
+            A list of keys to partially download the remote objects of the download dict.
+            If None, all data is downloaded
+        info_message (str or None):
+            A string of info to print when this function is called.
+            If None, no string is printed.
         force_overwrite (bool):
-            Whether to overwrite the existing downloaded data
-        partial_download (list):
-            List indicating what to partially download. The list can include any of:
-                * 'TODO_KEYS_OF_REMOTES' TODO ADD DESCRIPTION
-            If `None`, all data is downloaded.
+            If True, existing files are overwritten by the downloaded files.
         cleanup (bool):
             Whether to delete the zip/tar file after extracting.
 
     """
-    if data_home is None:
-        data_home = utils.get_default_dataset_path(DATASET_DIR)
-
-    download_utils.downloader(
-        # -- everything will be downloaded & uncompressed inside `data_home`
-        data_home,
-        # -- by default all elements in REMOTES will be downloaded
-        remotes=REMOTES,
-        # -- we allow partial downloads of the datasets containing multiple remote files
-        # -- this is done by specifying a list of keys in partial_download (when using the library)
-        partial_download=partial_download,
-        # -- if you need to give the user any instructions, such as how to download
-        # -- a dataset which is not freely availalbe, put them here
-        info_message=None,
-        force_overwrite=force_overwrite,
-        cleanup=cleanup,
-    )
-
-
-# -- keep this function exactly as it is
-def validate(data_home=None, silence=False):
-    """Validate if the stored dataset is a valid version
-
-    Args:
-        data_home (str): Local path where the dataset is stored.
-            If `None`, looks for the data in the default directory, `~/mir_datasets`
-    Returns:
-        missing_files (list): List of file paths that are in the dataset index
-            but missing locally
-        invalid_checksums (list): List of file paths that file exists in the dataset
-            index but has a different checksum compare to the reference checksum
-    """
-    if data_home is None:
-        data_home = utils.get_default_dataset_path(DATASET_DIR)
-
-    missing_files, invalid_checksums = utils.validator(
-        DATA.index, data_home, silence=silence
-    )
-    return missing_files, invalid_checksums
-
-
-# -- keep this function exactly as it is
-def track_ids():
-    """Return track ids
-
-    Returns:
-        (list): A list of track ids
-    """
-    return list(DATA.index.keys())
-
-
-# -- keep this function as it is
-def load(data_home=None):
-    """Load Example dataset
-
-    Args:
-        data_home (str): Local path where the dataset is stored.
-            If `None`, looks for the data in the default directory, `~/mir_datasets`
-    Returns:
-        (dict): {`track_id`: track data}
-    """
-    if data_home is None:
-        data_home = utils.get_default_dataset_path(DATASET_DIR)
-
-    data = {}
-    for key in DATA.index.keys():
-        data[key] = Track(key, data_home=data_home)
-    return data
+    # see download_utils.downloader for basic usage - if you only need to call downloader
+    # once, you do not need this function at all.
+    # only write a custom function if you need it! 
 
 
 # -- Write any necessary loader functions for loading the dataset's data
@@ -385,18 +333,6 @@ def load_annotation(annotation_path):
         np.array(annotation))
     return annotation_data
 
-
-def cite():
-    """Print the reference"""
-
-    cite_data = """
-=========== MLA ===========
-MLA format citation/s here
-========== Bibtex ==========
-Bibtex format citations/s here
-"""
-    print(cite_data)
-
 ```
 
 

diff --git a/README.md b/README.md
@@ -24,14 +24,14 @@ pip install mirdata
 
 Try it out!
 ```python
-import mirdata.orchset
+import mirdata
 import random
 
-mirdata.orchset.download()  # download the dataset
-mirdata.orchset.validate()  # validate that all the expected files are there
-orchset_data = mirdata.orchset.load()  # (lazy) load the data index
+orchset = mirdata.Dataset('orchset')
+orchset.download()  # download the dataset
+orchset.validate()  # validate that all the expected files are there
 
-example_track = random.choice(list(orchset_data.items()))  # choose a random example track
+example_track = orchset.choice_track()  # choose a random example track
 print(example_track)  # see the availalbe data
 ```
 See the Examples section below for more details, or the [documentation](https://mirdata.readthedocs.io/en/latest/) for more examples and the API reference.
@@ -91,31 +91,44 @@ We welcome contributions to this library, especially new datasets. Please see [C
 
 ### Download the Orchset Dataset
 ```python
-import mirdata.orchset
+import mirdata
 
-mirdata.orchset.download()
+orchset = mirdata.Dataset('orchset')
+orchset.download()
 ```
 
 ### Validate the data
 ```python
-import mirdata.orchset
+import mirdata
 
-mirdata.orchset.validate()
+orchset = mirdata.Dataset('orchset')
+orchset.validate()
 ```
 
-### Load the Orchset Dataset
+### Load data for a specific track
 ```python
-import mirdata.orchset
+import mirdata
 
-orchset_data = mirdata.orchset.load()
+orchset = mirdata.Dataset('orchset')
+track = orchset.track('Beethoven-S3-I-ex1')
+print(track)
+```
+
+### Load all tracks in the Orchset Dataset
+```python
+import mirdata
+
+orchset = mirdata.Dataset('orchset')
+orchset_data = orchset.load_tracks()
 ```
 
 ### See what data are available for a track
 ```python
-import mirdata.orchset
+import mirdata
 
-orchset_ids = mirdata.orchset.track_ids()
-orchset_data = mirdata.orchset.load()
+orchset = mirdata.Dataset('orchset')
+orchset_ids = orchset.track_ids()
+orchset_data = orchset.load_tracks()
 
 example_track = orchset_data[orchset_ids[0]]
 print(example_track)
@@ -143,7 +156,7 @@ print(example_track)
 ### Evaluate a melody extraction algorithm on Orchset
 ```python
 import mir_eval
-import mirdata.orchset
+import mirdata
 import numpy as np
 import sox
 
@@ -155,7 +168,8 @@ def very_bad_melody_extractor(audio_path):
 
 # Evaluate on the full dataset
 orchset_scores = {}
-orchset_data = mirdata.orchset.load()
+orchset = mirdata.Dataset('orchset')
+orchset_data = orchset.load_tracks()
 for track_id, track_data in orchset_data.items():
     est_times, est_freqs = very_bad_melody_extractor(track_data.audio_path_mono)
 
@@ -182,4 +196,4 @@ for track_id, track_data in orchset_data.items():
 By default, all datasets tracked by this library are stored in `~/mir_datasets`,
 (defined as `MIR_DATASETS_DIR` in `mirdata/__init__.py`).
 Data can alternatively be stored in another location by specifying `data_home`
-within a relevant function, e.g. `mirdata.orchset.download(data_home='my_custom_path')`
+within a relevant function, e.g. `mirdata.Dataset('orchset', data_home='my_custom_path')`