Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[RFC] Dataset class. Cross-module download, duration, from_jams, load, to_jams, and validate #219

Closed
wants to merge 44 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
44 commits
Select commit Hold shift + click to select a range
3719649
create Dataset class
Apr 7, 2020
cf10e5b
create track2.py
Apr 7, 2020
277f9ad
implement to_jams in track2
Apr 7, 2020
87c1bb0
implement Dataset.download
Apr 7, 2020
1ee3711
rwc_classical
Apr 7, 2020
65beab0
orchset
Apr 7, 2020
dcf5dfe
change prototype of load_track to have paths second, metadata third
Apr 7, 2020
a08b816
update Track2 superclass
Apr 8, 2020
313f42e
address case where track_metadata is empty
Apr 8, 2020
9772995
implement duration, from_jams, to_jams
Apr 8, 2020
db178ff
remove Track constructors of orchset and rwcc!
Apr 8, 2020
bea3f77
GTZAN Genre
Apr 8, 2020
87fe788
create kwarg download_items in dataset.download
Apr 8, 2020
a16f0c9
define beats, chords, keys in from_jams
Apr 8, 2020
bebcf56
do not store track_metadata in Track superclass
Apr 8, 2020
978000a
call jams load in the parent class if "jams" in track_index
Apr 8, 2020
813d068
exclude jams et al. from Track __repr__
Apr 8, 2020
829f547
GuitarSet
Apr 8, 2020
43e1bde
sections of rwc classical
Apr 8, 2020
80f6dc2
implement sections and chords in to_jams
Apr 8, 2020
b555c34
implement dataset.choice()
Apr 8, 2020
5de0ae7
cite multiple papers if bibtex is dict
Apr 8, 2020
f7dcccb
print remote if isinstance(remote, str)
Apr 8, 2020
39aaec1
unify rwc_classical and rwc_jazz beat and structure parsers
Apr 8, 2020
e505f58
rwc_jazz
Apr 8, 2020
11f71ee
rwc_popular
Apr 8, 2020
cf30af6
format string remote according to data_home
Apr 8, 2020
3b81b33
encode title and artist In jams
Apr 8, 2020
0000c41
bugfix print remote
Apr 8, 2020
8db5b49
beatles.Track
Apr 8, 2020
75d668e
black -S .
Apr 8, 2020
873ff49
document beatles.artist
Apr 8, 2020
e7cc3b5
allow for missing fields in index
Apr 8, 2020
bb8df77
medley_solos_db
Apr 8, 2020
d4dffd7
salami
Apr 8, 2020
f298369
write failsafe mode (flag196), parse_track_id
Apr 8, 2020
3015442
implement parse_track_id in track2.py
Apr 8, 2020
12eb3b1
keep a list of previously printed remotes
Apr 8, 2020
b2b83cc
ikala
Apr 8, 2020
36bf88e
black -S .
Apr 8, 2020
d425283
duration should not be cached
Apr 8, 2020
07e55df
beatles.artist should be cached
Apr 8, 2020
b3ea8de
bugfix duration decorator
Apr 8, 2020
2678eef
experiment with to_jams(duration, self)
Apr 9, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 97 additions & 1 deletion mirdata/beatles.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from mirdata import download_utils
from mirdata import jams_utils
from mirdata import track
from mirdata import track, track2
from mirdata import utils

DATASET_DIR = 'Beatles'
Expand Down Expand Up @@ -362,3 +362,99 @@ def cite():
"""

print(cite_data)


name = "Beatles"

bibtex = """@inproceedings{mauch2009beatles,
title={OMRAS2 metadata project 2009},
author={Mauch, Matthias and Cannam, Chris and Davies, Matthew and Dixon, Simon and Harte,
Christopher and Kolozali, Sefki and Tidhar, Dan and Sandler, Mark},
booktitle={12th International Society for Music Information Retrieval Conference},
year={2009},
series = {ISMIR}
}"""

remotes = {
"annotations": download_utils.RemoteFileMetadata(
filename='The Beatles Annotations.tar.gz',
url='http://isophonics.net/files/annotations/The%20Beatles%20Annotations.tar.gz',
checksum='62425c552d37c6bb655a78e4603828cc',
destination_dir='annotations',
),
"audio": """
Unfortunately the audio files of the Beatles dataset are not available
for download. If you have the Beatles dataset, place the contents into
a folder called Beatles with the following structure:
> Beatles/
> annotations/
> audio/
and copy the Beatles folder to {}""",
}


class Track2(track2.Track2):
@utils.cached_property
def artist():
"""string: artist name"""
return "The Beatles"

@utils.cached_property
def beats(self):
"""BeatData: human-labeled beat data"""
beat_times, beat_positions = [], []
with open(self.track_index["beat"][0], 'r') as fhandle:
dialect = csv.Sniffer().sniff(fhandle.read(1024))
fhandle.seek(0)
reader = csv.reader(fhandle, dialect)
for line in reader:
beat_times.append(float(line[0]))
beat_positions.append(line[-1])
beat_positions = _fix_newpoint(np.array(beat_positions))
# After fixing New Point labels convert positions to int
beat_positions = [int(b) for b in beat_positions]
return utils.BeatData(np.array(beat_times), np.array(beat_positions))

@utils.cached_property
def chords(self):
"""ChordData: chord annotation"""
start_times, end_times, chords = [], [], []
with open(self.track_index["chords"][0], 'r') as f:
dialect = csv.Sniffer().sniff(f.read(1024))
f.seek(0)
reader = csv.reader(f, dialect)
for line in reader:
start_times.append(float(line[0]))
end_times.append(float(line[1]))
chords.append(line[2])
return utils.ChordData(np.array([start_times, end_times]).T, chords)

@utils.cached_property
def key(self):
"""KeyData: key annotation"""
start_times, end_times, keys = [], [], []
with open(self.track_index["keys"][0], 'r') as fhandle:
reader = csv.reader(fhandle, delimiter='\t')
for line in reader:
if line[2] == 'Key':
start_times.append(float(line[0]))
end_times.append(float(line[1]))
keys.append(line[3])
return utils.KeyData(np.array(start_times), np.array(end_times), np.array(keys))

@utils.cached_property
def sections(self):
"""SectionData: human-labeled section annotations"""
start_times, end_times, sections = [], [], []
with open(self.track_index["sections"][0], 'r') as fhandle:
reader = csv.reader(fhandle, delimiter='\t')
for line in reader:
start_times.append(float(line[0]))
end_times.append(float(line[1]))
sections.append(line[3])
return utils.SectionData(np.array([start_times, end_times]).T, sections)

@utils.cached_property
def title(self):
"""string: song title"""
return os.path.basename(self.track_index['sections'][0]).split('.')[0]
140 changes: 140 additions & 0 deletions mirdata/dataset.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import jams
import json
import os
import random
import re
import tqdm

from mirdata import download_utils
from mirdata import utils
from mirdata import track2


class Dataset(object):
def __init__(self, module, data_home=None):
self.name = module.name
self.bibtex = module.bibtex
self.remotes = module.remotes
self.Track2 = module.Track2
self.readme = module.__doc__
self.module_path = module.__file__
if data_home is None:
self.data_home = self.dataset_default_path
else:
self.data_home = data_home

# TODO: bikeshed the naming of the kwarg flag196
def __getitem__(self, track_id, flag196="error"):
track_id_info = self.Track2.parse_track_id(track_id)
track_index = self.index[track_id].copy()
for track_key in self.index[track_id]:
if track_index[track_key][0] is not None:
track_index[track_key][0] = os.path.join(
self.data_home, track_index[track_key][0]
)
else:
del track_index[track_key]
try:
metadata = self.metadata
if track_id in metadata:
track_metadata = {**track_id_info, **metadata[track_id]}
else:
track_metadata = track_id_info
except FileNotFoundError as file_not_found_error:
# Failsafe mode
if flag196 == "pass":
track_metadata = track_id_info
else:
raise file_not_found_error
return self.Track2(track_index, track_metadata, flag196=flag196)

@property
def dataset_default_path(self):
mir_datasets_dir = os.path.join(os.getenv('HOME', '/tmp'), 'mir_datasets')
dataset_dir = re.sub(' ', '-', self.name)
return os.path.join(mir_datasets_dir, dataset_dir)

@utils.cached_property
def index(self):
return self.load_index()

@property
def index_path(self):
json_name = re.sub('[^0-9a-z]+', '_', self.name.lower()) + "_index.json"
cwd = os.path.dirname(os.path.realpath(__file__))
return os.path.join(cwd, "indexes", json_name)

@utils.cached_property
def metadata(self):
metadata_index = self.Track2.load_metadata(self.data_home)
return metadata_index

def choice(self):
return self[random.choice(self.track_ids())]

def cite(self):
# TODO: use pybtex to convert to MLA
print("========== BibTeX ==========")
if isinstance(self.bibtex, str):
print(self.bibtex)
else:
print("\n".join(self.bibtex.values()))

def download(self, force_overwrite=False, cleanup=False, download_items=None):
# TODO: perhaps find a shorter kwarg name than "download_items"?
if not os.path.exists(self.data_home):
os.makedirs(self.data_home, exist_ok=True)

# By default, download all remotes
if download_items is None:
download_items = self.remotes

# The list previus_remote_prints stores all the previously printed
# messages. This avoids printing the same message over and over for
# different remotes when multiple remotes are absent from download,
# e.g. in ikala
previous_remote_prints = []
for remote_key in download_items:
remote = self.remotes[remote_key]
if isinstance(remote, str):
if remote not in previous_remote_prints:
print(remote.format(self.data_home))
previous_remote_prints.append(remote)
continue

if ".zip" in remote.url:
download_utils.download_zip_file(
remote, self.data_home, force_overwrite, cleanup
)
elif ".tar.gz" in remote.url:
download_utils.download_tar_file(
remote, self.data_home, force_overwrite, cleanup
)
else:
download_utils.download_from_remote(
remote, self.data_home, force_overwrite
)

# TODO: bikeshed the naming of the kwarg flag196
def load(self, verbose=False, flag196="error"):
tracks = {}
for track_id in tqdm.tqdm(self.index, disable=not verbose):
tracks[track_id] = self.__getitem__(track_id, flag196=flag196)

return tracks

def load_index(self):
with open(self.index_path) as f:
return json.load(f)

def track_ids(self):
return list(self.index.keys())

def validate(self, verbose=False):
missing_files = []
invalid_checksums = []
for track_id in tqdm.tqdm(self.track_ids(), disable=not verbose):
track_validation = self[track_id].validate()
missing_files += track_validation[0]
invalid_checksums += track_validation[1]
return missing_files, invalid_checksums
80 changes: 79 additions & 1 deletion mirdata/gtzan_genre.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@
22050 Hz mono 16-bit audio files in .wav format.
"""

import jams
import librosa
import os

import mirdata
from mirdata import download_utils
from mirdata import jams_utils
from mirdata import track
from mirdata import track, track2
from mirdata import utils


Expand Down Expand Up @@ -170,3 +172,79 @@ def cite():
}
"""
print(cite_data)


name = "GTZAN Genre"

bibtex = """@article{tzanetakis2002gtzan,
title={GTZAN genre collection},
author={Tzanetakis, George and Cook, P},
journal={Music Analysis, Retrieval and Synthesis for Audio Signals},
year={2002}
}"""

remotes = {
"audio": download_utils.RemoteFileMetadata(
filename="genres.tar.gz",
url="http://opihi.cs.uvic.ca/sound/genres.tar.gz",
checksum="5b3d6dddb579ab49814ab86dba69e7c7",
destination_dir="gtzan_genre",
)
}


class Track2(track2.Track2):
@utils.cached_property
def genre(self):
"""Musical genre among 10 categories"""
wav_name = os.path.split(self.track_index["audio"][0])[1]
return wav_name.split(".")[0]

@utils.cached_property
def jams(self):
"""JAMS: JSON-Annotated Music Specification"""
# Initialize top-level JAMS container
jam = jams.JAMS()

# Encode title, artist, and release
jam.file_metadata.title = "Unknown track"
jam.file_metadata.artist = "Unknown artist"
jam.file_metadata.release = "Unknown album"

# Encode duration in seconds
jam.file_metadata.duration = 30.0

# Encode JAMS curator
curator = jams.Curator(name="George Tzanetakis", email="gtzan@cs.uvic.ca")

# Store mirdata metadata as JAMS identifiers
jam.file_metadata.identifiers = jams.Sandbox(**self.__dict__)

# Encode annotation rules
annotation_rules = """Unfortunately the database was collected gradually and
very early on in my research so I have no titles.
The files were collected in 2000-2001 from a variety of sources including
personal CDs, radio, microphone recordings, in order to represent a variety of
recording conditions. Nevertheless I have been providing it to researchers upon
request mainly for comparison purposes etc. -- George Tzanetakis, 2005"""

# Encode annotation metadata
ann_meta = jams.AnnotationMetadata(
annotator={"mirdata version": mirdata.__version__},
version="1.0",
corpus=name,
annotation_tools="MARSYAS",
annotation_rules=annotation_rules,
validation=remotes,
data_source="George Tzanetakis",
curator=curator,
)

# Encode genre annotation
genre_ann = jams.Annotation(
namespace="tag_gtzan", time=0, duration=30.0, annotation_metadata=ann_meta
)
genre_ann.append(time=0, duration=30.0, confidence=0, value=self.genre)
jam.annotations.append(genre_ann)

return jam
Loading