From 753ef907a3a08430a9d6d86668a46b3e2365f9f4 Mon Sep 17 00:00:00 2001 From: Rachel Bittner Date: Tue, 3 Nov 2020 14:06:52 -0800 Subject: [PATCH] Dataset object (#296) * Dataset object, heavily inspired by the RFC in #219 * update top-level docs, adapt two loaders * update dataset api * update all loaders to fit new API * remove outdated test * update tests, inherit dataset-specific load functions, docstring hack, better error handling * remove data_home from Track docstrings * normalize dataset_dir to match module name, removes need for DATASET_DIR * update test_full dataset; fix introduced bug in orchset * fix bug in orchset download method #309 * consolodate track.py and dataset.py into core.py * create datasets submodule * fix import bug in tests * hack around git case sensitiveness * hack back around git case sensitiveness * hack around git ignore case changes * hack back around git ignoring case changes * fix capitalization in tests paths * port beatport key to 0.3 Co-authored-by: Rachel Bittner --- CONTRIBUTING.md | 152 ++---- README.md | 50 +- docs/source/example.rst | 44 +- docs/source/mirdata.rst | 127 +++-- mirdata/__init__.py | 44 +- mirdata/core.py | 348 ++++++++++++++ mirdata/dali.py | 368 -------------- .../datasets/__init__.py | 0 mirdata/{ => datasets}/beatles.py | 202 ++------ mirdata/{ => datasets}/beatport_key.py | 190 +++----- mirdata/datasets/dali.py | 277 +++++++++++ mirdata/{ => datasets}/giantsteps_key.py | 185 ++------ mirdata/datasets/giantsteps_tempo.py | 233 +++++++++ mirdata/{ => datasets}/groove_midi.py | 167 ++----- mirdata/datasets/gtzan_genre.py | 103 ++++ mirdata/{ => datasets}/guitarset.py | 231 +++------ mirdata/{ => datasets}/ikala.py | 193 ++------ .../{ => datasets}/indexes/beatles_index.json | 0 .../indexes/beatport_key_index.json | 0 .../{ => datasets}/indexes/dali_index.json | 0 .../indexes/giantsteps_key_index.json | 0 .../indexes/giantsteps_tempo_index.json | 0 .../indexes/groove_midi_index.json | 0 .../indexes/gtzan_genre_index.json | 0 .../indexes/guitarset_index.json | 0 .../{ => datasets}/indexes/ikala_index.json | 0 .../{ => datasets}/indexes/maestro_index.json | 0 .../indexes/medley_solos_db_index.json | 0 .../indexes/medleydb_melody_index.json | 0 .../indexes/medleydb_pitch_index.json | 0 .../indexes/mridangam_stroke_index.json | 0 .../{ => datasets}/indexes/orchset_index.json | 0 .../indexes/rwc_classical_index.json | 0 .../indexes/rwc_jazz_index.json | 0 .../indexes/rwc_popular_index.json | 0 .../{ => datasets}/indexes/salami_index.json | 0 .../{ => datasets}/indexes/tinysol_index.json | 0 mirdata/{ => datasets}/maestro.py | 195 +++----- mirdata/{ => datasets}/medley_solos_db.py | 121 +---- mirdata/{ => datasets}/medleydb_melody.py | 184 ++----- mirdata/datasets/medleydb_pitch.py | 159 +++++++ mirdata/{ => datasets}/mridangam_stroke.py | 164 ++----- mirdata/{ => datasets}/orchset.py | 146 ++---- mirdata/datasets/rwc_classical.py | 292 ++++++++++++ mirdata/datasets/rwc_jazz.py | 222 +++++++++ mirdata/datasets/rwc_popular.py | 303 ++++++++++++ mirdata/{ => datasets}/salami.py | 251 +++------- mirdata/{ => datasets}/tinysol.py | 126 +---- mirdata/giantsteps_tempo.py | 332 ------------- mirdata/gtzan_genre.py | 197 -------- mirdata/jams_utils.py | 176 ++++--- mirdata/medleydb_pitch.py | 251 ---------- mirdata/rwc_classical.py | 448 ------------------ mirdata/rwc_jazz.py | 334 ------------- mirdata/rwc_popular.py | 435 ----------------- mirdata/track.py | 159 ------- mirdata/utils.py | 87 ++-- mirdata/version.py | 4 +- scripts/print_track_docstring.py | 78 ++- .../resources/download/Orchset_dataset_0.zip | Bin 0 -> 5385 bytes .../medleydb_melody_metadata.json | 14 - .../11_-_Do_You_Want_To_Know_A_Secret.txt | 0 .../11_-_Do_You_Want_To_Know_A_Secret.lab | 0 .../11_-_Do_You_Want_To_Know_A_Secret.lab | 0 .../11_-_Do_You_Want_To_Know_A_Secret.lab | 0 .../11_-_Do_You_Want_To_Know_A_Secret.wav | Bin .../4b196e6c99574dd49ad00d56e132712b.gz | Bin .../4b196e6c99574dd49ad00d56e132712b.mp3 | Bin .../{DALI => dali}/dali_metadata.json | 0 ...My Eyes feat. J. Little (Original Mix).mp3 | Bin ...My Eyes feat. J. Little (Original Mix).txt | 0 ...y Eyes feat. J. Little (Original Mix).json | 0 .../audio/28952.LOFI.mp3 | Bin .../annotations/jams/28952.LOFI.jams | 0 .../annotations_v2/jams/28952.LOFI.jams | 0 .../1_funk-groove1_138_beat_4-4.mid | Bin .../1_funk-groove1_138_beat_4-4.wav | Bin .../{Groove-MIDI => groove_midi}/info.csv | 0 .../genres/country/country.00000.wav | Bin .../annotation/03_BN3-119-G_solo.jams | 0 .../03_BN3-119-G_solo_hex_cln.wav | Bin .../03_BN3-119-G_solo_hex.wav | Bin .../audio_mono-mic/03_BN3-119-G_solo_mic.wav | Bin .../03_BN3-119-G_solo_mix.wav | Bin .../{iKala => ikala}/Lyrics/10161_chorus.lab | 0 .../{iKala => ikala}/Lyrics/10164_chorus.lab | 0 .../PitchLabel/10161_chorus.pv | 0 .../{iKala => ikala}/Wavfile/10161_chorus.wav | Bin .../{iKala => ikala}/id_mapping.txt | 0 ...Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi | Bin ..._Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav | Bin .../{MAESTRO => maestro}/maestro-v2.0.0.json | 0 .../annotation/Medley-solos-DB_metadata.csv | 0 ...3_d07b1fc0-567d-52c2-fef4-239f31c9d40e.wav | Bin .../audio/MusicDelta_Beethoven_MIX.wav | Bin .../medleydb_melody_metadata.json | 14 + .../melody1/MusicDelta_Beethoven_MELODY1.csv | 0 .../melody2/MusicDelta_Beethoven_MELODY2.csv | 0 .../melody3/MusicDelta_Beethoven_MELODY3.csv | 0 .../AClassicEducation_NightOwl_STEM_08.wav | Bin .../medleydb_pitch_metadata.json | 4 +- .../AClassicEducation_NightOwl_STEM_08.csv | 0 .../B/224030__akshaylaya__bheem-b-001.wav | Bin .../B/_readme_and_license.txt | 0 .../GT/Beethoven-S3-I-ex1.mel | 0 ...hset - Predominant Melodic Instruments.csv | 0 .../audio/mono/Beethoven-S3-I-ex1.wav | Bin .../audio/stereo/Beethoven-S3-I-ex1.wav | Bin .../AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT | 0 .../RM-C003.CHORUS.TXT | 0 .../audio/rwc-c-m01/3.wav | Bin .../metadata-master/rwc-c.csv | 0 .../AIST.RWC-MDB-G-2001.BEAT/RM-G002.BEAT.TXT | 0 .../RM-G002.CHORUS.TXT | 0 .../audio/rwc-g-m01/2.wav | Bin .../metadata-master/rwc-g.csv | 0 .../AIST.RWC-MDB-J-2001.BEAT/RM-J004.BEAT.TXT | 0 .../RM-J004.CHORUS.TXT | 0 .../audio/rwc-j-m01/4.wav | Bin .../metadata-master/rwc-j.csv | 0 .../AIST.RWC-MDB-P-2001.BEAT/RM-P001.BEAT.TXT | 0 .../RWC_Pop_Chords/N001-M01-T01.lab | 0 .../RM-P001.CHORUS.TXT | 0 .../RM-P001.VOCA_INST.TXT | 0 .../audio/rwc-p-m01/1.wav | Bin .../metadata-master/rwc-p.csv | 0 .../{Salami => salami}/audio/2.mp3 | Bin .../1015/parsed/textfile2_functions.txt | 0 .../1015/parsed/textfile2_lowercase.txt | 0 .../1015/parsed/textfile2_uppercase.txt | 0 .../annotations/1015/textfile2.txt | 0 .../192/parsed/textfile1_functions.txt | 0 .../192/parsed/textfile1_lowercase.txt | 0 .../192/parsed/textfile1_uppercase.txt | 0 .../annotations/192/textfile1.txt | 0 .../2/parsed/textfile1_functions.txt | 0 .../2/parsed/textfile1_lowercase.txt | 0 .../2/parsed/textfile1_uppercase.txt | 0 .../2/parsed/textfile2_functions.txt | 0 .../2/parsed/textfile2_lowercase.txt | 0 .../2/parsed/textfile2_uppercase.txt | 0 .../annotations/2/textfile1.txt} | 0 .../annotations/2/textfile2.txt | 0 .../metadata/metadata.csv | 0 .../annotation/TinySOL_metadata.csv | 0 .../ordinario/Cb-ord-A2-mf-2c-N.wav | Bin .../Flute/ordinario/Fl-ord-C4-mf-N-T14d.wav | Bin tests/test_beatles.py | 158 +++--- tests/test_beatport_key.py | 80 ++-- tests/{test_track.py => test_core.py} | 62 ++- tests/test_dali.py | 171 +++---- tests/test_full_dataset.py | 43 +- tests/test_giantsteps_key.py | 61 +-- tests/test_giantsteps_tempo.py | 59 ++- tests/test_groove_midi.py | 103 ++-- tests/test_gtzan_genre.py | 20 +- tests/test_guitarset.py | 79 +-- tests/test_ikala.py | 67 +-- tests/test_jams_utils.py | 438 ++++++++--------- tests/test_loaders.py | 356 +++++++------- tests/test_maestro.py | 159 ++++--- tests/test_medley_solos_db.py | 32 +- tests/test_medleydb_melody.py | 93 ++-- tests/test_medleydb_pitch.py | 63 +-- tests/test_mridangam_stroke.py | 30 +- tests/test_orchset.py | 234 +++++---- tests/test_rwc_classical.py | 85 ++-- tests/test_rwc_jazz.py | 99 ++-- tests/test_rwc_popular.py | 127 ++--- tests/test_salami.py | 179 +++---- tests/test_tinysol.py | 99 ++-- tests/test_utils.py | 67 ++- 172 files changed, 4411 insertions(+), 6163 deletions(-) create mode 100644 mirdata/core.py delete mode 100644 mirdata/dali.py rename tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/textfile1.txt => mirdata/datasets/__init__.py (100%) rename mirdata/{ => datasets}/beatles.py (60%) rename mirdata/{ => datasets}/beatport_key.py (65%) create mode 100644 mirdata/datasets/dali.py rename mirdata/{ => datasets}/giantsteps_key.py (58%) create mode 100644 mirdata/datasets/giantsteps_tempo.py rename mirdata/{ => datasets}/groove_midi.py (75%) create mode 100644 mirdata/datasets/gtzan_genre.py rename mirdata/{ => datasets}/guitarset.py (58%) rename mirdata/{ => datasets}/ikala.py (57%) rename mirdata/{ => datasets}/indexes/beatles_index.json (100%) rename mirdata/{ => datasets}/indexes/beatport_key_index.json (100%) rename mirdata/{ => datasets}/indexes/dali_index.json (100%) rename mirdata/{ => datasets}/indexes/giantsteps_key_index.json (100%) rename mirdata/{ => datasets}/indexes/giantsteps_tempo_index.json (100%) rename mirdata/{ => datasets}/indexes/groove_midi_index.json (100%) rename mirdata/{ => datasets}/indexes/gtzan_genre_index.json (100%) rename mirdata/{ => datasets}/indexes/guitarset_index.json (100%) rename mirdata/{ => datasets}/indexes/ikala_index.json (100%) rename mirdata/{ => datasets}/indexes/maestro_index.json (100%) rename mirdata/{ => datasets}/indexes/medley_solos_db_index.json (100%) rename mirdata/{ => datasets}/indexes/medleydb_melody_index.json (100%) rename mirdata/{ => datasets}/indexes/medleydb_pitch_index.json (100%) rename mirdata/{ => datasets}/indexes/mridangam_stroke_index.json (100%) rename mirdata/{ => datasets}/indexes/orchset_index.json (100%) rename mirdata/{ => datasets}/indexes/rwc_classical_index.json (100%) rename mirdata/{ => datasets}/indexes/rwc_jazz_index.json (100%) rename mirdata/{ => datasets}/indexes/rwc_popular_index.json (100%) rename mirdata/{ => datasets}/indexes/salami_index.json (100%) rename mirdata/{ => datasets}/indexes/tinysol_index.json (100%) rename mirdata/{ => datasets}/maestro.py (58%) rename mirdata/{ => datasets}/medley_solos_db.py (58%) rename mirdata/{ => datasets}/medleydb_melody.py (52%) create mode 100644 mirdata/datasets/medleydb_pitch.py rename mirdata/{ => datasets}/mridangam_stroke.py (50%) rename mirdata/{ => datasets}/orchset.py (75%) create mode 100644 mirdata/datasets/rwc_classical.py create mode 100644 mirdata/datasets/rwc_jazz.py create mode 100644 mirdata/datasets/rwc_popular.py rename mirdata/{ => datasets}/salami.py (53%) rename mirdata/{ => datasets}/tinysol.py (66%) delete mode 100644 mirdata/giantsteps_tempo.py delete mode 100644 mirdata/gtzan_genre.py delete mode 100644 mirdata/medleydb_pitch.py delete mode 100644 mirdata/rwc_classical.py delete mode 100644 mirdata/rwc_jazz.py delete mode 100644 mirdata/rwc_popular.py delete mode 100644 mirdata/track.py create mode 100644 tests/resources/download/Orchset_dataset_0.zip delete mode 100644 tests/resources/mir_datasets/MedleyDB-Melody/medleydb_melody_metadata.json rename tests/resources/mir_datasets/{Beatles => beatles}/annotations/beat/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt (100%) rename tests/resources/mir_datasets/{Beatles => beatles}/annotations/chordlab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab (100%) rename tests/resources/mir_datasets/{Beatles => beatles}/annotations/keylab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab (100%) rename tests/resources/mir_datasets/{Beatles => beatles}/annotations/seglab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab (100%) rename tests/resources/mir_datasets/{Beatles => beatles}/audio/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.wav (100%) rename tests/resources/mir_datasets/{DALI => dali}/annotations/4b196e6c99574dd49ad00d56e132712b.gz (100%) rename tests/resources/mir_datasets/{DALI => dali}/audio/4b196e6c99574dd49ad00d56e132712b.mp3 (100%) rename tests/resources/mir_datasets/{DALI => dali}/dali_metadata.json (100%) rename tests/resources/mir_datasets/{GiantSteps_key => giantsteps_key}/audio/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).mp3 (100%) rename tests/resources/mir_datasets/{GiantSteps_key => giantsteps_key}/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).txt (100%) rename tests/resources/mir_datasets/{GiantSteps_key => giantsteps_key}/meta/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).json (100%) rename tests/resources/mir_datasets/{GiantSteps_tempo => giantsteps_tempo}/audio/28952.LOFI.mp3 (100%) rename tests/resources/mir_datasets/{GiantSteps_tempo => giantsteps_tempo}/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams (100%) rename tests/resources/mir_datasets/{GiantSteps_tempo => giantsteps_tempo}/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams (100%) rename tests/resources/mir_datasets/{Groove-MIDI => groove_midi}/drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid (100%) rename tests/resources/mir_datasets/{Groove-MIDI => groove_midi}/drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav (100%) rename tests/resources/mir_datasets/{Groove-MIDI => groove_midi}/info.csv (100%) rename tests/resources/mir_datasets/{GTZAN-Genre => gtzan_genre}/gtzan_genre/genres/country/country.00000.wav (100%) rename tests/resources/mir_datasets/{GuitarSet => guitarset}/annotation/03_BN3-119-G_solo.jams (100%) rename tests/resources/mir_datasets/{GuitarSet => guitarset}/audio_hex-pickup_debleeded/03_BN3-119-G_solo_hex_cln.wav (100%) rename tests/resources/mir_datasets/{GuitarSet => guitarset}/audio_hex-pickup_original/03_BN3-119-G_solo_hex.wav (100%) rename tests/resources/mir_datasets/{GuitarSet => guitarset}/audio_mono-mic/03_BN3-119-G_solo_mic.wav (100%) rename tests/resources/mir_datasets/{GuitarSet => guitarset}/audio_mono-pickup_mix/03_BN3-119-G_solo_mix.wav (100%) rename tests/resources/mir_datasets/{iKala => ikala}/Lyrics/10161_chorus.lab (100%) rename tests/resources/mir_datasets/{iKala => ikala}/Lyrics/10164_chorus.lab (100%) rename tests/resources/mir_datasets/{iKala => ikala}/PitchLabel/10161_chorus.pv (100%) rename tests/resources/mir_datasets/{iKala => ikala}/Wavfile/10161_chorus.wav (100%) rename tests/resources/mir_datasets/{iKala => ikala}/id_mapping.txt (100%) rename tests/resources/mir_datasets/{MAESTRO => maestro}/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi (100%) rename tests/resources/mir_datasets/{MAESTRO => maestro}/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav (100%) rename tests/resources/mir_datasets/{MAESTRO => maestro}/maestro-v2.0.0.json (100%) rename tests/resources/mir_datasets/{Medley-solos-DB => medley_solos_db}/annotation/Medley-solos-DB_metadata.csv (100%) rename tests/resources/mir_datasets/{Medley-solos-DB => medley_solos_db}/audio/Medley-solos-DB_validation-3_d07b1fc0-567d-52c2-fef4-239f31c9d40e.wav (100%) rename tests/resources/mir_datasets/{MedleyDB-Melody => medleydb_melody}/audio/MusicDelta_Beethoven_MIX.wav (100%) create mode 100644 tests/resources/mir_datasets/medleydb_melody/medleydb_melody_metadata.json rename tests/resources/mir_datasets/{MedleyDB-Melody => medleydb_melody}/melody1/MusicDelta_Beethoven_MELODY1.csv (100%) rename tests/resources/mir_datasets/{MedleyDB-Melody => medleydb_melody}/melody2/MusicDelta_Beethoven_MELODY2.csv (100%) rename tests/resources/mir_datasets/{MedleyDB-Melody => medleydb_melody}/melody3/MusicDelta_Beethoven_MELODY3.csv (100%) rename tests/resources/mir_datasets/{MedleyDB-Pitch => medleydb_pitch}/audio/AClassicEducation_NightOwl_STEM_08.wav (100%) rename tests/resources/mir_datasets/{MedleyDB-Pitch => medleydb_pitch}/medleydb_pitch_metadata.json (62%) rename tests/resources/mir_datasets/{MedleyDB-Pitch => medleydb_pitch}/pitch/AClassicEducation_NightOwl_STEM_08.csv (100%) rename tests/resources/mir_datasets/{Mridangam-Stroke => mridangam_stroke}/mridangam_stroke_1.5/B/224030__akshaylaya__bheem-b-001.wav (100%) rename tests/resources/mir_datasets/{Mridangam-Stroke => mridangam_stroke}/mridangam_stroke_1.5/B/_readme_and_license.txt (100%) rename tests/resources/mir_datasets/{Orchset => orchset}/GT/Beethoven-S3-I-ex1.mel (100%) rename tests/resources/mir_datasets/{Orchset => orchset}/Orchset - Predominant Melodic Instruments.csv (100%) rename tests/resources/mir_datasets/{Orchset => orchset}/audio/mono/Beethoven-S3-I-ex1.wav (100%) rename tests/resources/mir_datasets/{Orchset => orchset}/audio/stereo/Beethoven-S3-I-ex1.wav (100%) rename tests/resources/mir_datasets/{RWC-Classical => rwc_classical}/annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT (100%) rename tests/resources/mir_datasets/{RWC-Classical => rwc_classical}/annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT (100%) rename tests/resources/mir_datasets/{RWC-Classical => rwc_classical}/audio/rwc-c-m01/3.wav (100%) rename tests/resources/mir_datasets/{RWC-Classical => rwc_classical}/metadata-master/rwc-c.csv (100%) rename tests/resources/mir_datasets/{RWC-Genre => rwc_genre}/annotations/AIST.RWC-MDB-G-2001.BEAT/RM-G002.BEAT.TXT (100%) rename tests/resources/mir_datasets/{RWC-Genre => rwc_genre}/annotations/AIST.RWC-MDB-G-2001.CHORUS/RM-G002.CHORUS.TXT (100%) rename tests/resources/mir_datasets/{RWC-Genre => rwc_genre}/audio/rwc-g-m01/2.wav (100%) rename tests/resources/mir_datasets/{RWC-Genre => rwc_genre}/metadata-master/rwc-g.csv (100%) rename tests/resources/mir_datasets/{RWC-Jazz => rwc_jazz}/annotations/AIST.RWC-MDB-J-2001.BEAT/RM-J004.BEAT.TXT (100%) rename tests/resources/mir_datasets/{RWC-Jazz => rwc_jazz}/annotations/AIST.RWC-MDB-J-2001.CHORUS/RM-J004.CHORUS.TXT (100%) rename tests/resources/mir_datasets/{RWC-Jazz => rwc_jazz}/audio/rwc-j-m01/4.wav (100%) rename tests/resources/mir_datasets/{RWC-Jazz => rwc_jazz}/metadata-master/rwc-j.csv (100%) rename tests/resources/mir_datasets/{RWC-Popular => rwc_popular}/annotations/AIST.RWC-MDB-P-2001.BEAT/RM-P001.BEAT.TXT (100%) rename tests/resources/mir_datasets/{RWC-Popular => rwc_popular}/annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab (100%) rename tests/resources/mir_datasets/{RWC-Popular => rwc_popular}/annotations/AIST.RWC-MDB-P-2001.CHORUS/RM-P001.CHORUS.TXT (100%) rename tests/resources/mir_datasets/{RWC-Popular => rwc_popular}/annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT (100%) rename tests/resources/mir_datasets/{RWC-Popular => rwc_popular}/audio/rwc-p-m01/1.wav (100%) rename tests/resources/mir_datasets/{RWC-Popular => rwc_popular}/metadata-master/rwc-p.csv (100%) rename tests/resources/mir_datasets/{Salami => salami}/audio/2.mp3 (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_functions.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_lowercase.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_uppercase.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/1015/textfile2.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_functions.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_lowercase.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_uppercase.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/192/textfile1.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_functions.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_lowercase.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_functions.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_lowercase.txt (100%) rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_uppercase.txt (100%) rename tests/resources/mir_datasets/{Salami/salami-data-public-hierarchy-corrections/annotations/2/textfile2.txt => salami/salami-data-public-hierarchy-corrections/annotations/2/textfile1.txt} (100%) create mode 100644 tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/textfile2.txt rename tests/resources/mir_datasets/{Salami => salami}/salami-data-public-hierarchy-corrections/metadata/metadata.csv (100%) rename tests/resources/mir_datasets/{TinySOL => tinysol}/annotation/TinySOL_metadata.csv (100%) rename tests/resources/mir_datasets/{TinySOL => tinysol}/audio/Strings/Contrabass/ordinario/Cb-ord-A2-mf-2c-N.wav (100%) rename tests/resources/mir_datasets/{TinySOL => tinysol}/audio/Winds/Flute/ordinario/Fl-ord-C4-mf-N-T14d.wav (100%) rename tests/{test_track.py => test_core.py} (85%) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 1700192b5..93ed09b48 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -22,12 +22,13 @@ Finally, run tox with `tox`. All tests should pass! To add a new dataset loader you should: 1. Create a script in `scripts/`, e.g. `make_my_dataset_index.py`, which generates an index file. (See below for what an index file is) -2. Run the script on the canonical version of the dataset and save the index in `mirdata/indexes/` e.g. `my_dataset_index.json`. (Also see below for what we mean by "canonical") -3. Create a module in mirdata, e.g. `mirdata/my_dataset.py` +2. Run the script on the canonical version of the dataset and save the index in `mirdata/datasets/indexes/` e.g. `my_dataset_index.json`. (Also see below for what we mean by "canonical") +3. Create a module in mirdata, e.g. `mirdata/datasets/my_dataset.py` 4. Create tests for your loader in `tests/`, e.g. `test_my_dataset.py` -5. Add your module to `docs/source/mirdata.rst` and `docs/source/datasets.rst` -6. Add the module to `mirdata/__init__.py` +5. Add your module to `docs/source/mirdata.rst` and `docs/source/datasets.rst` (you can check that this was done correctly by clicking on the readthedocs check when you open a Pull Request) +6. Add the module name to `DATASETS` in `mirdata/__init__.py` 7. Add the module to the list in the `README.md` file, section `Currently supported datasets` +8. Run `pytest -s tests/test_full_dataset.py --local --dataset my_dataset` and make sure the tests all pass. See the tests section below for details. If your dataset **is not fully downloadable** there are two extra steps you should follow: 1. Contacting the mirdata organizers by opening an issue or PR so we can discuss how to proceed with the closed dataset. @@ -138,10 +139,18 @@ import os from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'Example' + +# -- Add any relevant citations here +BIBTEX = """@article{article-minimal, + author = "L[eslie] B. Lamport", + title = "The Gnats and Gnus Document Preparation System", + journal = "G-Animal's Journal", + year = "1986" +}""" + # -- REMOTES is a dictionary containing all files that need to be downloaded. # -- The keys should be descriptive (e.g. 'annotations', 'audio') REMOTES = { @@ -153,6 +162,14 @@ REMOTES = { ), } +# -- Include any information that should be printed when downloading +# -- remove this variable if you don't need to print anything during download +DOWNLOAD_INFO = """ +Include any information you want to be printed when dataset.download() is called. +These can be instructions for how to download the dataset (e.g. request access on zenodo), +caveats about the download, etc +""" + # -- change this to load any top-level metadata ## delete this function if you don't have global metadata def _load_metadata(data_home): @@ -175,7 +192,7 @@ DATA = utils.LargeData('example_index.json', _load_metadata) # DATA = utils.LargeData('example_index.json') ## use this if your dataset has no metadata -class Track(track.Track): +class Track(core.Track): """Example track class # -- YOU CAN AUTOMATICALLY GENERATE THIS DOCSTRING BY CALLING THE SCRIPT: # -- `scripts/print_track_docstring.py my_dataset` @@ -183,24 +200,19 @@ class Track(track.Track): Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets/Example` Attributes: track_id (str): track id # -- Add any of the dataset specific attributes here """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: raise ValueError( '{} is not a valid track ID in Example'.format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] @@ -319,97 +331,34 @@ def load_audio(audio_path): raise IOError("audio_path {} does not exist".format(audio_path)) return librosa.load(audio_path, sr=None, mono=True) -# -- the partial_download argument can be removed if `dataset.REMOTES` is missing/has only one value -# -- the force_overwrite argument can be removed if the dataset does not download anything -# -- (i.e. there is no `dataset.REMOTES`) -# -- the cleanup argument can be removed if the dataset has no tar or zip files in `dataset.REMOTES`. -def download( - data_home=None, partial_download=None, force_overwrite=False, cleanup=True +# -- this function is not necessary unless you need very custom download logic +# -- If you need it, it must have this signature. +def _download( + save_dir, remotes, partial_download, info_message, force_overwrite, cleanup ): """Download the dataset. Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` + save_dir (str): + The directory to download the data + remotes (dict or None): + A dictionary of RemoteFileMetadata tuples of data in zip format. + If None, there is no data to download + partial_download (list or None): + A list of keys to partially download the remote objects of the download dict. + If None, all data is downloaded + info_message (str or None): + A string of info to print when this function is called. + If None, no string is printed. force_overwrite (bool): - Whether to overwrite the existing downloaded data - partial_download (list): - List indicating what to partially download. The list can include any of: - * 'TODO_KEYS_OF_REMOTES' TODO ADD DESCRIPTION - If `None`, all data is downloaded. + If True, existing files are overwritten by the downloaded files. cleanup (bool): Whether to delete the zip/tar file after extracting. """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_utils.downloader( - # -- everything will be downloaded & uncompressed inside `data_home` - data_home, - # -- by default all elements in REMOTES will be downloaded - remotes=REMOTES, - # -- we allow partial downloads of the datasets containing multiple remote files - # -- this is done by specifying a list of keys in partial_download (when using the library) - partial_download=partial_download, - # -- if you need to give the user any instructions, such as how to download - # -- a dataset which is not freely availalbe, put them here - info_message=None, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -# -- keep this function exactly as it is -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -# -- keep this function exactly as it is -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -# -- keep this function as it is -def load(data_home=None): - """Load Example dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - data = {} - for key in DATA.index.keys(): - data[key] = Track(key, data_home=data_home) - return data + # see download_utils.downloader for basic usage - if you only need to call downloader + # once, you do not need this function at all. + # only write a custom function if you need it! # -- Write any necessary loader functions for loading the dataset's data @@ -438,18 +387,6 @@ def load_annotation(annotation_path): np.array(annotation)) return annotation_data - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -MLA format citation/s here -========== Bibtex ========== -Bibtex format citations/s here -""" - print(cite_data) - ``` @@ -461,6 +398,7 @@ Bibtex format citations/s here c. If the dataset has a metadata file, reduce the length to a few lines to make it trival to test. 2. Test all of the dataset specific code, e.g. the public attributes of the Track object, the load functions and any other custom functions you wrote. See the ikala dataset tests (`tests/test_ikala.py`) for a reference. *Note that we have written automated tests for all loader's `cite`, `download`, `validate`, `load`, `track_ids` functions, as well as some basic edge cases of the `Track` object, so you don't need to write tests for these!* +3. Locally run `pytest -s tests/test_full_dataset.py --local --dataset my_dataset`. See below for more details. ## Running your tests locally diff --git a/README.md b/README.md index 2731208d1..9be1568b7 100644 --- a/README.md +++ b/README.md @@ -24,14 +24,14 @@ pip install mirdata Try it out! ```python -import mirdata.orchset +import mirdata import random -mirdata.orchset.download() # download the dataset -mirdata.orchset.validate() # validate that all the expected files are there -orchset_data = mirdata.orchset.load() # (lazy) load the data index +orchset = mirdata.Dataset('orchset') +orchset.download() # download the dataset +orchset.validate() # validate that all the expected files are there -example_track = random.choice(list(orchset_data.items())) # choose a random example track +example_track = orchset.choice_track() # choose a random example track print(example_track) # see the availalbe data ``` See the Examples section below for more details, or the [documentation](https://mirdata.readthedocs.io/en/latest/) for more examples and the API reference. @@ -92,31 +92,44 @@ We welcome contributions to this library, especially new datasets. Please see [C ### Download the Orchset Dataset ```python -import mirdata.orchset +import mirdata -mirdata.orchset.download() +orchset = mirdata.Dataset('orchset') +orchset.download() ``` ### Validate the data ```python -import mirdata.orchset +import mirdata -mirdata.orchset.validate() +orchset = mirdata.Dataset('orchset') +orchset.validate() ``` -### Load the Orchset Dataset +### Load data for a specific track ```python -import mirdata.orchset +import mirdata -orchset_data = mirdata.orchset.load() +orchset = mirdata.Dataset('orchset') +track = orchset.track('Beethoven-S3-I-ex1') +print(track) +``` + +### Load all tracks in the Orchset Dataset +```python +import mirdata + +orchset = mirdata.Dataset('orchset') +orchset_data = orchset.load_tracks() ``` ### See what data are available for a track ```python -import mirdata.orchset +import mirdata -orchset_ids = mirdata.orchset.track_ids() -orchset_data = mirdata.orchset.load() +orchset = mirdata.Dataset('orchset') +orchset_ids = orchset.track_ids() +orchset_data = orchset.load_tracks() example_track = orchset_data[orchset_ids[0]] print(example_track) @@ -144,7 +157,7 @@ print(example_track) ### Evaluate a melody extraction algorithm on Orchset ```python import mir_eval -import mirdata.orchset +import mirdata import numpy as np import sox @@ -156,7 +169,8 @@ def very_bad_melody_extractor(audio_path): # Evaluate on the full dataset orchset_scores = {} -orchset_data = mirdata.orchset.load() +orchset = mirdata.Dataset('orchset') +orchset_data = orchset.load_tracks() for track_id, track_data in orchset_data.items(): est_times, est_freqs = very_bad_melody_extractor(track_data.audio_path_mono) @@ -183,4 +197,4 @@ for track_id, track_data in orchset_data.items(): By default, all datasets tracked by this library are stored in `~/mir_datasets`, (defined as `MIR_DATASETS_DIR` in `mirdata/__init__.py`). Data can alternatively be stored in another location by specifying `data_home` -within a relevant function, e.g. `mirdata.orchset.download(data_home='my_custom_path')` +within a relevant function, e.g. `mirdata.Dataset('orchset', data_home='my_custom_path')` diff --git a/docs/source/example.rst b/docs/source/example.rst index 29bd8bb99..337c98717 100644 --- a/docs/source/example.rst +++ b/docs/source/example.rst @@ -22,10 +22,10 @@ Fortunately, we can download Orchset dataset directly. .. code-block:: python :linenos: - import mirdata.orchset + import mirdata + orchset = mirdata.Dataset("orchset") # Download the Orchset Dataset - mirdata.orchset.download() - # Orchset_dataset_0.zip?download=1: 1.00B [03:05, 185s/B] + orchset.download() Once downloading is done, you can find the the dataset folder. @@ -33,7 +33,7 @@ Once downloading is done, you can find the the dataset folder. .. code-block:: bash :linenos: - $ ls ~/mir_datasets/Orchset/ + $ ls ~/mir_datasets/orchset/ GT Orchset - Predominant Melodic Instruments.csv README.txt @@ -46,10 +46,12 @@ The ID's and annotation data can be loaded as below. :linenos: # Load the dataset - orchset_data = mirdata.orchset.load() - orchset_ids = mirdata.orchset.track_ids() + orchset_data = orchset.load_tracks() + orchset_ids = orchset.track_ids() - # todo: add __str__() method and print(orchset_data) + # Look at one random track + track = orchset.choice_track() + print(track) If we wanted to use Orchset to evaluate the performance of a melody extraction algorithm @@ -60,7 +62,7 @@ metadata, we could do the following: :linenos: import mir_eval - import mirdata.orchset + import mirdata import numpy as np import sox @@ -71,8 +73,9 @@ metadata, we could do the following: return time_stamps, melody_f0 # Evaluate on the full dataset + orchset = mirdata.Dataset("orchset") orchset_scores = {} - orchset_data = mirdata.orchset.load() + orchset_data = orchset.load_tracks() for track_id, track_data in orchset_data.items(): est_times, est_freqs = very_bad_melody_extractor(track_data.audio_path_mono) @@ -134,10 +137,13 @@ For example, to load the melody annotations from Orchset into memory, we can sim .. code-block:: python :linenos: - import mirdata.orchset + import mirdata + + # get the orchset dataset + orchset = mirdata.Dataset("orchset") - # Load a single track - track = mirdata.orchset.Track('Beethoven-S3-I-ex1') + # Load a specific track + track = orchset.track('Beethoven-S3-I-ex1') melody_annotation = track.melody print(melody_annotation) @@ -149,10 +155,13 @@ However, if your data lives somewhere else, accessing the annotation will return .. code-block:: python :linenos: - import mirdata.orchset + import mirdata + + # get the orchset dataset + orchset = mirdata.Dataset("orchset") # Load a single track, specifying the remote location - track = mirdata.orchset.Track('Beethoven-S3-I-ex1', data_home='gs://my_custom/remote_path') + track = orchset.track('Beethoven-S3-I-ex1', data_home='gs://my_custom/remote_path') melody_path = track.melody_path print(melody_path) @@ -186,15 +195,16 @@ The following is a simple example of a generator that can be used to create a te .. code-block:: python :linenos: - import mirdata.orchset + import mirdata import numpy as np import tensorflow as tf def orchset_generator(): # using the default data_home - track_ids = mirdata.orchset.track_ids() + orchset = mirdata.Dataset("orchset") + track_ids = orchset.track_ids() for track_id in track_ids: - track = mirdata.orchset.Track(track_id) + track = orchset.track(track_id) audio_signal, sample_rate = track.audio_mono yield { "audio": audio_signal.astype(np.float32), diff --git a/docs/source/mirdata.rst b/docs/source/mirdata.rst index 8d7d6c405..316fd058e 100644 --- a/docs/source/mirdata.rst +++ b/docs/source/mirdata.rst @@ -10,146 +10,145 @@ API documentation .. automodule:: mirdata :members: -Dataset Loaders ---------------- +Datasets +-------- -mirdata.beatles -^^^^^^^^^^^^^^^ +beatles +^^^^^^^ -.. automodule:: mirdata.beatles +.. automodule:: mirdata.datasets.beatles :members: -mirdata.beatport_key -^^^^^^^^^^^^^^^^^^^^ +beatport_key +^^^^^^^^^^^^ -.. automodule:: mirdata.beatport_key +.. automodule:: mirdata.datasets.beatport_key :members: -mirdata.dali -^^^^^^^^^^^^ +dali +^^^^ -.. automodule:: mirdata.dali +.. automodule:: mirdata.datasets.dali :members: -mirdata.giantsteps_tempo -^^^^^^^^^^^^^^^^^^^^^^^^ +giantsteps_tempo +^^^^^^^^^^^^^^^^ -.. automodule:: mirdata.giantsteps_tempo +.. automodule:: mirdata.datasets.giantsteps_tempo :members: -mirdata.giantsteps_key -^^^^^^^^^^^^^^^^^^^^^^ +giantsteps_key +^^^^^^^^^^^^^^ -.. automodule:: mirdata.giantsteps_key +.. automodule:: mirdata.datasets.giantsteps_key :members: -mirdata.groove_midi -^^^^^^^^^^^^^^^^^^^ +groove_midi +^^^^^^^^^^^ -.. automodule:: mirdata.groove_midi +.. automodule:: mirdata.datasets.groove_midi :members: -mirdata.gtzan_genre -^^^^^^^^^^^^^^^^^^^ +gtzan_genre +^^^^^^^^^^^ -.. automodule:: mirdata.gtzan_genre +.. automodule:: mirdata.datasets.gtzan_genre :members: -mirdata.guitarset -^^^^^^^^^^^^^^^^^ +guitarset +^^^^^^^^^ -.. automodule:: mirdata.guitarset +.. automodule:: mirdata.datasets.guitarset :members: -mirdata.ikala -^^^^^^^^^^^^^ +ikala +^^^^^ -.. automodule:: mirdata.ikala +.. automodule:: mirdata.datasets.ikala :members: -mirdata.maestro -^^^^^^^^^^^^^^^ +maestro +^^^^^^^ -.. automodule:: mirdata.maestro +.. automodule:: mirdata.datasets.maestro :members: -mirdata.medleydb\_melody -^^^^^^^^^^^^^^^^^^^^^^^^ +medleydb\_melody +^^^^^^^^^^^^^^^^ -.. automodule:: mirdata.medleydb_melody +.. automodule:: mirdata.datasets.medleydb_melody :members: -mirdata.medleydb\_pitch -^^^^^^^^^^^^^^^^^^^^^^^ +medleydb\_pitch +^^^^^^^^^^^^^^^ -.. automodule:: mirdata.medleydb_pitch +.. automodule:: mirdata.datasets.medleydb_pitch :members: -mirdata.medley_solos_db -^^^^^^^^^^^^^^^^^^^^^^^ +medley_solos_db +^^^^^^^^^^^^^^^ -.. automodule:: mirdata.medley_solos_db +.. automodule:: mirdata.datasets.medley_solos_db :members: -mirdata.mridangam_stroke -^^^^^^^^^^^^^^^^^^^^^^^^ +mridangam_stroke +^^^^^^^^^^^^^^^^ .. automodule:: mirdata.mridangam_stroke :members: -mirdata.orchset -^^^^^^^^^^^^^^^ +orchset +^^^^^^^ -.. automodule:: mirdata.orchset +.. automodule:: mirdata.datasets.orchset :members: +rwc_classical +^^^^^^^^^^^^^ -mirdata.rwc_classical -^^^^^^^^^^^^^^^^^^^^^ - -.. automodule:: mirdata.rwc_classical +.. automodule:: mirdata.datasets.rwc_classical :members: -mirdata.rwc_jazz -^^^^^^^^^^^^^^^^ +rwc_jazz +^^^^^^^^ -.. automodule:: mirdata.rwc_jazz +.. automodule:: mirdata.datasets.rwc_jazz :members: -mirdata.rwc_popular -^^^^^^^^^^^^^^^^^^^ +rwc_popular +^^^^^^^^^^^ -.. automodule:: mirdata.rwc_popular +.. automodule:: mirdata.datasets.rwc_popular :members: -mirdata.salami -^^^^^^^^^^^^^^ +salami +^^^^^^ -.. automodule:: mirdata.salami +.. automodule:: mirdata.datasets.salami :members: -mirdata.tinysol -^^^^^^^^^^^^^^^ +tinysol +^^^^^^^ -.. automodule:: mirdata.tinysol +.. automodule:: mirdata.datasets.tinysol :members: Utilities ---------- -mirdata.track +mirdata.core ^^^^^^^^^^^^^ -.. automodule:: mirdata.track +.. automodule:: mirdata.core :members: mirdata.utils diff --git a/mirdata/__init__.py b/mirdata/__init__.py index 1719cb983..2b8061005 100644 --- a/mirdata/__init__.py +++ b/mirdata/__init__.py @@ -3,25 +3,27 @@ from .version import version as __version__ -__all__ = [ - 'beatles', - 'beatport_key', - 'dali', - 'giantsteps_tempo', - 'giantsteps_key', - 'groove_midi', - 'gtzan_genre', - 'guitarset', - 'ikala', - 'maestro', - 'medley_solos_db', - 'medleydb_melody', - 'medleydb_pitch', - 'mridangam_stroke', - 'orchset', - 'rwc_classical', - 'rwc_jazz', - 'rwc_popular', - 'salami', - 'tinysol', +DATASETS = [ + "beatles", + "beatport_key", + "dali", + "giantsteps_key", + "giantsteps_tempo", + "groove_midi", + "gtzan_genre", + "guitarset", + "ikala", + "maestro", + "medley_solos_db", + "medleydb_melody", + "medleydb_pitch", + "mridangam_stroke", + "orchset", + "rwc_classical", + "rwc_jazz", + "rwc_popular", + "salami", + "tinysol", ] + +from .core import Dataset diff --git a/mirdata/core.py b/mirdata/core.py new file mode 100644 index 000000000..e042bb0b0 --- /dev/null +++ b/mirdata/core.py @@ -0,0 +1,348 @@ +# -*- coding: utf-8 -*- +"""core mirdata classes +""" +import importlib +import os +import random +import types +import numpy as np + +import mirdata +from mirdata import download_utils +from mirdata import utils + +MAX_STR_LEN = 100 +DATASETS = mirdata.DATASETS + + +class Dataset(object): + """mirdata Dataset object + + Usage example: + orchset = mirdata.Dataset('orchset') # get the orchset dataset + orchset.download() # download orchset + orchset.validate() # validate orchset + track = orchset.choice_track() # load a random track + print(track) # see what data a track contains + orchset.track_ids() # load all track ids + + Attributes: + name (str): the identifier of the dataset + bibtex (str): dataset citation/s in bibtex format + remotes (dict): data to be downloaded + index (dict): dataset file index + download_info (str): download instructions or caveats + track (mirdata.core.Track): function that inputs a track_id + readme (str): information about the dataset + data_home (str): path where mirdata will look for the dataset + + """ + + def __init__(self, dataset, data_home=None): + """Inits a dataset by name and data location""" + if dataset not in DATASETS: + raise ValueError( + "{} is not a valid dataset in mirdata. Valid datsets are:\n{}".format( + dataset, ",".join(DATASETS) + ) + ) + module = importlib.import_module("mirdata.datasets.{}".format(dataset)) + self.name = dataset + self.bibtex = getattr(module, "BIBTEX", None) + self._remotes = getattr(module, "REMOTES", None) + self._index = module.DATA.index + self._download_info = getattr(module, "DOWNLOAD_INFO", None) + self._track_object = getattr(module, "Track", None) + self._download_fn = getattr(module, "_download", download_utils.downloader) + self._readme_str = module.__doc__ + + if data_home is None: + self.data_home = self.default_path + else: + self.data_home = data_home + + # this is a hack to be able to have dataset-specific docstrings + self.track = lambda track_id: self._track(track_id) + self.track.__doc__ = self._track_object.__doc__ # set the docstring + + # inherit any public load functions from the module + for method_name in dir(module): + if method_name.startswith("load_"): + method = getattr(module, method_name) + setattr(self, method_name, method) + # getattr(self, method_name).__doc__ = method.__doc__ + + def __repr__(self): + repr_string = "The {} dataset\n".format(self.name) + repr_string += "-" * MAX_STR_LEN + repr_string += "\n" + repr_string += ( + "Call the .readme method for complete documentation of this dataset.\n" + ) + repr_string += "Call the .cite method for bibtex citations.\n" + repr_string += "-" * MAX_STR_LEN + repr_string += "\n" + if self._track_object is not None: + repr_string += self.track.__doc__ + repr_string += "-" * MAX_STR_LEN + repr_string += "\n" + + return repr_string + + @property + def default_path(self): + """Get the default path for the dataset + + Returns: + default_path (str): Local path to the dataset + """ + mir_datasets_dir = os.path.join(os.getenv("HOME", "/tmp"), "mir_datasets") + return os.path.join(mir_datasets_dir, self.name) + + def _track(self, track_id): + """Load a track by track_id. + Hidden helper function that gets called as a lambda. + + Args: + track_id (str): track id of the track + + Returns: + track (dataset.Track): an instance of this dataset's Track object + """ + if self._track_object is None: + raise NotImplementedError + else: + return self._track_object(track_id, self.data_home) + + def load_tracks(self): + """Load all tracks in the dataset + + Returns: + (dict): {`track_id`: track data} + + Raises: + NotImplementedError: If the dataset does not support Track objects + """ + return {track_id: self.track(track_id) for track_id in self.track_ids} + + def choice_track(self): + """Choose a random track + + Returns: + track (dataset.Track): a random Track object + """ + return self.track(random.choice(self.track_ids)) + + def readme(self): + """Print the dataset's readme. + """ + print(self._readme_str) + + def cite(self): + """Print the reference""" + print("========== BibTeX ==========") + print(self.bibtex) + + def download(self, partial_download=None, force_overwrite=False, cleanup=True): + """Download data to `save_dir` and optionally print a message. + + Args: + partial_download (list or None): + A list of keys of remotes to partially download. + If None, all data is downloaded + force_overwrite (bool): + If True, existing files are overwritten by the downloaded files. + cleanup (bool): + Whether to delete any zip/tar files after extracting. + + Raises: + ValueError: if invalid keys are passed to partial_download + IOError: if a downloaded file's checksum is different from expected + + """ + self._download_fn( + self.data_home, + remotes=self._remotes, + partial_download=partial_download, + info_message=self._download_info, + force_overwrite=force_overwrite, + cleanup=cleanup, + ) + + @utils.cached_property + def track_ids(self): + """Return track ids + + Returns: + (list): A list of track ids + """ + return list(self._index.keys()) + + def validate(self, verbose=True): + """Validate if the stored dataset is a valid version + + Args: + verbose (bool): If False, don't print output + + Returns: + missing_files (list): List of file paths that are in the dataset index + but missing locally + invalid_checksums (list): List of file paths that file exists in the dataset + index but has a different checksum compare to the reference checksum + + """ + missing_files, invalid_checksums = utils.validator( + self._index, self.data_home, verbose=verbose + ) + return missing_files, invalid_checksums + + +class Track(object): + def __repr__(self): + properties = [v for v in dir(self.__class__) if not v.startswith("_")] + attributes = [ + v for v in dir(self) if not v.startswith("_") and v not in properties + ] + + repr_str = "Track(\n" + + for attr in attributes: + val = getattr(self, attr) + if isinstance(val, str): + if len(val) > MAX_STR_LEN: + val = "...{}".format(val[-MAX_STR_LEN:]) + val = '"{}"'.format(val) + repr_str += " {}={},\n".format(attr, val) + + for prop in properties: + val = getattr(self.__class__, prop) + if isinstance(val, types.FunctionType): + continue + + if val.__doc__ is None: + raise ValueError("{} has no documentation".format(prop)) + + val_type_str = val.__doc__.split(":")[0] + repr_str += " {}: {},\n".format(prop, val_type_str) + + repr_str += ")" + return repr_str + + def to_jams(self): + raise NotImplementedError + + +class MultiTrack(Track): + """MultiTrack class. + + A multitrack class is a collection of track objects and their associated audio + that can be mixed together. + A multitrack is iteslf a Track, and can have its own associated audio (such as + a mastered mix), its own metadata and its own annotations. + + """ + + def _check_mixable(self): + if not hasattr(self, "tracks") or not hasattr(self, "track_audio_property"): + raise NotImplementedError( + "This MultiTrack has no tracks/track_audio_property. Cannot perform mixing" + ) + + def get_target(self, track_keys, weights=None, average=True, enforce_length=True): + """Get target which is a linear mixture of tracks + + Args: + track_keys (list): list of track keys to mix together + weights (list or None): list of positive scalars to be used in the average + average (bool): if True, computes a weighted average of the tracks + if False, computes a weighted sum of the tracks + enforce_length (bool): If True, raises ValueError if the tracks are + not the same length. If False, pads audio with zeros to match the length + of the longest track + + Returns: + target (np.ndarray): target audio with shape (n_channels, n_samples) + + Raises: + ValueError: + if sample rates of the tracks are not equal + if enforce_length=True and lengths are not equal + + """ + self._check_mixable() + signals = [] + lengths = [] + sample_rates = [] + for k in track_keys: + audio, sample_rate = getattr(self.tracks[k], self.track_audio_property) + # ensure all signals are shape (n_channels, n_samples) + if len(audio.shape) == 1: + audio = audio[np.newaxis, :] + signals.append(audio) + lengths.append(audio.shape[1]) + sample_rates.append(sample_rate) + + if len(set(sample_rates)) > 1: + raise ValueError( + "Sample rates for tracks {} are not equal: {}".format( + track_keys, sample_rates + ) + ) + + max_length = np.max(lengths) + if any([l != max_length for l in lengths]): + if enforce_length: + raise ValueError( + "Track's {} audio are not the same length {}. Use enforce_length=False to pad with zeros.".format( + track_keys, lengths + ) + ) + else: + # pad signals to the max length + signals = [ + np.pad(signal, ((0, 0), (0, max_length - signal.shape[1]))) + for signal in signals + ] + + if weights is None: + weights = np.ones((len(track_keys),)) + + target = np.average(signals, axis=0, weights=weights) + if not average: + target *= np.sum(weights) + + return target + + def get_random_target(self, n_tracks=None, min_weight=0.3, max_weight=1.0): + """Get a random target by combining a random selection of tracks with random weights + + Args: + n_tracks (int or None): number of tracks to randomly mix. If None, uses all tracks + min_weight (float): minimum possible weight when mixing + max_weight (float): maximum possible weight when mixing + + Returns: + target (np.ndarray): mixture audio with shape (n_samples, n_channels) + tracks (list): list of keys of included tracks + weights (list): list of weights used to mix tracks + """ + self._check_mixable() + tracks = list(self.tracks.keys()) + if n_tracks is not None and n_tracks < len(tracks): + tracks = np.random.choice(tracks, n_tracks, replace=False) + + weights = np.random.uniform(low=min_weight, high=max_weight, size=len(tracks)) + target = self.get_target(tracks, weights=weights) + return target, tracks, weights + + def get_mix(self): + """Create a linear mixture given a subset of tracks. + + Args: + track_keys (list): list of track keys to mix together + + Returns: + target (np.ndarray): mixture audio with shape (n_samples, n_channels) + """ + self._check_mixable() + return self.get_target(list(self.tracks.keys())) diff --git a/mirdata/dali.py b/mirdata/dali.py deleted file mode 100644 index 1dda3fcfb..000000000 --- a/mirdata/dali.py +++ /dev/null @@ -1,368 +0,0 @@ -# -*- coding: utf-8 -*- -"""DALI Dataset Loader - -DALI contains 5358 audio files with their time-aligned vocal melody. -It also contains time-aligned lyrics at four levels of granularity: notes, -words, lines, and paragraphs. - -For each song, DALI also provides additional metadata: genre, language, musician, -album covers, or links to video clips. - -For more details, please visit: https://github.com/gabolsgabs/DALI -""" - -import json -import gzip -import librosa -import logging -import numpy as np -import os -import pickle - -from mirdata import download_utils -from mirdata import jams_utils -from mirdata import track -from mirdata import utils - -# this is the package, needed to load the annotations. -# DALI-dataset is only installed if the user explicitly declares -# they want dali when pip installing. -try: - import DALI -except ImportError as E: - logging.error( - 'In order to use dali you must have dali-dataset installed. ' - 'Please reinstall mirdata using `pip install \'mirdata[dali]\'' - ) - raise - -DATASET_DIR = 'DALI' - -REMOTES = { - 'metadata': download_utils.RemoteFileMetadata( - filename='dali_metadata.json', - url='https://raw.githubusercontent.com/gabolsgabs/DALI/master/code/DALI/files/dali_v1_metadata.json', - checksum='40af5059e7aa97f81b2654758094d24b', - destination_dir='.', - ) -} - - -def _load_metadata(data_home): - metadata_path = os.path.join(data_home, os.path.join('dali_metadata.json')) - if not os.path.exists(metadata_path): - logging.info('Metadata file {} not found.'.format(metadata_path)) - return None - with open(metadata_path, 'r') as fhandle: - metadata_index = json.load(fhandle) - - metadata_index['data_home'] = data_home - return metadata_index - - -DATA = utils.LargeData('dali_index.json', _load_metadata) - - -class Track(track.Track): - """DALI melody Track class - - Args: - track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Attributes: - album (str): the track's album - annotation_path (str): path to the track's annotation file - artist (str): the track's artist - audio_path (str): path to the track's audio file - audio_url (str): youtube ID - dataset_version (int): dataset annotation version - ground_truth (bool): True if the annotation is verified - language (str): sung language - release_date (str): year the track was released - scores_manual (int): TODO - scores_ncc (float): TODO - title (str): the track's title - track_id (str): the unique track id - url_working (bool): True if the youtube url was valid - - """ - - def __init__(self, track_id, data_home=None): - if track_id not in DATA.index: - raise ValueError('{} is not a valid track ID in DALI'.format(track_id)) - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - self.track_id = track_id - self._data_home = data_home - self._track_paths = DATA.index[track_id] - self.annotation_path = os.path.join( - self._data_home, self._track_paths['annot'][0] - ) - - metadata = DATA.metadata(data_home) - if metadata is not None and track_id in metadata: - self._track_metadata = metadata[track_id] - self._track_metadata['album'] = metadata[track_id]['metadata']['album'] - self._track_metadata['release_date'] = metadata[track_id]['metadata'][ - 'release_date' - ] - self._track_metadata['language'] = metadata[track_id]['metadata'][ - 'language' - ] - self.audio_url = self._track_metadata['audio']['url'] - self.url_working = self._track_metadata['audio']['working'] - self.ground_truth = self._track_metadata['ground-truth'] - self.artist = self._track_metadata['artist'] - self.title = self._track_metadata['title'] - self.dataset_version = self._track_metadata['dataset_version'] - self.scores_ncc = self._track_metadata['scores']['NCC'] - self.scores_manual = self._track_metadata['scores']['manual'] - self.album = self._track_metadata['album'] - self.release_date = self._track_metadata['release_date'] - self.language = self._track_metadata['language'] - self.audio_path = os.path.join( - self._data_home, self._track_paths['audio'][0] - ) - else: - self.audio_url = None - self.url_working = None - self.ground_truth = None - self.artist = None - self.title = None - self.dataset_version = None - self.scores_ncc = None - self.scores_manual = None - self.album = None - self.release_date = None - self.language = None - self.audio_path = None - - @utils.cached_property - def notes(self): - """NoteData: note-aligned lyrics""" - return load_annotations_granularity(self.annotation_path, 'notes') - - @utils.cached_property - def words(self): - """LyricData: word-aligned lyric""" - return load_annotations_granularity(self.annotation_path, 'words') - - @utils.cached_property - def lines(self): - """LyricData: line-aligned lyrics""" - return load_annotations_granularity(self.annotation_path, 'lines') - - @utils.cached_property - def paragraphs(self): - """LyricData: paragraph-aligned lyrics""" - return load_annotations_granularity(self.annotation_path, 'paragraphs') - - @utils.cached_property - def annotation_object(self): - """DALI.Annotations: DALI Annotations object""" - return load_annotations_class(self.annotation_path) - - @property - def audio(self): - """(np.ndarray, float): audio signal, sample rate""" - return load_audio(self.audio_path) - - def to_jams(self): - """Jams: the track's data in jams format""" - return jams_utils.jams_converter( - audio_path=self.audio_path, - lyrics_data=[ - (self.words, 'word-aligned lyrics'), - (self.lines, 'line-aligned lyrics'), - (self.paragraphs, 'paragraph-aligned lyrics'), - ], - note_data=[(self.notes, 'annotated vocal notes')], - metadata=self._track_metadata, - ) - - -def load_audio(audio_path): - """Load a DALI audio file. - - Args: - audio_path (str): path to audio file - - Returns: - y (np.ndarray): the mono audio signal - sr (float): The sample rate of the audio file - - """ - if not os.path.exists(audio_path): - raise IOError("audio_path {} does not exist".format(audio_path)) - return librosa.load(audio_path, sr=None, mono=True) - - -def download(data_home=None, force_overwrite=False): - """DALI is not available for downloading directly. - This function prints a helper message to download DALI - through zenodo.org. - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - """ - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - info_message = """ - To download this dataset, visit: - https://zenodo.org/record/2577915 and request access. - - Once downloaded, unzip the file DALI_v1.0.zip - and place the result in: - {save_path} - - Use the function dali_code.get_audio you can find at: - https://github.com/gabolsgabs/DALI for getting the audio and place them at: - {audio_path} - """.format( - save_path=os.path.join(data_home, 'annotations'), - force_overwrite=force_overwrite, - audio_path=os.path.join(data_home, 'audio'), - ) - - download_utils.downloader(data_home, remotes=REMOTES, info_message=info_message) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load DALI dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - dali_data = {} - for key in track_ids(): - dali_data[key] = Track(key, data_home=data_home) - return dali_data - - -def load_annotations_granularity(annotations_path, granularity): - """Load annotations at the specified level of granularity - - Args: - annotations_path (str): path to a DALI annotation file - granularity (str): one of 'notes', 'words', 'lines', 'paragraphs' - - Returns: - NoteData for granularity='notes' or LyricData otherwise - - """ - if not os.path.exists(annotations_path): - raise IOError("annotations_path {} does not exist".format(annotations_path)) - - try: - with gzip.open(annotations_path, 'rb') as f: - output = pickle.load(f) - except Exception as e: - with gzip.open(annotations_path, 'r') as f: - output = pickle.load(f) - text = [] - notes = [] - begs = [] - ends = [] - for annot in output.annotations['annot'][granularity]: - notes.append(round(annot['freq'][0], 3)) - begs.append(round(annot['time'][0], 3)) - ends.append(round(annot['time'][1], 3)) - text.append(annot['text']) - if granularity == 'notes': - annotation = utils.NoteData(np.array([begs, ends]).T, np.array(notes), None) - else: - annotation = utils.LyricData( - np.array(begs), np.array(ends), np.array(text), None - ) - return annotation - - -def load_annotations_class(annotations_path): - """Load full annotations into the DALI class object - - Args: - annotations_path (str): path to a DALI annotation file - - Returns: - DALI annotations object - - """ - if not os.path.exists(annotations_path): - raise IOError("annotations_path {} does not exist".format(annotations_path)) - - try: - with gzip.open(annotations_path, 'rb') as f: - output = pickle.load(f) - except Exception as e: - with gzip.open(annotations_path, 'r') as f: - output = pickle.load(f) - return output - - -def cite(): - """Print the reference""" - - cite_data = """ - =========== MLA =========== - Meseguer-Brocal, Gabriel, et al. - "DALI: a large Dataset of synchronized Audio, LyrIcs and notes, automatically created using teacher-student machine - learning paradigm." - In Proceedings of the 19th International Society for Music Information Retrieval Conference (ISMIR). 2018. - - ========== Bibtex ========== - @inproceedings{Meseguer-Brocal_2018, - Title = {DALI: a large Dataset of synchronized Audio, LyrIcs and notes, automatically created using teacher-student - machine learning paradigm.}, - Author = {Meseguer-Brocal, Gabriel and Cohen-Hadria, Alice and Peeters, Geoffroy}, - Booktitle = {19th International Society for Music Information Retrieval Conference}, - Editor = {ISMIR}, Month = {September}, - Year = {2018}} - """ - print(cite_data) diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/textfile1.txt b/mirdata/datasets/__init__.py similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/textfile1.txt rename to mirdata/datasets/__init__.py diff --git a/mirdata/beatles.py b/mirdata/datasets/beatles.py similarity index 60% rename from mirdata/beatles.py rename to mirdata/datasets/beatles.py index 95126b22e..521902a11 100644 --- a/mirdata/beatles.py +++ b/mirdata/datasets/beatles.py @@ -8,35 +8,52 @@ """ import csv +import os import librosa import numpy as np -import os from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'Beatles' +BIBTEX = """@inproceedings{mauch2009beatles, + title={OMRAS2 metadata project 2009}, + author={Mauch, Matthias and Cannam, Chris and Davies, Matthew and Dixon, Simon and Harte, + Christopher and Kolozali, Sefki and Tidhar, Dan and Sandler, Mark}, + booktitle={12th International Society for Music Information Retrieval Conference}, + year={2009}, + series = {ISMIR} +}""" + + REMOTES = { - 'annotations': download_utils.RemoteFileMetadata( - filename='The Beatles Annotations.tar.gz', - url='http://isophonics.net/files/annotations/The%20Beatles%20Annotations.tar.gz', - checksum='62425c552d37c6bb655a78e4603828cc', - destination_dir='annotations', + "annotations": download_utils.RemoteFileMetadata( + filename="The Beatles Annotations.tar.gz", + url="http://isophonics.net/files/annotations/The%20Beatles%20Annotations.tar.gz", + checksum="62425c552d37c6bb655a78e4603828cc", + destination_dir="annotations", ) } -DATA = utils.LargeData('beatles_index.json') +DOWNLOAD_INFO = """ + Unfortunately the audio files of the Beatles dataset are not available + for download. If you have the Beatles dataset, place the contents into + a folder called Beatles with the following structure: + > Beatles/ + > annotations/ + > audio/ + and copy the Beatles folder to {data_home} +""" +DATA = utils.LargeData("beatles_index.json") -class Track(track.Track): + +class Track(core.Track): """Beatles track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: audio_path (str): track audio path @@ -49,30 +66,27 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: - raise ValueError('{} is not a valid track ID in Beatles'.format(track_id)) + raise ValueError("{} is not a valid track ID in Beatles".format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] self.beats_path = utils.none_path_join( - [self._data_home, self._track_paths['beat'][0]] + [self._data_home, self._track_paths["beat"][0]] ) - self.chords_path = os.path.join(self._data_home, self._track_paths['chords'][0]) + self.chords_path = os.path.join(self._data_home, self._track_paths["chords"][0]) self.keys_path = utils.none_path_join( - [self._data_home, self._track_paths['keys'][0]] + [self._data_home, self._track_paths["keys"][0]] ) self.sections_path = os.path.join( - self._data_home, self._track_paths['sections'][0] + self._data_home, self._track_paths["sections"][0] ) - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) - self.title = os.path.basename(self._track_paths['sections'][0]).split('.')[0] + self.title = os.path.basename(self._track_paths["sections"][0]).split(".")[0] @utils.cached_property def beats(self): @@ -107,7 +121,7 @@ def to_jams(self): section_data=[(self.sections, None)], chord_data=[(self.chords, None)], key_data=[(self.key, None)], - metadata={'artist': 'The Beatles', 'title': self.title}, + metadata={"artist": "The Beatles", "title": self.title}, ) @@ -127,98 +141,6 @@ def load_audio(audio_path): return librosa.load(audio_path, sr=None, mono=True) -def download(data_home=None, force_overwrite=False, cleanup=True): - """Download the Beatles Dataset (annotations). - The audio files are not provided due to copyright issues. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - cleanup (bool): - Whether to delete the zip/tar file after extracting. - - """ - - # use the default location: ~/mir_datasets/Beatles - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_message = """ - Unfortunately the audio files of the Beatles dataset are not available - for download. If you have the Beatles dataset, place the contents into - a folder called Beatles with the following structure: - > Beatles/ - > annotations/ - > audio/ - and copy the Beatles folder to {} - """.format( - data_home - ) - - download_utils.downloader( - data_home, - remotes=REMOTES, - info_message=download_message, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if a local version of this dataset is consistent - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths where the expected file exists locally - but has a different checksum than the reference - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Get the list of track IDs for this dataset - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load Beatles dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - beatles_data = {} - for key in track_ids(): - beatles_data[key] = Track(key, data_home=data_home) - return beatles_data - - def load_beats(beats_path): """Load Beatles format beat data from a file @@ -236,7 +158,7 @@ def load_beats(beats_path): raise IOError("beats_path {} does not exist".format(beats_path)) beat_times, beat_positions = [], [] - with open(beats_path, 'r') as fhandle: + with open(beats_path, "r") as fhandle: dialect = csv.Sniffer().sniff(fhandle.read(1024)) fhandle.seek(0) reader = csv.reader(fhandle, dialect) @@ -270,7 +192,7 @@ def load_chords(chords_path): raise IOError("chords_path {} does not exist".format(chords_path)) start_times, end_times, chords = [], [], [] - with open(chords_path, 'r') as f: + with open(chords_path, "r") as f: dialect = csv.Sniffer().sniff(f.read(1024)) f.seek(0) reader = csv.reader(f, dialect) @@ -301,10 +223,10 @@ def load_key(keys_path): raise IOError("keys_path {} does not exist".format(keys_path)) start_times, end_times, keys = [], [], [] - with open(keys_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter='\t') + with open(keys_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter="\t") for line in reader: - if line[2] == 'Key': + if line[2] == "Key": start_times.append(float(line[0])) end_times.append(float(line[1])) keys.append(line[3]) @@ -331,8 +253,8 @@ def load_sections(sections_path): raise IOError("sections_path {} does not exist".format(sections_path)) start_times, end_times, sections = [], [], [] - with open(sections_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter='\t') + with open(sections_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter="\t") for line in reader: start_times.append(float(line[0])) end_times.append(float(line[1])) @@ -348,39 +270,15 @@ def _fix_newpoint(beat_positions): from neighboring beats. """ - while np.any(beat_positions == 'New Point'): - idxs = np.where(beat_positions == 'New Point')[0] + while np.any(beat_positions == "New Point"): + idxs = np.where(beat_positions == "New Point")[0] for i in idxs: if i < len(beat_positions) - 1: - if not beat_positions[i + 1] == 'New Point': + if not beat_positions[i + 1] == "New Point": beat_positions[i] = str(np.mod(int(beat_positions[i + 1]) - 1, 4)) if i == len(beat_positions) - 1: - if not beat_positions[i - 1] == 'New Point': + if not beat_positions[i - 1] == "New Point": beat_positions[i] = str(np.mod(int(beat_positions[i - 1]) + 1, 4)) - beat_positions[beat_positions == '0'] = '4' + beat_positions[beat_positions == "0"] = "4" return beat_positions - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== - -Mauch, Matthias, et al. -"OMRAS2 metadata project 2009." -10th International Society for Music Information Retrieval Conference (2009) - -========== Bibtex ========== -@inproceedings{mauch2009beatles, - title={OMRAS2 metadata project 2009}, - author={Mauch, Matthias and Cannam, Chris and Davies, Matthew and Dixon, Simon and Harte, - Christopher and Kolozali, Sefki and Tidhar, Dan and Sandler, Mark}, - booktitle={12th International Society for Music Information Retrieval Conference}, - year={2009}, - series = {ISMIR} -} - """ - - print(cite_data) diff --git a/mirdata/beatport_key.py b/mirdata/datasets/beatport_key.py similarity index 65% rename from mirdata/beatport_key.py rename to mirdata/datasets/beatport_key.py index 2d6a8517d..754e1406a 100644 --- a/mirdata/beatport_key.py +++ b/mirdata/datasets/beatport_key.py @@ -24,35 +24,46 @@ from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'beatport_key' +BIBTEX = """@phdthesis {3897, + title = {Tonality Estimation in Electronic Dance Music: A Computational and Musically Informed Examination}, + year = {2018}, + month = {03/2018}, + pages = {234}, + school = {Universitat Pompeu Fabra}, + address = {Barcelona}, + abstract = {This dissertation revolves around the task of computational key estimation in electronic dance music, upon which three interrelated operations are performed. First, I attempt to detect possible misconceptions within the task, which is typically accomplished with a tonal vocabulary overly centred in Western classical tonality, reduced to a binary major/minor model which might not accomodate popular music styles. Second, I present a study of tonal practises in electronic dance music, developed hand in hand with the curation of a corpus of over 2,000 audio excerpts, including various subgenres and degrees of complexity. Based on this corpus, I propose the creation of more open-ended key labels, accounting for other modal practises and ambivalent tonal configurations. Last, I describe my own key finding methods, adapting existing models to the musical idiosyncrasies and tonal distributions of electronic dance music, with new statistical key profiles derived from the newly created corpus.}, + keywords = {EDM, Electronic Dance Music, Key Estimation, mir, music information retrieval, tonality}, + url = {https://doi.org/10.5281/zenodo.1154586}, + author = {{\'A}ngel Faraldo} +}""" REMOTES = { - 'keys': download_utils.RemoteFileMetadata( - filename='keys.zip', - url='https://zenodo.org/record/1101082/files/keys.zip?download=1', - checksum='939abc05f36121badfac4087241ac172', - destination_dir='.', + "keys": download_utils.RemoteFileMetadata( + filename="keys.zip", + url="https://zenodo.org/record/1101082/files/keys.zip?download=1", + checksum="939abc05f36121badfac4087241ac172", + destination_dir=".", ), - 'metadata': download_utils.RemoteFileMetadata( - filename='original_metadata.zip', - url='https://zenodo.org/record/1101082/files/original_metadata.zip?download=1', - checksum='bb3e3ac1fe5dee7600ef2814accdf8f8', - destination_dir='.', + "metadata": download_utils.RemoteFileMetadata( + filename="original_metadata.zip", + url="https://zenodo.org/record/1101082/files/original_metadata.zip?download=1", + checksum="bb3e3ac1fe5dee7600ef2814accdf8f8", + destination_dir=".", ), - 'audio': download_utils.RemoteFileMetadata( - filename='audio.zip', - url='https://zenodo.org/record/1101082/files/audio.zip?download=1', - checksum='f490ee6c23578482d6fcfa11b82636a1', - destination_dir='.', + "audio": download_utils.RemoteFileMetadata( + filename="audio.zip", + url="https://zenodo.org/record/1101082/files/audio.zip?download=1", + checksum="f490ee6c23578482d6fcfa11b82636a1", + destination_dir=".", ), } -DATA = utils.LargeData('beatport_key_index.json') +DATA = utils.LargeData("beatport_key_index.json") -class Track(track.Track): +class Track(core.Track): """beatport_key track class Args: track_id (str): track id of the track @@ -66,27 +77,24 @@ class Track(track.Track): track_id (str): track id """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: raise ValueError( - '{} is not a valid track ID in beatport_key'.format(track_id) + "{} is not a valid track ID in beatport_key".format(track_id) ) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - self.keys_path = os.path.join(self._data_home, self._track_paths['key'][0]) + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + self.keys_path = os.path.join(self._data_home, self._track_paths["key"][0]) self.metadata_path = ( - os.path.join(self._data_home, self._track_paths['meta'][0]) - if self._track_paths['meta'][0] is not None + os.path.join(self._data_home, self._track_paths["meta"][0]) + if self._track_paths["meta"][0] is not None else None ) - self.title = self.audio_path.replace(".mp3", '').split('/')[-1] + self.title = self.audio_path.replace(".mp3", "").split("/")[-1] @utils.cached_property def key(self): @@ -118,11 +126,11 @@ def to_jams(self): return jams_utils.jams_converter( audio_path=self.audio_path, metadata={ - 'artists': self.artists, - 'genres': self.genres, - 'tempo': self.tempo, - 'title': self.title, - 'key': self.key, + "artists": self.artists, + "genres": self.genres, + "tempo": self.tempo, + "title": self.title, + "key": self.key, }, ) @@ -160,88 +168,37 @@ def find_replace(directory, find, replace, pattern): f.write(s) -def download( - data_home=None, force_overwrite=False, cleanup=True, partial_download=None +def _download( + save_dir, remotes, partial_download, info_message, force_overwrite, cleanup ): - """Download the beatport_key Dataset (annotations). - The audio files are not provided due to copyright issues. + """Download the dataset. - This dataset annotations have characters that doesnt correspond with json format. In particular, "bpm": nan - doesn't correspond to json format. The function find_replace is used to fix this problem. - input file Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` + save_dir (str): + The directory to download the data + remotes (dict or None): + A dictionary of RemoteFileMetadata tuples of data in zip format. + If None, there is no data to download + partial_download (list or None): + A list of keys to partially download the remote objects of the download dict. + If None, all data is downloaded + info_message (str or None): + A string of info to print when this function is called. + If None, no string is printed. force_overwrite (bool): - Whether to overwrite the existing downloaded data + If True, existing files are overwritten by the downloaded files. cleanup (bool): Whether to delete the zip/tar file after extracting. - partial_download(list of str) - arguments can be 'audio' 'metadata' or/and 'keys' """ - - # use the default location: ~/mir_datasets/beatport_key - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_message = "" - download_utils.downloader( - data_home, - remotes=REMOTES, + save_dir, + remotes=remotes, partial_download=partial_download, - info_message=download_message, force_overwrite=force_overwrite, cleanup=cleanup, ) # removing nans from JSON files - find_replace(os.path.join(data_home, "meta"), ": nan", ": null", "*.json") - - -def validate(data_home=None, silence=False): - """Validate if a local version of this dataset is consistent - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths where the expected file exists locally - but has a different checksum than the reference - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Get the list of track IDs for this dataset - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load beatport_key dataset - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - beatles_data = {} - for key in track_ids(): - beatles_data[key] = Track(key, data_home=data_home) - return beatles_data + find_replace(os.path.join(save_dir, "meta"), ": nan", ": null", "*.json") def load_key(keys_path): @@ -325,30 +282,3 @@ def load_artist(metadata_path): meta = json.load(json_file) return [artist["name"] for artist in meta["artists"]] - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Ángel Faraldo (2017). -Tonality Estimation in Electronic Dance Music: A Computational and Musically Informed Examination. -PhD Thesis. Universitat Pompeu Fabra, Barcelona. -========== Bibtex ========== -@phdthesis {3897, - title = {Tonality Estimation in Electronic Dance Music: A Computational and Musically Informed Examination}, - year = {2018}, - month = {03/2018}, - pages = {234}, - school = {Universitat Pompeu Fabra}, - address = {Barcelona}, - abstract = {This dissertation revolves around the task of computational key estimation in electronic dance music, upon which three interrelated operations are performed. First, I attempt to detect possible misconceptions within the task, which is typically accomplished with a tonal vocabulary overly centred in Western classical tonality, reduced to a binary major/minor model which might not accomodate popular music styles. Second, I present a study of tonal practises in electronic dance music, developed hand in hand with the curation of a corpus of over 2,000 audio excerpts, including various subgenres and degrees of complexity. Based on this corpus, I propose the creation of more open-ended key labels, accounting for other modal practises and ambivalent tonal configurations. Last, I describe my own key finding methods, adapting existing models to the musical idiosyncrasies and tonal distributions of electronic dance music, with new statistical key profiles derived from the newly created corpus.}, - keywords = {EDM, Electronic Dance Music, Key Estimation, mir, music information retrieval, tonality}, - url = {https://doi.org/10.5281/zenodo.1154586}, - author = {{\'A}ngel Faraldo} -} - - """ - - print(cite_data) diff --git a/mirdata/datasets/dali.py b/mirdata/datasets/dali.py new file mode 100644 index 000000000..c218de9d0 --- /dev/null +++ b/mirdata/datasets/dali.py @@ -0,0 +1,277 @@ +# -*- coding: utf-8 -*- +"""DALI Dataset Loader + +DALI contains 5358 audio files with their time-aligned vocal melody. +It also contains time-aligned lyrics at four levels of granularity: notes, +words, lines, and paragraphs. + +For each song, DALI also provides additional metadata: genre, language, musician, +album covers, or links to video clips. + +For more details, please visit: https://github.com/gabolsgabs/DALI +""" + +import json +import gzip +import logging +import os +import pickle + +import librosa +import numpy as np + +from mirdata import download_utils +from mirdata import jams_utils +from mirdata import core +from mirdata import utils + +# this is the package, needed to load the annotations. +# DALI-dataset is only installed if the user explicitly declares +# they want dali when pip installing. +try: + import DALI +except ImportError as E: + logging.error( + "In order to use dali you must have dali-dataset installed. " + "Please reinstall mirdata using `pip install 'mirdata[dali]'" + ) + raise + +BIBTEX = """@inproceedings{Meseguer-Brocal_2018, + Title = {DALI: a large Dataset of synchronized Audio, LyrIcs and notes, automatically created using teacher-student + machine learning paradigm.}, + Author = {Meseguer-Brocal, Gabriel and Cohen-Hadria, Alice and Peeters, Geoffroy}, + Booktitle = {19th International Society for Music Information Retrieval Conference}, + Editor = {ISMIR}, Month = {September}, + Year = {2018} +}""" + +REMOTES = { + "metadata": download_utils.RemoteFileMetadata( + filename="dali_metadata.json", + url="https://raw.githubusercontent.com/gabolsgabs/DALI/master/code/DALI/files/dali_v1_metadata.json", + checksum="40af5059e7aa97f81b2654758094d24b", + destination_dir=".", + ) +} +DOWNLOAD_INFO = """ + To download this dataset, visit: + https://zenodo.org/record/2577915 and request access. + + Once downloaded, unzip the file DALI_v1.0.zip + and place the result in: + {data_home} + + Use the function dali_code.get_audio you can find at: + https://github.com/gabolsgabs/DALI for getting the audio and place them in: + {data_home}/audio +""" + + +def _load_metadata(data_home): + metadata_path = os.path.join(data_home, os.path.join("dali_metadata.json")) + if not os.path.exists(metadata_path): + logging.info("Metadata file {} not found.".format(metadata_path)) + return None + with open(metadata_path, "r") as fhandle: + metadata_index = json.load(fhandle) + + metadata_index["data_home"] = data_home + return metadata_index + + +DATA = utils.LargeData("dali_index.json", _load_metadata) + + +class Track(core.Track): + """DALI melody Track class + + Args: + track_id (str): track id of the track + + Attributes: + album (str): the track's album + annotation_path (str): path to the track's annotation file + artist (str): the track's artist + audio_path (str): path to the track's audio file + audio_url (str): youtube ID + dataset_version (int): dataset annotation version + ground_truth (bool): True if the annotation is verified + language (str): sung language + release_date (str): year the track was released + scores_manual (int): TODO + scores_ncc (float): TODO + title (str): the track's title + track_id (str): the unique track id + url_working (bool): True if the youtube url was valid + + """ + + def __init__(self, track_id, data_home): + if track_id not in DATA.index: + raise ValueError("{} is not a valid track ID in DALI".format(track_id)) + + self.track_id = track_id + self._data_home = data_home + self._track_paths = DATA.index[track_id] + self.annotation_path = os.path.join( + self._data_home, self._track_paths["annot"][0] + ) + + metadata = DATA.metadata(data_home) + if metadata is not None and track_id in metadata: + self._track_metadata = metadata[track_id] + self._track_metadata["album"] = metadata[track_id]["metadata"]["album"] + self._track_metadata["release_date"] = metadata[track_id]["metadata"][ + "release_date" + ] + self._track_metadata["language"] = metadata[track_id]["metadata"][ + "language" + ] + self.audio_url = self._track_metadata["audio"]["url"] + self.url_working = self._track_metadata["audio"]["working"] + self.ground_truth = self._track_metadata["ground-truth"] + self.artist = self._track_metadata["artist"] + self.title = self._track_metadata["title"] + self.dataset_version = self._track_metadata["dataset_version"] + self.scores_ncc = self._track_metadata["scores"]["NCC"] + self.scores_manual = self._track_metadata["scores"]["manual"] + self.album = self._track_metadata["album"] + self.release_date = self._track_metadata["release_date"] + self.language = self._track_metadata["language"] + self.audio_path = os.path.join( + self._data_home, self._track_paths["audio"][0] + ) + else: + self.audio_url = None + self.url_working = None + self.ground_truth = None + self.artist = None + self.title = None + self.dataset_version = None + self.scores_ncc = None + self.scores_manual = None + self.album = None + self.release_date = None + self.language = None + self.audio_path = None + + @utils.cached_property + def notes(self): + """NoteData: note-aligned lyrics""" + return load_annotations_granularity(self.annotation_path, "notes") + + @utils.cached_property + def words(self): + """LyricData: word-aligned lyric""" + return load_annotations_granularity(self.annotation_path, "words") + + @utils.cached_property + def lines(self): + """LyricData: line-aligned lyrics""" + return load_annotations_granularity(self.annotation_path, "lines") + + @utils.cached_property + def paragraphs(self): + """LyricData: paragraph-aligned lyrics""" + return load_annotations_granularity(self.annotation_path, "paragraphs") + + @utils.cached_property + def annotation_object(self): + """DALI.Annotations: DALI Annotations object""" + return load_annotations_class(self.annotation_path) + + @property + def audio(self): + """(np.ndarray, float): audio signal, sample rate""" + return load_audio(self.audio_path) + + def to_jams(self): + """Jams: the track's data in jams format""" + return jams_utils.jams_converter( + audio_path=self.audio_path, + lyrics_data=[ + (self.words, "word-aligned lyrics"), + (self.lines, "line-aligned lyrics"), + (self.paragraphs, "paragraph-aligned lyrics"), + ], + note_data=[(self.notes, "annotated vocal notes")], + metadata=self._track_metadata, + ) + + +def load_audio(audio_path): + """Load a DALI audio file. + + Args: + audio_path (str): path to audio file + + Returns: + y (np.ndarray): the mono audio signal + sr (float): The sample rate of the audio file + + """ + if not os.path.exists(audio_path): + raise IOError("audio_path {} does not exist".format(audio_path)) + return librosa.load(audio_path, sr=None, mono=True) + + +def load_annotations_granularity(annotations_path, granularity): + """Load annotations at the specified level of granularity + + Args: + annotations_path (str): path to a DALI annotation file + granularity (str): one of 'notes', 'words', 'lines', 'paragraphs' + + Returns: + NoteData for granularity='notes' or LyricData otherwise + + """ + if not os.path.exists(annotations_path): + raise IOError("annotations_path {} does not exist".format(annotations_path)) + + try: + with gzip.open(annotations_path, "rb") as f: + output = pickle.load(f) + except Exception as e: + with gzip.open(annotations_path, "r") as f: + output = pickle.load(f) + text = [] + notes = [] + begs = [] + ends = [] + for annot in output.annotations["annot"][granularity]: + notes.append(round(annot["freq"][0], 3)) + begs.append(round(annot["time"][0], 3)) + ends.append(round(annot["time"][1], 3)) + text.append(annot["text"]) + if granularity == "notes": + annotation = utils.NoteData(np.array([begs, ends]).T, np.array(notes), None) + else: + annotation = utils.LyricData( + np.array(begs), np.array(ends), np.array(text), None + ) + return annotation + + +def load_annotations_class(annotations_path): + """Load full annotations into the DALI class object + + Args: + annotations_path (str): path to a DALI annotation file + + Returns: + DALI annotations object + + """ + if not os.path.exists(annotations_path): + raise IOError("annotations_path {} does not exist".format(annotations_path)) + + try: + with gzip.open(annotations_path, "rb") as f: + output = pickle.load(f) + except Exception as e: + with gzip.open(annotations_path, "r") as f: + output = pickle.load(f) + return output + diff --git a/mirdata/giantsteps_key.py b/mirdata/datasets/giantsteps_key.py similarity index 58% rename from mirdata/giantsteps_key.py rename to mirdata/datasets/giantsteps_key.py index 0e085b377..97f715b26 100644 --- a/mirdata/giantsteps_key.py +++ b/mirdata/datasets/giantsteps_key.py @@ -32,41 +32,45 @@ from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'GiantSteps_key' +BIBTEX = """@inproceedings{knees2015two, + title={Two data sets for tempo estimation and key detection in electronic dance music annotated from user corrections}, + author={Knees, Peter and Faraldo P{\'e}rez, {\'A}ngel and Boyer, Herrera and Vogl, Richard and B{\"o}ck, Sebastian and H{\"o}rschl{\"a}ger, Florian and Le Goff, Mickael and others}, + booktitle={Proceedings of the 16th International Society for Music Information Retrieval Conference (ISMIR); 2015 Oct 26-30; M{\'a}laga, Spain.[M{\'a}laga]: International Society for Music Information Retrieval, 2015. p. 364-70.}, + year={2015}, + organization={International Society for Music Information Retrieval (ISMIR)} +}""" REMOTES = { - 'audio': download_utils.RemoteFileMetadata( - filename='audio.zip', - url='https://zenodo.org/record/1095691/files/audio.zip?download=1', - checksum='8ec9ade888d5a88ce435d7fda031929b', - destination_dir='.', + "audio": download_utils.RemoteFileMetadata( + filename="audio.zip", + url="https://zenodo.org/record/1095691/files/audio.zip?download=1", + checksum="8ec9ade888d5a88ce435d7fda031929b", + destination_dir=".", ), - 'keys': download_utils.RemoteFileMetadata( - filename='keys.zip', - url='https://zenodo.org/record/1095691/files/keys.zip?download=1', - checksum='775b7d17e009f5818544cf505b6a96fd', - destination_dir='.', + "keys": download_utils.RemoteFileMetadata( + filename="keys.zip", + url="https://zenodo.org/record/1095691/files/keys.zip?download=1", + checksum="775b7d17e009f5818544cf505b6a96fd", + destination_dir=".", ), - 'metadata': download_utils.RemoteFileMetadata( - filename='original_metadata.zip', - url='https://zenodo.org/record/1095691/files/original_metadata.zip?download=1', - checksum='54181e0f34c35d9720439750d0b08091', - destination_dir='.', + "metadata": download_utils.RemoteFileMetadata( + filename="original_metadata.zip", + url="https://zenodo.org/record/1095691/files/original_metadata.zip?download=1", + checksum="54181e0f34c35d9720439750d0b08091", + destination_dir=".", ), } -DATA = utils.LargeData('giantsteps_key_index.json') +DATA = utils.LargeData("giantsteps_key_index.json") -class Track(track.Track): +class Track(core.Track): """giantsteps_key track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: audio_path (str): track audio path @@ -77,27 +81,24 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: raise ValueError( - '{} is not a valid track ID in giantsteps_key'.format(track_id) + "{} is not a valid track ID in giantsteps_key".format(track_id) ) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - self.keys_path = os.path.join(self._data_home, self._track_paths['key'][0]) + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + self.keys_path = os.path.join(self._data_home, self._track_paths["key"][0]) self.metadata_path = ( - os.path.join(self._data_home, self._track_paths['meta'][0]) - if self._track_paths['meta'][0] is not None + os.path.join(self._data_home, self._track_paths["meta"][0]) + if self._track_paths["meta"][0] is not None else None ) - self.title = self.audio_path.replace(".mp3", '').split('/')[-1] + self.title = self.audio_path.replace(".mp3", "").split("/")[-1] @utils.cached_property def key(self): @@ -129,11 +130,11 @@ def to_jams(self): return jams_utils.jams_converter( audio_path=self.audio_path, metadata={ - 'artists': self.artists, - 'genres': self.genres, - 'tempo': self.tempo, - 'title': self.title, - 'key': self.key, + "artists": self.artists, + "genres": self.genres, + "tempo": self.tempo, + "title": self.title, + "key": self.key, }, ) @@ -154,92 +155,6 @@ def load_audio(audio_path): return librosa.load(audio_path, sr=None, mono=True) -def download( - data_home=None, force_overwrite=False, cleanup=True, partial_download=None -): - """Download the giantsteps_key Dataset (annotations). - The audio files are not provided due to copyright issues. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - cleanup (bool): - Whether to delete the zip/tar file after extracting. - partial_download(list of str) - arguments can be 'audio' 'metadata' or/and 'keys' - """ - - # use the default location: ~/mir_datasets/giantsteps_key - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_message = "" - - download_utils.downloader( - data_home, - remotes=REMOTES, - partial_download=partial_download, - info_message=download_message, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if a local version of this dataset is consistent - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths where the expected file exists locally - but has a different checksum than the reference - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Get the list of track IDs for this dataset - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load giantsteps_key dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - beatles_data = {} - for key in track_ids(): - beatles_data[key] = Track(key, data_home=data_home) - return beatles_data - - def load_key(keys_path): """Load giantsteps_key format key data from a file @@ -328,29 +243,3 @@ def load_artist(metadata_path): meta = json.load(json_file) return [artist["name"] for artist in meta["artists"]] - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== - -Peter Knees, Ángel Faraldo, Perfecto Herrera, Richard Vogl, -Sebastian Böck, Florian Hörschläger, Mickael Le Goff: "Two data -sets for tempo estimation and key detection in electronic dance -music annotated from user corrections," Proc. of the 16th -Conference of the International Society for Music Information -Retrieval (ISMIR'15), Oct. 2015, Malaga, Spain. - -========== Bibtex ========== -@inproceedings{knees2015two, - title={Two data sets for tempo estimation and key detection in electronic dance music annotated from user corrections}, - author={Knees, Peter and Faraldo P{\'e}rez, {\'A}ngel and Boyer, Herrera and Vogl, Richard and B{\"o}ck, Sebastian and H{\"o}rschl{\"a}ger, Florian and Le Goff, Mickael and others}, - booktitle={Proceedings of the 16th International Society for Music Information Retrieval Conference (ISMIR); 2015 Oct 26-30; M{\'a}laga, Spain.[M{\'a}laga]: International Society for Music Information Retrieval, 2015. p. 364-70.}, - year={2015}, - organization={International Society for Music Information Retrieval (ISMIR)} -} - """ - - print(cite_data) diff --git a/mirdata/datasets/giantsteps_tempo.py b/mirdata/datasets/giantsteps_tempo.py new file mode 100644 index 000000000..4c5c31dc6 --- /dev/null +++ b/mirdata/datasets/giantsteps_tempo.py @@ -0,0 +1,233 @@ +# -*- coding: utf-8 -*- +"""giantsteps_tempo Dataset Loader + +name: GiantSteps (tempo+genre) + +contact: + * Richard Vogl + * Peter Knees + +description: collection of annotations for 664 2min(1) audio previews from + www.beatport.com + +references: +[1] Peter Knees, Ángel Faraldo, Perfecto Herrera, Richard Vogl, + Sebastian Böck, Florian Hörschläger, Mickael Le Goff: "Two data + sets for tempo estimation and key detection in electronic dance + music annotated from user corrections", Proc. of the 16th + Conference of the International Society for Music Information + Retrieval (ISMIR'15), Oct. 2015, Malaga, Spain. + +[2] Hendrik Schreiber, Meinard Müller: "A Crowdsourced Experiment + for Tempo Estimation of Electronic Dance Music", Proc. of the + 19th Conference of the International Society for Music + Information Retrieval (ISMIR'18), Sept. 2018, Paris, France. + +annotations: tempo (bpm), genre + +notes: +The audio files (664 files, size ~1gb) can be downloaded from http://www.beatport.com/ +using the bash script: + + https://github.com/GiantSteps/giantsteps-tempo-dataset/blob/master/audio_dl.sh + +To download the files manually use links of the following form: +http://geo-samples.beatport.com/lofi/ +e.g.: +http://geo-samples.beatport.com/lofi/5377710.LOFI.mp3 + +To convert the audio files to .wav use (bash + sox): + +./convert_audio.sh + +To retrieve the genre information, the JSON contained within the website was parsed. +The tempo annotation was extracted from forum entries of people correcting the bpm values (i.e. manual annotation of tempo). +For more information please contact creators. + +[2] found some files without tempo. There are: + +3041381.LOFI.mp3 +3041383.LOFI.mp3 +1327052.LOFI.mp3 + +Their v2 tempo is denoted as 0.0 in tempo and mirex and has no annotation in the JAMS format. + +(1): Most of the audio files are 120 seconds long. Exceptions are: +name length +906760.LOFI.mp3 62 +1327052.LOFI.mp3 70 +4416506.LOFI.mp3 80 +1855660.LOFI.mp3 119 +3419452.LOFI.mp3 119 +3577631.LOFI.mp3 119 +""" + +import librosa +import os + +from mirdata import download_utils +from mirdata import core +from mirdata import utils +import numpy as np +import jams + + +BIBTEX = """@inproceedings{knees2015two, + title={Two data sets for tempo estimation and key detection in electronic dance music annotated from user corrections}, + author={Knees, Peter and Faraldo P{\'e}rez, {\'A}ngel and Boyer, Herrera and Vogl, Richard and B{\"o}ck, Sebastian and H{\"o}rschl{\"a}ger, Florian and Le Goff, Mickael and others}, + booktitle={Proceedings of the 16th International Society for Music Information Retrieval Conference (ISMIR); 2015 Oct 26-30; M{\'a}laga, Spain.[M{\'a}laga]: International Society for Music Information Retrieval, 2015. p. 364-70.}, + year={2015}, + organization={International Society for Music Information Retrieval (ISMIR)}, +} +@inproceedings{SchreiberM18a_Tempo_ISMIR, + author={Hendrik Schreiber and Meinard M{\"u}ller}, + title={A Crowdsourced Experiment for Tempo Estimation of Electronic Dance Music}, + booktitle={Proceedings of the International Conference on Music Information Retrieval ({ISMIR})}, + address={Paris, France}, + year={2018}, + url-pdf={http://www.tagtraum.com/download/2018_schreiber_tempo_giantsteps.pdf}, +}""" + +DATA = utils.LargeData("giantsteps_tempo_index.json") + +REMOTES = { + "annotations": download_utils.RemoteFileMetadata( + filename="giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb.zip", + url="https://github.com/GiantSteps/giantsteps-tempo-dataset/archive/0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb.zip", + checksum="8fdafbaf505fe3f293bd912c92b72ac8", + destination_dir="", + ) +} +DOWNLOAD_INFO = """ + Unfortunately the audio files of the Giant Steps Tempo dataset are not available + for download. If you have the Giant Steps audio dataset, place the contents into + a folder called GiantSteps_tempo with the following structure: + > GiantSteps_tempo/ + > giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/ + > audio/ + and copy the folder to {data_home} +""" + + +class Track(core.Track): + """giantsteps_tempo track class + + Args: + track_id (str): track id of the track + + Attributes: + audio_path (str): track audio path + title (str): title of the track + track_id (str): track id + annotation_v1_path (str): track annotation v1 path + annotation_v2_path (str): track annotation v2 path + """ + + def __init__(self, track_id, data_home): + if track_id not in DATA.index: + raise ValueError( + "{} is not a valid track ID in giantsteps_tempo".format(track_id) + ) + + self.track_id = track_id + + self._data_home = data_home + self._track_paths = DATA.index[track_id] + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + self.annotation_v1_path = os.path.join( + self._data_home, self._track_paths["annotation_v1"][0] + ) + self.annotation_v2_path = os.path.join( + self._data_home, self._track_paths["annotation_v2"][0] + ) + + self.title = self.audio_path.replace(".mp3", "").split("/")[-1].split(".")[0] + + @utils.cached_property + def genre(self): + """genre: human-labeled metadata annotation""" + return load_genre(self.annotation_v1_path) + + @utils.cached_property + def tempo(self): + """TempoData: tempo annotation ordered by confidence""" + return load_tempo(self.annotation_v1_path) + + @utils.cached_property + def tempo_v2(self): + """TempoData: tempos annotation ordered by confidence""" + return load_tempo(self.annotation_v2_path) + + @property + def audio(self): + """(np.ndarray, float): audio signal, sample rate""" + return load_audio(self.audio_path) + + def to_jams(self): + """Jams: the track's data in jams format""" + return jams.load(self.annotation_v1_path) + + def to_jams_v2(self): + """Jams: the track's data in jams format""" + return jams.load(self.annotation_v2_path) + + +def load_audio(audio_path): + """Load a giantsteps_tempo audio file. + + Args: + audio_path (str): path to audio file + + Returns: + y (np.ndarray): the mono audio signal + sr (float): The sample rate of the audio file + """ + if not os.path.exists(audio_path): + raise IOError("audio_path {} does not exist".format(audio_path)) + return librosa.load(audio_path, sr=None, mono=True) + + +def load_genre(path): + """Load genre data from a file + + Args: + path (str): path to metadata annotation file + + Returns: + (str): loaded genre data + """ + if path is None: + return None + + with open(path) as json_file: + annotation = jams.load(json_file) + + return annotation.search(namespace="tag_open")[0]["data"][0].value + + +def load_tempo(tempo_path): + """Load giantsteps_tempo tempo data from a file ordered by confidence + + Args: + tempo_path (str): path to tempo annotation file + + Returns: + (list of utils.TempoData): loaded tempo data + """ + if tempo_path is None: + return None + + if not os.path.exists(tempo_path): + raise IOError("tempo_path {} does not exist".format(tempo_path)) + + with open(tempo_path) as json_file: + annotation = jams.load(json_file) + + tempo = annotation.search(namespace="tempo")[0]["data"] + + return utils.TempoData( + np.array([t.time for t in tempo]), + np.array([t.duration for t in tempo]), + np.array([t.value for t in tempo]), + np.array([t.confidence for t in tempo]), + ) diff --git a/mirdata/groove_midi.py b/mirdata/datasets/groove_midi.py similarity index 75% rename from mirdata/groove_midi.py rename to mirdata/datasets/groove_midi.py index 8501abe0c..c95cfc4a9 100644 --- a/mirdata/groove_midi.py +++ b/mirdata/datasets/groove_midi.py @@ -52,16 +52,21 @@ import numpy as np import pretty_midi -from mirdata import download_utils, jams_utils, track, utils +from mirdata import download_utils, jams_utils, core, utils -DATASET_DIR = 'Groove-MIDI' - +BIBTEX = """@inproceedings{groove2019, + Author = {Jon Gillick and Adam Roberts and Jesse Engel and Douglas Eck + and David Bamman}, + Title = {Learning to Groove with Inverse Sequence Transformations}, + Booktitle = {International Conference on Machine Learning (ICML)}, + Year = {2019}, +}""" REMOTES = { - 'all': download_utils.RemoteFileMetadata( - filename='groove-v1-0.0.zip', - url='http://storage.googleapis.com/magentadata/datasets/groove/groove-v1.0.0.zip', - checksum='99db7e2a087761a913b2abfb19e86181', + "all": download_utils.RemoteFileMetadata( + filename="groove-v1-0.0.zip", + url="http://storage.googleapis.com/magentadata/datasets/groove/groove-v1.0.0.zip", + checksum="99db7e2a087761a913b2abfb19e86181", destination_dir=None, ) } @@ -202,34 +207,32 @@ def _load_metadata(data_home): split, ) = row metadata_index[str(track_id)] = { - 'drummer': str(drummer), - 'session': str(session), - 'track_id': str(track_id), - 'style': str(style), - 'tempo': int(bpm), - 'beat_type': str(beat_type), - 'time_signature': str(time_signature), - 'midi_filename': str(midi_filename), - 'audio_filename': str(audio_filename), - 'duration': float(duration), - 'split': str(split), + "drummer": str(drummer), + "session": str(session), + "track_id": str(track_id), + "style": str(style), + "tempo": int(bpm), + "beat_type": str(beat_type), + "time_signature": str(time_signature), + "midi_filename": str(midi_filename), + "audio_filename": str(audio_filename), + "duration": float(duration), + "split": str(split), } - metadata_index['data_home'] = data_home + metadata_index["data_home"] = data_home return metadata_index -DATA = utils.LargeData('groove_midi_index.json', _load_metadata) +DATA = utils.LargeData("groove_midi_index.json", _load_metadata) -class Track(track.Track): +class Track(core.Track): """Groove MIDI Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: drummer (str): Drummer id of the track (ex. 'drummer1') @@ -246,17 +249,14 @@ class Track(track.Track): 'train', 'valid' or 'test'. """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: raise ValueError( - '{} is not a valid track ID in Groove MIDI'.format(track_id) + "{} is not a valid track ID in Groove MIDI".format(track_id) ) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] @@ -317,9 +317,9 @@ def midi(self): def to_jams(self): # Initialize top-level JAMS container return jams_utils.jams_converter( - beat_data=[(self.beats, 'midi beats')], - tempo_data=[(self.tempo, 'midi tempo')], - event_data=[(self.drum_events, 'annotated drum patterns')], + beat_data=[(self.beats, "midi beats")], + tempo_data=[(self.tempo, "midi tempo")], + event_data=[(self.drum_events, "annotated drum patterns")], metadata=self._track_metadata, ) @@ -406,106 +406,43 @@ def load_drum_events(midi_path, midi=None): return utils.EventData(np.array(start_times), np.array(end_times), np.array(events)) -def download(data_home=None, force_overwrite=False, cleanup=True): - """Download Groove MIDI. +def _download( + save_dir, remotes, partial_download, info_message, force_overwrite, cleanup +): + """Download the dataset. Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` + save_dir (str): + The directory to download the data + remotes (dict or None): + A dictionary of RemoteFileMetadata tuples of data in zip format. + If None, there is no data to download + partial_download (list or None): + A list of keys to partially download the remote objects of the download dict. + If None, all data is downloaded + info_message (str or None): + A string of info to print when this function is called. + If None, no string is printed. force_overwrite (bool): - Whether to overwrite the existing downloaded data + If True, existing files are overwritten by the downloaded files. cleanup (bool): Whether to delete the zip/tar file after extracting. """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - download_utils.downloader( - data_home, - remotes=REMOTES, + save_dir, + remotes=remotes, info_message=None, force_overwrite=force_overwrite, cleanup=cleanup, ) # files get downloaded to a folder called groove - move everything up a level - groove_dir = os.path.join(data_home, 'groove') - groove_files = glob.glob(os.path.join(groove_dir, '*')) + groove_dir = os.path.join(save_dir, "groove") + groove_files = glob.glob(os.path.join(groove_dir, "*")) for fpath in groove_files: - shutil.move(fpath, data_home) + shutil.move(fpath, save_dir) if os.path.exists(groove_dir): shutil.rmtree(groove_dir) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load Groove MIDI dataset - - Args: - data_home (str): Local path where Groove MIDI is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - groove_data = {} - for key in DATA.index.keys(): - groove_data[key] = Track(key, data_home=data_home) - return groove_data - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Jon Gillick, Adam Roberts, Jesse Engel, Douglas Eck, and David Bamman. -"Learning to Groove with Inverse Sequence Transformations." -International Conference on Machine Learning (ICML), 2019. -========== Bibtex ========== -@inproceedings{groove2019, - Author = {Jon Gillick and Adam Roberts and Jesse Engel and Douglas Eck - and David Bamman}, - Title = {Learning to Groove with Inverse Sequence Transformations}, - Booktitle = {International Conference on Machine Learning (ICML)}, - Year = {2019}, -} -""" - print(cite_data) diff --git a/mirdata/datasets/gtzan_genre.py b/mirdata/datasets/gtzan_genre.py new file mode 100644 index 000000000..abe7fb10f --- /dev/null +++ b/mirdata/datasets/gtzan_genre.py @@ -0,0 +1,103 @@ +# -*- coding: utf-8 -*- +"""GTZAN-Genre Dataset Loader + +This dataset was used for the well known paper in genre classification +"Musical genre classification of audio signals " by G. Tzanetakis and +P. Cook in IEEE Transactions on Audio and Speech Processing 2002. + +The dataset consists of 1000 audio tracks each 30 seconds long. It +contains 10 genres, each represented by 100 tracks. The tracks are all +22050 Hz mono 16-bit audio files in .wav format. +""" + +import librosa +import os + +from mirdata import download_utils +from mirdata import jams_utils +from mirdata import core +from mirdata import utils + + +BIBTEX = """@article{tzanetakis2002gtzan, + title={GTZAN genre collection}, + author={Tzanetakis, George and Cook, P}, + journal={Music Analysis, Retrieval and Synthesis for Audio Signals}, + year={2002} +}""" +REMOTES = { + "all": download_utils.RemoteFileMetadata( + filename="genres.tar.gz", + url="http://opihi.cs.uvic.ca/sound/genres.tar.gz", + checksum="5b3d6dddb579ab49814ab86dba69e7c7", + destination_dir="gtzan_genre", + ) +} + +DATA = utils.LargeData("gtzan_genre_index.json") + + +class Track(core.Track): + """gtzan_genre Track class + + Args: + track_id (str): track id of the track + + Attributes: + audio_path (str): path to the audio file + genre (str): annotated genre + track_id (str): track id + + """ + + def __init__(self, track_id, data_home): + if track_id not in DATA.index: + raise ValueError( + "{} is not a valid track ID in GTZAN-Genre".format(track_id) + ) + + self.track_id = track_id + + self._data_home = data_home + self._track_paths = DATA.index[track_id] + + self.genre = track_id.split(".")[0] + if self.genre == "hiphop": + self.genre = "hip-hop" + + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + + @property + def audio(self): + """(np.ndarray, float): audio signal, sample rate""" + return load_audio(self.audio_path) + + def to_jams(self): + """Jams: the track's data in jams format""" + return jams_utils.jams_converter( + tags_gtzan_data=[(self.genre, "gtzan-genre")], + metadata={ + "title": "Unknown track", + "artist": "Unknown artist", + "release": "Unknown album", + "duration": 30.0, + "curator": "George Tzanetakis", + }, + ) + + +def load_audio(audio_path): + """Load a GTZAN audio file. + + Args: + audio_path (str): path to audio file + + Returns: + y (np.ndarray): the mono audio signal + sr (float): The sample rate of the audio file + + """ + if not os.path.exists(audio_path): + raise IOError("audio_path {} does not exist".format(audio_path)) + audio, sr = librosa.load(audio_path, sr=22050, mono=True) + return audio, sr diff --git a/mirdata/guitarset.py b/mirdata/datasets/guitarset.py similarity index 58% rename from mirdata/guitarset.py rename to mirdata/datasets/guitarset.py index 5632853c3..09f449aa1 100644 --- a/mirdata/guitarset.py +++ b/mirdata/datasets/guitarset.py @@ -48,70 +48,72 @@ For more details, please visit: http://github.com/marl/guitarset/ """ - +import logging +import os import jams import librosa -import logging import numpy as np -import os from mirdata import download_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'GuitarSet' +BIBTEX = """@inproceedings{xi2018guitarset, +title={GuitarSet: A Dataset for Guitar Transcription}, +author={Xi, Qingyang and Bittner, Rachel M and Ye, Xuzhou and Pauwels, Johan and Bello, Juan P}, +booktitle={International Society of Music Information Retrieval (ISMIR)}, +year={2018} +}""" REMOTES = { - 'annotations': download_utils.RemoteFileMetadata( - filename='annotation.zip', - url='https://zenodo.org/record/3371780/files/annotation.zip?download=1', - checksum='b39b78e63d3446f2e54ddb7a54df9b10', - destination_dir='annotation', + "annotations": download_utils.RemoteFileMetadata( + filename="annotation.zip", + url="https://zenodo.org/record/3371780/files/annotation.zip?download=1", + checksum="b39b78e63d3446f2e54ddb7a54df9b10", + destination_dir="annotation", ), - 'audio_hex_debleeded': download_utils.RemoteFileMetadata( - filename='audio_hex-pickup_debleeded.zip', - url='https://zenodo.org/record/3371780/files/audio_hex-pickup_debleeded.zip?download=1', - checksum='c31d97279464c9a67e640cb9061fb0c6', - destination_dir='audio_hex-pickup_debleeded', + "audio_hex_debleeded": download_utils.RemoteFileMetadata( + filename="audio_hex-pickup_debleeded.zip", + url="https://zenodo.org/record/3371780/files/audio_hex-pickup_debleeded.zip?download=1", + checksum="c31d97279464c9a67e640cb9061fb0c6", + destination_dir="audio_hex-pickup_debleeded", ), - 'audio_hex_original': download_utils.RemoteFileMetadata( - filename='audio_hex-pickup_original.zip', - url='https://zenodo.org/record/3371780/files/audio_hex-pickup_original.zip?download=1', - checksum='f9911bf217cb40e9e68edf3726ef86cc', - destination_dir='audio_hex-pickup_original', + "audio_hex_original": download_utils.RemoteFileMetadata( + filename="audio_hex-pickup_original.zip", + url="https://zenodo.org/record/3371780/files/audio_hex-pickup_original.zip?download=1", + checksum="f9911bf217cb40e9e68edf3726ef86cc", + destination_dir="audio_hex-pickup_original", ), - 'audio_mic': download_utils.RemoteFileMetadata( - filename='audio_mono-mic.zip', - url='https://zenodo.org/record/3371780/files/audio_mono-mic.zip?download=1', - checksum='275966d6610ac34999b58426beb119c3', - destination_dir='audio_mono-mic', + "audio_mic": download_utils.RemoteFileMetadata( + filename="audio_mono-mic.zip", + url="https://zenodo.org/record/3371780/files/audio_mono-mic.zip?download=1", + checksum="275966d6610ac34999b58426beb119c3", + destination_dir="audio_mono-mic", ), - 'audio_mix': download_utils.RemoteFileMetadata( - filename='audio_mono-pickup_mix.zip', - url='https://zenodo.org/record/3371780/files/audio_mono-pickup_mix.zip?download=1', - checksum='aecce79f425a44e2055e46f680e10f6a', - destination_dir='audio_mono-pickup_mix', + "audio_mix": download_utils.RemoteFileMetadata( + filename="audio_mono-pickup_mix.zip", + url="https://zenodo.org/record/3371780/files/audio_mono-pickup_mix.zip?download=1", + checksum="aecce79f425a44e2055e46f680e10f6a", + destination_dir="audio_mono-pickup_mix", ), } _STYLE_DICT = { - 'Jazz': 'Jazz', - 'BN': 'Bossa Nova', - 'Rock': 'Rock', - 'SS': 'Singer-Songwriter', - 'Funk': 'Funk', + "Jazz": "Jazz", + "BN": "Bossa Nova", + "Rock": "Rock", + "SS": "Singer-Songwriter", + "Funk": "Funk", } -_GUITAR_STRINGS = ['E', 'A', 'D', 'G', 'B', 'e'] -DATA = utils.LargeData('guitarset_index.json') +_GUITAR_STRINGS = ["E", "A", "D", "G", "B", "e"] +DATA = utils.LargeData("guitarset_index.json") -class Track(track.Track): +class Track(core.Track): """guitarset Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: audio_hex_cln_path (str): path to the debleeded hex wave file @@ -130,34 +132,31 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: - raise ValueError('{} is not a valid track ID in GuitarSet'.format(track_id)) + raise ValueError("{} is not a valid track ID in GuitarSet".format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] self.audio_hex_cln_path = os.path.join( - self._data_home, self._track_paths['audio_hex_cln'][0] + self._data_home, self._track_paths["audio_hex_cln"][0] ) self.audio_hex_path = os.path.join( - self._data_home, self._track_paths['audio_hex'][0] + self._data_home, self._track_paths["audio_hex"][0] ) self.audio_mic_path = os.path.join( - self._data_home, self._track_paths['audio_mic'][0] + self._data_home, self._track_paths["audio_mic"][0] ) self.audio_mix_path = os.path.join( - self._data_home, self._track_paths['audio_mix'][0] + self._data_home, self._track_paths["audio_mix"][0] ) - self.jams_path = os.path.join(self._data_home, self._track_paths['jams'][0]) + self.jams_path = os.path.join(self._data_home, self._track_paths["jams"][0]) - title_list = track_id.split('_') # [PID, S-T-K, mode, rec_mode] - style, tempo, _ = title_list[1].split('-') # [style, tempo, key] + title_list = track_id.split("_") # [PID, S-T-K, mode, rec_mode] + style, tempo, _ = title_list[1].split("-") # [style, tempo, key] self.player_id = title_list[0] self.mode = title_list[2] self.tempo = float(tempo) @@ -171,18 +170,18 @@ def beats(self): @utils.cached_property def leadsheet_chords(self): """ChordData: the track's chords as written in the leadsheet""" - if self.mode == 'solo': + if self.mode == "solo": logging.info( - 'Chord annotations for solo excerpts are the same with the comp excerpt.' + "Chord annotations for solo excerpts are the same with the comp excerpt." ) return load_chords(self.jams_path, leadsheet_version=True) @utils.cached_property def inferred_chords(self): """ChordData: the track's chords inferred from played transcription""" - if self.mode == 'solo': + if self.mode == "solo": logging.info( - 'Chord annotations for solo excerpts are the same with the comp excerpt.' + "Chord annotations for solo excerpts are the same with the comp excerpt." ) return load_chords(self.jams_path, leadsheet_version=False) @@ -286,98 +285,13 @@ def load_multitrack_audio(audio_path): return librosa.load(audio_path, sr=None, mono=False) -def download( - data_home=None, partial_download=None, force_overwrite=False, cleanup=True -): - """Download GuitarSet. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - partial_download (list): - List indicating what to partially download. The list can include any of: - * `'annotations'` the annotation files - * `'audio_hex_original'` original 6 channel wave file from hexaphonic pickup - * `'audio_hex_debleeded'` hex wave files with interference removal applied - * `'audio_mic'` monophonic recording from reference microphone - * `'audio_mix'` monophonic mixture of original 6 channel file - If `None`, all data is downloaded. - cleanup (bool): - Whether to delete the zip/tar file after extracting. - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_utils.downloader( - data_home, - remotes=REMOTES, - partial_download=partial_download, - info_message=None, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load GuitarSet - - Args: - data_home (str): Local path where GuitarSet is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - guitarset_data = {} - for key in DATA.index.keys(): - guitarset_data[key] = Track(key, data_home=data_home) - return guitarset_data - - def load_beats(jams_path): if not os.path.exists(jams_path): raise IOError("jams_path {} does not exist".format(jams_path)) jam = jams.load(jams_path) - anno = jam.search(namespace='beat_position')[0] + anno = jam.search(namespace="beat_position")[0] times, values = anno.to_event_values() - positions = [int(v['position']) for v in values] + positions = [int(v["position"]) for v in values] return utils.BeatData(times, positions) @@ -396,9 +310,9 @@ def load_chords(jams_path, leadsheet_version=True): raise IOError("jams_path {} does not exist".format(jams_path)) jam = jams.load(jams_path) if leadsheet_version: - anno = jam.search(namespace='chord')[0] + anno = jam.search(namespace="chord")[0] else: - anno = jam.search(namespace='chord')[1] + anno = jam.search(namespace="chord")[1] intervals, values = anno.to_interval_values() return utils.ChordData(intervals, values) @@ -407,7 +321,7 @@ def load_key_mode(jams_path): if not os.path.exists(jams_path): raise IOError("jams_path {} does not exist".format(jams_path)) jam = jams.load(jams_path) - anno = jam.search(namespace='key_mode')[0] + anno = jam.search(namespace="key_mode")[0] intervals, values = anno.to_interval_values() return utils.KeyData(intervals[:, 0], intervals[:, 1], values) @@ -423,10 +337,10 @@ def load_pitch_contour(jams_path, string_num): if not os.path.exists(jams_path): raise IOError("jams_path {} does not exist".format(jams_path)) jam = jams.load(jams_path) - anno_arr = jam.search(namespace='pitch_contour') + anno_arr = jam.search(namespace="pitch_contour") anno = anno_arr.search(data_source=str(string_num))[0] times, values = anno.to_event_values() - frequencies = [v['frequency'] for v in values] + frequencies = [v["frequency"] for v in values] return utils.F0Data(times, frequencies, np.ones_like(times)) @@ -441,26 +355,7 @@ def load_note_ann(jams_path, string_num): if not os.path.exists(jams_path): raise IOError("jams_path {} does not exist".format(jams_path)) jam = jams.load(jams_path) - anno_arr = jam.search(namespace='note_midi') + anno_arr = jam.search(namespace="note_midi") anno = anno_arr.search(data_source=str(string_num))[0] intervals, values = anno.to_interval_values() return utils.NoteData(intervals, values, np.ones_like(values)) - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Xi, Qingyang, et al. -"GuitarSet: A Dataset for Guitar Transcription." -In Proceedings of the 19th International Society for Music Information Retrieval Conference (ISMIR). 2018. -========== Bibtex ========== -@inproceedings{xi2018guitarset, - title={GuitarSet: A Dataset for Guitar Transcription}, - author={Xi, Qingyang and Bittner, Rachel M and Ye, Xuzhou and Pauwels, Johan and Bello, Juan P}, - booktitle={International Society of Music Information Retrieval (ISMIR)}, - year={2018} -} -""" - print(cite_data) diff --git a/mirdata/ikala.py b/mirdata/datasets/ikala.py similarity index 57% rename from mirdata/ikala.py rename to mirdata/datasets/ikala.py index 30bde48e9..f1a598d23 100644 --- a/mirdata/ikala.py +++ b/mirdata/datasets/ikala.py @@ -19,57 +19,69 @@ from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'iKala' +BIBTEX = """@inproceedings{chan2015vocal, + title={Vocal activity informed singing voice separation with the iKala dataset}, + author={Chan, Tak-Shing and Yeh, Tzu-Chun and Fan, Zhe-Cheng and Chen, Hung-Wei and Su, Li and Yang, Yi-Hsuan and Jang, Roger}, + booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, + pages={718--722}, + year={2015}, + organization={IEEE} +}""" TIME_STEP = 0.032 # seconds REMOTES = { - 'metadata': download_utils.RemoteFileMetadata( - filename='id_mapping.txt', - url='http://mac.citi.sinica.edu.tw/ikala/id_mapping.txt', - checksum='81097b587804ce93e56c7a331ba06abc', + "metadata": download_utils.RemoteFileMetadata( + filename="id_mapping.txt", + url="http://mac.citi.sinica.edu.tw/ikala/id_mapping.txt", + checksum="81097b587804ce93e56c7a331ba06abc", destination_dir=None, ) } +DOWNLOAD_INFO = """ + Unfortunately the iKala dataset is not available for download. + If you have the iKala dataset, place the contents into a folder called + iKala with the following structure: + > iKala/ + > Lyrics/ + > PitchLabel/ + > Wavfile/ + and copy the iKala folder to {data_home} +""" def _load_metadata(data_home): - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - id_map_path = os.path.join(data_home, 'id_mapping.txt') + id_map_path = os.path.join(data_home, "id_mapping.txt") if not os.path.exists(id_map_path): logging.info( - 'Metadata file {} not found.'.format(id_map_path) - + 'You can download the metadata file for ikala by running ikala.download' + "Metadata file {} not found.".format(id_map_path) + + "You can download the metadata file for ikala by running ikala.download" ) return None - with open(id_map_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter='\t') + with open(id_map_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter="\t") singer_map = {} for line in reader: - if line[0] == 'singer': + if line[0] == "singer": continue singer_map[line[1]] = line[0] - singer_map['data_home'] = data_home + singer_map["data_home"] = data_home return singer_map -DATA = utils.LargeData('ikala_index.json', _load_metadata) +DATA = utils.LargeData("ikala_index.json", _load_metadata) -class Track(track.Track): +class Track(core.Track): """ikala Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: audio_path (str): path to the track's audio file @@ -82,25 +94,22 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: - raise ValueError('{} is not a valid track ID in iKala'.format(track_id)) + raise ValueError("{} is not a valid track ID in iKala".format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - metadata = DATA.metadata(data_home) self._data_home = data_home self._track_paths = DATA.index[track_id] - self.f0_path = os.path.join(self._data_home, self._track_paths['pitch'][0]) - self.lyrics_path = os.path.join(self._data_home, self._track_paths['lyrics'][0]) + self.f0_path = os.path.join(self._data_home, self._track_paths["pitch"][0]) + self.lyrics_path = os.path.join(self._data_home, self._track_paths["lyrics"][0]) - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - self.song_id = track_id.split('_')[0] - self.section = track_id.split('_')[1] + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + self.song_id = track_id.split("_")[0] + self.section = track_id.split("_")[1] if metadata is not None and self.song_id in metadata: self.singer_id = metadata[self.song_id] @@ -139,10 +148,10 @@ def to_jams(self): f0_data=[(self.f0, None)], lyrics_data=[(self.lyrics, None)], metadata={ - 'section': self.section, - 'singer_id': self.singer_id, - 'track_id': self.track_id, - 'song_id': self.song_id, + "section": self.section, + "singer_id": self.singer_id, + "track_id": self.track_id, + "song_id": self.song_id, }, ) @@ -204,93 +213,6 @@ def load_mix_audio(audio_path): return 2.0 * mixed_audio, sr -def download(data_home=None, force_overwrite=False): - """Download iKala Dataset. However, iKala dataset is not available for - download anymore. This function prints a helper message to organize - pre-downloaded iKala dataset. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_message = """ - Unfortunately the iKala dataset is not available for download. - If you have the iKala dataset, place the contents into a folder called - {ikala_dir} with the following structure: - > {ikala_dir}/ - > Lyrics/ - > PitchLabel/ - > Wavfile/ - and copy the {ikala_dir} folder to {save_path} - """.format( - ikala_dir=DATASET_DIR, save_path=data_home - ) - - download_utils.downloader( - data_home, - remotes=REMOTES, - info_message=download_message, - force_overwrite=force_overwrite, - ) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load iKala dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - ikala_data = {} - for key in track_ids(): - ikala_data[key] = Track(key, data_home=data_home) - return ikala_data - - def load_f0(f0_path): if not os.path.exists(f0_path): raise IOError("f0_path {} does not exist".format(f0_path)) @@ -310,8 +232,8 @@ def load_lyrics(lyrics_path): raise IOError("lyrics_path {} does not exist".format(lyrics_path)) # input: start time (ms), end time (ms), lyric, [pronunciation] - with open(lyrics_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter=' ') + with open(lyrics_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter=" ") start_times = [] end_times = [] lyrics = [] @@ -321,8 +243,8 @@ def load_lyrics(lyrics_path): end_times.append(float(line[1]) / 1000.0) lyrics.append(line[2]) if len(line) > 2: - pronunciation = ' '.join(line[3:]) - pronunciations.append(pronunciation if pronunciation != '' else None) + pronunciation = " ".join(line[3:]) + pronunciations.append(pronunciation if pronunciation != "" else None) else: pronunciations.append(None) @@ -333,24 +255,3 @@ def load_lyrics(lyrics_path): np.array(pronunciations), ) return lyrics_data - - -def cite(): - """Print the reference""" - cite_data = """ -=========== MLA =========== -Chan, Tak-Shing, et al. -"Vocal activity informed singing voice separation with the iKala dataset." -2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP). IEEE, 2015. - -========== Bibtex ========== -@inproceedings{chan2015vocal, - title={Vocal activity informed singing voice separation with the iKala dataset}, - author={Chan, Tak-Shing and Yeh, Tzu-Chun and Fan, Zhe-Cheng and Chen, Hung-Wei and Su, Li and Yang, Yi-Hsuan and Jang, Roger}, - booktitle={2015 IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP)}, - pages={718--722}, - year={2015}, - organization={IEEE} -} -""" - print(cite_data) diff --git a/mirdata/indexes/beatles_index.json b/mirdata/datasets/indexes/beatles_index.json similarity index 100% rename from mirdata/indexes/beatles_index.json rename to mirdata/datasets/indexes/beatles_index.json diff --git a/mirdata/indexes/beatport_key_index.json b/mirdata/datasets/indexes/beatport_key_index.json similarity index 100% rename from mirdata/indexes/beatport_key_index.json rename to mirdata/datasets/indexes/beatport_key_index.json diff --git a/mirdata/indexes/dali_index.json b/mirdata/datasets/indexes/dali_index.json similarity index 100% rename from mirdata/indexes/dali_index.json rename to mirdata/datasets/indexes/dali_index.json diff --git a/mirdata/indexes/giantsteps_key_index.json b/mirdata/datasets/indexes/giantsteps_key_index.json similarity index 100% rename from mirdata/indexes/giantsteps_key_index.json rename to mirdata/datasets/indexes/giantsteps_key_index.json diff --git a/mirdata/indexes/giantsteps_tempo_index.json b/mirdata/datasets/indexes/giantsteps_tempo_index.json similarity index 100% rename from mirdata/indexes/giantsteps_tempo_index.json rename to mirdata/datasets/indexes/giantsteps_tempo_index.json diff --git a/mirdata/indexes/groove_midi_index.json b/mirdata/datasets/indexes/groove_midi_index.json similarity index 100% rename from mirdata/indexes/groove_midi_index.json rename to mirdata/datasets/indexes/groove_midi_index.json diff --git a/mirdata/indexes/gtzan_genre_index.json b/mirdata/datasets/indexes/gtzan_genre_index.json similarity index 100% rename from mirdata/indexes/gtzan_genre_index.json rename to mirdata/datasets/indexes/gtzan_genre_index.json diff --git a/mirdata/indexes/guitarset_index.json b/mirdata/datasets/indexes/guitarset_index.json similarity index 100% rename from mirdata/indexes/guitarset_index.json rename to mirdata/datasets/indexes/guitarset_index.json diff --git a/mirdata/indexes/ikala_index.json b/mirdata/datasets/indexes/ikala_index.json similarity index 100% rename from mirdata/indexes/ikala_index.json rename to mirdata/datasets/indexes/ikala_index.json diff --git a/mirdata/indexes/maestro_index.json b/mirdata/datasets/indexes/maestro_index.json similarity index 100% rename from mirdata/indexes/maestro_index.json rename to mirdata/datasets/indexes/maestro_index.json diff --git a/mirdata/indexes/medley_solos_db_index.json b/mirdata/datasets/indexes/medley_solos_db_index.json similarity index 100% rename from mirdata/indexes/medley_solos_db_index.json rename to mirdata/datasets/indexes/medley_solos_db_index.json diff --git a/mirdata/indexes/medleydb_melody_index.json b/mirdata/datasets/indexes/medleydb_melody_index.json similarity index 100% rename from mirdata/indexes/medleydb_melody_index.json rename to mirdata/datasets/indexes/medleydb_melody_index.json diff --git a/mirdata/indexes/medleydb_pitch_index.json b/mirdata/datasets/indexes/medleydb_pitch_index.json similarity index 100% rename from mirdata/indexes/medleydb_pitch_index.json rename to mirdata/datasets/indexes/medleydb_pitch_index.json diff --git a/mirdata/indexes/mridangam_stroke_index.json b/mirdata/datasets/indexes/mridangam_stroke_index.json similarity index 100% rename from mirdata/indexes/mridangam_stroke_index.json rename to mirdata/datasets/indexes/mridangam_stroke_index.json diff --git a/mirdata/indexes/orchset_index.json b/mirdata/datasets/indexes/orchset_index.json similarity index 100% rename from mirdata/indexes/orchset_index.json rename to mirdata/datasets/indexes/orchset_index.json diff --git a/mirdata/indexes/rwc_classical_index.json b/mirdata/datasets/indexes/rwc_classical_index.json similarity index 100% rename from mirdata/indexes/rwc_classical_index.json rename to mirdata/datasets/indexes/rwc_classical_index.json diff --git a/mirdata/indexes/rwc_jazz_index.json b/mirdata/datasets/indexes/rwc_jazz_index.json similarity index 100% rename from mirdata/indexes/rwc_jazz_index.json rename to mirdata/datasets/indexes/rwc_jazz_index.json diff --git a/mirdata/indexes/rwc_popular_index.json b/mirdata/datasets/indexes/rwc_popular_index.json similarity index 100% rename from mirdata/indexes/rwc_popular_index.json rename to mirdata/datasets/indexes/rwc_popular_index.json diff --git a/mirdata/indexes/salami_index.json b/mirdata/datasets/indexes/salami_index.json similarity index 100% rename from mirdata/indexes/salami_index.json rename to mirdata/datasets/indexes/salami_index.json diff --git a/mirdata/indexes/tinysol_index.json b/mirdata/datasets/indexes/tinysol_index.json similarity index 100% rename from mirdata/indexes/tinysol_index.json rename to mirdata/datasets/indexes/tinysol_index.json diff --git a/mirdata/maestro.py b/mirdata/datasets/maestro.py similarity index 58% rename from mirdata/maestro.py rename to mirdata/datasets/maestro.py index 1162e1c05..b1341be2d 100644 --- a/mirdata/maestro.py +++ b/mirdata/datasets/maestro.py @@ -39,63 +39,70 @@ from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'MAESTRO' + +BIBTEX = """@inproceedings{ + hawthorne2018enabling, + title={Enabling Factorized Piano Music Modeling and Generation with the {MAESTRO} Dataset}, + author={Curtis Hawthorne and Andriy Stasyuk and Adam Roberts and Ian Simon and Cheng-Zhi Anna Huang and Sander Dieleman and Erich Elsen and Jesse Engel and Douglas Eck}, + booktitle={International Conference on Learning Representations}, + year={2019}, + url={https://openreview.net/forum?id=r1lYRjC9F7}, +} +""" REMOTES = { - 'all': download_utils.RemoteFileMetadata( - filename='maestro-v2.0.0.zip', - url='https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0.zip', - checksum='7a6c23536ebcf3f50b1f00ac253886a7', - destination_dir='', + "all": download_utils.RemoteFileMetadata( + filename="maestro-v2.0.0.zip", + url="https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0.zip", + checksum="7a6c23536ebcf3f50b1f00ac253886a7", + destination_dir="", ), - 'midi': download_utils.RemoteFileMetadata( - filename='maestro-v2.0.0-midi.zip', - url='https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip', - checksum='8a45cc678a8b23cd7bad048b1e9034c5', - destination_dir='', + "midi": download_utils.RemoteFileMetadata( + filename="maestro-v2.0.0-midi.zip", + url="https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0-midi.zip", + checksum="8a45cc678a8b23cd7bad048b1e9034c5", + destination_dir="", ), - 'metadata': download_utils.RemoteFileMetadata( - filename='maestro-v2.0.0.json', - url='https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0.json', - checksum='576172af1cdc4efddcf0be7d260d48f7', - destination_dir='maestro-v2.0.0', + "metadata": download_utils.RemoteFileMetadata( + filename="maestro-v2.0.0.json", + url="https://storage.googleapis.com/magentadata/datasets/maestro/v2.0.0/maestro-v2.0.0.json", + checksum="576172af1cdc4efddcf0be7d260d48f7", + destination_dir="maestro-v2.0.0", ), } def _load_metadata(data_home): - metadata_path = os.path.join(data_home, 'maestro-v2.0.0.json') + metadata_path = os.path.join(data_home, "maestro-v2.0.0.json") if not os.path.exists(metadata_path): logging.info("Metadata file {} not found.".format(metadata_path)) return None # load metadata however makes sense for your dataset - with open(metadata_path, 'r') as fhandle: + with open(metadata_path, "r") as fhandle: raw_metadata = json.load(fhandle) metadata = {} for mdata in raw_metadata: - track_id = mdata['midi_filename'].split('.')[0] + track_id = mdata["midi_filename"].split(".")[0] metadata[track_id] = mdata - metadata['data_home'] = data_home + metadata["data_home"] = data_home return metadata -DATA = utils.LargeData('maestro_index.json', _load_metadata) +DATA = utils.LargeData("maestro_index.json", _load_metadata) -class Track(track.Track): +class Track(core.Track): """MAESTRO Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: audio_path (str): Path to the track's audio file @@ -111,28 +118,25 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: - raise ValueError('{} is not a valid track ID in MAESTRO'.format(track_id)) + raise ValueError("{} is not a valid track ID in MAESTRO".format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - self.midi_path = os.path.join(self._data_home, self._track_paths['midi'][0]) + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + self.midi_path = os.path.join(self._data_home, self._track_paths["midi"][0]) self._metadata = DATA.metadata(data_home) if self._metadata is not None and track_id in self._metadata: - self.canonical_composer = self._metadata[track_id]['canonical_composer'] - self.canonical_title = self._metadata[track_id]['canonical_title'] - self.split = self._metadata[track_id]['split'] - self.year = self._metadata[track_id]['year'] - self.duration = self._metadata[track_id]['duration'] + self.canonical_composer = self._metadata[track_id]["canonical_composer"] + self.canonical_title = self._metadata[track_id]["canonical_title"] + self.split = self._metadata[track_id]["split"] + self.year = self._metadata[track_id]["year"] + self.duration = self._metadata[track_id]["duration"] else: self.canonical_composer = None self.canonical_title = None @@ -221,39 +225,40 @@ def load_audio(audio_path): return librosa.load(audio_path, sr=None, mono=True) -def download( - data_home=None, partial_download=None, force_overwrite=False, cleanup=True +def _download( + save_dir, remotes, partial_download, info_message, force_overwrite, cleanup ): """Download the dataset. - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - partial_download (list): + save_dir (str): + The directory to download the data + remotes (dict or None): + A dictionary of RemoteFileMetadata tuples of data in zip format. + If None, there is no data to download + partial_download (list or None): List indicating what to partially download. The list can include any of: * 'all': audio, midi and metadata * 'midi': midi and metadata only - * 'metadata': metadata only - If `None`, all data is downloaded. + * 'metadata': metadata only + If None, all data is downloaded + info_message (str or None): + A string of info to print when this function is called. + If None, no string is printed. + force_overwrite (bool): + If True, existing files are overwritten by the downloaded files. cleanup (bool): Whether to delete the zip/tar file after extracting. """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - # in MAESTRO "metadata" is contained in "midi" is contained in "all" - if partial_download is None or 'all' in partial_download: - partial_download = ['all'] - elif 'midi' in partial_download: - partial_download = ['midi'] + if partial_download is None or "all" in partial_download: + partial_download = ["all"] + elif "midi" in partial_download: + partial_download = ["midi"] download_utils.downloader( - data_home, - remotes=REMOTES, + save_dir, + remotes=remotes, partial_download=partial_download, force_overwrite=force_overwrite, cleanup=cleanup, @@ -261,81 +266,11 @@ def download( # files get downloaded to a folder called maestro-v2.0.0 # move everything up a level - maestro_dir = os.path.join(data_home, 'maestro-v2.0.0') - maestro_files = glob.glob(os.path.join(maestro_dir, '*')) + maestro_dir = os.path.join(save_dir, "maestro-v2.0.0") + maestro_files = glob.glob(os.path.join(maestro_dir, "*")) for fpath in maestro_files: - shutil.move(fpath, data_home) + shutil.move(fpath, save_dir) if os.path.exists(maestro_dir): shutil.rmtree(maestro_dir) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load MAESTRO dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - data = {} - for key in DATA.index.keys(): - data[key] = Track(key, data_home=data_home) - return data - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Curtis Hawthorne, Andriy Stasyuk, Adam Roberts, Ian Simon, Cheng-Zhi Anna Huang, - Sander Dieleman, Erich Elsen, Jesse Engel, and Douglas Eck. "Enabling - Factorized Piano Music Modeling and Generation with the MAESTRO Dataset." - In International Conference on Learning Representations, 2019. -========== Bibtex ========== -@inproceedings{ - hawthorne2018enabling, - title={Enabling Factorized Piano Music Modeling and Generation with the {MAESTRO} Dataset}, - author={Curtis Hawthorne and Andriy Stasyuk and Adam Roberts and Ian Simon and Cheng-Zhi Anna Huang and Sander Dieleman and Erich Elsen and Jesse Engel and Douglas Eck}, - booktitle={International Conference on Learning Representations}, - year={2019}, - url={https://openreview.net/forum?id=r1lYRjC9F7}, -} -""" - print(cite_data) diff --git a/mirdata/medley_solos_db.py b/mirdata/datasets/medley_solos_db.py similarity index 58% rename from mirdata/medley_solos_db.py rename to mirdata/datasets/medley_solos_db.py index 573228664..fcc93335c 100644 --- a/mirdata/medley_solos_db.py +++ b/mirdata/datasets/medley_solos_db.py @@ -30,18 +30,23 @@ from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = "Medley-solos-DB" +BIBTEX = """@inproceedings{lostanlen2019ismir, + title={Deep Convolutional Networks in the Pitch Spiral for Musical Instrument Recognition}, + author={Lostanlen, Vincent and Cella, Carmine Emanuele}, + booktitle={International Society of Music Information Retrieval (ISMIR)}, + year={2016} +}""" REMOTES = { - 'annotations': download_utils.RemoteFileMetadata( + "annotations": download_utils.RemoteFileMetadata( filename="Medley-solos-DB_metadata.csv", url="https://zenodo.org/record/3464194/files/Medley-solos-DB_metadata.csv?download=1", checksum="fda6a589c56785f2195c9227809c521a", destination_dir="annotation", ), - 'audio': download_utils.RemoteFileMetadata( + "audio": download_utils.RemoteFileMetadata( filename="Medley-solos-DB.tar.gz", url="https://zenodo.org/record/3464194/files/Medley-solos-DB.tar.gz?download=1", checksum="f5facf398793ef5c1f80c013afdf3e5f", @@ -80,13 +85,11 @@ def _load_metadata(data_home): DATA = utils.LargeData("medley_solos_db_index.json", _load_metadata) -class Track(track.Track): +class Track(core.Track): """medley_solos_db Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: audio_path (str): path to the track's audio file @@ -98,7 +101,7 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: raise ValueError( "{} is not a valid track ID in Medley-solos-DB".format(track_id) @@ -106,9 +109,6 @@ def __init__(self, track_id, data_home=None): self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] @@ -157,102 +157,3 @@ def load_audio(audio_path): raise IOError("audio_path {} does not exist".format(audio_path)) return librosa.load(audio_path, sr=22050, mono=True) - - -def download( - data_home=None, partial_download=None, force_overwrite=False, cleanup=True -): - """Download Medley-solos-DB. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - partial_download (list): - List indicating what to partially download. The list can include any of: - * `'annotations'` the annotation files - * `'audio'` the audio files - If `None`, all data is downloaded. - cleanup (bool): - Whether to delete the zip/tar file after extracting. - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_utils.downloader( - data_home, - remotes=REMOTES, - partial_download=partial_download, - info_message=None, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def load(data_home=None): - """Load Medley-solos-DB - Args: - data_home (str): Local path where Medley-solos-DB is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - medley_solos_db_data = {} - for key in DATA.index.keys(): - medley_solos_db_data[key] = Track(key, data_home=data_home) - return medley_solos_db_data - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Lostanlen, Vincent and Cella, Carmine Emanuele. -"Deep Convolutional Networks in the Pitch Spiral for Musical Instrument Recognition." -In Proceedings of the 16th International Society for Music Information Retrieval Conference (ISMIR). 2016. -========== Bibtex ========== -@inproceedings{lostanlen2019ismir, - title={Deep Convolutional Networks in the Pitch Spiral for Musical Instrument Recognition}, - author={Lostanlen, Vincent and Cella, Carmine Emanuele}, - booktitle={International Society of Music Information Retrieval (ISMIR)}, - year={2016} -} -""" - print(cite_data) diff --git a/mirdata/medleydb_melody.py b/mirdata/datasets/medleydb_melody.py similarity index 52% rename from mirdata/medleydb_melody.py rename to mirdata/datasets/medleydb_melody.py index 0f10928bc..fd53f8a69 100644 --- a/mirdata/medleydb_melody.py +++ b/mirdata/datasets/medleydb_melody.py @@ -20,36 +20,49 @@ from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'MedleyDB-Melody' +BIBTEX = """@inproceedings{bittner2014medleydb, + Author = {Bittner, Rachel M and Salamon, Justin and Tierney, Mike and Mauch, Matthias and Cannam, Chris and Bello, Juan P}, + Booktitle = {International Society of Music Information Retrieval (ISMIR)}, + Month = {October}, + Title = {Medley{DB}: A Multitrack Dataset for Annotation-Intensive {MIR} Research}, + Year = {2014} +}""" +DOWNLOAD_INFO = """ + To download this dataset, visit: + https://zenodo.org/record/2628782#.XKZdABNKh24 + and request access. + + Once downloaded, unzip the file MedleyDB-Melody.zip + and copy the result to: + {data_home} +""" def _load_metadata(data_home): - metadata_path = os.path.join(data_home, 'medleydb_melody_metadata.json') + metadata_path = os.path.join(data_home, "medleydb_melody_metadata.json") if not os.path.exists(metadata_path): - logging.info('Metadata file {} not found.'.format(metadata_path)) + logging.info("Metadata file {} not found.".format(metadata_path)) return None - with open(metadata_path, 'r') as fhandle: + with open(metadata_path, "r") as fhandle: metadata = json.load(fhandle) - metadata['data_home'] = data_home + metadata["data_home"] = data_home return metadata -DATA = utils.LargeData('medleydb_melody_index.json', _load_metadata) +DATA = utils.LargeData("medleydb_melody_index.json", _load_metadata) -class Track(track.Track): +class Track(core.Track): """medleydb_melody Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: artist (str): artist @@ -66,27 +79,24 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: raise ValueError( - '{} is not a valid track ID in MedleyDB-Melody'.format(track_id) + "{} is not a valid track ID in medleydb_melody".format(track_id) ) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] self.melody1_path = os.path.join( - self._data_home, self._track_paths['melody1'][0] + self._data_home, self._track_paths["melody1"][0] ) self.melody2_path = os.path.join( - self._data_home, self._track_paths['melody2'][0] + self._data_home, self._track_paths["melody2"][0] ) self.melody3_path = os.path.join( - self._data_home, self._track_paths['melody3'][0] + self._data_home, self._track_paths["melody3"][0] ) metadata = DATA.metadata(data_home) @@ -94,21 +104,21 @@ def __init__(self, track_id, data_home=None): self._track_metadata = metadata[track_id] else: self._track_metadata = { - 'artist': None, - 'title': None, - 'genre': None, - 'is_excerpt': None, - 'is_instrumental': None, - 'n_sources': None, + "artist": None, + "title": None, + "genre": None, + "is_excerpt": None, + "is_instrumental": None, + "n_sources": None, } - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - self.artist = self._track_metadata['artist'] - self.title = self._track_metadata['title'] - self.genre = self._track_metadata['genre'] - self.is_excerpt = self._track_metadata['is_excerpt'] - self.is_instrumental = self._track_metadata['is_instrumental'] - self.n_sources = self._track_metadata['n_sources'] + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + self.artist = self._track_metadata["artist"] + self.title = self._track_metadata["title"] + self.genre = self._track_metadata["genre"] + self.is_excerpt = self._track_metadata["is_excerpt"] + self.is_instrumental = self._track_metadata["is_instrumental"] + self.n_sources = self._track_metadata["n_sources"] @utils.cached_property def melody1(self): @@ -135,7 +145,7 @@ def to_jams(self): # jams does not support multipitch, so we skip melody3 return jams_utils.jams_converter( audio_path=self.audio_path, - f0_data=[(self.melody1, 'melody1'), (self.melody2, 'melody2')], + f0_data=[(self.melody1, "melody1"), (self.melody2, "melody2")], metadata=self._track_metadata, ) @@ -157,95 +167,14 @@ def load_audio(audio_path): return librosa.load(audio_path, sr=None, mono=True) -def download(data_home=None): - """MedleyDB is not available for downloading directly. - This function prints a helper message to download MedleyDB - through zenodo.org. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - """ - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - info_message = """ - To download this dataset, visit: - https://zenodo.org/record/2628782#.XKZdABNKh24 - and request access. - - Once downloaded, unzip the file MedleyDB-Melody.zip - and copy the result to: - {data_home} - """.format( - data_home=data_home - ) - - download_utils.downloader(data_home, info_message=info_message) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load MedleyDB melody dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - """ - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - medleydb_melody_data = {} - for key in track_ids(): - medleydb_melody_data[key] = Track(key, data_home=data_home) - return medleydb_melody_data - - def load_melody(melody_path): if not os.path.exists(melody_path): raise IOError("melody_path {} does not exist".format(melody_path)) times = [] freqs = [] - with open(melody_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter=',') + with open(melody_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter=",") for line in reader: times.append(float(line[0])) freqs.append(float(line[1])) @@ -264,8 +193,8 @@ def load_melody3(melody_path): times = [] freqs_list = [] conf_list = [] - with open(melody_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter=',') + with open(melody_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter=",") for line in reader: times.append(float(line[0])) freqs_list.append([float(v) for v in line[1:]]) @@ -274,24 +203,3 @@ def load_melody3(melody_path): times = np.array(times) melody_data = utils.MultipitchData(times, freqs_list, conf_list) return melody_data - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Bittner, Rachel, et al. -"MedleyDB: A multitrack dataset for annotation-intensive MIR research." -In Proceedings of the 15th International Society for Music Information Retrieval Conference (ISMIR). 2014. - -========== Bibtex ========== -@inproceedings{bittner2014medleydb, - Author = {Bittner, Rachel M and Salamon, Justin and Tierney, Mike and Mauch, Matthias and Cannam, Chris and Bello, Juan P}, - Booktitle = {International Society of Music Information Retrieval (ISMIR)}, - Month = {October}, - Title = {Medley{DB}: A Multitrack Dataset for Annotation-Intensive {MIR} Research}, - Year = {2014} -} -""" - print(cite_data) diff --git a/mirdata/datasets/medleydb_pitch.py b/mirdata/datasets/medleydb_pitch.py new file mode 100644 index 000000000..30ffb995d --- /dev/null +++ b/mirdata/datasets/medleydb_pitch.py @@ -0,0 +1,159 @@ +# -*- coding: utf-8 -*- +"""MedleyDB pitch Dataset Loader + +MedleyDB is a dataset of annotated, royalty-free multitrack recordings. +MedleyDB was curated primarily to support research on melody extraction, +addressing important shortcomings of existing collections. For each song +we provide melody f0 annotations as well as instrument activations for +evaluating automatic instrument recognition. + +For more details, please visit: https://medleydb.weebly.com + +""" + +import csv +import json +import librosa +import logging +import numpy as np +import os + +from mirdata import download_utils +from mirdata import jams_utils +from mirdata import core +from mirdata import utils + +BIBTEX = """@inproceedings{bittner2014medleydb, + Author = {Bittner, Rachel M and Salamon, Justin and Tierney, Mike and Mauch, Matthias and Cannam, Chris and Bello, Juan P}, + Booktitle = {International Society of Music Information Retrieval (ISMIR)}, + Month = {October}, + Title = {Medley{DB}: A Multitrack Dataset for Annotation-Intensive {MIR} Research}, + Year = {2014} +}""" +DOWNLOAD_INFO = """ + To download this dataset, visit: + https://zenodo.org/record/2620624#.XKZc7hNKh24 + and request access. + + Once downloaded, unzip the file MedleyDB-Pitch.zip + and copy the result to: + {data_home} +""" + + +def _load_metadata(data_home): + metadata_path = os.path.join(data_home, "medleydb_pitch_metadata.json") + + if not os.path.exists(metadata_path): + logging.info("Metadata file {} not found.".format(metadata_path)) + return None + + with open(metadata_path, "r") as fhandle: + metadata = json.load(fhandle) + + metadata["data_home"] = data_home + return metadata + + +DATA = utils.LargeData("medleydb_pitch_index.json", _load_metadata) + + +class Track(core.Track): + """medleydb_pitch Track class + + Args: + track_id (str): track id of the track + + Attributes: + artist (str): artist + audio_path (str): path to the audio file + genre (str): genre + instrument (str): instrument of the track + pitch_path (str): path to the pitch annotation file + title (str): title + track_id (str): track id + + """ + + def __init__(self, track_id, data_home): + if track_id not in DATA.index: + raise ValueError( + "{} is not a valid track ID in MedleyDB-Pitch".format(track_id) + ) + + self.track_id = track_id + + self._data_home = data_home + self._track_paths = DATA.index[track_id] + self.pitch_path = os.path.join(self._data_home, self._track_paths["pitch"][0]) + + metadata = DATA.metadata(data_home) + if metadata is not None and track_id in metadata: + self._track_metadata = metadata[track_id] + else: + self._track_metadata = { + "instrument": None, + "artist": None, + "title": None, + "genre": None, + } + + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + self.instrument = self._track_metadata["instrument"] + self.artist = self._track_metadata["artist"] + self.title = self._track_metadata["title"] + self.genre = self._track_metadata["genre"] + + @utils.cached_property + def pitch(self): + """F0Data: The human-annotated pitch""" + return load_pitch(self.pitch_path) + + @property + def audio(self): + """(np.ndarray, float): audio signal, sample rate""" + return load_audio(self.audio_path) + + def to_jams(self): + """Jams: the track's data in jams format""" + return jams_utils.jams_converter( + audio_path=self.audio_path, + f0_data=[(self.pitch, "annotated pitch")], + metadata=self._track_metadata, + ) + + +def load_audio(audio_path): + """Load a MedleyDB audio file. + + Args: + audio_path (str): path to audio file + + Returns: + y (np.ndarray): the mono audio signal + sr (float): The sample rate of the audio file + + """ + if not os.path.exists(audio_path): + raise IOError("audio_path {} does not exist".format(audio_path)) + + return librosa.load(audio_path, sr=None, mono=True) + + +def load_pitch(pitch_path): + if not os.path.exists(pitch_path): + raise IOError("pitch_path {} does not exist".format(pitch_path)) + + times = [] + freqs = [] + with open(pitch_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter=",") + for line in reader: + times.append(float(line[0])) + freqs.append(float(line[1])) + + times = np.array(times) + freqs = np.array(freqs) + confidence = (freqs > 0).astype(float) + pitch_data = utils.F0Data(times, freqs, confidence) + return pitch_data diff --git a/mirdata/mridangam_stroke.py b/mirdata/datasets/mridangam_stroke.py similarity index 50% rename from mirdata/mridangam_stroke.py rename to mirdata/datasets/mridangam_stroke.py index cdc0ee162..5214988c1 100644 --- a/mirdata/mridangam_stroke.py +++ b/mirdata/datasets/mridangam_stroke.py @@ -40,38 +40,52 @@ from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils - -DATASET_DIR = 'Mridangam-Stroke' - +BIBTEX = """@article{Anantapadmanabhan2013, + author = {Anantapadmanabhan, Akshay and Bellur, Ashwin and Murthy, Hema A.}, + doi = {10.1109/ICASSP.2013.6637633}, + isbn = {9781479903566}, + issn = {15206149}, + journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings}, + keywords = {Hidden Markov models, Modal Analysis, Mridangam, Non-negative Matrix Factorization, + automatic transcription}, + pages = {181--185}, + title = {{Modal analysis and transcription of strokes of the mridangam using non-negative matrix factorization}}, + year = {2013} +}""" REMOTES = { - 'remote_data': download_utils.RemoteFileMetadata( - filename='mridangam_stroke_1.5.zip', - url='https://zenodo.org/record/4068196/files/mridangam_stroke_1.5.zip?download=1', - checksum='39af55b2476b94c7946bec24331ec01a', # the md5 checksum + "remote_data": download_utils.RemoteFileMetadata( + filename="mridangam_stroke_1.5.zip", + url="https://zenodo.org/record/4068196/files/mridangam_stroke_1.5.zip?download=1", + checksum="39af55b2476b94c7946bec24331ec01a", # the md5 checksum destination_dir=None, # relative path for where to unzip the data, or None ), } -DATA = utils.LargeData( - 'mridangam_stroke_index.json' -) +DATA = utils.LargeData("mridangam_stroke_index.json") STROKE_DICT = { - 'bheem', 'cha', 'dheem', 'dhin', 'num', 'ta', 'tha', 'tham', 'thi', 'thom' + "bheem", + "cha", + "dheem", + "dhin", + "num", + "ta", + "tha", + "tham", + "thi", + "thom", } -TONIC_DICT = { - 'B', 'C', 'C#', 'D', 'D#', 'E' -} +TONIC_DICT = {"B", "C", "C#", "D", "D#", "E"} -class Track(track.Track): +class Track(core.Track): """Mridangam Stroke track class Args: track_id (str): track id of the track @@ -84,27 +98,28 @@ class Track(track.Track): tonic (str): tonic of the stroke in the Track """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: - raise ValueError('{} is not a valid track ID in Example'.format(track_id)) + raise ValueError("{} is not a valid track ID in Example".format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) # Parse stroke name annotation from audio file name - self.stroke_name = self.audio_path.split('__')[2].split('-')[0] - assert self.stroke_name in STROKE_DICT, "Stroke {} not in stroke dictionary".format(self.stroke_name) + self.stroke_name = self.audio_path.split("__")[2].split("-")[0] + assert ( + self.stroke_name in STROKE_DICT + ), "Stroke {} not in stroke dictionary".format(self.stroke_name) # Parse tonic annotation from audio file name self.tonic = os.path.basename(os.path.dirname(self.audio_path)) - assert self.tonic in TONIC_DICT, "Tonic {} not in tonic dictionary".format(self.tonic) + assert self.tonic in TONIC_DICT, "Tonic {} not in tonic dictionary".format( + self.tonic + ) @property def audio(self): @@ -115,10 +130,8 @@ def to_jams(self): """Jams: the track's data in jams format""" return jams_utils.jams_converter( audio_path=self.audio_path, - tags_open_data=[(self.stroke_name, 'stroke_name')], - metadata={ - 'tonic': self.tonic - }, + tags_open_data=[(self.stroke_name, "stroke_name")], + metadata={"tonic": self.tonic}, ) @@ -133,96 +146,3 @@ def load_audio(audio_path): if not os.path.exists(audio_path): raise IOError("audio_path {} does not exist".format(audio_path)) return librosa.load(audio_path, sr=44100, mono=True) - - -def download(data_home=None, force_overwrite=False, cleanup=True): - """Download the Mridangam Stroke Dataset. - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - cleanup (bool): - Whether to delete the zip/tar file after extracting. - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_utils.downloader( - data_home, - remotes=REMOTES, - info_message=None, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load Mridangam Stroke dataset - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - data = {} - for key in DATA.index.keys(): - data[key] = Track(key, data_home=data_home) - return data - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Anantapadmanabhan, A., Bellur, A., & Murthy, H. A. -"Modal analysis and transcription of strokes of the mridangam using non-negative matrix factorization" (2013) -IEEE International Conference on Acoustics, Speech and Signal Processing (ICASSP 2013) -========== Bibtex ========== -@article{Anantapadmanabhan2013, - author = {Anantapadmanabhan, Akshay and Bellur, Ashwin and Murthy, Hema A.}, - doi = {10.1109/ICASSP.2013.6637633}, - isbn = {9781479903566}, - issn = {15206149}, - journal = {ICASSP, IEEE International Conference on Acoustics, Speech and Signal Processing - Proceedings}, - keywords = {Hidden Markov models, Modal Analysis, Mridangam, Non-negative Matrix Factorization, - automatic transcription}, - pages = {181--185}, - title = {{Modal analysis and transcription of strokes of the mridangam using non-negative matrix factorization}}, - year = {2013} -} -""" - print(cite_data) diff --git a/mirdata/orchset.py b/mirdata/datasets/orchset.py similarity index 75% rename from mirdata/orchset.py rename to mirdata/datasets/orchset.py index 816caa60c..ef9899737 100644 --- a/mirdata/orchset.py +++ b/mirdata/datasets/orchset.py @@ -12,18 +12,27 @@ import csv import glob -import librosa import logging -import numpy as np import os import shutil +import librosa +import numpy as np from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils - +BIBTEX = """@article{bosch2016evaluation, + title={Evaluation and combination of pitch estimation methods for melody extraction in symphonic classical music}, + author={Bosch, Juan J and Marxer, Ricard and G{\'o}mez, Emilia}, + journal={Journal of New Music Research}, + volume={45}, + number={2}, + pages={101--117}, + year={2016}, + publisher={Taylor \\& Francis} +}""" REMOTES = { "all": download_utils.RemoteFileMetadata( filename="Orchset_dataset_0.zip", @@ -33,8 +42,6 @@ ) } -DATASET_DIR = "Orchset" - def _load_metadata(data_home): @@ -99,13 +106,11 @@ def _load_metadata(data_home): DATA = utils.LargeData("orchset_index.json", _load_metadata) -class Track(track.Track): +class Track(core.Track): """orchset Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: alternating_melody (bool): True if the melody alternates between instruments @@ -126,15 +131,12 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: - raise ValueError("{} is not a valid track ID in Orchset".format(track_id)) + raise ValueError("{} is not a valid track ID in orchset".format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] self.melody_path = os.path.join(self._data_home, self._track_paths["melody"][0]) @@ -236,93 +238,43 @@ def load_audio_stereo(audio_path): return librosa.load(audio_path, sr=None, mono=False) -def download(data_home=None, force_overwrite=False, cleanup=True): - """Download ORCHSET Dataset. +def _download( + save_dir, remotes, partial_download, info_message, force_overwrite, cleanup +): + """Download the dataset. Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` + save_dir (str): + The directory to download the data + remotes (dict or None): + A dictionary of RemoteFileMetadata tuples of data in zip format. + If None, there is no data to download + partial_download (list or None): + A list of keys to partially download the remote objects of the download dict. + If None, all data is downloaded + info_message (str or None): + A string of info to print when this function is called. + If None, no string is printed. force_overwrite (bool): - Whether to overwrite the existing downloaded data + If True, existing files are overwritten by the downloaded files. cleanup (bool): Whether to delete the zip/tar file after extracting. """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - download_utils.downloader( - data_home, - remotes=REMOTES, + save_dir, + remotes=remotes, info_message=None, force_overwrite=force_overwrite, cleanup=cleanup, ) - # files get downloaded to a folder called Orchset - move everything up a level - duplicated_orchset_dir = os.path.join(data_home, "Orchset") + duplicated_orchset_dir = os.path.join(save_dir, "Orchset") orchset_files = glob.glob(os.path.join(duplicated_orchset_dir, "*")) - for fpath in orchset_files: - shutil.move(fpath, data_home) - + shutil.move(fpath, save_dir) if os.path.exists(duplicated_orchset_dir): - os.removedirs(duplicated_orchset_dir) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - dataset_path (str): ORCHSET dataset local path - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load ORCHSET dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - orchset_data = {} - for key in track_ids(): - orchset_data[key] = Track(key, data_home=data_home) - return orchset_data + shutil.rmtree(duplicated_orchset_dir) def load_melody(melody_path): @@ -342,27 +294,3 @@ def load_melody(melody_path): melody_data = utils.F0Data(np.array(times), np.array(freqs), np.array(confidence)) return melody_data - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Bosch, J., Marxer, R., Gomez, E., "Evaluation and Combination of -Pitch Estimation Methods for Melody Extraction in Symphonic -Classical Music", Journal of New Music Research (2016) - -========== Bibtex ========== -@article{bosch2016evaluation, - title={Evaluation and combination of pitch estimation methods for melody extraction in symphonic classical music}, - author={Bosch, Juan J and Marxer, Ricard and G{\'o}mez, Emilia}, - journal={Journal of New Music Research}, - volume={45}, - number={2}, - pages={101--117}, - year={2016}, - publisher={Taylor \\& Francis} -} -""" - - print(cite_data) diff --git a/mirdata/datasets/rwc_classical.py b/mirdata/datasets/rwc_classical.py new file mode 100644 index 000000000..1f2e066a2 --- /dev/null +++ b/mirdata/datasets/rwc_classical.py @@ -0,0 +1,292 @@ +# -*- coding: utf-8 -*- +"""RWC Classical Dataset Loader + + The Classical Music Database consists of 50 pieces: +* Symphonies: 4 pieces +* Concerti: 2 pieces +* Orchestral music: 4 pieces +* Chamber music: 10 pieces +* Solo performances: 24 pieces +* Vocal performances: 6 pieces + +For more details, please visit: https://staff.aist.go.jp/m.goto/RWC-MDB/rwc-mdb-c.html +""" +import csv +import logging +import os + +import librosa +import numpy as np + +from mirdata import download_utils +from mirdata import jams_utils +from mirdata import core +from mirdata import utils + +BIBTEX = """@inproceedings{goto2002rwc, + title={RWC Music Database: Popular, Classical and Jazz Music Databases.}, + author={Goto, Masataka and Hashiguchi, Hiroki and Nishimura, Takuichi and Oka, Ryuichi}, + booktitle={3rd International Society for Music Information Retrieval Conference}, + year={2002}, + series={ISMIR}, +}""" +REMOTES = { + "annotations_beat": download_utils.RemoteFileMetadata( + filename="AIST.RWC-MDB-C-2001.BEAT.zip", + url="https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-C-2001.BEAT.zip", + checksum="e8ee05854833cbf5eb7280663f71c29b", + destination_dir="annotations", + ), + "annotations_sections": download_utils.RemoteFileMetadata( + filename="AIST.RWC-MDB-C-2001.CHORUS.zip", + url="https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-C-2001.CHORUS.zip", + checksum="f77bd527510376f59f5a2eed8fd7feb3", + destination_dir="annotations", + ), + "metadata": download_utils.RemoteFileMetadata( + filename="rwc-c.csv", + url="https://github.com/magdalenafuentes/metadata/archive/master.zip", + checksum="7dbe87fedbaaa1f348625a2af1d78030", + destination_dir=None, + ), +} +DATASET_INFO = """ + Unfortunately the audio files of the RWC-Classical dataset are not available + for download. If you have the RWC-Classical dataset, place the contents into a + folder called RWC-Classical with the following structure: + > RWC-Classical/ + > annotations/ + > audio/rwc-c-m0i with i in [1 .. 6] + > metadata-master/ + and copy the RWC-Classical folder to {data_home} +""" + + +def _load_metadata(data_home): + + metadata_path = os.path.join(data_home, "metadata-master", "rwc-c.csv") + + if not os.path.exists(metadata_path): + logging.info( + "Metadata file {} not found.".format(metadata_path) + + "You can download the metadata file by running download()" + ) + return None + + with open(metadata_path, "r") as fhandle: + dialect = csv.Sniffer().sniff(fhandle.read(1024)) + fhandle.seek(0) + reader = csv.reader(fhandle, dialect) + raw_data = [] + for line in reader: + if line[0] != "Piece No.": + raw_data.append(line) + + metadata_index = {} + for line in raw_data: + if line[0] == "Piece No.": + continue + p = "00" + line[0].split(".")[1][1:] + track_id = "RM-C{}".format(p[len(p) - 3 :]) + + metadata_index[track_id] = { + "piece_number": line[0], + "suffix": line[1], + "track_number": line[2], + "title": line[3], + "composer": line[4], + "artist": line[5], + "duration": _duration_to_sec(line[6]), + "category": line[7], + } + + metadata_index["data_home"] = data_home + + return metadata_index + + +DATA = utils.LargeData("rwc_classical_index.json", _load_metadata) + + +class Track(core.Track): + """rwc_classical Track class + + Args: + track_id (str): track id of the track + + Attributes: + artist (str): the track's artist + audio_path (str): path of the audio file + beats_path (str): path of the beat annotation file + category (str): One of 'Symphony', 'Concerto', 'Orchestral', + 'Solo', 'Chamber', 'Vocal', or blank. + composer (str): Composer of this Track. + duration (float): Duration of the track in seconds + piece_number (str): Piece number of this Track, [1-50] + sections_path (str): path of the section annotation file + suffix (str): string within M01-M06 + title (str): Title of The track. + track_id (str): track id + track_number (str): CD track number of this Track + + """ + + def __init__(self, track_id, data_home): + if track_id not in DATA.index: + raise ValueError( + "{} is not a valid track ID in rwc_classical".format(track_id) + ) + + self.track_id = track_id + self._data_home = data_home + self._track_paths = DATA.index[track_id] + self.sections_path = os.path.join( + self._data_home, self._track_paths["sections"][0] + ) + self.beats_path = os.path.join(self._data_home, self._track_paths["beats"][0]) + + metadata = DATA.metadata(data_home) + if metadata is not None and track_id in metadata: + self._track_metadata = metadata[track_id] + else: + self._track_metadata = { + "piece_number": None, + "suffix": None, + "track_number": None, + "title": None, + "composer": None, + "artist": None, + "duration": None, + "category": None, + } + + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + + self.piece_number = self._track_metadata["piece_number"] + self.suffix = self._track_metadata["suffix"] + self.track_number = self._track_metadata["track_number"] + self.title = self._track_metadata["title"] + self.composer = self._track_metadata["composer"] + self.artist = self._track_metadata["artist"] + self.duration = self._track_metadata["duration"] + self.category = self._track_metadata["category"] + + @utils.cached_property + def sections(self): + """SectionData: human labeled section annotations""" + return load_sections(self.sections_path) + + @utils.cached_property + def beats(self): + """BeatData: human labeled beat annotations""" + return load_beats(self.beats_path) + + @property + def audio(self): + """(np.ndarray, float): audio signal, sample rate""" + return load_audio(self.audio_path) + + def to_jams(self): + """Jams: the track's data in jams format""" + return jams_utils.jams_converter( + audio_path=self.audio_path, + beat_data=[(self.beats, None)], + section_data=[(self.sections, None)], + metadata=self._track_metadata, + ) + + +def load_audio(audio_path): + """Load a RWC audio file. + + Args: + audio_path (str): path to audio file + + Returns: + y (np.ndarray): the mono audio signal + sr (float): The sample rate of the audio file + + """ + if not os.path.exists(audio_path): + raise IOError("audio_path {} does not exist".format(audio_path)) + + return librosa.load(audio_path, sr=None, mono=True) + + +def load_sections(sections_path): + if not os.path.exists(sections_path): + raise IOError("sections_path {} does not exist".format(sections_path)) + + begs = [] # timestamps of section beginnings + ends = [] # timestamps of section endings + secs = [] # section labels + + with open(sections_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter="\t") + for line in reader: + begs.append(float(line[0]) / 100.0) + ends.append(float(line[1]) / 100.0) + secs.append(line[2]) + + return utils.SectionData(np.array([begs, ends]).T, secs) + + +def _position_in_bar(beat_positions, beat_times): + """ + Mapping to beat position in bar (e.g. 1, 2, 3, 4). + """ + # Remove -1 + _beat_positions = np.delete(beat_positions, np.where(beat_positions == -1)) + beat_times_corrected = np.delete(beat_times, np.where(beat_positions == -1)) + + # Create corrected array with downbeat positions + beat_positions_corrected = np.zeros((len(_beat_positions),)) + downbeat_positions = np.where(_beat_positions == np.max(_beat_positions))[0] + _beat_positions[downbeat_positions] = 1 + beat_positions_corrected[downbeat_positions] = 1 + + # Propagate positions + for b in range(0, len(_beat_positions)): + if _beat_positions[b] > _beat_positions[b - 1]: + beat_positions_corrected[b] = beat_positions_corrected[b - 1] + 1 + + if not downbeat_positions[0] == 0: + timesig_next_bar = beat_positions_corrected[downbeat_positions[1] - 1] + for b in range(1, downbeat_positions[0] + 1): + beat_positions_corrected[downbeat_positions[0] - b] = ( + timesig_next_bar - b + 1 + ) + + return beat_positions_corrected, beat_times_corrected + + +def load_beats(beats_path): + if not os.path.exists(beats_path): + raise IOError("beats_path {} does not exist".format(beats_path)) + + beat_times = [] # timestamps of beat interval beginnings + beat_positions = [] # beat position inside the bar + + with open(beats_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter="\t") + for line in reader: + beat_times.append(float(line[0]) / 100.0) + beat_positions.append(int(line[2])) + beat_positions, beat_times = _position_in_bar( + np.array(beat_positions), np.array(beat_times) + ) + + return utils.BeatData(beat_times, beat_positions.astype(int)) + + +def _duration_to_sec(duration): + if type(duration) == str: + if ":" in duration: + if len(duration.split(":")) <= 2: + minutes, secs = duration.split(":") + else: + minutes, secs, _ = duration.split( + ":" + ) # mistake in annotation in RM-J044 + total_secs = float(minutes) * 60 + float(secs) + return total_secs diff --git a/mirdata/datasets/rwc_jazz.py b/mirdata/datasets/rwc_jazz.py new file mode 100644 index 000000000..0c19f6dc4 --- /dev/null +++ b/mirdata/datasets/rwc_jazz.py @@ -0,0 +1,222 @@ +# -*- coding: utf-8 -*- +"""RWC Jazz Dataset Loader. + +The Jazz Music Database consists of 50 pieces: + +* Instrumentation variations: 35 pieces (5 pieces × 7 instrumentations). +The instrumentation-variation pieces were recorded to obtain different versions +of the same piece; i.e., different arrangements performed by different player +instrumentations. Five standard-style jazz pieces were originally composed +and then performed in modern-jazz style using the following seven instrumentations: +1. Piano solo +2. Guitar solo +3. Duo: Vibraphone + Piano, Flute + Piano, and Piano + Bass +4. Piano trio: Piano + Bass + Drums +5. Piano trio + Trumpet or Tenor saxophone +6. Octet: Piano trio + Guitar + Alto saxophone + Baritone saxophone + Tenor saxophone × 2 +7. Piano trio + Vibraphone or Flute + +* Style variations: 9 pieces +The style-variation pieces were recorded to represent various styles of jazz. +They include four well-known public-domain pieces and consist of +1. Vocal jazz: 2 pieces (including "Aura Lee") +2. Big band jazz: 2 pieces (including "The Entertainer") +3. Modal jazz: 2 pieces +4. Funky jazz: 2 pieces (including "Silent Night") +5. Free jazz: 1 piece (including "Joyful, Joyful, We Adore Thee") +Fusion (crossover): 6 pieces +The fusion pieces were recorded to obtain music that combines elements of jazz +with other styles such as popular, rock, and latin. They include music with an +eighth-note feel, music with a sixteenth-note feel, and Latin jazz music. + +For more details, please visit: https://staff.aist.go.jp/m.goto/RWC-MDB/rwc-mdb-j.html +""" +import csv +import logging +import os + +import librosa + +from mirdata import download_utils +from mirdata import jams_utils +from mirdata import core +from mirdata import utils + +# these functions are identical for all rwc datasets +from mirdata.datasets.rwc_classical import ( + load_beats, + load_sections, + load_audio, + _duration_to_sec, +) + +BIBTEX = """@inproceedings{goto2002rwc, + title={RWC Music Database: Popular, Classical and Jazz Music Databases.}, + author={Goto, Masataka and Hashiguchi, Hiroki and Nishimura, Takuichi and Oka, Ryuichi}, + booktitle={3rd International Society for Music Information Retrieval Conference}, + year={2002}, + series={ISMIR}, +}""" +REMOTES = { + "metadata": download_utils.RemoteFileMetadata( + filename="rwc-j.csv", + url="https://github.com/magdalenafuentes/metadata/archive/master.zip", + checksum="7dbe87fedbaaa1f348625a2af1d78030", + destination_dir=None, + ), + "annotations_beat": download_utils.RemoteFileMetadata( + filename="AIST.RWC-MDB-J-2001.BEAT.zip", + url="https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-J-2001.BEAT.zip", + checksum="b483853da05d0fff3992879f7729bcb4", + destination_dir="annotations", + ), + "annotations_sections": download_utils.RemoteFileMetadata( + filename="AIST.RWC-MDB-J-2001.CHORUS.zip", + url="https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-J-2001.CHORUS.zip", + checksum="44afcf7f193d7e48a7d99e7a6f3ed39d", + destination_dir="annotations", + ), +} +DOWNLOAD_INFO = """ + Unfortunately the audio files of the RWC-Jazz dataset are not available + for download. If you have the RWC-Jazz dataset, place the contents into a + folder called RWC-Jazz with the following structure: + > RWC-Jazz/ + > annotations/ + > audio/rwc-j-m0i with i in [1 .. 4] + > metadata-master/ + and copy the RWC-Jazz folder to {data_home} +""" + + +def _load_metadata(data_home): + + metadata_path = os.path.join(data_home, "metadata-master", "rwc-j.csv") + + if not os.path.exists(metadata_path): + logging.info( + "Metadata file {} not found.".format(metadata_path) + + "You can download the metadata file by running download()" + ) + return None + + with open(metadata_path, "r") as fhandle: + dialect = csv.Sniffer().sniff(fhandle.read(1024)) + fhandle.seek(0) + reader = csv.reader(fhandle, dialect) + raw_data = [] + for line in reader: + if line[0] != "Piece No.": + raw_data.append(line) + + metadata_index = {} + for line in raw_data: + if line[0] == "Piece No.": + continue + p = "00" + line[0].split(".")[1][1:] + track_id = "RM-J{}".format(p[len(p) - 3 :]) + + metadata_index[track_id] = { + "piece_number": line[0], + "suffix": line[1], + "track_number": line[2], + "title": line[3], + "artist": line[4], + "duration": _duration_to_sec(line[5]), + "variation": line[6], + "instruments": line[7], + } + + metadata_index["data_home"] = data_home + + return metadata_index + + +DATA = utils.LargeData("rwc_jazz_index.json", _load_metadata) + + +class Track(core.Track): + """rwc_jazz Track class + + Args: + track_id (str): track id of the track + + Attributes: + artist (str): Artist name + audio_path (str): path of the audio file + beats_path (str): path of the beat annotation file + duration (float): Duration of the track in seconds + instruments (str): list of used instruments. + piece_number (str): Piece number of this Track, [1-50] + sections_path (str): path of the section annotation file + suffix (str): M01-M04 + title (str): Title of The track. + track_id (str): track id + track_number (str): CD track number of this Track + variation (str): TODO + + """ + + def __init__(self, track_id, data_home): + if track_id not in DATA.index: + raise ValueError("{} is not a valid track ID in RWC-Jazz".format(track_id)) + + self.track_id = track_id + self._data_home = data_home + + self._track_paths = DATA.index[track_id] + self.sections_path = os.path.join( + self._data_home, self._track_paths["sections"][0] + ) + self.beats_path = os.path.join(self._data_home, self._track_paths["beats"][0]) + + metadata = DATA.metadata(data_home) + if metadata is not None and track_id in metadata: + self._track_metadata = metadata[track_id] + else: + self._track_metadata = { + "piece_number": None, + "suffix": None, + "track_number": None, + "title": None, + "artist": None, + "duration": None, + "variation": None, + "instruments": None, + } + + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + + self.piece_number = self._track_metadata["piece_number"] + self.suffix = self._track_metadata["suffix"] + self.track_number = self._track_metadata["track_number"] + self.title = self._track_metadata["title"] + self.artist = self._track_metadata["artist"] + self.duration = self._track_metadata["duration"] + self.variation = self._track_metadata["variation"] + self.instruments = self._track_metadata["instruments"] + + @utils.cached_property + def sections(self): + """SectionData: human-labeled section data""" + return load_sections(self.sections_path) + + @utils.cached_property + def beats(self): + """BeatData: human-labeled beat data""" + return load_beats(self.beats_path) + + @property + def audio(self): + """(np.ndarray, float): audio signal, sample rate""" + return load_audio(self.audio_path) + + def to_jams(self): + """Jams: the track's data in jams format""" + return jams_utils.jams_converter( + audio_path=self.audio_path, + beat_data=[(self.beats, None)], + section_data=[(self.sections, None)], + metadata=self._track_metadata, + ) + diff --git a/mirdata/datasets/rwc_popular.py b/mirdata/datasets/rwc_popular.py new file mode 100644 index 000000000..da1b0cc77 --- /dev/null +++ b/mirdata/datasets/rwc_popular.py @@ -0,0 +1,303 @@ +# -*- coding: utf-8 -*- +"""RWC Popular Dataset Loader + +The Popular Music Database consists of 100 songs — 20 songs with English lyrics +performed in the style of popular music typical of songs on the American hit +charts in the 1980s, and 80 songs with Japanese lyrics performed in the style of +modern Japanese popular music typical of songs on the Japanese hit charts in +the 1990s. + +For more details, please visit: https://staff.aist.go.jp/m.goto/RWC-MDB/rwc-mdb-p.html +""" +import csv +import logging +import os + +import librosa +import numpy as np + +from mirdata import download_utils +from mirdata import jams_utils +from mirdata import core +from mirdata import utils + +# these functions are identical for all rwc datasets +from mirdata.datasets.rwc_classical import ( + load_beats, + load_sections, + load_audio, + _duration_to_sec, +) + +BIBTEX = """@inproceedings{goto2002rwc, + title={RWC Music Database: Popular, Classical and Jazz Music Databases.}, + author={Goto, Masataka and Hashiguchi, Hiroki and Nishimura, Takuichi and Oka, Ryuichi}, + booktitle={3rd International Society for Music Information Retrieval Conference}, + year={2002}, + series={ISMIR}, + note={Cite this if using audio, beat or section annotations}, +} +@inproceedings{cho2011feature, + title={A feature smoothing method for chord recognition using recurrence plots}, + author={Cho, Taemin and Bello, Juan P}, + booktitle={12th International Society for Music Information Retrieval Conference}, + year={2011}, + series={ISMIR}, + note={Cite this if using chord annotations}, +} +@inproceedings{mauch2011timbre, + title={Timbre and Melody Features for the Recognition of Vocal Activity and Instrumental Solos in Polyphonic Music.}, + author={Mauch, Matthias and Fujihara, Hiromasa and Yoshii, Kazuyoshi and Goto, Masataka}, + booktitle={ISMIR}, + year={2011}, + series={ISMIR}, + note={Cite this if using vocal-instrumental activity annotations}, +}""" +REMOTES = { + "metadata": download_utils.RemoteFileMetadata( + filename="rwc-p.csv", + url="https://github.com/magdalenafuentes/metadata/archive/master.zip", + checksum="7dbe87fedbaaa1f348625a2af1d78030", + destination_dir=None, + ), + "annotations_beat": download_utils.RemoteFileMetadata( + filename="AIST.RWC-MDB-P-2001.BEAT.zip", + url="https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-P-2001.BEAT.zip", + checksum="3858aa989535bd7196b3cd07b512b5b6", + destination_dir="annotations", + ), + "annotations_sections": download_utils.RemoteFileMetadata( + filename="AIST.RWC-MDB-P-2001.CHORUS.zip", + url="https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-P-2001.CHORUS.zip", + checksum="f76b3a32701fbd9bf78baa608f692a77", + destination_dir="annotations", + ), + "annotations_chords": download_utils.RemoteFileMetadata( + filename="AIST.RWC-MDB-P-2001.CHORD.zip", + url="https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-P-2001.CHORD.zip", + checksum="68379c88bc8ec3f1907b32a3579197c5", + destination_dir="annotations", + ), + "annotations_vocal_act": download_utils.RemoteFileMetadata( + filename="AIST.RWC-MDB-P-2001.VOCA_INST.zip", + url="https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-P-2001.VOCA_INST.zip", + checksum="47ded648a496407ef49dba9c8bf80e87", + destination_dir="annotations", + ), +} +DOWNLOAD_INFO = """ + Unfortunately the audio files of the RWC-Popular dataset are not available + for download. If you have the RWC-Popular dataset, place the contents into a + folder called RWC-Popular with the following structure: + > RWC-Popular/ + > annotations/ + > audio/rwc-p-m0i with i in [1 .. 7] + > metadata-master/ + and copy the RWC-Popular folder to {data_home} +""" + + +def _load_metadata(data_home): + + metadata_path = os.path.join(data_home, "metadata-master", "rwc-p.csv") + + if not os.path.exists(metadata_path): + logging.info( + "Metadata file {} not found.".format(metadata_path) + + "You can download the metadata file by running download()" + ) + return None + + with open(metadata_path, "r") as fhandle: + dialect = csv.Sniffer().sniff(fhandle.read(1024)) + fhandle.seek(0) + reader = csv.reader(fhandle, dialect) + raw_data = [] + for line in reader: + if line[0] != "Piece No.": + raw_data.append(line) + + metadata_index = {} + for line in raw_data: + if line[0] == "Piece No.": + continue + p = "00" + line[0].split(".")[1][1:] + track_id = "RM-P{}".format(p[len(p) - 3 :]) + + metadata_index[track_id] = { + "piece_number": line[0], + "suffix": line[1], + "track_number": line[2], + "title": line[3], + "artist": line[4], + "singer_information": line[5], + "duration": _duration_to_sec(line[6]), + "tempo": line[7], + "instruments": line[8], + "drum_information": line[9], + } + + metadata_index["data_home"] = data_home + + return metadata_index + + +DATA = utils.LargeData("rwc_popular_index.json", _load_metadata) + + +class Track(core.Track): + """rwc_popular Track class + + Args: + track_id (str): track id of the track + + Attributes: + artist (str): artist + audio_path (str): path of the audio file + beats_path (str): path of the beat annotation file + chords_path (str): path of the chord annotation file + drum_information (str): If the drum is 'Drum sequences', 'Live drums', + or 'Drum loops' + duration (float): Duration of the track in seconds + instruments (str): List of used instruments + piece_number (str): Piece number, [1-50] + sections_path (str): path of the section annotation file + singer_information (str): TODO + suffix (str): M01-M04 + tempo (str): Tempo of the track in BPM + title (str): title + track_id (str): track id + track_number (str): CD track number + voca_inst_path (str): path of the vocal/instrumental annotation file + + """ + + def __init__(self, track_id, data_home): + if track_id not in DATA.index: + raise ValueError( + "{} is not a valid track ID in RWC-Popular".format(track_id) + ) + + self.track_id = track_id + self._data_home = data_home + + self._track_paths = DATA.index[track_id] + self.sections_path = os.path.join( + self._data_home, self._track_paths["sections"][0] + ) + self.beats_path = os.path.join(self._data_home, self._track_paths["beats"][0]) + self.chords_path = os.path.join(self._data_home, self._track_paths["chords"][0]) + self.voca_inst_path = os.path.join( + self._data_home, self._track_paths["voca_inst"][0] + ) + + metadata = DATA.metadata(data_home) + if metadata is not None and track_id in metadata: + self._track_metadata = metadata[track_id] + else: + # annotations with missing metadata + self._track_metadata = { + "piece_number": None, + "suffix": None, + "track_number": None, + "title": None, + "artist": None, + "singer_information": None, + "duration": None, + "tempo": None, + "instruments": None, + "drum_information": None, + } + + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + + self.piece_number = self._track_metadata["piece_number"] + self.suffix = self._track_metadata["suffix"] + self.track_number = self._track_metadata["track_number"] + self.title = self._track_metadata["title"] + self.artist = self._track_metadata["artist"] + self.singer_information = self._track_metadata["singer_information"] + self.duration = self._track_metadata["duration"] + self.tempo = self._track_metadata["tempo"] + self.instruments = self._track_metadata["instruments"] + self.drum_information = self._track_metadata["drum_information"] + + @utils.cached_property + def sections(self): + """SectionData: human-labeled section annotation""" + return load_sections(self.sections_path) + + @utils.cached_property + def beats(self): + """BeatData: human-labeled beat annotation""" + return load_beats(self.beats_path) + + @utils.cached_property + def chords(self): + """ChordData: human-labeled chord annotation""" + return load_chords(self.chords_path) + + @utils.cached_property + def vocal_instrument_activity(self): + """EventData: human-labeled vocal/instrument activity""" + return load_voca_inst(self.voca_inst_path) + + @property + def audio(self): + """(np.ndarray, float): audio signal, sample rate""" + return load_audio(self.audio_path) + + def to_jams(self): + """Jams: the track's data in jams format""" + return jams_utils.jams_converter( + audio_path=self.audio_path, + beat_data=[(self.beats, None)], + section_data=[(self.sections, None)], + chord_data=[(self.chords, None)], + metadata=self._track_metadata, + ) + + +def load_chords(chords_path): + if not os.path.exists(chords_path): + raise IOError("chords_path {} does not exist".format(chords_path)) + + begs = [] # timestamps of chord beginnings + ends = [] # timestamps of chord endings + chords = [] # chord labels + + if os.path.exists(chords_path): + with open(chords_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter="\t") + for line in reader: + begs.append(float(line[0])) + ends.append(float(line[1])) + chords.append(line[2]) + + return utils.ChordData(np.array([begs, ends]).T, chords) + + +def load_voca_inst(voca_inst_path): + if not os.path.exists(voca_inst_path): + raise IOError("voca_inst_path {} does not exist".format(voca_inst_path)) + + begs = [] # timestamps of vocal-instrument activity beginnings + ends = [] # timestamps of vocal-instrument activity endings + events = [] # vocal-instrument activity labels + + with open(voca_inst_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter="\t") + raw_data = [] + for line in reader: + if line[0] != "Piece No.": + raw_data.append(line) + + for i in range(len(raw_data)): + # Parsing vocal-instrument activity as intervals (beg, end, event) + if raw_data[i] != raw_data[-1]: + begs.append(float(raw_data[i][0])) + ends.append(float(raw_data[i + 1][0])) + events.append(raw_data[i][1]) + + return utils.EventData(np.array(begs), np.array(ends), np.array(events)) + diff --git a/mirdata/salami.py b/mirdata/datasets/salami.py similarity index 53% rename from mirdata/salami.py rename to mirdata/datasets/salami.py index 4aeb6c4a7..f39b67d7c 100644 --- a/mirdata/salami.py +++ b/mirdata/datasets/salami.py @@ -12,26 +12,42 @@ For more details, please visit: https://github.com/DDMAL/salami-data-public """ import csv -import librosa import logging -import numpy as np import os +import librosa +import numpy as np + from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = 'Salami' - +BIBTEX = """@inproceedings{smith2011salami, + title={Design and creation of a large-scale database of structural annotations.}, + author={Smith, Jordan Bennett Louis and Burgoyne, John Ashley and + Fujinaga, Ichiro and De Roure, David and Downie, J Stephen}, + booktitle={12th International Society for Music Information Retrieval Conference}, + year={2011}, + series = {ISMIR}, +}""" REMOTES = { - 'annotations': download_utils.RemoteFileMetadata( - filename='salami-data-public-hierarchy-corrections.zip', - url='https://github.com/bmcfee/salami-data-public/archive/hierarchy-corrections.zip', - checksum='194add2601c09a7279a7433288de81fd', + "annotations": download_utils.RemoteFileMetadata( + filename="salami-data-public-hierarchy-corrections.zip", + url="https://github.com/bmcfee/salami-data-public/archive/hierarchy-corrections.zip", + checksum="194add2601c09a7279a7433288de81fd", destination_dir=None, ) } +DOWNLOAD_INFO = """ + Unfortunately the audio files of the Salami dataset are not available + for download. If you have the Salami dataset, place the contents into a + folder called Salami with the following structure: + > Salami/ + > salami-data-public-hierarchy-corrections/ + > audio/ + and copy the Salami folder to {} +""" def _load_metadata(data_home): @@ -39,19 +55,19 @@ def _load_metadata(data_home): metadata_path = os.path.join( data_home, os.path.join( - 'salami-data-public-hierarchy-corrections', 'metadata', 'metadata.csv' + "salami-data-public-hierarchy-corrections", "metadata", "metadata.csv" ), ) if not os.path.exists(metadata_path): - logging.info('Metadata file {} not found.'.format(metadata_path)) + logging.info("Metadata file {} not found.".format(metadata_path)) return None - with open(metadata_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter=',') + with open(metadata_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter=",") raw_data = [] for line in reader: if line != []: - if line[0] == 'SONG_ID': + if line[0] == "SONG_ID": continue raw_data.append(line) @@ -59,36 +75,34 @@ def _load_metadata(data_home): for line in raw_data: track_id = line[0] duration = None - if line[5] != '': + if line[5] != "": duration = float(line[5]) metadata_index[track_id] = { - 'source': line[1], - 'annotator_1_id': line[2], - 'annotator_2_id': line[3], - 'duration': duration, - 'title': line[7], - 'artist': line[8], - 'annotator_1_time': line[10], - 'annotator_2_time': line[11], - 'class': line[14], - 'genre': line[15], + "source": line[1], + "annotator_1_id": line[2], + "annotator_2_id": line[3], + "duration": duration, + "title": line[7], + "artist": line[8], + "annotator_1_time": line[10], + "annotator_2_time": line[11], + "class": line[14], + "genre": line[15], } - metadata_index['data_home'] = data_home + metadata_index["data_home"] = data_home return metadata_index -DATA = utils.LargeData('salami_index.json', _load_metadata) +DATA = utils.LargeData("salami_index.json", _load_metadata) -class Track(track.Track): +class Track(core.Track): """salami Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: annotator_1_id (str): number that identifies annotator 1 @@ -109,28 +123,25 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: - raise ValueError('{} is not a valid track ID in Salami'.format(track_id)) + raise ValueError("{} is not a valid track ID in Salami".format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] self.sections_annotator1_uppercase_path = utils.none_path_join( - [self._data_home, self._track_paths['annotator_1_uppercase'][0]] + [self._data_home, self._track_paths["annotator_1_uppercase"][0]] ) self.sections_annotator1_lowercase_path = utils.none_path_join( - [self._data_home, self._track_paths['annotator_1_lowercase'][0]] + [self._data_home, self._track_paths["annotator_1_lowercase"][0]] ) self.sections_annotator2_uppercase_path = utils.none_path_join( - [self._data_home, self._track_paths['annotator_2_uppercase'][0]] + [self._data_home, self._track_paths["annotator_2_uppercase"][0]] ) self.sections_annotator2_lowercase_path = utils.none_path_join( - [self._data_home, self._track_paths['annotator_2_lowercase'][0]] + [self._data_home, self._track_paths["annotator_2_lowercase"][0]] ) metadata = DATA.metadata(data_home) @@ -139,28 +150,28 @@ def __init__(self, track_id, data_home=None): else: # annotations with missing metadata self._track_metadata = { - 'source': None, - 'annotator_1_id': None, - 'annotator_2_id': None, - 'duration': None, - 'title': None, - 'artist': None, - 'annotator_1_time': None, - 'annotator_2_time': None, - 'class': None, - 'genre': None, + "source": None, + "annotator_1_id": None, + "annotator_2_id": None, + "duration": None, + "title": None, + "artist": None, + "annotator_1_time": None, + "annotator_2_time": None, + "class": None, + "genre": None, } - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - self.source = self._track_metadata['source'] - self.annotator_1_id = self._track_metadata['annotator_1_id'] - self.annotator_2_id = self._track_metadata['annotator_2_id'] - self.duration = self._track_metadata['duration'] - self.title = self._track_metadata['title'] - self.artist = self._track_metadata['artist'] - self.annotator_1_time = self._track_metadata['annotator_1_time'] - self.annotator_2_time = self._track_metadata['annotator_2_time'] - self.broad_genre = self._track_metadata['class'] - self.genre = self._track_metadata['genre'] + self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) + self.source = self._track_metadata["source"] + self.annotator_1_id = self._track_metadata["annotator_1_id"] + self.annotator_2_id = self._track_metadata["annotator_2_id"] + self.duration = self._track_metadata["duration"] + self.title = self._track_metadata["title"] + self.artist = self._track_metadata["artist"] + self.annotator_1_time = self._track_metadata["annotator_1_time"] + self.annotator_2_time = self._track_metadata["annotator_2_time"] + self.broad_genre = self._track_metadata["class"] + self.genre = self._track_metadata["genre"] @utils.cached_property def sections_annotator_1_uppercase(self): @@ -205,14 +216,14 @@ def to_jams(self): (self.sections_annotator_1_uppercase, 0), (self.sections_annotator_1_lowercase, 1), ], - 'annotator_1', + "annotator_1", ), ( [ (self.sections_annotator_2_uppercase, 0), (self.sections_annotator_2_lowercase, 1), ], - 'annotator_2', + "annotator_2", ), ], metadata=self._track_metadata, @@ -236,96 +247,6 @@ def load_audio(audio_path): return librosa.load(audio_path, sr=None, mono=True) -def download(data_home=None, force_overwrite=False, cleanup=True): - """Download SALAMI Dataset (annotations). - The audio files are not provided. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - cleanup (bool): - Whether to delete the zip/tar file after extracting. - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - info_message = """ - Unfortunately the audio files of the Salami dataset are not available - for download. If you have the Salami dataset, place the contents into a - folder called Salami with the following structure: - > Salami/ - > salami-data-public-hierarchy-corrections/ - > audio/ - and copy the Salami folder to {} - """.format( - data_home - ) - - download_utils.downloader( - data_home, - remotes=REMOTES, - info_message=info_message, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load SALAMI dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - salami_data = {} - for key in track_ids(): - salami_data[key] = Track(key, data_home=data_home) - return salami_data - - def load_sections(sections_path): if sections_path is None: return None @@ -335,8 +256,8 @@ def load_sections(sections_path): times = [] secs = [] - with open(sections_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter='\t') + with open(sections_path, "r") as fhandle: + reader = csv.reader(fhandle, delimiter="\t") for line in reader: times.append(float(line[0])) secs.append(line[1]) @@ -350,25 +271,3 @@ def load_sections(sections_path): np.array([times_revised[:-1], times_revised[1:]]).T, list(secs_revised[:-1]) ) - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Smith, Jordan Bennett Louis, et al., -"Design and creation of a large-scale database of structural annotations", -12th International Society for Music Information Retrieval Conference (2011) - -========== Bibtex ========== -@inproceedings{smith2011salami, - title={Design and creation of a large-scale database of structural annotations.}, - author={Smith, Jordan Bennett Louis and Burgoyne, John Ashley and - Fujinaga, Ichiro and De Roure, David and Downie, J Stephen}, - booktitle={12th International Society for Music Information Retrieval Conference}, - year={2011}, - series = {ISMIR}, -} -""" - - print(cite_data) diff --git a/mirdata/tinysol.py b/mirdata/datasets/tinysol.py similarity index 66% rename from mirdata/tinysol.py rename to mirdata/datasets/tinysol.py index 106e49202..167ac3668 100644 --- a/mirdata/tinysol.py +++ b/mirdata/datasets/tinysol.py @@ -48,19 +48,25 @@ from mirdata import download_utils from mirdata import jams_utils -from mirdata import track +from mirdata import core from mirdata import utils -DATASET_DIR = "TinySOL" - +BIBTEX = """@inproceedings{cella2020preprint, + author={Cella, Carmine Emanuele and Ghisi, Daniele and Lostanlen, Vincent and + Lévy, Fabien and Fineberg, Joshua and Maresz, Yan}, + title={{OrchideaSOL}: {A} dataset of extended + instrumental techniques for computer-aided orchestration}, + bootktitle={Under review}, + year={2020} +}""" REMOTES = { - 'audio': download_utils.RemoteFileMetadata( + "audio": download_utils.RemoteFileMetadata( filename="TinySOL.tar.gz", url="https://zenodo.org/record/3685367/files/TinySOL.tar.gz?download=1", checksum="36030a7fe389da86c3419e5ee48e3b7f", destination_dir="audio", ), - 'annotations': download_utils.RemoteFileMetadata( + "annotations": download_utils.RemoteFileMetadata( filename="TinySOL_metadata.csv", url="https://zenodo.org/record/3685367/files/TinySOL_metadata.csv?download=1", checksum="a86c9bb115f69e61f2f25872e397fc4a", @@ -109,13 +115,11 @@ def _load_metadata(data_home): DATA = utils.LargeData("tinysol_index.json", _load_metadata) -class Track(track.Track): +class Track(core.Track): """tinysol Track class Args: track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` Attributes: audio_path (str): path of the audio file @@ -136,15 +140,12 @@ class Track(track.Track): """ - def __init__(self, track_id, data_home=None): + def __init__(self, track_id, data_home): if track_id not in DATA.index: raise ValueError("{} is not a valid track ID in TinySOL".format(track_id)) self.track_id = track_id - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home self._track_paths = DATA.index[track_id] @@ -215,104 +216,3 @@ def load_audio(audio_path): return librosa.load(audio_path, sr=None, mono=True) - -def download( - data_home=None, partial_download=None, force_overwrite=False, cleanup=True -): - """Download TinySOL. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - partial_download (list): - List indicating what to partially download. The list can include any of: - * `'annotations'` the annotation files - * `'audio'` the audio files - If `None`, all data is downloaded. - - cleanup (bool): - Whether to delete the zip/tar file after extracting. - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_utils.downloader( - data_home, - remotes=REMOTES, - partial_download=partial_download, - info_message=None, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, os.path.join(data_home, "audio"), silence=silence - ) - return missing_files, invalid_checksums - - -def load(data_home=None): - """Load TinySOL - Args: - data_home (str): Local path where TinySOL is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - tinysol_data = {} - for key in DATA.index.keys(): - tinysol_data[key] = Track(key, data_home=data_home) - return tinysol_data - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Cella, Carmine Emanuele, et al., "OrchideaSOL: A dataset of extended -instrumental techniques for computer-aided orchestration". Under review, 2020. - -========== Bibtex ========== -@inproceedings{cella2020preprint, -author={Cella, Carmine Emanuele and Ghisi, Daniele and Lostanlen, Vincent and -Lévy, Fabien and Fineberg, Joshua and Maresz, Yan}, -title={{OrchideaSOL}: {A} dataset of extended -instrumental techniques for computer-aided orchestration}, -bootktitle={Under review}, -year={2020}} -""" - print(cite_data) diff --git a/mirdata/giantsteps_tempo.py b/mirdata/giantsteps_tempo.py deleted file mode 100644 index cc218d98e..000000000 --- a/mirdata/giantsteps_tempo.py +++ /dev/null @@ -1,332 +0,0 @@ -# -*- coding: utf-8 -*- -"""giantsteps_tempo Dataset Loader - -name: GiantSteps (tempo+genre) - -contact: Richard Vogl - Peter Knees - -description: collection of annotations for 664 2min(1) audio previews from - www.beatport.com - -references: [1] Peter Knees, Ángel Faraldo, Perfecto Herrera, Richard Vogl, - Sebastian Böck, Florian Hörschläger, Mickael Le Goff: "Two data - sets for tempo estimation and key detection in electronic dance - music annotated from user corrections", Proc. of the 16th - Conference of the International Society for Music Information - Retrieval (ISMIR'15), Oct. 2015, Malaga, Spain. - - [2] Hendrik Schreiber, Meinard Müller: "A Crowdsourced Experiment - for Tempo Estimation of Electronic Dance Music", Proc. of the - 19th Conference of the International Society for Music - Information Retrieval (ISMIR'18), Sept. 2018, Paris, France. - -annotations: tempo (bpm), genre - -notes: -========================================================================= -The audio files (664 files, size ~1gb) can be downloaded from http://www.beatport.com/ -using the bash script: - - https://github.com/GiantSteps/giantsteps-tempo-dataset/blob/master/audio_dl.sh - -To download the files manually use links of the following form: -http://geo-samples.beatport.com/lofi/ -e.g.: -http://geo-samples.beatport.com/lofi/5377710.LOFI.mp3 - -To convert the audio files to .wav use (bash + sox): - -./convert_audio.sh - -To retrieve the genre information, the JSON contained within the website was parsed. -The tempo annotation was extracted from forum entries of people correcting the bpm values (i.e. manual annotation of tempo). -For more information please contact creators. - -[2] found some files without tempo. There are: - -3041381.LOFI.mp3 -3041383.LOFI.mp3 -1327052.LOFI.mp3 - -Their v2 tempo is denoted as 0.0 in tempo and mirex and has no annotation in the JAMS format. - -(1): Most of the audio files are 120 seconds long. Exceptions are: -name length -906760.LOFI.mp3 62 -1327052.LOFI.mp3 70 -4416506.LOFI.mp3 80 -1855660.LOFI.mp3 119 -3419452.LOFI.mp3 119 -3577631.LOFI.mp3 119 -""" - - -import librosa -import os - -from mirdata import download_utils -from mirdata import track -from mirdata import utils -import numpy as np -import jams - -DATASET_DIR = 'GiantSteps_tempo' - -DATA = utils.LargeData('giantsteps_tempo_index.json') - -REMOTES = { - 'annotations': download_utils.RemoteFileMetadata( - filename='giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb.zip', - url='https://github.com/GiantSteps/giantsteps-tempo-dataset/archive/0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb.zip', - checksum='8fdafbaf505fe3f293bd912c92b72ac8', - destination_dir='', - ) -} - - -class Track(track.Track): - """giantsteps_tempo track class - Args: - track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Attributes: - audio_path (str): track audio path - title (str): title of the track - track_id (str): track id - annotation_v1_path (str): track annotation v1 path - annotation_v2_path (str): track annotation v2 path - """ - - def __init__(self, track_id, data_home=None): - if track_id not in DATA.index: - raise ValueError( - '{} is not a valid track ID in giantsteps_tempo'.format(track_id) - ) - - self.track_id = track_id - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - self._data_home = data_home - self._track_paths = DATA.index[track_id] - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - self.annotation_v1_path = os.path.join( - self._data_home, self._track_paths['annotation_v1'][0] - ) - self.annotation_v2_path = os.path.join( - self._data_home, self._track_paths['annotation_v2'][0] - ) - - self.title = self.audio_path.replace(".mp3", '').split('/')[-1].split('.')[0] - - @utils.cached_property - def genre(self): - """genre: human-labeled metadata annotation""" - return load_genre(self.annotation_v1_path) - - @utils.cached_property - def tempo(self): - """TempoData: tempo annotation ordered by confidence""" - return load_tempo(self.annotation_v1_path) - - @utils.cached_property - def tempo_v2(self): - """TempoData: tempos annotation ordered by confidence""" - return load_tempo(self.annotation_v2_path) - - @property - def audio(self): - """(np.ndarray, float): audio signal, sample rate""" - return load_audio(self.audio_path) - - def to_jams(self): - """Jams: the track's data in jams format""" - return jams.load(self.annotation_v1_path) - - def to_jams_v2(self): - """Jams: the track's data in jams format""" - return jams.load(self.annotation_v2_path) - - -def load_audio(audio_path): - """Load a giantsteps_tempo audio file. - Args: - audio_path (str): path to audio file - Returns: - y (np.ndarray): the mono audio signal - sr (float): The sample rate of the audio file - """ - if not os.path.exists(audio_path): - raise IOError("audio_path {} does not exist".format(audio_path)) - return librosa.load(audio_path, sr=None, mono=True) - - -def download(data_home=None, force_overwrite=False, cleanup=True): - """Download the giantsteps_tempo Dataset (annotations). - The audio files are not provided due to copyright issues. - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - cleanup (bool): - Whether to delete the zip/tar file after extracting. - partial_download(list of str) - arguments can be 'audio' 'metadata' or/and 'tempos' - """ - - # use the default location: ~/mir_datasets/giantsteps_tempo - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_message = """ - Unfortunately the audio files of the Giant Steps Tempo dataset are not available - for download. If you have the Giant Steps audio dataset, place the contents into - a folder called GiantSteps_tempo with the following structure: - > GiantSteps_tempo/ - > giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/ - > audio/ - and copy the folder to {} - """.format( - data_home - ) - - download_utils.downloader( - data_home, - remotes=REMOTES, - info_message=download_message, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if a local version of this dataset is consistent - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths where the expected file exists locally - but has a different checksum than the reference - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Get the list of track IDs for this dataset - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load giantsteps_tempo dataset - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - giantsteps_key_data = {} - for tempo in track_ids(): - giantsteps_key_data[tempo] = Track(tempo, data_home=data_home) - return giantsteps_key_data - - -def load_genre(path): - """Load genre data from a file - Args: - path (str): path to metadata annotation file - Returns: - (str): loaded genre data - """ - if path is None: - return None - - with open(path) as json_file: - annotation = jams.load(json_file) - - return annotation.search(namespace='tag_open')[0]['data'][0].value - - -def load_tempo(tempo_path): - """Load giantsteps_tempo tempo data from a file ordered by confidence - Args: - tempo_path (str): path to tempo annotation file - Returns: - (list of utils.TempoData): loaded tempo data - """ - if tempo_path is None: - return None - - if not os.path.exists(tempo_path): - raise IOError("tempo_path {} does not exist".format(tempo_path)) - - with open(tempo_path) as json_file: - annotation = jams.load(json_file) - - tempo = annotation.search(namespace='tempo')[0]['data'] - - return utils.TempoData( - np.array([t.time for t in tempo]), - np.array([t.duration for t in tempo]), - np.array([t.value for t in tempo]), - np.array([t.confidence for t in tempo]), - ) - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Peter Knees, Ángel Faraldo, Perfecto Herrera, Richard Vogl, -Sebastian Böck, Florian Hörschläger, Mickael Le Goff: "Two data -sets for tempo estimation and key detection in electronic dance -music annotated from user corrections," Proc. of the 16th -Conference of the International Society for Music Information -Retrieval (ISMIR'15), Oct. 2015, Malaga, Spain. -========== Bibtex ========== -@inproceedings{knees2015two, - title={Two data sets for tempo estimation and key detection in electronic dance music annotated from user corrections}, - author={Knees, Peter and Faraldo P{\'e}rez, {\'A}ngel and Boyer, Herrera and Vogl, Richard and B{\"o}ck, Sebastian and H{\"o}rschl{\"a}ger, Florian and Le Goff, Mickael and others}, - booktitle={Proceedings of the 16th International Society for Music Information Retrieval Conference (ISMIR); 2015 Oct 26-30; M{\'a}laga, Spain.[M{\'a}laga]: International Society for Music Information Retrieval, 2015. p. 364-70.}, - year={2015}, - organization={International Society for Music Information Retrieval (ISMIR)} -} -=========== MLA =========== -Hendrik Schreiber, Meinard Müller: "A Crowdsourced Experiment -for Tempo Estimation of Electronic Dance Music", Proc. of the -19th Conference of the International Society for Music -Information Retrieval (ISMIR'18), Sept. 2018, Paris, France. -========== Bibtex ========== -@inproceedings{SchreiberM18a_Tempo_ISMIR, -author = {Hendrik Schreiber and Meinard M{\"u}ller}, -title = {A Crowdsourced Experiment for Tempo Estimation of Electronic Dance Music}, -booktitle = {Proceedings of the International Conference on Music Information Retrieval ({ISMIR})}, -address = {Paris, France}, -year = {2018}, -url-pdf = {http://www.tagtraum.com/download/2018_schreiber_tempo_giantsteps.pdf} -} - - - - - """ - print(cite_data) diff --git a/mirdata/gtzan_genre.py b/mirdata/gtzan_genre.py deleted file mode 100644 index 6a5564253..000000000 --- a/mirdata/gtzan_genre.py +++ /dev/null @@ -1,197 +0,0 @@ -# -*- coding: utf-8 -*- -"""GTZAN-Genre Dataset Loader - -This dataset was used for the well known paper in genre classification -"Musical genre classification of audio signals " by G. Tzanetakis and -P. Cook in IEEE Transactions on Audio and Speech Processing 2002. - -The dataset consists of 1000 audio tracks each 30 seconds long. It -contains 10 genres, each represented by 100 tracks. The tracks are all -22050 Hz mono 16-bit audio files in .wav format. -""" - -import librosa -import os - -from mirdata import download_utils -from mirdata import jams_utils -from mirdata import track -from mirdata import utils - - -DATASET_DIR = "GTZAN-Genre" - -REMOTES = { - 'all': download_utils.RemoteFileMetadata( - filename="genres.tar.gz", - url="http://opihi.cs.uvic.ca/sound/genres.tar.gz", - checksum="5b3d6dddb579ab49814ab86dba69e7c7", - destination_dir="gtzan_genre", - ) -} - -DATA = utils.LargeData("gtzan_genre_index.json") - - -class Track(track.Track): - """gtzan_genre Track class - - Args: - track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Attributes: - audio_path (str): path to the audio file - genre (str): annotated genre - track_id (str): track id - - """ - - def __init__(self, track_id, data_home=None): - if track_id not in DATA.index: - raise ValueError( - "{} is not a valid track ID in GTZAN-Genre".format(track_id) - ) - - self.track_id = track_id - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - self._data_home = data_home - self._track_paths = DATA.index[track_id] - - self.genre = track_id.split(".")[0] - if self.genre == 'hiphop': - self.genre = 'hip-hop' - - self.audio_path = os.path.join(self._data_home, self._track_paths["audio"][0]) - - @property - def audio(self): - """(np.ndarray, float): audio signal, sample rate""" - return load_audio(self.audio_path) - - def to_jams(self): - """Jams: the track's data in jams format""" - return jams_utils.jams_converter( - tags_gtzan_data=[(self.genre, 'gtzan-genre')], - metadata={ - 'title': "Unknown track", - 'artist': "Unknown artist", - 'release': "Unknown album", - 'duration': 30.0, - 'curator': 'George Tzanetakis', - }, - ) - - -def load_audio(audio_path): - """Load a GTZAN audio file. - - Args: - audio_path (str): path to audio file - - Returns: - y (np.ndarray): the mono audio signal - sr (float): The sample rate of the audio file - - """ - if not os.path.exists(audio_path): - raise IOError("audio_path {} does not exist".format(audio_path)) - audio, sr = librosa.load(audio_path, sr=22050, mono=True) - return audio, sr - - -def load(data_home=None): - """Load GTZAN-Genre - - Args: - data_home (str): Local path where GTZAN-Genre is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - data = {} - for key in DATA.index.keys(): - data[key] = Track(key, data_home=data_home) - return data - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def download(data_home=None, force_overwrite=False, cleanup=True): - """Download the GTZAN-Genre dataset. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - cleanup (bool): - Whether to delete the zip/tar file after extracting. - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - download_utils.downloader( - data_home, - remotes=REMOTES, - info_message=None, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Tzanetakis, George et al. -"GTZAN genre collection". -Music Analysis, Retrieval and Synthesis for Audio Signals. (2002). -========== Bibtex ========== -@article{tzanetakis2002gtzan, - title={GTZAN genre collection}, - author={Tzanetakis, George and Cook, P}, - journal={Music Analysis, Retrieval and Synthesis for Audio Signals}, - year={2002} -} -""" - print(cite_data) diff --git a/mirdata/jams_utils.py b/mirdata/jams_utils.py index 900aed460..330f70d43 100644 --- a/mirdata/jams_utils.py +++ b/mirdata/jams_utils.py @@ -82,23 +82,19 @@ def jams_converter( duration = librosa.get_duration(filename=audio_path) else: raise OSError( - 'jams conversion failed because the audio file ' - + 'for this track cannot be found, and it is required' - + 'to compute duration.' + "jams conversion failed because the audio file " + + "for this track cannot be found, and it is required " + + "to compute duration." ) # metadata if metadata is not None: for key in metadata: - if ( - key == 'duration' - and duration is not None - and metadata[key] != duration - ): + if key == "duration" and duration is not None and metadata[key] != duration: print( - 'Warning: duration provided in metadata does not' - + 'match the duration computed from the audio file.' - + 'Using the duration provided by the metadata.' + "Warning: duration provided in metadata does not" + + "match the duration computed from the audio file." + + "Using the duration provided by the metadata." ) if metadata[key] is None: @@ -115,153 +111,153 @@ def jams_converter( # beats if beat_data is not None: if not isinstance(beat_data, list): - raise TypeError('beat_data should be a list of tuples') + raise TypeError("beat_data should be a list of tuples") for beats in beat_data: if not isinstance(beats, tuple): raise TypeError( - 'beat_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(beats)) + "beat_data should be a list of tuples, " + + "but contains a {} element".format(type(beats)) ) jam.annotations.append(beats_to_jams(beats)) # sections if section_data is not None: if not isinstance(section_data, list): - raise TypeError('section_data should be a list of tuples') + raise TypeError("section_data should be a list of tuples") for sections in section_data: if not isinstance(sections, tuple): raise TypeError( - 'section_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(sections)) + "section_data should be a list of tuples, " + + "but contains a {} element".format(type(sections)) ) jam.annotations.append(sections_to_jams(sections)) # multi-sections (sections with multiple levels) if multi_section_data is not None: if not isinstance(multi_section_data, list): - raise TypeError('multi_section_data should be a list of tuples') + raise TypeError("multi_section_data should be a list of tuples") for sections in multi_section_data: if not isinstance(sections, tuple): raise TypeError( - 'multi_section_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(sections)) + "multi_section_data should be a list of tuples, " + + "but contains a {} element".format(type(sections)) ) if not ( isinstance(sections[0], list) and isinstance(sections[0][0], tuple) ): raise TypeError( - 'tuples in multi_section_data should contain a ' - + 'list of tuples, indicating annotations in the different ' - + 'levels, e.g. ([(segments0, level0), ' - + '(segments1, level1)], annotator)' + "tuples in multi_section_data should contain a " + + "list of tuples, indicating annotations in the different " + + "levels, e.g. ([(segments0, level0), " + + "(segments1, level1)], annotator)" ) jam.annotations.append(multi_sections_to_jams(sections)) # tempo if tempo_data is not None: if type(tempo_data) != list: - raise TypeError('tempo_data should be a list of tuples') + raise TypeError("tempo_data should be a list of tuples") for tempo in tempo_data: if type(tempo) != tuple: raise TypeError( - 'tempo_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(tempo)) + "tempo_data should be a list of tuples, " + + "but contains a {} element".format(type(tempo)) ) jam.annotations.append(tempos_to_jams(tempo)) # events if event_data is not None: if type(event_data) != list: - raise TypeError('event_data should be a list of tuples') + raise TypeError("event_data should be a list of tuples") for events in event_data: if type(events) != tuple: raise TypeError( - 'event_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(events)) + "event_data should be a list of tuples, " + + "but contains a {} element".format(type(events)) ) jam.annotations.append(events_to_jams(events)) # chords if chord_data is not None: if not isinstance(chord_data, list): - raise TypeError('chord_data should be a list of tuples') + raise TypeError("chord_data should be a list of tuples") for chords in chord_data: if not isinstance(chords, tuple): raise TypeError( - 'chord_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(chords)) + "chord_data should be a list of tuples, " + + "but contains a {} element".format(type(chords)) ) jam.annotations.append(chords_to_jams(chords)) # notes if note_data is not None: if not isinstance(note_data, list): - raise TypeError('note_data should be a list of tuples') + raise TypeError("note_data should be a list of tuples") for notes in note_data: if not isinstance(notes, tuple): raise TypeError( - 'note_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(notes)) + "note_data should be a list of tuples, " + + "but contains a {} element".format(type(notes)) ) jam.annotations.append(notes_to_jams(notes)) # keys if key_data is not None: if not isinstance(key_data, list): - raise TypeError('key_data should be a list of tuples') + raise TypeError("key_data should be a list of tuples") for keys in key_data: if not isinstance(keys, tuple): raise TypeError( - 'key_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(keys)) + "key_data should be a list of tuples, " + + "but contains a {} element".format(type(keys)) ) jam.annotations.append(keys_to_jams(keys)) # f0 if f0_data is not None: if not isinstance(f0_data, list): - raise TypeError('f0_data should be a list of tuples') + raise TypeError("f0_data should be a list of tuples") for f0s in f0_data: if not isinstance(f0s, tuple): raise TypeError( - 'f0_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(f0s)) + "f0_data should be a list of tuples, " + + "but contains a {} element".format(type(f0s)) ) jam.annotations.append(f0s_to_jams(f0s)) # lyrics if lyrics_data is not None: if not isinstance(lyrics_data, list): - raise TypeError('lyrics_data should be a list of tuples') + raise TypeError("lyrics_data should be a list of tuples") for lyrics in lyrics_data: if not isinstance(lyrics, tuple): raise TypeError( - 'lyrics_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(lyrics)) + "lyrics_data should be a list of tuples, " + + "but contains a {} element".format(type(lyrics)) ) jam.annotations.append(lyrics_to_jams(lyrics)) # tags if tags_gtzan_data is not None: if not isinstance(tags_gtzan_data, list): - raise TypeError('tags_gtzan_data should be a list of tuples') + raise TypeError("tags_gtzan_data should be a list of tuples") for tag in tags_gtzan_data: if not isinstance(tag, tuple): raise TypeError( - 'tags_gtzan_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(tag)) + "tags_gtzan_data should be a list of tuples, " + + "but contains a {} element".format(type(tag)) ) jam.annotations.append(tag_gtzan_to_jams(tag)) # tag open if tags_open_data is not None: if not isinstance(tags_open_data, list): - raise TypeError('tags_open_data should be a list of tuples') + raise TypeError("tags_open_data should be a list of tuples") for tag in tags_open_data: if not isinstance(tag, tuple): raise TypeError( - 'tags_open_data should be a list of tuples, ' - + 'but contains a {} element'.format(type(tag)) + "tags_open_data should be a list of tuples, " + + "but contains a {} element".format(type(tag)) ) jam.annotations.append(tag_open_to_jams(tag)) @@ -283,11 +279,11 @@ def beats_to_jams(beats): jannot_beat: JAM beat annotation object. """ - jannot_beat = jams.Annotation(namespace='beat') - jannot_beat.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_beat = jams.Annotation(namespace="beat") + jannot_beat.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if beats[0] is not None: if not isinstance(beats[0], utils.BeatData): - raise TypeError('Type should be BeatData.') + raise TypeError("Type should be BeatData.") for t, p in zip(beats[0].beat_times, beats[0].beat_positions): jannot_beat.append(time=t, duration=0.0, value=p) if beats[1] is not None: @@ -309,11 +305,11 @@ def sections_to_jams(sections): ------- jannot_seg: JAM segment_open annotation object. """ - jannot_seg = jams.Annotation(namespace='segment_open') - jannot_seg.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_seg = jams.Annotation(namespace="segment_open") + jannot_seg.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if sections[0] is not None: if not isinstance(sections[0], utils.SectionData): - raise TypeError('Type should be SectionData.') + raise TypeError("Type should be SectionData.") for inter, seg in zip(sections[0].intervals, sections[0].labels): jannot_seg.append(time=inter[0], duration=inter[1] - inter[0], value=seg) if sections[1] is not None: @@ -335,11 +331,11 @@ def chords_to_jams(chords): ------- jannot_chord: JAM chord annotation object. """ - jannot_chord = jams.Annotation(namespace='chord') - jannot_chord.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_chord = jams.Annotation(namespace="chord") + jannot_chord.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if chords[0] is not None: if not isinstance(chords[0], utils.ChordData): - raise TypeError('Type should be ChordData.') + raise TypeError("Type should be ChordData.") for beg, end, ch in zip( chords[0].intervals[:, 0], chords[0].intervals[:, 1], chords[0].labels ): @@ -363,11 +359,11 @@ def notes_to_jams(notes): ------- jannot_notes: JAM note_midi annotation object. """ - jannot_note = jams.Annotation(namespace='note_hz') - jannot_note.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_note = jams.Annotation(namespace="note_hz") + jannot_note.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if notes[0] is not None: if not isinstance(notes[0], utils.NoteData): - raise TypeError('Type should be NoteData.') + raise TypeError("Type should be NoteData.") for beg, end, n in zip( notes[0].intervals[:, 0], notes[0].intervals[:, 1], notes[0].notes ): @@ -391,11 +387,11 @@ def keys_to_jams(keys): ------- jannot_key: JAM key_mode annotation object. """ - jannot_key = jams.Annotation(namespace='key_mode') - jannot_key.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_key = jams.Annotation(namespace="key_mode") + jannot_key.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if keys[0] is not None: if not isinstance(keys[0], utils.KeyData): - raise TypeError('Type should be KeyData.') + raise TypeError("Type should be KeyData.") for beg, end, key in zip(keys[0].start_times, keys[0].end_times, keys[0].keys): jannot_key.append(time=beg, duration=end - beg, value=key) if keys[1] is not None: @@ -419,20 +415,20 @@ def multi_sections_to_jams(multi_sections): jannot_multi: JAM multi_segment annotation object. """ # sections with multiple annotators and multiple level annotations - jannot_multi = jams.Annotation(namespace='multi_segment') - jannot_multi.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_multi = jams.Annotation(namespace="multi_segment") + jannot_multi.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") jannot_multi.annotation_metadata = jams.AnnotationMetadata( - annotator={'name': multi_sections[1]} + annotator={"name": multi_sections[1]} ) for sections in multi_sections[0]: if sections[0] is not None: if not isinstance(sections[0], utils.SectionData): - raise TypeError('Type should be SectionData.') + raise TypeError("Type should be SectionData.") for inter, seg in zip(sections[0].intervals, sections[0].labels): jannot_multi.append( time=inter[0], duration=inter[1] - inter[0], - value={'label': seg, 'level': sections[1]}, + value={"label": seg, "level": sections[1]}, ) return jannot_multi @@ -451,11 +447,11 @@ def tempos_to_jams(tempos): ------- jannot_tempo: JAM tempo annotation object. """ - jannot_tempo = jams.Annotation(namespace='tempo') - jannot_tempo.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_tempo = jams.Annotation(namespace="tempo") + jannot_tempo.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if tempos[0] is not None: if not isinstance(tempos[0], float) and not isinstance(tempos[0], int): - raise TypeError('Type should be float or int.') + raise TypeError("Type should be float or int.") jannot_tempo.append(time=0, duration=0, confidence=1, value=tempos[0]) if tempos[1] is not None: jannot_tempo.sandbox = jams.Sandbox(name=tempos[1]) @@ -476,11 +472,11 @@ def events_to_jams(events): ------- jannot_events: JAM tag_open annotation object. """ - jannot_events = jams.Annotation(namespace='tag_open') - jannot_events.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_events = jams.Annotation(namespace="tag_open") + jannot_events.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if events[0] is not None: if type(events[0]) != utils.EventData: - raise TypeError('Type should be EventData.') + raise TypeError("Type should be EventData.") for beg, end, label in zip( events[0].start_times, events[0].end_times, events[0].event ): @@ -504,16 +500,16 @@ def f0s_to_jams(f0s): ------- jannot_f0: JAM pitch_contour annotation object. """ - jannot_f0 = jams.Annotation(namespace='pitch_contour') - jannot_f0.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_f0 = jams.Annotation(namespace="pitch_contour") + jannot_f0.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if f0s[0] is not None: if not isinstance(f0s[0], utils.F0Data): - raise TypeError('Type should be F0Data.') + raise TypeError("Type should be F0Data.") for t, f, c in zip(f0s[0].times, f0s[0].frequencies, f0s[0].confidence): jannot_f0.append( time=t, duration=0.0, - value={'index': 0, 'frequency': f, 'voiced': f > 0}, + value={"index": 0, "frequency": f, "voiced": f > 0}, confidence=c, ) if f0s[1] is not None: @@ -535,11 +531,11 @@ def lyrics_to_jams(lyrics): ------- jannot_lyric: JAM lyric annotation object. """ - jannot_lyric = jams.Annotation(namespace='lyrics') - jannot_lyric.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_lyric = jams.Annotation(namespace="lyrics") + jannot_lyric.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if lyrics[0] is not None: if not isinstance(lyrics[0], utils.LyricData): - raise TypeError('Type should be LyricData.') + raise TypeError("Type should be LyricData.") for beg, end, lyric in zip( lyrics[0].start_times, lyrics[0].end_times, lyrics[0].lyrics ): @@ -563,13 +559,13 @@ def tag_gtzan_to_jams(tags): ------- jannot_tag_gtzan: JAM tag_gtzan annotation object. """ - jannot_tag_gtzan = jams.Annotation(namespace='tag_gtzan') + jannot_tag_gtzan = jams.Annotation(namespace="tag_gtzan") jannot_tag_gtzan.annotation_metadata = jams.AnnotationMetadata( - data_source='mirdata' + data_source="mirdata" ) if tags[0] is not None: if not isinstance(tags[0], str): - raise TypeError('Type should be str.') + raise TypeError("Type should be str.") jannot_tag_gtzan.append(time=0.0, duration=0.0, value=tags[0]) if tags[1] is not None: jannot_tag_gtzan.sandbox = jams.Sandbox(name=tags[1]) @@ -590,11 +586,11 @@ def tag_open_to_jams(tags): ------- jannot_tag_open: JAM tag_open annotation object. """ - jannot_tag_open = jams.Annotation(namespace='tag_open') - jannot_tag_open.annotation_metadata = jams.AnnotationMetadata(data_source='mirdata') + jannot_tag_open = jams.Annotation(namespace="tag_open") + jannot_tag_open.annotation_metadata = jams.AnnotationMetadata(data_source="mirdata") if tags[0] is not None: if not isinstance(tags[0], str): - raise TypeError('Type should be str.') + raise TypeError("Type should be str.") jannot_tag_open.append(time=0.0, duration=0.0, value=tags[0]) if tags[1] is not None: jannot_tag_open.sandbox = jams.Sandbox(name=tags[1]) diff --git a/mirdata/medleydb_pitch.py b/mirdata/medleydb_pitch.py deleted file mode 100644 index 2b583962a..000000000 --- a/mirdata/medleydb_pitch.py +++ /dev/null @@ -1,251 +0,0 @@ -# -*- coding: utf-8 -*- -"""MedleyDB pitch Dataset Loader - -MedleyDB is a dataset of annotated, royalty-free multitrack recordings. -MedleyDB was curated primarily to support research on melody extraction, -addressing important shortcomings of existing collections. For each song -we provide melody f0 annotations as well as instrument activations for -evaluating automatic instrument recognition. - -For more details, please visit: https://medleydb.weebly.com - -""" - -import csv -import json -import librosa -import logging -import numpy as np -import os - -from mirdata import download_utils -from mirdata import jams_utils -from mirdata import track -from mirdata import utils - -DATASET_DIR = 'MedleyDB-Pitch' - - -def _load_metadata(data_home): - metadata_path = os.path.join(data_home, 'medleydb_pitch_metadata.json') - - if not os.path.exists(metadata_path): - logging.info('Metadata file {} not found.'.format(metadata_path)) - return None - - with open(metadata_path, 'r') as fhandle: - metadata = json.load(fhandle) - - metadata['data_home'] = data_home - return metadata - - -DATA = utils.LargeData('medleydb_pitch_index.json', _load_metadata) - - -class Track(track.Track): - """medleydb_pitch Track class - - Args: - track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Attributes: - artist (str): artist - audio_path (str): path to the audio file - genre (str): genre - instrument (str): instrument of the track - pitch_path (str): path to the pitch annotation file - title (str): title - track_id (str): track id - - """ - - def __init__(self, track_id, data_home=None): - if track_id not in DATA.index: - raise ValueError( - '{} is not a valid track ID in MedleyDB-Pitch'.format(track_id) - ) - - self.track_id = track_id - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - self._data_home = data_home - self._track_paths = DATA.index[track_id] - self.pitch_path = os.path.join(self._data_home, self._track_paths['pitch'][0]) - - metadata = DATA.metadata(data_home) - if metadata is not None and track_id in metadata: - self._track_metadata = metadata[track_id] - else: - self._track_metadata = { - 'instrument': None, - 'artist': None, - 'title': None, - 'genre': None, - } - - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - self.instrument = self._track_metadata['instrument'] - self.artist = self._track_metadata['artist'] - self.title = self._track_metadata['title'] - self.genre = self._track_metadata['genre'] - - @utils.cached_property - def pitch(self): - """F0Data: The human-annotated pitch""" - return load_pitch(self.pitch_path) - - @property - def audio(self): - """(np.ndarray, float): audio signal, sample rate""" - return load_audio(self.audio_path) - - def to_jams(self): - """Jams: the track's data in jams format""" - return jams_utils.jams_converter( - audio_path=self.audio_path, - f0_data=[(self.pitch, 'annotated pitch')], - metadata=self._track_metadata, - ) - - -def load_audio(audio_path): - """Load a MedleyDB audio file. - - Args: - audio_path (str): path to audio file - - Returns: - y (np.ndarray): the mono audio signal - sr (float): The sample rate of the audio file - - """ - if not os.path.exists(audio_path): - raise IOError("audio_path {} does not exist".format(audio_path)) - - return librosa.load(audio_path, sr=None, mono=True) - - -def download(data_home=None): - """MedleyDB is not available for downloading directly. - This function prints a helper message to download MedleyDB - through zenodo.org. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - """ - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - info_message = """ - To download this dataset, visit: - https://zenodo.org/record/2620624#.XKZc7hNKh24 - and request access. - - Once downloaded, unzip the file MedleyDB-Pitch.zip - and copy the result to: - {data_home} - """.format( - data_home=data_home - ) - - download_utils.downloader(data_home, info_message=info_message) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load MedleyDB pitch dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - medleydb_pitch_data = {} - for key in track_ids(): - medleydb_pitch_data[key] = Track(key, data_home=data_home) - return medleydb_pitch_data - - -def load_pitch(pitch_path): - if not os.path.exists(pitch_path): - raise IOError("pitch_path {} does not exist".format(pitch_path)) - - times = [] - freqs = [] - with open(pitch_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter=',') - for line in reader: - times.append(float(line[0])) - freqs.append(float(line[1])) - - times = np.array(times) - freqs = np.array(freqs) - confidence = (freqs > 0).astype(float) - pitch_data = utils.F0Data(times, freqs, confidence) - return pitch_data - - -def cite(): - """Print the reference""" - - cite_data = """ -=========== MLA =========== -Bittner, Rachel, et al. -"MedleyDB: A multitrack dataset for annotation-intensive MIR research." -In Proceedings of the 15th International Society for Music Information Retrieval Conference (ISMIR). 2014. - -========== Bibtex ========== -@inproceedings{bittner2014medleydb, - Author = {Bittner, Rachel M and Salamon, Justin and Tierney, Mike and Mauch, Matthias and Cannam, Chris and Bello, Juan P}, - Booktitle = {International Society of Music Information Retrieval (ISMIR)}, - Month = {October}, - Title = {Medley{DB}: A Multitrack Dataset for Annotation-Intensive {MIR} Research}, - Year = {2014} -} -""" - - print(cite_data) diff --git a/mirdata/rwc_classical.py b/mirdata/rwc_classical.py deleted file mode 100644 index ffb237758..000000000 --- a/mirdata/rwc_classical.py +++ /dev/null @@ -1,448 +0,0 @@ -# -*- coding: utf-8 -*- -"""RWC Classical Dataset Loader - - The Classical Music Database consists of 50 pieces: -* Symphonies: 4 pieces -* Concerti: 2 pieces -* Orchestral music: 4 pieces -* Chamber music: 10 pieces -* Solo performances: 24 pieces -* Vocal performances: 6 pieces - -For more details, please visit: https://staff.aist.go.jp/m.goto/RWC-MDB/rwc-mdb-c.html -""" -import csv -import librosa -import logging -import numpy as np -import os - -from mirdata import download_utils -from mirdata import jams_utils -from mirdata import track -from mirdata import utils - -DATASET_DIR = 'RWC-Classical' - -REMOTES = { - 'annotations_beat': download_utils.RemoteFileMetadata( - filename='AIST.RWC-MDB-C-2001.BEAT.zip', - url='https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-C-2001.BEAT.zip', - checksum='e8ee05854833cbf5eb7280663f71c29b', - destination_dir='annotations', - ), - 'annotations_sections': download_utils.RemoteFileMetadata( - filename='AIST.RWC-MDB-C-2001.CHORUS.zip', - url='https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-C-2001.CHORUS.zip', - checksum='f77bd527510376f59f5a2eed8fd7feb3', - destination_dir='annotations', - ), - 'metadata': download_utils.RemoteFileMetadata( - filename='rwc-c.csv', - url='https://github.com/magdalenafuentes/metadata/archive/master.zip', - checksum='7dbe87fedbaaa1f348625a2af1d78030', - destination_dir=None, - ), -} - - -def _load_metadata(data_home): - - metadata_path = os.path.join(data_home, 'metadata-master', 'rwc-c.csv') - - if not os.path.exists(metadata_path): - logging.info( - 'Metadata file {} not found.'.format(metadata_path) - + 'You can download the metadata file by running download()' - ) - return None - - with open(metadata_path, 'r') as fhandle: - dialect = csv.Sniffer().sniff(fhandle.read(1024)) - fhandle.seek(0) - reader = csv.reader(fhandle, dialect) - raw_data = [] - for line in reader: - if line[0] != 'Piece No.': - raw_data.append(line) - - metadata_index = {} - for line in raw_data: - if line[0] == 'Piece No.': - continue - p = '00' + line[0].split('.')[1][1:] - track_id = 'RM-C{}'.format(p[len(p) - 3 :]) - - metadata_index[track_id] = { - 'piece_number': line[0], - 'suffix': line[1], - 'track_number': line[2], - 'title': line[3], - 'composer': line[4], - 'artist': line[5], - 'duration': _duration_to_sec(line[6]), - 'category': line[7], - } - - metadata_index['data_home'] = data_home - - return metadata_index - - -DATA = utils.LargeData('rwc_classical_index.json', _load_metadata) - - -class Track(track.Track): - """rwc_classical Track class - - Args: - track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Attributes: - artist (str): the track's artist - audio_path (str): path of the audio file - beats_path (str): path of the beat annotation file - category (str): One of 'Symphony', 'Concerto', 'Orchestral', - 'Solo', 'Chamber', 'Vocal', or blank. - composer (str): Composer of this Track. - duration (float): Duration of the track in seconds - piece_number (str): Piece number of this Track, [1-50] - sections_path (str): path of the section annotation file - suffix (str): string within M01-M06 - title (str): Title of The track. - track_id (str): track id - track_number (str): CD track number of this Track - - """ - - def __init__(self, track_id, data_home=None): - if track_id not in DATA.index: - raise ValueError( - '{} is not a valid track ID in RWC-Classical'.format(track_id) - ) - - self.track_id = track_id - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home - - self._track_paths = DATA.index[track_id] - self.sections_path = os.path.join( - self._data_home, self._track_paths['sections'][0] - ) - self.beats_path = os.path.join(self._data_home, self._track_paths['beats'][0]) - - metadata = DATA.metadata(data_home) - if metadata is not None and track_id in metadata: - self._track_metadata = metadata[track_id] - else: - self._track_metadata = { - 'piece_number': None, - 'suffix': None, - 'track_number': None, - 'title': None, - 'composer': None, - 'artist': None, - 'duration': None, - 'category': None, - } - - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - - self.piece_number = self._track_metadata['piece_number'] - self.suffix = self._track_metadata['suffix'] - self.track_number = self._track_metadata['track_number'] - self.title = self._track_metadata['title'] - self.composer = self._track_metadata['composer'] - self.artist = self._track_metadata['artist'] - self.duration = self._track_metadata['duration'] - self.category = self._track_metadata['category'] - - @utils.cached_property - def sections(self): - """SectionData: human labeled section annotations""" - return load_sections(self.sections_path) - - @utils.cached_property - def beats(self): - """BeatData: human labeled beat annotations""" - return load_beats(self.beats_path) - - @property - def audio(self): - """(np.ndarray, float): audio signal, sample rate""" - return load_audio(self.audio_path) - - def to_jams(self): - """Jams: the track's data in jams format""" - return jams_utils.jams_converter( - audio_path=self.audio_path, - beat_data=[(self.beats, None)], - section_data=[(self.sections, None)], - metadata=self._track_metadata, - ) - - -def load_audio(audio_path): - """Load a RWC audio file. - - Args: - audio_path (str): path to audio file - - Returns: - y (np.ndarray): the mono audio signal - sr (float): The sample rate of the audio file - - """ - if not os.path.exists(audio_path): - raise IOError("audio_path {} does not exist".format(audio_path)) - - return librosa.load(audio_path, sr=None, mono=True) - - -def download( - data_home=None, partial_download=None, force_overwrite=False, cleanup=True -): - """Download the RWC Classical (annotations and metadata). - The audio files are not provided due to copyright issues. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - partial_download (list): - List indicating what to partially download. The list can include any of - * `'annotations_beat'` the beat annotation files - * `'annotations_sections'` the sections annotation files - * `'metadata'` the metadata files - If `None`, all data is downloaded. - cleanup (bool): - Whether to delete the zip/tar file after extracting. - - """ - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - info_message = """ - Unfortunately the audio files of the RWC-Classical dataset are not available - for download. If you have the RWC-Classical dataset, place the contents into a - folder called RWC-Classical with the following structure: - > RWC-Classical/ - > annotations/ - > audio/rwc-c-m0i with i in [1 .. 6] - > metadata-master/ - and copy the RWC-Classical folder to {} - """.format( - data_home - ) - - download_utils.downloader( - data_home, - remotes=REMOTES, - partial_download=partial_download, - info_message=info_message, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load RWC-Classical dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - rwc_classical_data = {} - for key in track_ids(): - rwc_classical_data[key] = Track(key, data_home=data_home) - return rwc_classical_data - - -def load_sections(sections_path): - if not os.path.exists(sections_path): - raise IOError("sections_path {} does not exist".format(sections_path)) - - begs = [] # timestamps of section beginnings - ends = [] # timestamps of section endings - secs = [] # section labels - - with open(sections_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter='\t') - for line in reader: - begs.append(float(line[0]) / 100.0) - ends.append(float(line[1]) / 100.0) - secs.append(line[2]) - - return utils.SectionData(np.array([begs, ends]).T, secs) - - -def _position_in_bar(beat_positions, beat_times): - """ - Mapping to beat position in bar (e.g. 1, 2, 3, 4). - """ - # Remove -1 - _beat_positions = np.delete(beat_positions, np.where(beat_positions == -1)) - beat_times_corrected = np.delete(beat_times, np.where(beat_positions == -1)) - - # Create corrected array with downbeat positions - beat_positions_corrected = np.zeros((len(_beat_positions),)) - downbeat_positions = np.where(_beat_positions == np.max(_beat_positions))[0] - _beat_positions[downbeat_positions] = 1 - beat_positions_corrected[downbeat_positions] = 1 - - # Propagate positions - for b in range(0, len(_beat_positions)): - if _beat_positions[b] > _beat_positions[b - 1]: - beat_positions_corrected[b] = beat_positions_corrected[b - 1] + 1 - - if not downbeat_positions[0] == 0: - timesig_next_bar = beat_positions_corrected[downbeat_positions[1] - 1] - for b in range(1, downbeat_positions[0] + 1): - beat_positions_corrected[downbeat_positions[0] - b] = ( - timesig_next_bar - b + 1 - ) - - return beat_positions_corrected, beat_times_corrected - - -def load_beats(beats_path): - if not os.path.exists(beats_path): - raise IOError("beats_path {} does not exist".format(beats_path)) - - beat_times = [] # timestamps of beat interval beginnings - beat_positions = [] # beat position inside the bar - - with open(beats_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter='\t') - for line in reader: - beat_times.append(float(line[0]) / 100.0) - beat_positions.append(int(line[2])) - beat_positions, beat_times = _position_in_bar( - np.array(beat_positions), np.array(beat_times) - ) - - return utils.BeatData(beat_times, beat_positions.astype(int)) - - -def _duration_to_sec(duration): - if type(duration) == str: - if ':' in duration: - if len(duration.split(':')) <= 2: - minutes, secs = duration.split(':') - else: - minutes, secs, _ = duration.split( - ':' - ) # mistake in annotation in RM-J044 - total_secs = float(minutes) * 60 + float(secs) - return total_secs - - -def _load_metadata(data_home): - metadata_path = os.path.join(data_home, 'metadata-master', 'rwc-c.csv') - - if not os.path.exists(metadata_path): - logging.info( - 'Metadata file {} not found.'.format(metadata_path) - + 'You can download the metadata file by running download()' - ) - return None - - with open(metadata_path, 'r') as fhandle: - dialect = csv.Sniffer().sniff(fhandle.read(1024)) - fhandle.seek(0) - reader = csv.reader(fhandle, dialect) - raw_data = [] - for line in reader: - if line[0] != 'Piece No.': - raw_data.append(line) - - metadata_index = {} - for line in raw_data: - if line[0] == 'Piece No.': - continue - p = '00' + line[0].split('.')[1][1:] - track_id = 'RM-C{}'.format(p[len(p) - 3 :]) - - metadata_index[track_id] = { - 'piece_number': line[0], - 'suffix': line[1], - 'track_number': line[2], - 'title': line[3], - 'composer': line[4], - 'artist': line[5], - 'duration': _duration_to_sec(line[6]), - 'category': line[7], - } - - metadata_index['data_home'] = data_home - - return metadata_index - - -def cite(): - cite_data = """ -=========== MLA =========== - -Goto, Masataka, et al., -"RWC Music Database: Popular, Classical and Jazz Music Databases.", -3rd International Society for Music Information Retrieval Conference (2002) - -========== Bibtex ========== - -@inproceedings{goto2002rwc, - title={RWC Music Database: Popular, Classical and Jazz Music Databases.}, - author={Goto, Masataka and Hashiguchi, Hiroki and Nishimura, Takuichi and Oka, Ryuichi}, - booktitle={3rd International Society for Music Information Retrieval Conference}, - year={2002}, - series={ISMIR}, -} - -""" - - print(cite_data) diff --git a/mirdata/rwc_jazz.py b/mirdata/rwc_jazz.py deleted file mode 100644 index 56fe7edf7..000000000 --- a/mirdata/rwc_jazz.py +++ /dev/null @@ -1,334 +0,0 @@ -# -*- coding: utf-8 -*- -"""RWC Jazz Dataset Loader. - -The Jazz Music Database consists of 50 pieces: - -* Instrumentation variations: 35 pieces (5 pieces × 7 instrumentations). -The instrumentation-variation pieces were recorded to obtain different versions -of the same piece; i.e., different arrangements performed by different player -instrumentations. Five standard-style jazz pieces were originally composed -and then performed in modern-jazz style using the following seven instrumentations: -1. Piano solo -2. Guitar solo -3. Duo: Vibraphone + Piano, Flute + Piano, and Piano + Bass -4. Piano trio: Piano + Bass + Drums -5. Piano trio + Trumpet or Tenor saxophone -6. Octet: Piano trio + Guitar + Alto saxophone + Baritone saxophone + Tenor saxophone × 2 -7. Piano trio + Vibraphone or Flute - -* Style variations: 9 pieces -The style-variation pieces were recorded to represent various styles of jazz. -They include four well-known public-domain pieces and consist of -1. Vocal jazz: 2 pieces (including "Aura Lee") -2. Big band jazz: 2 pieces (including "The Entertainer") -3. Modal jazz: 2 pieces -4. Funky jazz: 2 pieces (including "Silent Night") -5. Free jazz: 1 piece (including "Joyful, Joyful, We Adore Thee") -Fusion (crossover): 6 pieces -The fusion pieces were recorded to obtain music that combines elements of jazz -with other styles such as popular, rock, and latin. They include music with an -eighth-note feel, music with a sixteenth-note feel, and Latin jazz music. - -For more details, please visit: https://staff.aist.go.jp/m.goto/RWC-MDB/rwc-mdb-j.html -""" -import csv -import librosa -import logging -import os - -from mirdata import download_utils -from mirdata import jams_utils -from mirdata import track -from mirdata import utils - -# these functions are identical for all rwc datasets -from mirdata.rwc_classical import ( - load_beats, - load_sections, - load_audio, - _duration_to_sec, -) - - -REMOTES = { - 'metadata': download_utils.RemoteFileMetadata( - filename='rwc-j.csv', - url='https://github.com/magdalenafuentes/metadata/archive/master.zip', - checksum='7dbe87fedbaaa1f348625a2af1d78030', - destination_dir=None, - ), - 'annotations_beat': download_utils.RemoteFileMetadata( - filename='AIST.RWC-MDB-J-2001.BEAT.zip', - url='https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-J-2001.BEAT.zip', - checksum='b483853da05d0fff3992879f7729bcb4', - destination_dir='annotations', - ), - 'annotations_sections': download_utils.RemoteFileMetadata( - filename='AIST.RWC-MDB-J-2001.CHORUS.zip', - url='https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-J-2001.CHORUS.zip', - checksum='44afcf7f193d7e48a7d99e7a6f3ed39d', - destination_dir='annotations', - ), -} - -DATASET_DIR = 'RWC-Jazz' - - -def _load_metadata(data_home): - - metadata_path = os.path.join(data_home, 'metadata-master', 'rwc-j.csv') - - if not os.path.exists(metadata_path): - logging.info( - 'Metadata file {} not found.'.format(metadata_path) - + 'You can download the metadata file by running download()' - ) - return None - - with open(metadata_path, 'r') as fhandle: - dialect = csv.Sniffer().sniff(fhandle.read(1024)) - fhandle.seek(0) - reader = csv.reader(fhandle, dialect) - raw_data = [] - for line in reader: - if line[0] != 'Piece No.': - raw_data.append(line) - - metadata_index = {} - for line in raw_data: - if line[0] == 'Piece No.': - continue - p = '00' + line[0].split('.')[1][1:] - track_id = 'RM-J{}'.format(p[len(p) - 3 :]) - - metadata_index[track_id] = { - 'piece_number': line[0], - 'suffix': line[1], - 'track_number': line[2], - 'title': line[3], - 'artist': line[4], - 'duration': _duration_to_sec(line[5]), - 'variation': line[6], - 'instruments': line[7], - } - - metadata_index['data_home'] = data_home - - return metadata_index - - -DATA = utils.LargeData('rwc_jazz_index.json', _load_metadata) - - -class Track(track.Track): - """rwc_jazz Track class - - Args: - track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Attributes: - artist (str): Artist name - audio_path (str): path of the audio file - beats_path (str): path of the beat annotation file - duration (float): Duration of the track in seconds - instruments (str): list of used instruments. - piece_number (str): Piece number of this Track, [1-50] - sections_path (str): path of the section annotation file - suffix (str): M01-M04 - title (str): Title of The track. - track_id (str): track id - track_number (str): CD track number of this Track - variation (str): TODO - - """ - - def __init__(self, track_id, data_home=None): - if track_id not in DATA.index: - raise ValueError('{} is not a valid track ID in RWC-Jazz'.format(track_id)) - - self.track_id = track_id - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home - - self._track_paths = DATA.index[track_id] - self.sections_path = os.path.join( - self._data_home, self._track_paths['sections'][0] - ) - self.beats_path = os.path.join(self._data_home, self._track_paths['beats'][0]) - - metadata = DATA.metadata(data_home) - if metadata is not None and track_id in metadata: - self._track_metadata = metadata[track_id] - else: - self._track_metadata = { - 'piece_number': None, - 'suffix': None, - 'track_number': None, - 'title': None, - 'artist': None, - 'duration': None, - 'variation': None, - 'instruments': None, - } - - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - - self.piece_number = self._track_metadata['piece_number'] - self.suffix = self._track_metadata['suffix'] - self.track_number = self._track_metadata['track_number'] - self.title = self._track_metadata['title'] - self.artist = self._track_metadata['artist'] - self.duration = self._track_metadata['duration'] - self.variation = self._track_metadata['variation'] - self.instruments = self._track_metadata['instruments'] - - @utils.cached_property - def sections(self): - """SectionData: human-labeled section data""" - return load_sections(self.sections_path) - - @utils.cached_property - def beats(self): - """BeatData: human-labeled beat data""" - return load_beats(self.beats_path) - - @property - def audio(self): - """(np.ndarray, float): audio signal, sample rate""" - return load_audio(self.audio_path) - - def to_jams(self): - """Jams: the track's data in jams format""" - return jams_utils.jams_converter( - audio_path=self.audio_path, - beat_data=[(self.beats, None)], - section_data=[(self.sections, None)], - metadata=self._track_metadata, - ) - - -def download( - data_home=None, partial_download=None, force_overwrite=False, cleanup=True -): - """Download the RWC Jazz (annotations and metadata). - The audio files are not provided due to copyright issues. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - partial_download (list): - List indicating what to partially download. The list can include any of: - * `'annotations_beat'` the beat annotation files - * `'annotations_sections'` the sections annotation files - * `'metadata'` the metadata files - If `None`, all data is downloaded. - cleanup (bool): - Whether to delete the zip/tar file after extracting. - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - info_message = """ - Unfortunately the audio files of the RWC-Jazz dataset are not available - for download. If you have the RWC-Jazz dataset, place the contents into a - folder called RWC-Jazz with the following structure: - > RWC-Jazz/ - > annotations/ - > audio/rwc-j-m0i with i in [1 .. 4] - > metadata-master/ - and copy the RWC-Jazz folder to {} - """.format( - data_home - ) - - download_utils.downloader( - data_home, - remotes=REMOTES, - partial_download=partial_download, - info_message=info_message, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load RWC-Jazz dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - rwc_jazz_data = {} - for key in track_ids(): - rwc_jazz_data[key] = Track(key, data_home=data_home) - return rwc_jazz_data - - -def cite(): - cite_data = """ -=========== MLA =========== - -Goto, Masataka, et al., -"RWC Music Database: Popular, Classical and Jazz Music Databases.", -3rd International Society for Music Information Retrieval Conference (2002) - -========== Bibtex ========== - -@inproceedings{goto2002rwc, - title={RWC Music Database: Popular, Classical and Jazz Music Databases.}, - author={Goto, Masataka and Hashiguchi, Hiroki and Nishimura, Takuichi and Oka, Ryuichi}, - booktitle={3rd International Society for Music Information Retrieval Conference}, - year={2002}, - series={ISMIR}, -} - -""" - - print(cite_data) diff --git a/mirdata/rwc_popular.py b/mirdata/rwc_popular.py deleted file mode 100644 index 812221968..000000000 --- a/mirdata/rwc_popular.py +++ /dev/null @@ -1,435 +0,0 @@ -# -*- coding: utf-8 -*- -"""RWC Popular Dataset Loader - -The Popular Music Database consists of 100 songs — 20 songs with English lyrics -performed in the style of popular music typical of songs on the American hit -charts in the 1980s, and 80 songs with Japanese lyrics performed in the style of -modern Japanese popular music typical of songs on the Japanese hit charts in -the 1990s. - -For more details, please visit: https://staff.aist.go.jp/m.goto/RWC-MDB/rwc-mdb-p.html -""" -import csv -import librosa -import logging -import numpy as np -import os - -from mirdata import download_utils -from mirdata import jams_utils -from mirdata import track -from mirdata import utils - -# these functions are identical for all rwc datasets -from mirdata.rwc_classical import ( - load_beats, - load_sections, - load_audio, - _duration_to_sec, -) - -REMOTES = { - 'metadata': download_utils.RemoteFileMetadata( - filename='rwc-p.csv', - url='https://github.com/magdalenafuentes/metadata/archive/master.zip', - checksum='7dbe87fedbaaa1f348625a2af1d78030', - destination_dir=None, - ), - 'annotations_beat': download_utils.RemoteFileMetadata( - filename='AIST.RWC-MDB-P-2001.BEAT.zip', - url='https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-P-2001.BEAT.zip', - checksum='3858aa989535bd7196b3cd07b512b5b6', - destination_dir='annotations', - ), - 'annotations_sections': download_utils.RemoteFileMetadata( - filename='AIST.RWC-MDB-P-2001.CHORUS.zip', - url='https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-P-2001.CHORUS.zip', - checksum='f76b3a32701fbd9bf78baa608f692a77', - destination_dir='annotations', - ), - 'annotations_chords': download_utils.RemoteFileMetadata( - filename='AIST.RWC-MDB-P-2001.CHORD.zip', - url='https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-P-2001.CHORD.zip', - checksum='68379c88bc8ec3f1907b32a3579197c5', - destination_dir='annotations', - ), - 'annotations_vocal_act': download_utils.RemoteFileMetadata( - filename='AIST.RWC-MDB-P-2001.VOCA_INST.zip', - url='https://staff.aist.go.jp/m.goto/RWC-MDB/AIST-Annotation/AIST.RWC-MDB-P-2001.VOCA_INST.zip', - checksum='47ded648a496407ef49dba9c8bf80e87', - destination_dir='annotations', - ), -} - -DATASET_DIR = 'RWC-Popular' - - -def _load_metadata(data_home): - - metadata_path = os.path.join(data_home, 'metadata-master', 'rwc-p.csv') - - if not os.path.exists(metadata_path): - logging.info( - 'Metadata file {} not found.'.format(metadata_path) - + 'You can download the metadata file by running download()' - ) - return None - - with open(metadata_path, 'r') as fhandle: - dialect = csv.Sniffer().sniff(fhandle.read(1024)) - fhandle.seek(0) - reader = csv.reader(fhandle, dialect) - raw_data = [] - for line in reader: - if line[0] != 'Piece No.': - raw_data.append(line) - - metadata_index = {} - for line in raw_data: - if line[0] == 'Piece No.': - continue - p = '00' + line[0].split('.')[1][1:] - track_id = 'RM-P{}'.format(p[len(p) - 3 :]) - - metadata_index[track_id] = { - 'piece_number': line[0], - 'suffix': line[1], - 'track_number': line[2], - 'title': line[3], - 'artist': line[4], - 'singer_information': line[5], - 'duration': _duration_to_sec(line[6]), - 'tempo': line[7], - 'instruments': line[8], - 'drum_information': line[9], - } - - metadata_index['data_home'] = data_home - - return metadata_index - - -DATA = utils.LargeData('rwc_popular_index.json', _load_metadata) - - -class Track(track.Track): - """rwc_popular Track class - - Args: - track_id (str): track id of the track - data_home (str): Local path where the dataset is stored. default=None - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Attributes: - artist (str): artist - audio_path (str): path of the audio file - beats_path (str): path of the beat annotation file - chords_path (str): path of the chord annotation file - drum_information (str): If the drum is 'Drum sequences', 'Live drums', - or 'Drum loops' - duration (float): Duration of the track in seconds - instruments (str): List of used instruments - piece_number (str): Piece number, [1-50] - sections_path (str): path of the section annotation file - singer_information (str): TODO - suffix (str): M01-M04 - tempo (str): Tempo of the track in BPM - title (str): title - track_id (str): track id - track_number (str): CD track number - voca_inst_path (str): path of the vocal/instrumental annotation file - - """ - - def __init__(self, track_id, data_home=None): - if track_id not in DATA.index: - raise ValueError( - '{} is not a valid track ID in RWC-Popular'.format(track_id) - ) - - self.track_id = track_id - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - self._data_home = data_home - - self._track_paths = DATA.index[track_id] - self.sections_path = os.path.join( - self._data_home, self._track_paths['sections'][0] - ) - self.beats_path = os.path.join(self._data_home, self._track_paths['beats'][0]) - self.chords_path = os.path.join(self._data_home, self._track_paths['chords'][0]) - self.voca_inst_path = os.path.join( - self._data_home, self._track_paths['voca_inst'][0] - ) - - metadata = DATA.metadata(data_home) - if metadata is not None and track_id in metadata: - self._track_metadata = metadata[track_id] - else: - # annotations with missing metadata - self._track_metadata = { - 'piece_number': None, - 'suffix': None, - 'track_number': None, - 'title': None, - 'artist': None, - 'singer_information': None, - 'duration': None, - 'tempo': None, - 'instruments': None, - 'drum_information': None, - } - - self.audio_path = os.path.join(self._data_home, self._track_paths['audio'][0]) - - self.piece_number = self._track_metadata['piece_number'] - self.suffix = self._track_metadata['suffix'] - self.track_number = self._track_metadata['track_number'] - self.title = self._track_metadata['title'] - self.artist = self._track_metadata['artist'] - self.singer_information = self._track_metadata['singer_information'] - self.duration = self._track_metadata['duration'] - self.tempo = self._track_metadata['tempo'] - self.instruments = self._track_metadata['instruments'] - self.drum_information = self._track_metadata['drum_information'] - - @utils.cached_property - def sections(self): - """SectionData: human-labeled section annotation""" - return load_sections(self.sections_path) - - @utils.cached_property - def beats(self): - """BeatData: human-labeled beat annotation""" - return load_beats(self.beats_path) - - @utils.cached_property - def chords(self): - """ChordData: human-labeled chord annotation""" - return load_chords(self.chords_path) - - @utils.cached_property - def vocal_instrument_activity(self): - """EventData: human-labeled vocal/instrument activity""" - return load_voca_inst(self.voca_inst_path) - - @property - def audio(self): - """(np.ndarray, float): audio signal, sample rate""" - return load_audio(self.audio_path) - - def to_jams(self): - """Jams: the track's data in jams format""" - return jams_utils.jams_converter( - audio_path=self.audio_path, - beat_data=[(self.beats, None)], - section_data=[(self.sections, None)], - chord_data=[(self.chords, None)], - metadata=self._track_metadata, - ) - - -def download( - data_home=None, partial_download=None, force_overwrite=False, cleanup=True -): - """Download the RWC Popular (annotations and metadata). - The audio files are not provided due to copyright issues. - - Args: - data_home (str): - Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - force_overwrite (bool): - Whether to overwrite the existing downloaded data - partial_download (list): - List indicating what to partially download. The list can include any of: - * `'annotations_beat'` the beat annotation files - * `'annotations_sections'` the sections annotation files - * `'annotations_chords'` the chords annotation files - * `'annotations_vocal_act'` the vocal activity annotation files - * `'metadata'` the metadata files - If `None`, all data is downloaded. - cleanup (bool): - Whether to delete the zip/tar file after extracting. - - """ - - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - info_message = """ - Unfortunately the audio files of the RWC-Popular dataset are not available - for download. If you have the RWC-Popular dataset, place the contents into a - folder called RWC-Popular with the following structure: - > RWC-Popular/ - > annotations/ - > audio/rwc-p-m0i with i in [1 .. 7] - > metadata-master/ - and copy the RWC-Popular folder to {} - """.format( - data_home - ) - - download_utils.downloader( - data_home, - remotes=REMOTES, - partial_download=partial_download, - info_message=info_message, - force_overwrite=force_overwrite, - cleanup=cleanup, - ) - - -def validate(data_home=None, silence=False): - """Validate if the stored dataset is a valid version - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - missing_files (list): List of file paths that are in the dataset index - but missing locally - invalid_checksums (list): List of file paths that file exists in the dataset - index but has a different checksum compare to the reference checksum - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - missing_files, invalid_checksums = utils.validator( - DATA.index, data_home, silence=silence - ) - return missing_files, invalid_checksums - - -def track_ids(): - """Return track ids - - Returns: - (list): A list of track ids - """ - return list(DATA.index.keys()) - - -def load(data_home=None): - """Load RWC-Genre dataset - - Args: - data_home (str): Local path where the dataset is stored. - If `None`, looks for the data in the default directory, `~/mir_datasets` - - Returns: - (dict): {`track_id`: track data} - - """ - if data_home is None: - data_home = utils.get_default_dataset_path(DATASET_DIR) - - rwc_popular_data = {} - for key in track_ids(): - rwc_popular_data[key] = Track(key, data_home=data_home) - return rwc_popular_data - - -def load_chords(chords_path): - if not os.path.exists(chords_path): - raise IOError("chords_path {} does not exist".format(chords_path)) - - begs = [] # timestamps of chord beginnings - ends = [] # timestamps of chord endings - chords = [] # chord labels - - if os.path.exists(chords_path): - with open(chords_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter='\t') - for line in reader: - begs.append(float(line[0])) - ends.append(float(line[1])) - chords.append(line[2]) - - return utils.ChordData(np.array([begs, ends]).T, chords) - - -def load_voca_inst(voca_inst_path): - if not os.path.exists(voca_inst_path): - raise IOError("voca_inst_path {} does not exist".format(voca_inst_path)) - - begs = [] # timestamps of vocal-instrument activity beginnings - ends = [] # timestamps of vocal-instrument activity endings - events = [] # vocal-instrument activity labels - - with open(voca_inst_path, 'r') as fhandle: - reader = csv.reader(fhandle, delimiter='\t') - raw_data = [] - for line in reader: - if line[0] != 'Piece No.': - raw_data.append(line) - - for i in range(len(raw_data)): - # Parsing vocal-instrument activity as intervals (beg, end, event) - if raw_data[i] != raw_data[-1]: - begs.append(float(raw_data[i][0])) - ends.append(float(raw_data[i + 1][0])) - events.append(raw_data[i][1]) - - return utils.EventData(np.array(begs), np.array(ends), np.array(events)) - - -def cite(): - cite_data = """ -=========== MLA =========== - -If using beat and section annotations please cite: - -Goto, Masataka, et al., -"RWC Music Database: Popular, Classical and Jazz Music Databases.", -3rd International Society for Music Information Retrieval Conference (2002) - -If using chord annotations please cite: - -Cho, Taemin, and Juan P. Bello., -"A feature smoothing method for chord recognition using recurrence plots.", -12th International Society for Music Information Retrieval Conference (2011) - -If using vocal-instrument activity annotations please cite: - -Mauch, Matthias, et al., -"Timbre and Melody Features for the Recognition of Vocal Activity and Instrumental Solos in Polyphonic Music.", -12th International Society for Music Information Retrieval Conference (2011) - -========== Bibtex ========== - -If using beat and section annotations please cite: - -@inproceedings{goto2002rwc, - title={RWC Music Database: Popular, Classical and Jazz Music Databases.}, - author={Goto, Masataka and Hashiguchi, Hiroki and Nishimura, Takuichi and Oka, Ryuichi}, - booktitle={3rd International Society for Music Information Retrieval Conference}, - year={2002}, - series={ISMIR}, -} - -If using chord annotations please cite: - -@inproceedings{cho2011feature, - title={A feature smoothing method for chord recognition using recurrence plots}, - author={Cho, Taemin and Bello, Juan P}, - booktitle={12th International Society for Music Information Retrieval Conference}, - year={2011}, - series={ISMIR}, -} - -If using vocal-instrument activity annotations please cite: - -@inproceedings{mauch2011timbre, - title={Timbre and Melody Features for the Recognition of Vocal Activity and Instrumental Solos in Polyphonic Music.}, - author={Mauch, Matthias and Fujihara, Hiromasa and Yoshii, Kazuyoshi and Goto, Masataka}, - booktitle={ISMIR}, - year={2011}, - series={ISMIR}, -} - -""" - print(cite_data) diff --git a/mirdata/track.py b/mirdata/track.py deleted file mode 100644 index 51d5e4e24..000000000 --- a/mirdata/track.py +++ /dev/null @@ -1,159 +0,0 @@ -# -*- coding: utf-8 -*- -"""track object utility functions -""" -import types - -import numpy as np - -MAX_STR_LEN = 100 - - -class Track(object): - def __repr__(self): - properties = [v for v in dir(self.__class__) if not v.startswith("_")] - attributes = [ - v for v in dir(self) if not v.startswith("_") and v not in properties - ] - - repr_str = "Track(\n" - - for attr in attributes: - val = getattr(self, attr) - if isinstance(val, str): - if len(val) > MAX_STR_LEN: - val = "...{}".format(val[-MAX_STR_LEN:]) - val = '"{}"'.format(val) - repr_str += " {}={},\n".format(attr, val) - - for prop in properties: - val = getattr(self.__class__, prop) - if isinstance(val, types.FunctionType): - continue - - if val.__doc__ is None: - raise ValueError("{} has no documentation".format(prop)) - - val_type_str = val.__doc__.split(":")[0] - repr_str += " {}: {},\n".format(prop, val_type_str) - - repr_str += ")" - return repr_str - - def to_jams(self): - raise NotImplementedError - - -class MultiTrack(Track): - """MultiTrack class. - - A multitrack class is a collection of track objects and their associated audio - that can be mixed together. - A multitrack is iteslf a Track, and can have its own associated audio (such as - a mastered mix), its own metadata and its own annotations. - - """ - - def _check_mixable(self): - if not hasattr(self, "tracks") or not hasattr(self, "track_audio_property"): - raise NotImplementedError( - "This MultiTrack has no tracks/track_audio_property. Cannot perform mixing" - ) - - def get_target(self, track_keys, weights=None, average=True, enforce_length=True): - """Get target which is a linear mixture of tracks - - Args: - track_keys (list): list of track keys to mix together - weights (list or None): list of positive scalars to be used in the average - average (bool): if True, computes a weighted average of the tracks - if False, computes a weighted sum of the tracks - enforce_length (bool): If True, raises ValueError if the tracks are - not the same length. If False, pads audio with zeros to match the length - of the longest track - - Returns: - target (np.ndarray): target audio with shape (n_channels, n_samples) - - Raises: - ValueError: - if sample rates of the tracks are not equal - if enforce_length=True and lengths are not equal - - """ - self._check_mixable() - signals = [] - lengths = [] - sample_rates = [] - for k in track_keys: - audio, sample_rate = getattr(self.tracks[k], self.track_audio_property) - # ensure all signals are shape (n_channels, n_samples) - if len(audio.shape) == 1: - audio = audio[np.newaxis, :] - signals.append(audio) - lengths.append(audio.shape[1]) - sample_rates.append(sample_rate) - - if len(set(sample_rates)) > 1: - raise ValueError( - "Sample rates for tracks {} are not equal: {}".format( - track_keys, sample_rates - ) - ) - - max_length = np.max(lengths) - if any([l != max_length for l in lengths]): - if enforce_length: - raise ValueError( - "Track's {} audio are not the same length {}. Use enforce_length=False to pad with zeros.".format( - track_keys, lengths - ) - ) - else: - # pad signals to the max length - signals = [ - np.pad(signal, ((0, 0), (0, max_length - signal.shape[1]))) - for signal in signals - ] - - if weights is None: - weights = np.ones((len(track_keys),)) - - target = np.average(signals, axis=0, weights=weights) - if not average: - target *= np.sum(weights) - - return target - - def get_random_target(self, n_tracks=None, min_weight=0.3, max_weight=1.0): - """Get a random target by combining a random selection of tracks with random weights - - Args: - n_tracks (int or None): number of tracks to randomly mix. If None, uses all tracks - min_weight (float): minimum possible weight when mixing - max_weight (float): maximum possible weight when mixing - - Returns: - target (np.ndarray): mixture audio with shape (n_samples, n_channels) - tracks (list): list of keys of included tracks - weights (list): list of weights used to mix tracks - """ - self._check_mixable() - tracks = list(self.tracks.keys()) - if n_tracks is not None and n_tracks < len(tracks): - tracks = np.random.choice(tracks, n_tracks, replace=False) - - weights = np.random.uniform(low=min_weight, high=max_weight, size=len(tracks)) - target = self.get_target(tracks, weights=weights) - return target, tracks, weights - - def get_mix(self): - """Create a linear mixture given a subset of tracks. - - Args: - track_keys (list): list of track keys to mix together - - Returns: - target (np.ndarray): mixture audio with shape (n_samples, n_channels) - """ - self._check_mixable() - return self.get_target(list(self.tracks.keys())) diff --git a/mirdata/utils.py b/mirdata/utils.py index 950a71cbc..8732c258c 100644 --- a/mirdata/utils.py +++ b/mirdata/utils.py @@ -2,7 +2,6 @@ """Utility functions for mirdata Attributes: - MIR_DATASETS_DIR (str): home folder for MIR datasets NoteData (namedtuple): `intervals`, `notes`, `confidence` @@ -29,9 +28,7 @@ import hashlib import os import json - - -MIR_DATASETS_DIR = os.path.join(os.getenv('HOME', '/tmp'), 'mir_datasets') +import tqdm def md5(file_path): @@ -45,8 +42,8 @@ def md5(file_path): """ hash_md5 = hashlib.md5() - with open(file_path, 'rb') as fhandle: - for chunk in iter(lambda: fhandle.read(4096), b''): + with open(file_path, "rb") as fhandle: + for chunk in iter(lambda: fhandle.read(4096), b""): hash_md5.update(chunk) return hash_md5.hexdigest() @@ -68,23 +65,24 @@ def none_path_join(partial_path_list): return os.path.join(*partial_path_list) -def log_message(message, silence=False): +def log_message(message, verbose=True): """Helper function to log message Args: message (str): message to log - silence (bool): if true, the message is not logged + verbose (bool): if false, the message is not logged """ - if not silence: + if verbose: print(message) -def check_index(dataset_index, data_home): +def check_index(dataset_index, data_home, verbose=True): """check index to find out missing files and files with invalid checksum Args: dataset_index (list): dataset indices data_home (str): Local home path that the dataset is being stored + verbose (bool): if true, prints validation status while running Returns: missing_files (list): List of file paths that are in the dataset index @@ -97,7 +95,7 @@ def check_index(dataset_index, data_home): invalid_checksums = {} # loop over track ids - for track_id, track in dataset_index.items(): + for track_id, track in tqdm.tqdm(dataset_index.items(), disable=not verbose): # loop over each data file for this track id for key in track.keys(): filepath = track[key][0] @@ -118,7 +116,7 @@ def check_index(dataset_index, data_home): return missing_files, invalid_checksums -def validator(dataset_index, data_home, silence=False): +def validator(dataset_index, data_home, verbose=True): """Checks the existence and validity of files stored locally with respect to the paths and file checksums stored in the reference index. Logs invalid checksums and missing files. @@ -126,8 +124,8 @@ def validator(dataset_index, data_home, silence=False): Args: dataset_index (list): dataset indices data_home (str): Local home path that the dataset is being stored - silence (bool): if False (default), prints missing and invalid files - to stdout. Otherwise, this function is equivalent to check_index. + verbose (bool): if True (default), prints missing and invalid files + to stdout. Otherwise, this function is equivalent to check_index. Returns: missing_files (list): List of file paths that are in the dataset index @@ -136,78 +134,65 @@ def validator(dataset_index, data_home, silence=False): dataset index but has a different checksum compare to the reference checksum. """ - missing_files, invalid_checksums = check_index(dataset_index, data_home) + missing_files, invalid_checksums = check_index(dataset_index, data_home, verbose) # print path of any missing files has_any_missing_file = False - for track_id in missing_files.keys(): + for track_id in missing_files: if len(missing_files[track_id]) > 0: - log_message('Files missing for {}:'.format(track_id), silence) + log_message("Files missing for {}:".format(track_id), verbose) for fpath in missing_files[track_id]: - log_message(fpath, silence) - log_message('-' * 20, silence) + log_message(fpath, verbose) + log_message("-" * 20, verbose) has_any_missing_file = True # print path of any invalid checksums has_any_invalid_checksum = False - for track_id in invalid_checksums.keys(): + for track_id in invalid_checksums: if len(invalid_checksums[track_id]) > 0: - log_message('Invalid checksums for {}:'.format(track_id), silence) + log_message("Invalid checksums for {}:".format(track_id), verbose) for fpath in invalid_checksums[track_id]: - log_message(fpath, silence) - log_message('-' * 20, silence) + log_message(fpath, verbose) + log_message("-" * 20, verbose) has_any_invalid_checksum = True if not (has_any_missing_file or has_any_invalid_checksum): log_message( - 'Success: the dataset is complete and all files are valid.', silence + "Success: the dataset is complete and all files are valid.", verbose ) - log_message('-' * 20, silence) + log_message("-" * 20, verbose) return missing_files, invalid_checksums -NoteData = namedtuple('NoteData', ['intervals', 'notes', 'confidence']) +NoteData = namedtuple("NoteData", ["intervals", "notes", "confidence"]) -F0Data = namedtuple('F0Data', ['times', 'frequencies', 'confidence']) +F0Data = namedtuple("F0Data", ["times", "frequencies", "confidence"]) MultipitchData = namedtuple( - 'MultipitchData', ['times', 'frequency_list', 'confidence_list'] + "MultipitchData", ["times", "frequency_list", "confidence_list"] ) LyricData = namedtuple( - 'LyricData', ['start_times', 'end_times', 'lyrics', 'pronunciations'] + "LyricData", ["start_times", "end_times", "lyrics", "pronunciations"] ) -SectionData = namedtuple('SectionData', ['intervals', 'labels']) - -BeatData = namedtuple('BeatData', ['beat_times', 'beat_positions']) - -ChordData = namedtuple('ChordData', ['intervals', 'labels']) - -KeyData = namedtuple('KeyData', ['start_times', 'end_times', 'keys']) +SectionData = namedtuple("SectionData", ["intervals", "labels"]) -TempoData = namedtuple('TempoData', ['time', 'duration', 'value', 'confidence']) +BeatData = namedtuple("BeatData", ["beat_times", "beat_positions"]) -EventData = namedtuple('EventData', ['start_times', 'end_times', 'event']) +ChordData = namedtuple("ChordData", ["intervals", "labels"]) +KeyData = namedtuple("KeyData", ["start_times", "end_times", "keys"]) -def get_default_dataset_path(dataset_name): - """Get the default path for a dataset given it's name +TempoData = namedtuple("TempoData", ["time", "duration", "value", "confidence"]) - Args: - dataset_name (str or None) - The name of the dataset folder, e.g. 'Orchset' - - Returns: - save_path (str): Local path to the dataset - """ - return os.path.join(MIR_DATASETS_DIR, dataset_name) +EventData = namedtuple("EventData", ["start_times", "end_times", "event"]) def load_json_index(filename): - CWD = os.path.dirname(os.path.realpath(__file__)) - with open(os.path.join(CWD, 'indexes', filename)) as f: + working_dir = os.path.dirname(os.path.realpath(__file__)) + with open(os.path.join(working_dir, "datasets/indexes", filename)) as f: return json.load(f) @@ -257,6 +242,6 @@ def metadata(self, data_home): if self.metadata_load_fn is None: raise NotImplementedError - if self._metadata is None or self._metadata['data_home'] != data_home: + if self._metadata is None or self._metadata["data_home"] != data_home: self._metadata = self.metadata_load_fn(data_home) return self._metadata diff --git a/mirdata/version.py b/mirdata/version.py index 979cdda19..7d120044b 100644 --- a/mirdata/version.py +++ b/mirdata/version.py @@ -2,5 +2,5 @@ # -*- coding: utf-8 -*- """Version info""" -short_version = "0.2" -version = "0.2.2" +short_version = "0.3" +version = "0.3.0b0" diff --git a/scripts/print_track_docstring.py b/scripts/print_track_docstring.py index 984057ecf..5d3bdf8b4 100644 --- a/scripts/print_track_docstring.py +++ b/scripts/print_track_docstring.py @@ -9,21 +9,21 @@ TEST_TRACKIDS = { - 'beatles': '0111', - 'dali': '4b196e6c99574dd49ad00d56e132712b', - 'gtzan_genre': 'country.00000', - 'guitarset': '03_BN3-119-G_solo', - 'ikala': '10161_chorus', - 'maestro': '2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1', - 'medley_solos_db': 'd07b1fc0-567d-52c2-fef4-239f31c9d40e', - 'medleydb_melody': 'MusicDelta_Beethoven', - 'medleydb_pitch': 'AClassicEducation_NightOwl_STEM_08', - 'orchset': 'Beethoven-S3-I-ex1', - 'rwc_classical': 'RM-C003', - 'rwc_jazz': 'RM-J004', - 'rwc_popular': 'RM-P001', - 'salami': '2', - 'tinysol': 'Fl-ord-C4-mf-N-T14d', + "beatles": "0111", + "dali": "4b196e6c99574dd49ad00d56e132712b", + "gtzan_genre": "country.00000", + "guitarset": "03_BN3-119-G_solo", + "ikala": "10161_chorus", + "maestro": "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1", + "medley_solos_db": "d07b1fc0-567d-52c2-fef4-239f31c9d40e", + "medleydb_melody": "MusicDelta_Beethoven", + "medleydb_pitch": "AClassicEducation_NightOwl_STEM_08", + "orchset": "Beethoven-S3-I-ex1", + "rwc_classical": "RM-C003", + "rwc_jazz": "RM-J004", + "rwc_popular": "RM-P001", + "salami": "2", + "tinysol": "Fl-ord-C4-mf-N-T14d", } @@ -33,7 +33,7 @@ def get_attributes_and_properties(class_instance): cached_properties = [] functions = [] for val in dir(class_instance.__class__): - if val.startswith('_'): + if val.startswith("_"): continue attr = getattr(class_instance.__class__, val) @@ -50,15 +50,15 @@ def get_attributes_and_properties(class_instance): itertools.chain.from_iterable([properties, cached_properties, functions]) ) for val in dir(class_instance): - if val.startswith('_'): + if val.startswith("_"): continue if val not in non_attributes: attributes.append(val) return { - 'attributes': sorted(attributes), - 'properties': sorted(properties), - 'cached_properties': sorted(cached_properties), - 'functions': sorted(functions), + "attributes": sorted(attributes), + "properties": sorted(properties), + "cached_properties": sorted(cached_properties), + "functions": sorted(functions), } @@ -72,43 +72,39 @@ def main(args): print("Please add a test track to the dictionary at the top of this script.") return - data_home = "tests/resources/mir_datasets/{}".format(dataset.DATASET_DIR) + data_home = "tests/resources/mir_datasets/{}".format(dataset.name) print(data_home) track = dataset.Track(track_id, data_home=data_home) data = get_attributes_and_properties(track) print('"""{} Track class'.format(args.dataset)) - print('') - print('Args:') - print(' track_id (str): track id of the track') - print(' data_home (str): Local path where the dataset is stored. default=None') - print( - ' If `None`, looks for the data in the default directory, `~/mir_datasets`' - ) - print('') - - if len(data['attributes']) > 0: - print('Attributes:') - for attr in data['attributes']: - if attr == 'track_id': + print("") + print("Args:") + print(" track_id (str): track id of the track") + print("") + + if len(data["attributes"]) > 0: + print("Attributes:") + for attr in data["attributes"]: + if attr == "track_id": print( - ' {} ({}): track id'.format( + " {} ({}): track id".format( attr, type(getattr(track, attr)).__name__ ) ) else: print( - ' {} ({}): TODO'.format( + " {} ({}): TODO".format( attr, type(getattr(track, attr)).__name__ ) ) - print('') + print("") print('"""') -if __name__ == '__main__': - PARSER = argparse.ArgumentParser(description='Print an empty docstring') - PARSER.add_argument('dataset', type=str, help='dataset module name.') +if __name__ == "__main__": + PARSER = argparse.ArgumentParser(description="Print an empty docstring") + PARSER.add_argument("dataset", type=str, help="dataset module name.") main(PARSER.parse_args()) diff --git a/tests/resources/download/Orchset_dataset_0.zip b/tests/resources/download/Orchset_dataset_0.zip new file mode 100644 index 0000000000000000000000000000000000000000..d232713def1b07e9263a06bf75310828d3e490af GIT binary patch literal 5385 zcmbtX3piA37v2nm&Bve0hT9LrAPWP>O4>{+HI%y7;#6;w7DRZO6SD}%x7?4AA?@5`zJq3X z?7GRHX743bCATGHU&w4JeOg^!Z~eBl;YMhxZJy&frGq7HJ-_^#J^0!5U;_@inCBsH zh=(MMg|)?FB3Z&heRmp6YX!7_x_-h#Tf2CrXoR-SP}gTAxmx_Z7ZHPlHJ*(&k1f)d znmQYwId@h@*({-*fxG))L(wkR0mfP`+Dy z*mm=JR%(BZW?8a$7};(}r#v%`oSW$}8d<;SbgXG^P0z32aLg20R`=L&(Axm~s7l(- zMb@uveKzO?ToqS4`)YX+fB6|L{t^=tvi+^p9GM55wYF7)d{;v1?@BPA%QG+b&y(#F z8i{4Te-vzPU(*<-8LK~XyR6k$-zBnSld8LPvzP@T)lm4f8znc)=ILlHU$3M)kNQRG z#IFK(34=r8*UZ0!8-u@x>QF^HS_l$51_sAVzw~a7DEnF_u9)4SE4Ns%xV0-UxCquEo~r)*H`L%K*0rmPmYoe1^wlpZUt>uW5TxY!o4f$5fvLfrG{G?a7Q>`*aIi4i zLL%F0A{^9k;Le|csdRL4ly-4*bawn=VQ%w9%qG(EDDNpb+)D99JUNTRj}1v1Rzc*~ z3ipoU^iPEI86`^->^s461O$a6@9FEp2LOkdI36g5Pbf}<-@+(Pn^;X0$2s^ES?~fo zhRqzHVGNK63dqZY;xPeL07Epx3+&TE)nTASQBY2S6c3*XEKeB22VP+3iUn#GTltbG ztXyE%G(r(C{Bwv?Yyxag+Q0-cpv-l^~UK7MjA z$V(h*V>ufTvW=}`{VEf0lST)VhTFcC==pqTAK5VGHn{t1zHm#?E2q@oTZSohSE{8 z3+D-`zsX%eUBS|yO7ZdX@OJX1DOgZFeJCC-3TEB`H2**^sy8h_!zCbyO_olWC?DVj zb`1HvVl3vB62y-Wh8=jcOG|>ez{*lpFuEScq}0+WyW(@M=da5fQ*SbO^>Ab<4ewm- zzwACfjuCZSKvqqjo}mBAK%7)(WoxS9TrFoRb5b$hl@X&uw{_FKz3^GYUZs)rfaj|! zmK2Ws@8d{oAFD9VWDQh&0&i+fv|imQ(`&P?r$1R_tlQUaoO!9yCbWkK)IKYNygLrQ zGq^YRh_u16GK3?-g}HMN&Me98%?rrA;Cpf}He{3AZF+LA!%s)NpU2Y#1qxc_&$(wD+dC*&CGXHM?E2Gn(Jh%YOT&xyK;4H=34CpkB=YyF4Sw7q*pAAY=axU^x%L784!vQEjfSDT*>`zIf2mW0$ zyWe+?_J4g>uq-%r_vde7mj`{WjVoUs(_MK>blc7Z`z`U6 z$Er29>`YKsN)vd`)yuoCMh-Q|N_Q7*BUrU}n(3dT4YR&%na=2bn`Ro^`pC z@+*c{{HdDq1_yeDkYaCwG|a5;WehsjTz>c{F79OpdfpkZVK@WkY;FJoOg^))) z0Yq%?1}YpemXSP;6P_7qa!%Z_JsPNT#8^fyDBu}tbPhB(mCk|ALP5_|vvU%QXlg`l z1J#ZggFcMHo<>&x191R8S|tL_ju?Z!kB@6+r*Xxlts$ue90jryEx&;5M2tb_!*fzQ zcb&yWgkq0?)(k-BA;zG~2ynga+)wil0YSKq?9>92 z6)~1k2kZ~&-xF?TJZCxm1UsF;q(zKnv|&Eh#SG^zr>P)!K$rj~FJcUO%oGRzY7Qg8 p11$i#8R8hsUBno4ya?Bu&20)>68Yhd0zs1ShXxmRC=3oke*;b5X~O^j literal 0 HcmV?d00001 diff --git a/tests/resources/mir_datasets/MedleyDB-Melody/medleydb_melody_metadata.json b/tests/resources/mir_datasets/MedleyDB-Melody/medleydb_melody_metadata.json deleted file mode 100644 index 29a501685..000000000 --- a/tests/resources/mir_datasets/MedleyDB-Melody/medleydb_melody_metadata.json +++ /dev/null @@ -1,14 +0,0 @@ -{ - "MusicDelta_Beethoven": { - "audio_path": "MedleyDB-Melody/audio/MusicDelta_Beethoven_MIX.wav", - "melody1_path": "MedleyDB-Melody/melody1/MusicDelta_Beethoven_MELODY1.csv", - "melody2_path": "MedleyDB-Melody/melody2/MusicDelta_Beethoven_MELODY2.csv", - "melody3_path": "MedleyDB-Melody/melody3/MusicDelta_Beethoven_MELODY3.csv", - "artist": "MusicDelta", - "title": "Beethoven", - "genre": "Classical", - "is_excerpt": true, - "is_instrumental": true, - "n_sources": 18 - } -} \ No newline at end of file diff --git a/tests/resources/mir_datasets/Beatles/annotations/beat/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt b/tests/resources/mir_datasets/beatles/annotations/beat/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt similarity index 100% rename from tests/resources/mir_datasets/Beatles/annotations/beat/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt rename to tests/resources/mir_datasets/beatles/annotations/beat/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt diff --git a/tests/resources/mir_datasets/Beatles/annotations/chordlab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab b/tests/resources/mir_datasets/beatles/annotations/chordlab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab similarity index 100% rename from tests/resources/mir_datasets/Beatles/annotations/chordlab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab rename to tests/resources/mir_datasets/beatles/annotations/chordlab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab diff --git a/tests/resources/mir_datasets/Beatles/annotations/keylab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab b/tests/resources/mir_datasets/beatles/annotations/keylab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab similarity index 100% rename from tests/resources/mir_datasets/Beatles/annotations/keylab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab rename to tests/resources/mir_datasets/beatles/annotations/keylab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab diff --git a/tests/resources/mir_datasets/Beatles/annotations/seglab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab b/tests/resources/mir_datasets/beatles/annotations/seglab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab similarity index 100% rename from tests/resources/mir_datasets/Beatles/annotations/seglab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab rename to tests/resources/mir_datasets/beatles/annotations/seglab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab diff --git a/tests/resources/mir_datasets/Beatles/audio/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.wav b/tests/resources/mir_datasets/beatles/audio/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.wav similarity index 100% rename from tests/resources/mir_datasets/Beatles/audio/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.wav rename to tests/resources/mir_datasets/beatles/audio/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.wav diff --git a/tests/resources/mir_datasets/DALI/annotations/4b196e6c99574dd49ad00d56e132712b.gz b/tests/resources/mir_datasets/dali/annotations/4b196e6c99574dd49ad00d56e132712b.gz similarity index 100% rename from tests/resources/mir_datasets/DALI/annotations/4b196e6c99574dd49ad00d56e132712b.gz rename to tests/resources/mir_datasets/dali/annotations/4b196e6c99574dd49ad00d56e132712b.gz diff --git a/tests/resources/mir_datasets/DALI/audio/4b196e6c99574dd49ad00d56e132712b.mp3 b/tests/resources/mir_datasets/dali/audio/4b196e6c99574dd49ad00d56e132712b.mp3 similarity index 100% rename from tests/resources/mir_datasets/DALI/audio/4b196e6c99574dd49ad00d56e132712b.mp3 rename to tests/resources/mir_datasets/dali/audio/4b196e6c99574dd49ad00d56e132712b.mp3 diff --git a/tests/resources/mir_datasets/DALI/dali_metadata.json b/tests/resources/mir_datasets/dali/dali_metadata.json similarity index 100% rename from tests/resources/mir_datasets/DALI/dali_metadata.json rename to tests/resources/mir_datasets/dali/dali_metadata.json diff --git a/tests/resources/mir_datasets/GiantSteps_key/audio/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).mp3 b/tests/resources/mir_datasets/giantsteps_key/audio/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).mp3 similarity index 100% rename from tests/resources/mir_datasets/GiantSteps_key/audio/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).mp3 rename to tests/resources/mir_datasets/giantsteps_key/audio/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).mp3 diff --git a/tests/resources/mir_datasets/GiantSteps_key/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).txt b/tests/resources/mir_datasets/giantsteps_key/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).txt similarity index 100% rename from tests/resources/mir_datasets/GiantSteps_key/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).txt rename to tests/resources/mir_datasets/giantsteps_key/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).txt diff --git a/tests/resources/mir_datasets/GiantSteps_key/meta/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).json b/tests/resources/mir_datasets/giantsteps_key/meta/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).json similarity index 100% rename from tests/resources/mir_datasets/GiantSteps_key/meta/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).json rename to tests/resources/mir_datasets/giantsteps_key/meta/10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix).json diff --git a/tests/resources/mir_datasets/GiantSteps_tempo/audio/28952.LOFI.mp3 b/tests/resources/mir_datasets/giantsteps_tempo/audio/28952.LOFI.mp3 similarity index 100% rename from tests/resources/mir_datasets/GiantSteps_tempo/audio/28952.LOFI.mp3 rename to tests/resources/mir_datasets/giantsteps_tempo/audio/28952.LOFI.mp3 diff --git a/tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams b/tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams similarity index 100% rename from tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams rename to tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams diff --git a/tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams b/tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams similarity index 100% rename from tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams rename to tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams diff --git a/tests/resources/mir_datasets/Groove-MIDI/drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid b/tests/resources/mir_datasets/groove_midi/drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid similarity index 100% rename from tests/resources/mir_datasets/Groove-MIDI/drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid rename to tests/resources/mir_datasets/groove_midi/drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid diff --git a/tests/resources/mir_datasets/Groove-MIDI/drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav b/tests/resources/mir_datasets/groove_midi/drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav similarity index 100% rename from tests/resources/mir_datasets/Groove-MIDI/drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav rename to tests/resources/mir_datasets/groove_midi/drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav diff --git a/tests/resources/mir_datasets/Groove-MIDI/info.csv b/tests/resources/mir_datasets/groove_midi/info.csv similarity index 100% rename from tests/resources/mir_datasets/Groove-MIDI/info.csv rename to tests/resources/mir_datasets/groove_midi/info.csv diff --git a/tests/resources/mir_datasets/GTZAN-Genre/gtzan_genre/genres/country/country.00000.wav b/tests/resources/mir_datasets/gtzan_genre/gtzan_genre/genres/country/country.00000.wav similarity index 100% rename from tests/resources/mir_datasets/GTZAN-Genre/gtzan_genre/genres/country/country.00000.wav rename to tests/resources/mir_datasets/gtzan_genre/gtzan_genre/genres/country/country.00000.wav diff --git a/tests/resources/mir_datasets/GuitarSet/annotation/03_BN3-119-G_solo.jams b/tests/resources/mir_datasets/guitarset/annotation/03_BN3-119-G_solo.jams similarity index 100% rename from tests/resources/mir_datasets/GuitarSet/annotation/03_BN3-119-G_solo.jams rename to tests/resources/mir_datasets/guitarset/annotation/03_BN3-119-G_solo.jams diff --git a/tests/resources/mir_datasets/GuitarSet/audio_hex-pickup_debleeded/03_BN3-119-G_solo_hex_cln.wav b/tests/resources/mir_datasets/guitarset/audio_hex-pickup_debleeded/03_BN3-119-G_solo_hex_cln.wav similarity index 100% rename from tests/resources/mir_datasets/GuitarSet/audio_hex-pickup_debleeded/03_BN3-119-G_solo_hex_cln.wav rename to tests/resources/mir_datasets/guitarset/audio_hex-pickup_debleeded/03_BN3-119-G_solo_hex_cln.wav diff --git a/tests/resources/mir_datasets/GuitarSet/audio_hex-pickup_original/03_BN3-119-G_solo_hex.wav b/tests/resources/mir_datasets/guitarset/audio_hex-pickup_original/03_BN3-119-G_solo_hex.wav similarity index 100% rename from tests/resources/mir_datasets/GuitarSet/audio_hex-pickup_original/03_BN3-119-G_solo_hex.wav rename to tests/resources/mir_datasets/guitarset/audio_hex-pickup_original/03_BN3-119-G_solo_hex.wav diff --git a/tests/resources/mir_datasets/GuitarSet/audio_mono-mic/03_BN3-119-G_solo_mic.wav b/tests/resources/mir_datasets/guitarset/audio_mono-mic/03_BN3-119-G_solo_mic.wav similarity index 100% rename from tests/resources/mir_datasets/GuitarSet/audio_mono-mic/03_BN3-119-G_solo_mic.wav rename to tests/resources/mir_datasets/guitarset/audio_mono-mic/03_BN3-119-G_solo_mic.wav diff --git a/tests/resources/mir_datasets/GuitarSet/audio_mono-pickup_mix/03_BN3-119-G_solo_mix.wav b/tests/resources/mir_datasets/guitarset/audio_mono-pickup_mix/03_BN3-119-G_solo_mix.wav similarity index 100% rename from tests/resources/mir_datasets/GuitarSet/audio_mono-pickup_mix/03_BN3-119-G_solo_mix.wav rename to tests/resources/mir_datasets/guitarset/audio_mono-pickup_mix/03_BN3-119-G_solo_mix.wav diff --git a/tests/resources/mir_datasets/iKala/Lyrics/10161_chorus.lab b/tests/resources/mir_datasets/ikala/Lyrics/10161_chorus.lab similarity index 100% rename from tests/resources/mir_datasets/iKala/Lyrics/10161_chorus.lab rename to tests/resources/mir_datasets/ikala/Lyrics/10161_chorus.lab diff --git a/tests/resources/mir_datasets/iKala/Lyrics/10164_chorus.lab b/tests/resources/mir_datasets/ikala/Lyrics/10164_chorus.lab similarity index 100% rename from tests/resources/mir_datasets/iKala/Lyrics/10164_chorus.lab rename to tests/resources/mir_datasets/ikala/Lyrics/10164_chorus.lab diff --git a/tests/resources/mir_datasets/iKala/PitchLabel/10161_chorus.pv b/tests/resources/mir_datasets/ikala/PitchLabel/10161_chorus.pv similarity index 100% rename from tests/resources/mir_datasets/iKala/PitchLabel/10161_chorus.pv rename to tests/resources/mir_datasets/ikala/PitchLabel/10161_chorus.pv diff --git a/tests/resources/mir_datasets/iKala/Wavfile/10161_chorus.wav b/tests/resources/mir_datasets/ikala/Wavfile/10161_chorus.wav similarity index 100% rename from tests/resources/mir_datasets/iKala/Wavfile/10161_chorus.wav rename to tests/resources/mir_datasets/ikala/Wavfile/10161_chorus.wav diff --git a/tests/resources/mir_datasets/iKala/id_mapping.txt b/tests/resources/mir_datasets/ikala/id_mapping.txt similarity index 100% rename from tests/resources/mir_datasets/iKala/id_mapping.txt rename to tests/resources/mir_datasets/ikala/id_mapping.txt diff --git a/tests/resources/mir_datasets/MAESTRO/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi b/tests/resources/mir_datasets/maestro/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi similarity index 100% rename from tests/resources/mir_datasets/MAESTRO/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi rename to tests/resources/mir_datasets/maestro/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi diff --git a/tests/resources/mir_datasets/MAESTRO/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav b/tests/resources/mir_datasets/maestro/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav similarity index 100% rename from tests/resources/mir_datasets/MAESTRO/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav rename to tests/resources/mir_datasets/maestro/2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav diff --git a/tests/resources/mir_datasets/MAESTRO/maestro-v2.0.0.json b/tests/resources/mir_datasets/maestro/maestro-v2.0.0.json similarity index 100% rename from tests/resources/mir_datasets/MAESTRO/maestro-v2.0.0.json rename to tests/resources/mir_datasets/maestro/maestro-v2.0.0.json diff --git a/tests/resources/mir_datasets/Medley-solos-DB/annotation/Medley-solos-DB_metadata.csv b/tests/resources/mir_datasets/medley_solos_db/annotation/Medley-solos-DB_metadata.csv similarity index 100% rename from tests/resources/mir_datasets/Medley-solos-DB/annotation/Medley-solos-DB_metadata.csv rename to tests/resources/mir_datasets/medley_solos_db/annotation/Medley-solos-DB_metadata.csv diff --git a/tests/resources/mir_datasets/Medley-solos-DB/audio/Medley-solos-DB_validation-3_d07b1fc0-567d-52c2-fef4-239f31c9d40e.wav b/tests/resources/mir_datasets/medley_solos_db/audio/Medley-solos-DB_validation-3_d07b1fc0-567d-52c2-fef4-239f31c9d40e.wav similarity index 100% rename from tests/resources/mir_datasets/Medley-solos-DB/audio/Medley-solos-DB_validation-3_d07b1fc0-567d-52c2-fef4-239f31c9d40e.wav rename to tests/resources/mir_datasets/medley_solos_db/audio/Medley-solos-DB_validation-3_d07b1fc0-567d-52c2-fef4-239f31c9d40e.wav diff --git a/tests/resources/mir_datasets/MedleyDB-Melody/audio/MusicDelta_Beethoven_MIX.wav b/tests/resources/mir_datasets/medleydb_melody/audio/MusicDelta_Beethoven_MIX.wav similarity index 100% rename from tests/resources/mir_datasets/MedleyDB-Melody/audio/MusicDelta_Beethoven_MIX.wav rename to tests/resources/mir_datasets/medleydb_melody/audio/MusicDelta_Beethoven_MIX.wav diff --git a/tests/resources/mir_datasets/medleydb_melody/medleydb_melody_metadata.json b/tests/resources/mir_datasets/medleydb_melody/medleydb_melody_metadata.json new file mode 100644 index 000000000..e9b84a724 --- /dev/null +++ b/tests/resources/mir_datasets/medleydb_melody/medleydb_melody_metadata.json @@ -0,0 +1,14 @@ +{ + "MusicDelta_Beethoven": { + "audio_path": "medleydb_melody/audio/MusicDelta_Beethoven_MIX.wav", + "melody1_path": "medleydb_melody/melody1/MusicDelta_Beethoven_MELODY1.csv", + "melody2_path": "medleydb_melody/melody2/MusicDelta_Beethoven_MELODY2.csv", + "melody3_path": "medleydb_melody/melody3/MusicDelta_Beethoven_MELODY3.csv", + "artist": "MusicDelta", + "title": "Beethoven", + "genre": "Classical", + "is_excerpt": true, + "is_instrumental": true, + "n_sources": 18 + } +} \ No newline at end of file diff --git a/tests/resources/mir_datasets/MedleyDB-Melody/melody1/MusicDelta_Beethoven_MELODY1.csv b/tests/resources/mir_datasets/medleydb_melody/melody1/MusicDelta_Beethoven_MELODY1.csv similarity index 100% rename from tests/resources/mir_datasets/MedleyDB-Melody/melody1/MusicDelta_Beethoven_MELODY1.csv rename to tests/resources/mir_datasets/medleydb_melody/melody1/MusicDelta_Beethoven_MELODY1.csv diff --git a/tests/resources/mir_datasets/MedleyDB-Melody/melody2/MusicDelta_Beethoven_MELODY2.csv b/tests/resources/mir_datasets/medleydb_melody/melody2/MusicDelta_Beethoven_MELODY2.csv similarity index 100% rename from tests/resources/mir_datasets/MedleyDB-Melody/melody2/MusicDelta_Beethoven_MELODY2.csv rename to tests/resources/mir_datasets/medleydb_melody/melody2/MusicDelta_Beethoven_MELODY2.csv diff --git a/tests/resources/mir_datasets/MedleyDB-Melody/melody3/MusicDelta_Beethoven_MELODY3.csv b/tests/resources/mir_datasets/medleydb_melody/melody3/MusicDelta_Beethoven_MELODY3.csv similarity index 100% rename from tests/resources/mir_datasets/MedleyDB-Melody/melody3/MusicDelta_Beethoven_MELODY3.csv rename to tests/resources/mir_datasets/medleydb_melody/melody3/MusicDelta_Beethoven_MELODY3.csv diff --git a/tests/resources/mir_datasets/MedleyDB-Pitch/audio/AClassicEducation_NightOwl_STEM_08.wav b/tests/resources/mir_datasets/medleydb_pitch/audio/AClassicEducation_NightOwl_STEM_08.wav similarity index 100% rename from tests/resources/mir_datasets/MedleyDB-Pitch/audio/AClassicEducation_NightOwl_STEM_08.wav rename to tests/resources/mir_datasets/medleydb_pitch/audio/AClassicEducation_NightOwl_STEM_08.wav diff --git a/tests/resources/mir_datasets/MedleyDB-Pitch/medleydb_pitch_metadata.json b/tests/resources/mir_datasets/medleydb_pitch/medleydb_pitch_metadata.json similarity index 62% rename from tests/resources/mir_datasets/MedleyDB-Pitch/medleydb_pitch_metadata.json rename to tests/resources/mir_datasets/medleydb_pitch/medleydb_pitch_metadata.json index 43b601038..020297e05 100644 --- a/tests/resources/mir_datasets/MedleyDB-Pitch/medleydb_pitch_metadata.json +++ b/tests/resources/mir_datasets/medleydb_pitch/medleydb_pitch_metadata.json @@ -1,7 +1,7 @@ { "AClassicEducation_NightOwl_STEM_08": { - "audio_path": "MedleyDB-Pitch/audio/AClassicEducation_NightOwl_STEM_08.wav", - "pitch_path": "MedleyDB-Pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv", + "audio_path": "medleydb_pitch/audio/AClassicEducation_NightOwl_STEM_08.wav", + "pitch_path": "medleydb_pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv", "instrument": "male singer", "artist": "AClassicEducation", "title": "NightOwl", diff --git a/tests/resources/mir_datasets/MedleyDB-Pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv b/tests/resources/mir_datasets/medleydb_pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv similarity index 100% rename from tests/resources/mir_datasets/MedleyDB-Pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv rename to tests/resources/mir_datasets/medleydb_pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv diff --git a/tests/resources/mir_datasets/Mridangam-Stroke/mridangam_stroke_1.5/B/224030__akshaylaya__bheem-b-001.wav b/tests/resources/mir_datasets/mridangam_stroke/mridangam_stroke_1.5/B/224030__akshaylaya__bheem-b-001.wav similarity index 100% rename from tests/resources/mir_datasets/Mridangam-Stroke/mridangam_stroke_1.5/B/224030__akshaylaya__bheem-b-001.wav rename to tests/resources/mir_datasets/mridangam_stroke/mridangam_stroke_1.5/B/224030__akshaylaya__bheem-b-001.wav diff --git a/tests/resources/mir_datasets/Mridangam-Stroke/mridangam_stroke_1.5/B/_readme_and_license.txt b/tests/resources/mir_datasets/mridangam_stroke/mridangam_stroke_1.5/B/_readme_and_license.txt similarity index 100% rename from tests/resources/mir_datasets/Mridangam-Stroke/mridangam_stroke_1.5/B/_readme_and_license.txt rename to tests/resources/mir_datasets/mridangam_stroke/mridangam_stroke_1.5/B/_readme_and_license.txt diff --git a/tests/resources/mir_datasets/Orchset/GT/Beethoven-S3-I-ex1.mel b/tests/resources/mir_datasets/orchset/GT/Beethoven-S3-I-ex1.mel similarity index 100% rename from tests/resources/mir_datasets/Orchset/GT/Beethoven-S3-I-ex1.mel rename to tests/resources/mir_datasets/orchset/GT/Beethoven-S3-I-ex1.mel diff --git a/tests/resources/mir_datasets/Orchset/Orchset - Predominant Melodic Instruments.csv b/tests/resources/mir_datasets/orchset/Orchset - Predominant Melodic Instruments.csv similarity index 100% rename from tests/resources/mir_datasets/Orchset/Orchset - Predominant Melodic Instruments.csv rename to tests/resources/mir_datasets/orchset/Orchset - Predominant Melodic Instruments.csv diff --git a/tests/resources/mir_datasets/Orchset/audio/mono/Beethoven-S3-I-ex1.wav b/tests/resources/mir_datasets/orchset/audio/mono/Beethoven-S3-I-ex1.wav similarity index 100% rename from tests/resources/mir_datasets/Orchset/audio/mono/Beethoven-S3-I-ex1.wav rename to tests/resources/mir_datasets/orchset/audio/mono/Beethoven-S3-I-ex1.wav diff --git a/tests/resources/mir_datasets/Orchset/audio/stereo/Beethoven-S3-I-ex1.wav b/tests/resources/mir_datasets/orchset/audio/stereo/Beethoven-S3-I-ex1.wav similarity index 100% rename from tests/resources/mir_datasets/Orchset/audio/stereo/Beethoven-S3-I-ex1.wav rename to tests/resources/mir_datasets/orchset/audio/stereo/Beethoven-S3-I-ex1.wav diff --git a/tests/resources/mir_datasets/RWC-Classical/annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT b/tests/resources/mir_datasets/rwc_classical/annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Classical/annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT rename to tests/resources/mir_datasets/rwc_classical/annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT diff --git a/tests/resources/mir_datasets/RWC-Classical/annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT b/tests/resources/mir_datasets/rwc_classical/annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Classical/annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT rename to tests/resources/mir_datasets/rwc_classical/annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT diff --git a/tests/resources/mir_datasets/RWC-Classical/audio/rwc-c-m01/3.wav b/tests/resources/mir_datasets/rwc_classical/audio/rwc-c-m01/3.wav similarity index 100% rename from tests/resources/mir_datasets/RWC-Classical/audio/rwc-c-m01/3.wav rename to tests/resources/mir_datasets/rwc_classical/audio/rwc-c-m01/3.wav diff --git a/tests/resources/mir_datasets/RWC-Classical/metadata-master/rwc-c.csv b/tests/resources/mir_datasets/rwc_classical/metadata-master/rwc-c.csv similarity index 100% rename from tests/resources/mir_datasets/RWC-Classical/metadata-master/rwc-c.csv rename to tests/resources/mir_datasets/rwc_classical/metadata-master/rwc-c.csv diff --git a/tests/resources/mir_datasets/RWC-Genre/annotations/AIST.RWC-MDB-G-2001.BEAT/RM-G002.BEAT.TXT b/tests/resources/mir_datasets/rwc_genre/annotations/AIST.RWC-MDB-G-2001.BEAT/RM-G002.BEAT.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Genre/annotations/AIST.RWC-MDB-G-2001.BEAT/RM-G002.BEAT.TXT rename to tests/resources/mir_datasets/rwc_genre/annotations/AIST.RWC-MDB-G-2001.BEAT/RM-G002.BEAT.TXT diff --git a/tests/resources/mir_datasets/RWC-Genre/annotations/AIST.RWC-MDB-G-2001.CHORUS/RM-G002.CHORUS.TXT b/tests/resources/mir_datasets/rwc_genre/annotations/AIST.RWC-MDB-G-2001.CHORUS/RM-G002.CHORUS.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Genre/annotations/AIST.RWC-MDB-G-2001.CHORUS/RM-G002.CHORUS.TXT rename to tests/resources/mir_datasets/rwc_genre/annotations/AIST.RWC-MDB-G-2001.CHORUS/RM-G002.CHORUS.TXT diff --git a/tests/resources/mir_datasets/RWC-Genre/audio/rwc-g-m01/2.wav b/tests/resources/mir_datasets/rwc_genre/audio/rwc-g-m01/2.wav similarity index 100% rename from tests/resources/mir_datasets/RWC-Genre/audio/rwc-g-m01/2.wav rename to tests/resources/mir_datasets/rwc_genre/audio/rwc-g-m01/2.wav diff --git a/tests/resources/mir_datasets/RWC-Genre/metadata-master/rwc-g.csv b/tests/resources/mir_datasets/rwc_genre/metadata-master/rwc-g.csv similarity index 100% rename from tests/resources/mir_datasets/RWC-Genre/metadata-master/rwc-g.csv rename to tests/resources/mir_datasets/rwc_genre/metadata-master/rwc-g.csv diff --git a/tests/resources/mir_datasets/RWC-Jazz/annotations/AIST.RWC-MDB-J-2001.BEAT/RM-J004.BEAT.TXT b/tests/resources/mir_datasets/rwc_jazz/annotations/AIST.RWC-MDB-J-2001.BEAT/RM-J004.BEAT.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Jazz/annotations/AIST.RWC-MDB-J-2001.BEAT/RM-J004.BEAT.TXT rename to tests/resources/mir_datasets/rwc_jazz/annotations/AIST.RWC-MDB-J-2001.BEAT/RM-J004.BEAT.TXT diff --git a/tests/resources/mir_datasets/RWC-Jazz/annotations/AIST.RWC-MDB-J-2001.CHORUS/RM-J004.CHORUS.TXT b/tests/resources/mir_datasets/rwc_jazz/annotations/AIST.RWC-MDB-J-2001.CHORUS/RM-J004.CHORUS.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Jazz/annotations/AIST.RWC-MDB-J-2001.CHORUS/RM-J004.CHORUS.TXT rename to tests/resources/mir_datasets/rwc_jazz/annotations/AIST.RWC-MDB-J-2001.CHORUS/RM-J004.CHORUS.TXT diff --git a/tests/resources/mir_datasets/RWC-Jazz/audio/rwc-j-m01/4.wav b/tests/resources/mir_datasets/rwc_jazz/audio/rwc-j-m01/4.wav similarity index 100% rename from tests/resources/mir_datasets/RWC-Jazz/audio/rwc-j-m01/4.wav rename to tests/resources/mir_datasets/rwc_jazz/audio/rwc-j-m01/4.wav diff --git a/tests/resources/mir_datasets/RWC-Jazz/metadata-master/rwc-j.csv b/tests/resources/mir_datasets/rwc_jazz/metadata-master/rwc-j.csv similarity index 100% rename from tests/resources/mir_datasets/RWC-Jazz/metadata-master/rwc-j.csv rename to tests/resources/mir_datasets/rwc_jazz/metadata-master/rwc-j.csv diff --git a/tests/resources/mir_datasets/RWC-Popular/annotations/AIST.RWC-MDB-P-2001.BEAT/RM-P001.BEAT.TXT b/tests/resources/mir_datasets/rwc_popular/annotations/AIST.RWC-MDB-P-2001.BEAT/RM-P001.BEAT.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Popular/annotations/AIST.RWC-MDB-P-2001.BEAT/RM-P001.BEAT.TXT rename to tests/resources/mir_datasets/rwc_popular/annotations/AIST.RWC-MDB-P-2001.BEAT/RM-P001.BEAT.TXT diff --git a/tests/resources/mir_datasets/RWC-Popular/annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab b/tests/resources/mir_datasets/rwc_popular/annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab similarity index 100% rename from tests/resources/mir_datasets/RWC-Popular/annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab rename to tests/resources/mir_datasets/rwc_popular/annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab diff --git a/tests/resources/mir_datasets/RWC-Popular/annotations/AIST.RWC-MDB-P-2001.CHORUS/RM-P001.CHORUS.TXT b/tests/resources/mir_datasets/rwc_popular/annotations/AIST.RWC-MDB-P-2001.CHORUS/RM-P001.CHORUS.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Popular/annotations/AIST.RWC-MDB-P-2001.CHORUS/RM-P001.CHORUS.TXT rename to tests/resources/mir_datasets/rwc_popular/annotations/AIST.RWC-MDB-P-2001.CHORUS/RM-P001.CHORUS.TXT diff --git a/tests/resources/mir_datasets/RWC-Popular/annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT b/tests/resources/mir_datasets/rwc_popular/annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT similarity index 100% rename from tests/resources/mir_datasets/RWC-Popular/annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT rename to tests/resources/mir_datasets/rwc_popular/annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT diff --git a/tests/resources/mir_datasets/RWC-Popular/audio/rwc-p-m01/1.wav b/tests/resources/mir_datasets/rwc_popular/audio/rwc-p-m01/1.wav similarity index 100% rename from tests/resources/mir_datasets/RWC-Popular/audio/rwc-p-m01/1.wav rename to tests/resources/mir_datasets/rwc_popular/audio/rwc-p-m01/1.wav diff --git a/tests/resources/mir_datasets/RWC-Popular/metadata-master/rwc-p.csv b/tests/resources/mir_datasets/rwc_popular/metadata-master/rwc-p.csv similarity index 100% rename from tests/resources/mir_datasets/RWC-Popular/metadata-master/rwc-p.csv rename to tests/resources/mir_datasets/rwc_popular/metadata-master/rwc-p.csv diff --git a/tests/resources/mir_datasets/Salami/audio/2.mp3 b/tests/resources/mir_datasets/salami/audio/2.mp3 similarity index 100% rename from tests/resources/mir_datasets/Salami/audio/2.mp3 rename to tests/resources/mir_datasets/salami/audio/2.mp3 diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_functions.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_functions.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_functions.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_functions.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_lowercase.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_lowercase.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_lowercase.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_lowercase.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_uppercase.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_uppercase.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_uppercase.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_uppercase.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/1015/textfile2.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/1015/textfile2.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/1015/textfile2.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/1015/textfile2.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_functions.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_functions.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_functions.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_functions.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_lowercase.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_lowercase.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_lowercase.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_lowercase.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_uppercase.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_uppercase.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_uppercase.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_uppercase.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/192/textfile1.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/192/textfile1.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/192/textfile1.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/192/textfile1.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_functions.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_functions.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_functions.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_functions.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_lowercase.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_lowercase.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_lowercase.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_lowercase.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_functions.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_functions.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_functions.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_functions.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_lowercase.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_lowercase.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_lowercase.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_lowercase.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_uppercase.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_uppercase.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_uppercase.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_uppercase.txt diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/textfile2.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/textfile1.txt similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/annotations/2/textfile2.txt rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/textfile1.txt diff --git a/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/textfile2.txt b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/annotations/2/textfile2.txt new file mode 100644 index 000000000..e69de29bb diff --git a/tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/metadata/metadata.csv b/tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/metadata/metadata.csv similarity index 100% rename from tests/resources/mir_datasets/Salami/salami-data-public-hierarchy-corrections/metadata/metadata.csv rename to tests/resources/mir_datasets/salami/salami-data-public-hierarchy-corrections/metadata/metadata.csv diff --git a/tests/resources/mir_datasets/TinySOL/annotation/TinySOL_metadata.csv b/tests/resources/mir_datasets/tinysol/annotation/TinySOL_metadata.csv similarity index 100% rename from tests/resources/mir_datasets/TinySOL/annotation/TinySOL_metadata.csv rename to tests/resources/mir_datasets/tinysol/annotation/TinySOL_metadata.csv diff --git a/tests/resources/mir_datasets/TinySOL/audio/Strings/Contrabass/ordinario/Cb-ord-A2-mf-2c-N.wav b/tests/resources/mir_datasets/tinysol/audio/Strings/Contrabass/ordinario/Cb-ord-A2-mf-2c-N.wav similarity index 100% rename from tests/resources/mir_datasets/TinySOL/audio/Strings/Contrabass/ordinario/Cb-ord-A2-mf-2c-N.wav rename to tests/resources/mir_datasets/tinysol/audio/Strings/Contrabass/ordinario/Cb-ord-A2-mf-2c-N.wav diff --git a/tests/resources/mir_datasets/TinySOL/audio/Winds/Flute/ordinario/Fl-ord-C4-mf-N-T14d.wav b/tests/resources/mir_datasets/tinysol/audio/Winds/Flute/ordinario/Fl-ord-C4-mf-N-T14d.wav similarity index 100% rename from tests/resources/mir_datasets/TinySOL/audio/Winds/Flute/ordinario/Fl-ord-C4-mf-N-T14d.wav rename to tests/resources/mir_datasets/tinysol/audio/Winds/Flute/ordinario/Fl-ord-C4-mf-N-T14d.wav diff --git a/tests/test_beatles.py b/tests/test_beatles.py index 869c85309..8a5acdbfa 100644 --- a/tests/test_beatles.py +++ b/tests/test_beatles.py @@ -2,59 +2,60 @@ import numpy as np -from mirdata import beatles, utils +from mirdata.datasets import beatles +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = '0111' - data_home = 'tests/resources/mir_datasets/Beatles' + default_trackid = "0111" + data_home = "tests/resources/mir_datasets/beatles" track = beatles.Track(default_trackid, data_home=data_home) expected_attributes = { - 'audio_path': 'tests/resources/mir_datasets/Beatles/' - + 'audio/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.wav', - 'beats_path': 'tests/resources/mir_datasets/Beatles/' - + 'annotations/beat/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt', - 'chords_path': 'tests/resources/mir_datasets/Beatles/' - + 'annotations/chordlab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab', - 'keys_path': 'tests/resources/mir_datasets/Beatles/' - + 'annotations/keylab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab', - 'sections_path': 'tests/resources/mir_datasets/Beatles/' - + 'annotations/seglab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab', - 'title': '11_-_Do_You_Want_To_Know_A_Secret', - 'track_id': '0111', + "audio_path": "tests/resources/mir_datasets/beatles/" + + "audio/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.wav", + "beats_path": "tests/resources/mir_datasets/beatles/" + + "annotations/beat/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt", + "chords_path": "tests/resources/mir_datasets/beatles/" + + "annotations/chordlab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab", + "keys_path": "tests/resources/mir_datasets/beatles/" + + "annotations/keylab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab", + "sections_path": "tests/resources/mir_datasets/beatles/" + + "annotations/seglab/The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab", + "title": "11_-_Do_You_Want_To_Know_A_Secret", + "track_id": "0111", } expected_property_types = { - 'beats': utils.BeatData, - 'chords': utils.ChordData, - 'key': utils.KeyData, - 'sections': utils.SectionData, + "beats": utils.BeatData, + "chords": utils.ChordData, + "key": utils.KeyData, + "sections": utils.SectionData, } run_track_tests(track, expected_attributes, expected_property_types) audio, sr = track.audio - assert sr == 44100, 'sample rate {} is not 44100'.format(sr) - assert audio.shape == (44100 * 2,), 'audio shape {} was not (88200,)'.format( + assert sr == 44100, "sample rate {} is not 44100".format(sr) + assert audio.shape == (44100 * 2,), "audio shape {} was not (88200,)".format( audio.shape ) - track = beatles.Track('10212') - assert track.beats is None, 'expected track.beats to be None, got {}'.format( + track = beatles.Track("10212", data_home=data_home) + assert track.beats is None, "expected track.beats to be None, got {}".format( track.beats ) - assert track.key is None, 'expected track.key to be None, got {}'.format(track.key) + assert track.key is None, "expected track.key to be None, got {}".format(track.key) def test_to_jams(): - data_home = 'tests/resources/mir_datasets/Beatles' - track = beatles.Track('0111', data_home=data_home) + data_home = "tests/resources/mir_datasets/beatles" + track = beatles.Track("0111", data_home=data_home) jam = track.to_jams() - beats = jam.search(namespace='beat')[0]['data'] + beats = jam.search(namespace="beat")[0]["data"] assert [beat.time for beat in beats] == [ 13.249, 13.959, @@ -63,7 +64,7 @@ def test_to_jams(): 15.453, 15.929, 16.428, - ], 'beat times do not match expected' + ], "beat times do not match expected" assert [beat.duration for beat in beats] == [ 0.0, 0.0, @@ -72,7 +73,7 @@ def test_to_jams(): 0.0, 0.0, 0.0, - ], 'beat durations do not match expected' + ], "beat durations do not match expected" assert [beat.value for beat in beats] == [ 2, 3, @@ -81,7 +82,7 @@ def test_to_jams(): 2, 3, 4, - ], 'beat values do not match expected' + ], "beat values do not match expected" assert [beat.confidence for beat in beats] == [ None, None, @@ -90,96 +91,96 @@ def test_to_jams(): None, None, None, - ], 'beat confidence does not match expected' + ], "beat confidence does not match expected" - segments = jam.search(namespace='segment')[0]['data'] + segments = jam.search(namespace="segment")[0]["data"] assert [segment.time for segment in segments] == [ 0.0, 0.465, - ], 'segment time does not match expected' + ], "segment time does not match expected" assert [segment.duration for segment in segments] == [ 0.465, 14.466, - ], 'segment duration does not match expected' + ], "segment duration does not match expected" assert [segment.value for segment in segments] == [ - 'silence', - 'intro', - ], 'segment value does not match expected' + "silence", + "intro", + ], "segment value does not match expected" assert [segment.confidence for segment in segments] == [ None, None, - ], 'segment confidence does not match expected' + ], "segment confidence does not match expected" - chords = jam.search(namespace='chord')[0]['data'] + chords = jam.search(namespace="chord")[0]["data"] assert [chord.time for chord in chords] == [ 0.0, 4.586464, 6.98973, - ], 'chord time does not match expected' + ], "chord time does not match expected" assert [chord.duration for chord in chords] == [ 0.497838, 2.4032659999999995, 2.995374, - ], 'chord duration does not match expected' + ], "chord duration does not match expected" assert [chord.value for chord in chords] == [ - 'N', - 'E:min', - 'G', - ], 'chord value does not match expected' + "N", + "E:min", + "G", + ], "chord value does not match expected" assert [chord.confidence for chord in chords] == [ None, None, None, - ], 'chord confidence does not match expected' + ], "chord confidence does not match expected" - keys = jam.search(namespace='key')[0]['data'] - assert [key.time for key in keys] == [0.0], 'key time does not match expected' + keys = jam.search(namespace="key")[0]["data"] + assert [key.time for key in keys] == [0.0], "key time does not match expected" assert [key.duration for key in keys] == [ 119.333 - ], 'key duration does not match expected' - assert [key.value for key in keys] == ['E'], 'key value does not match expected' + ], "key duration does not match expected" + assert [key.value for key in keys] == ["E"], "key value does not match expected" assert [key.confidence for key in keys] == [ None - ], 'key confidence does not match expected' + ], "key confidence does not match expected" assert ( - jam['file_metadata']['title'] == '11_-_Do_You_Want_To_Know_A_Secret' - ), 'title does not match expected' + jam["file_metadata"]["title"] == "11_-_Do_You_Want_To_Know_A_Secret" + ), "title does not match expected" assert ( - jam['file_metadata']['artist'] == 'The Beatles' - ), 'artist does not match expected' + jam["file_metadata"]["artist"] == "The Beatles" + ), "artist does not match expected" def test_load_beats(): beats_path = ( - 'tests/resources/mir_datasets/Beatles/annotations/beat/' - + 'The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt' + "tests/resources/mir_datasets/beatles/annotations/beat/" + + "The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.txt" ) beat_data = beatles.load_beats(beats_path) - assert type(beat_data) == utils.BeatData, 'beat_data is not type utils.BeatData' + assert type(beat_data) == utils.BeatData, "beat_data is not type utils.BeatData" assert ( type(beat_data.beat_times) == np.ndarray - ), 'beat_data.beat_times is not an np.ndarray' + ), "beat_data.beat_times is not an np.ndarray" assert ( type(beat_data.beat_positions) == np.ndarray - ), 'beat_data.beat_positions is not an np.ndarray' + ), "beat_data.beat_positions is not an np.ndarray" assert np.array_equal( beat_data.beat_times, np.array([13.249, 13.959, 14.416, 14.965, 15.453, 15.929, 16.428]), - ), 'beat_data.beat_times different than expected' + ), "beat_data.beat_times different than expected" assert np.array_equal( beat_data.beat_positions, np.array([2, 3, 4, 1, 2, 3, 4]) - ), 'beat_data.beat_positions different from expected' + ), "beat_data.beat_positions different from expected" - assert beatles.load_beats(None) is None, 'load_beats(None) should return None' + assert beatles.load_beats(None) is None, "load_beats(None) should return None" def test_load_chords(): chords_path = ( - 'tests/resources/mir_datasets/Beatles/annotations/chordlab/' - + 'The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab' + "tests/resources/mir_datasets/beatles/annotations/chordlab/" + + "The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab" ) chord_data = beatles.load_chords(chords_path) @@ -193,15 +194,15 @@ def test_load_chords(): assert np.array_equal( chord_data.intervals[:, 1], np.array([0.497838, 6.989730, 9.985104]) ) - assert np.array_equal(chord_data.labels, np.array(['N', 'E:min', 'G'])) + assert np.array_equal(chord_data.labels, np.array(["N", "E:min", "G"])) assert beatles.load_chords(None) is None def test_load_key(): key_path = ( - 'tests/resources/mir_datasets/Beatles/annotations/keylab/' - + 'The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab' + "tests/resources/mir_datasets/beatles/annotations/keylab/" + + "The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab" ) key_data = beatles.load_key(key_path) @@ -210,15 +211,15 @@ def test_load_key(): assert np.array_equal(key_data.start_times, np.array([0.000])) assert np.array_equal(key_data.end_times, np.array([119.333])) - assert np.array_equal(key_data.keys, np.array(['E'])) + assert np.array_equal(key_data.keys, np.array(["E"])) assert beatles.load_key(None) is None def test_load_sections(): sections_path = ( - 'tests/resources/mir_datasets/Beatles/annotations/seglab/' - + 'The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab' + "tests/resources/mir_datasets/beatles/annotations/seglab/" + + "The Beatles/01_-_Please_Please_Me/11_-_Do_You_Want_To_Know_A_Secret.lab" ) section_data = beatles.load_sections(sections_path) @@ -228,20 +229,21 @@ def test_load_sections(): assert np.array_equal(section_data.intervals[:, 0], np.array([0.000000, 0.465])) assert np.array_equal(section_data.intervals[:, 1], np.array([0.465, 14.931])) - assert np.array_equal(section_data.labels, np.array(['silence', 'intro'])) + assert np.array_equal(section_data.labels, np.array(["silence", "intro"])) assert beatles.load_sections(None) is None def test_fix_newpoint(): - beat_positions1 = np.array(['4', '1', '2', 'New Point', '4']) + beat_positions1 = np.array(["4", "1", "2", "New Point", "4"]) new_beat_positions1 = beatles._fix_newpoint(beat_positions1) - assert np.array_equal(new_beat_positions1, np.array(['4', '1', '2', '3', '4'])) + assert np.array_equal(new_beat_positions1, np.array(["4", "1", "2", "3", "4"])) - beat_positions2 = np.array(['1', '2', 'New Point']) + beat_positions2 = np.array(["1", "2", "New Point"]) new_beat_positions2 = beatles._fix_newpoint(beat_positions2) - assert np.array_equal(new_beat_positions2, np.array(['1', '2', '3'])) + assert np.array_equal(new_beat_positions2, np.array(["1", "2", "3"])) - beat_positions3 = np.array(['New Point', '2', '3']) + beat_positions3 = np.array(["New Point", "2", "3"]) new_beat_positions3 = beatles._fix_newpoint(beat_positions3) - assert np.array_equal(new_beat_positions3, np.array(['1', '2', '3'])) + assert np.array_equal(new_beat_positions3, np.array(["1", "2", "3"])) + diff --git a/tests/test_beatport_key.py b/tests/test_beatport_key.py index 21f1b2de6..ddca63728 100644 --- a/tests/test_beatport_key.py +++ b/tests/test_beatport_key.py @@ -2,72 +2,60 @@ import numpy as np -from mirdata import beatport_key, utils +from mirdata.datasets import beatport_key +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = '1' - data_home = 'tests/resources/mir_datasets/beatport_key' + default_trackid = "1" + data_home = "tests/resources/mir_datasets/beatport_key" track = beatport_key.Track(default_trackid, data_home=data_home) expected_attributes = { - 'audio_path': 'tests/resources/mir_datasets/beatport_key/audio/100066 Lindstrom - Monsteer (Original Mix).mp3', - 'keys_path': 'tests/resources/mir_datasets/beatport_key/keys/100066 Lindstrom - Monsteer (Original Mix).txt', - 'metadata_path': 'tests/resources/mir_datasets/beatport_key/meta/100066 Lindstrom - Monsteer (Original Mix).json', - 'title': '100066 Lindstrom - Monsteer (Original Mix)', - 'track_id': '1', + "audio_path": "tests/resources/mir_datasets/beatport_key/audio/100066 Lindstrom - Monsteer (Original Mix).mp3", + "keys_path": "tests/resources/mir_datasets/beatport_key/keys/100066 Lindstrom - Monsteer (Original Mix).txt", + "metadata_path": "tests/resources/mir_datasets/beatport_key/meta/100066 Lindstrom - Monsteer (Original Mix).json", + "title": "100066 Lindstrom - Monsteer (Original Mix)", + "track_id": "1", } expected_property_types = { - 'key': list, - 'genres': dict, - 'artists': list, - 'tempo': int + "key": list, + "genres": dict, + "artists": list, + "tempo": int, } run_track_tests(track, expected_attributes, expected_property_types) audio, sr = track.audio - assert sr == 44100, 'sample rate {} is not 44100'.format(sr) - assert audio.shape == (5292000,), 'audio shape {} was not (5292000,)'.format( + assert sr == 44100, "sample rate {} is not 44100".format(sr) + assert audio.shape == (5292000,), "audio shape {} was not (5292000,)".format( audio.shape ) def test_to_jams(): - data_home = 'tests/resources/mir_datasets/beatport_key' - track = beatport_key.Track('1', data_home=data_home) + data_home = "tests/resources/mir_datasets/beatport_key" + track = beatport_key.Track("1", data_home=data_home) jam = track.to_jams() - assert ( - jam['sandbox']['key'] == ['D minor'] - ), 'key does not match expected' + assert jam["sandbox"]["key"] == ["D minor"], "key does not match expected" assert ( - jam['file_metadata']['title'] == '100066 Lindstrom - Monsteer (Original Mix)' - ), 'title does not match expected' + jam["file_metadata"]["title"] == "100066 Lindstrom - Monsteer (Original Mix)" + ), "title does not match expected" sand_box = { - "artists": [ - "Lindstrom" - ], - "genres": { - "genres": [ - "Electronica / Downtempo" - ], - "sub_genres": [] - }, + "artists": ["Lindstrom"], + "genres": {"genres": ["Electronica / Downtempo"], "sub_genres": []}, "tempo": 115, - "key": ["D minor"] + "key": ["D minor"], } - assert ( - dict(jam['sandbox']) == sand_box - ), 'sandbox does not match expected' + assert dict(jam["sandbox"]) == sand_box, "sandbox does not match expected" def test_load_key(): - key_path = ( - 'tests/resources/mir_datasets/beatport_key/keys/100066 Lindstrom - Monsteer (Original Mix).txt' - ) + key_path = "tests/resources/mir_datasets/beatport_key/keys/100066 Lindstrom - Monsteer (Original Mix).txt" key_data = beatport_key.load_key(key_path) assert type(key_data) == list @@ -78,11 +66,9 @@ def test_load_key(): def test_load_meta(): - meta_path = ( - 'tests/resources/mir_datasets/beatport_key/meta/100066 Lindstrom - Monsteer (Original Mix).json' - ) - genres = {'genres': ['Electronica / Downtempo'], 'sub_genres': []} - artists = ['Lindstrom'] + meta_path = "tests/resources/mir_datasets/beatport_key/meta/100066 Lindstrom - Monsteer (Original Mix).json" + genres = {"genres": ["Electronica / Downtempo"], "sub_genres": []} + artists = ["Lindstrom"] tempo = 115 assert type(beatport_key.load_genre(meta_path)) == dict @@ -99,9 +85,13 @@ def test_load_meta(): def test_find_replace(): - with open('tests/resources/mir_datasets/beatport_key/find_replace.json', 'w') as the_file: + with open( + "tests/resources/mir_datasets/beatport_key/find_replace.json", "w" + ) as the_file: the_file.write('{"probando": nan}') - beatport_key.find_replace('tests/resources/mir_datasets/beatport_key', ": nan", ": null", "*.json") - f = open('tests/resources/mir_datasets/beatport_key/find_replace.json', "r") + beatport_key.find_replace( + "tests/resources/mir_datasets/beatport_key", ": nan", ": null", "*.json" + ) + f = open("tests/resources/mir_datasets/beatport_key/find_replace.json", "r") content = f.read() assert content == '{"probando": null}' diff --git a/tests/test_track.py b/tests/test_core.py similarity index 85% rename from tests/test_track.py rename to tests/test_core.py index db5b52c8c..48989bd8d 100644 --- a/tests/test_track.py +++ b/tests/test_core.py @@ -4,7 +4,8 @@ import pytest import numpy as np -from mirdata import track +import mirdata +from mirdata import core if sys.version_info.major == 3: builtin_module_name = "builtins" @@ -13,7 +14,7 @@ def test_track_repr(): - class TestTrack(track.Track): + class TestTrack(core.Track): def __init__(self): self.a = "asdf" self.b = 1.2345678 @@ -47,7 +48,7 @@ def h(self): with pytest.raises(NotImplementedError): test_track.to_jams() - class NoDocsTrack(track.Track): + class NoDocsTrack(core.Track): @property def no_doc(self): return "whee!" @@ -57,8 +58,37 @@ def no_doc(self): bad_track.__repr__() +def test_dataset(): + dataset = mirdata.Dataset("guitarset") + assert isinstance(dataset, core.Dataset) + + dataset = mirdata.Dataset("rwc_jazz") + assert isinstance(dataset, core.Dataset) + + dataset = mirdata.Dataset("ikala") + assert isinstance(dataset, core.Dataset) + + print(dataset) # test that repr doesn't fail + + +def test_dataset_errors(): + with pytest.raises(ValueError): + core.Dataset("not_a_dataset") + + d = core.Dataset("orchset") + d._track_object = None + with pytest.raises(NotImplementedError): + d.track("asdf") + + with pytest.raises(NotImplementedError): + d.load_tracks() + + with pytest.raises(NotImplementedError): + d.choice_track() + + def test_multitrack_basic(): - class TestTrack(track.Track): + class TestTrack(core.Track): def __init__(self, key): self.key = key @@ -66,7 +96,7 @@ def __init__(self, key): def f(self): return np.random.uniform(-1, 1, (2, 100)), 1000 - class TestMultiTrack1(track.MultiTrack): + class TestMultiTrack1(core.MultiTrack): def __init__(self, mtrack_id, data_home): self.mtrack_id = mtrack_id self._data_home = data_home @@ -85,7 +115,7 @@ def __init__(self, mtrack_id, data_home): with pytest.raises(NotImplementedError): mtrack.get_mix() - class TestMultiTrack2(track.MultiTrack): + class TestMultiTrack2(core.MultiTrack): def __init__(self, mtrack_id, data_home): self.mtrack_id = mtrack_id self._data_home = data_home @@ -103,7 +133,7 @@ def to_jams(self): def test_multitrack_mixing(): - class TestTrack(track.Track): + class TestTrack(core.Track): def __init__(self, key): self.key = key @@ -111,7 +141,7 @@ def __init__(self, key): def f(self): return np.random.uniform(-1, 1, (2, 100)), 1000 - class TestMultiTrack(track.MultiTrack): + class TestMultiTrack(core.MultiTrack): def __init__(self, mtrack_id, data_home): self.mtrack_id = mtrack_id self._data_home = data_home @@ -179,7 +209,7 @@ def __init__(self, mtrack_id, data_home): def test_multitrack_unequal_len(): - class TestTrack(track.Track): + class TestTrack(core.Track): def __init__(self, key): self.key = key @@ -187,7 +217,7 @@ def __init__(self, key): def f(self): return np.random.uniform(-1, 1, (2, np.random.randint(50, 100))), 1000 - class TestMultiTrack(track.MultiTrack): + class TestMultiTrack(core.MultiTrack): def __init__(self, mtrack_id, data_home): self.mtrack_id = mtrack_id self._data_home = data_home @@ -209,7 +239,7 @@ def __init__(self, mtrack_id, data_home): def test_multitrack_unequal_sr(): - class TestTrack(track.Track): + class TestTrack(core.Track): def __init__(self, key): self.key = key @@ -217,7 +247,7 @@ def __init__(self, key): def f(self): return np.random.uniform(-1, 1, (2, 100)), np.random.randint(10, 1000) - class TestMultiTrack(track.MultiTrack): + class TestMultiTrack(core.MultiTrack): def __init__(self, mtrack_id, data_home): self.mtrack_id = mtrack_id self._data_home = data_home @@ -232,7 +262,7 @@ def __init__(self, mtrack_id, data_home): def test_multitrack_mono(): ### no first channel - audio shapes (100,) - class TestTrack(track.Track): + class TestTrack(core.Track): def __init__(self, key): self.key = key @@ -240,7 +270,7 @@ def __init__(self, key): def f(self): return np.random.uniform(-1, 1, (100)), 1000 - class TestMultiTrack(track.MultiTrack): + class TestMultiTrack(core.MultiTrack): def __init__(self, mtrack_id, data_home): self.mtrack_id = mtrack_id self._data_home = data_home @@ -258,7 +288,7 @@ def __init__(self, mtrack_id, data_home): assert np.max(np.abs(target1)) <= 2 ### one channel mono shape (1, 100) - class TestTrack1(track.Track): + class TestTrack1(core.Track): def __init__(self, key): self.key = key @@ -266,7 +296,7 @@ def __init__(self, key): def f(self): return np.random.uniform(-1, 1, (1, 100)), 1000 - class TestMultiTrack1(track.MultiTrack): + class TestMultiTrack1(core.MultiTrack): def __init__(self, mtrack_id, data_home): self.mtrack_id = mtrack_id self._data_home = data_home diff --git a/tests/test_dali.py b/tests/test_dali.py index 43f41b0db..a1c7968ea 100644 --- a/tests/test_dali.py +++ b/tests/test_dali.py @@ -1,47 +1,48 @@ import numpy as np import DALI -from mirdata import dali, utils +from mirdata.datasets import dali +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = '4b196e6c99574dd49ad00d56e132712b' - data_home = 'tests/resources/mir_datasets/DALI' + default_trackid = "4b196e6c99574dd49ad00d56e132712b" + data_home = "tests/resources/mir_datasets/dali" track = dali.Track(default_trackid, data_home=data_home) expected_attributes = { - 'album': 'Mezmerize', - 'annotation_path': 'tests/resources/mir_datasets/DALI/' - + 'annotations/4b196e6c99574dd49ad00d56e132712b.gz', - 'artist': 'System Of A Down', - 'audio_path': 'tests/resources/mir_datasets/DALI/' - + 'audio/4b196e6c99574dd49ad00d56e132712b.mp3', - 'audio_url': 'zUzd9KyIDrM', - 'dataset_version': 1, - 'ground_truth': False, - 'language': 'english', - 'release_date': '2005', - 'scores_manual': 0, - 'scores_ncc': 0.9644769596900552, - 'title': 'B.Y.O.B.', - 'track_id': '4b196e6c99574dd49ad00d56e132712b', - 'url_working': True, + "album": "Mezmerize", + "annotation_path": "tests/resources/mir_datasets/dali/" + + "annotations/4b196e6c99574dd49ad00d56e132712b.gz", + "artist": "System Of A Down", + "audio_path": "tests/resources/mir_datasets/dali/" + + "audio/4b196e6c99574dd49ad00d56e132712b.mp3", + "audio_url": "zUzd9KyIDrM", + "dataset_version": 1, + "ground_truth": False, + "language": "english", + "release_date": "2005", + "scores_manual": 0, + "scores_ncc": 0.9644769596900552, + "title": "B.Y.O.B.", + "track_id": "4b196e6c99574dd49ad00d56e132712b", + "url_working": True, } expected_property_types = { - 'notes': utils.NoteData, - 'words': utils.LyricData, - 'lines': utils.LyricData, - 'paragraphs': utils.LyricData, - 'annotation_object': DALI.Annotations, + "notes": utils.NoteData, + "words": utils.LyricData, + "lines": utils.LyricData, + "paragraphs": utils.LyricData, + "annotation_object": DALI.Annotations, } run_track_tests(track, expected_attributes, expected_property_types) - path_save = '/home/mfuentes/astre/code/repositories/mirdata/tests/resources/mir_datasets/DALI/annotations' - name = 'test' + path_save = "/home/mfuentes/astre/code/repositories/mirdata/tests/resources/mir_datasets/dali/annotations" + name = "test" track.annotation_object.write_json(path_save, name) audio, sr = track.audio @@ -51,10 +52,10 @@ def test_track(): def test_load_notes(): notes_path = ( - 'tests/resources/mir_datasets/DALI/annotations/' - + '4b196e6c99574dd49ad00d56e132712b.gz' + "tests/resources/mir_datasets/dali/annotations/" + + "4b196e6c99574dd49ad00d56e132712b.gz" ) - note_data = dali.load_annotations_granularity(notes_path, 'notes') + note_data = dali.load_annotations_granularity(notes_path, "notes") assert type(note_data) == utils.NoteData assert type(note_data.intervals) == np.ndarray @@ -67,10 +68,10 @@ def test_load_notes(): def test_load_words(): data_path = ( - 'tests/resources/mir_datasets/DALI/annotations/' - + '4b196e6c99574dd49ad00d56e132712b.gz' + "tests/resources/mir_datasets/dali/annotations/" + + "4b196e6c99574dd49ad00d56e132712b.gz" ) - word_data = dali.load_annotations_granularity(data_path, 'words') + word_data = dali.load_annotations_granularity(data_path, "words") assert type(word_data) == utils.LyricData assert type(word_data.start_times) == np.ndarray @@ -79,15 +80,15 @@ def test_load_words(): assert np.array_equal(word_data.start_times, np.array([24.125, 24.273, 24.42])) assert np.array_equal(word_data.end_times, np.array([24.273, 24.42, 24.568])) - assert np.array_equal(word_data.lyrics, np.array(['why', 'do', 'they'])) + assert np.array_equal(word_data.lyrics, np.array(["why", "do", "they"])) def test_load_lines(): data_path = ( - 'tests/resources/mir_datasets/DALI/annotations/' - + '4b196e6c99574dd49ad00d56e132712b.gz' + "tests/resources/mir_datasets/dali/annotations/" + + "4b196e6c99574dd49ad00d56e132712b.gz" ) - line_data = dali.load_annotations_granularity(data_path, 'lines') + line_data = dali.load_annotations_granularity(data_path, "lines") assert type(line_data) == utils.LyricData assert type(line_data.start_times) == np.ndarray @@ -100,15 +101,15 @@ def test_load_lines(): assert np.array_equal(line_data.start_times, np.array([24.125, 24.42])) assert np.array_equal(line_data.end_times, np.array([24.42, 24.568])) - assert np.array_equal(line_data.lyrics, np.array(['why do', 'they'])) + assert np.array_equal(line_data.lyrics, np.array(["why do", "they"])) def test_load_paragraphs(): data_path = ( - 'tests/resources/mir_datasets/DALI/annotations/' - + '4b196e6c99574dd49ad00d56e132712b.gz' + "tests/resources/mir_datasets/dali/annotations/" + + "4b196e6c99574dd49ad00d56e132712b.gz" ) - par_data = dali.load_annotations_granularity(data_path, 'paragraphs') + par_data = dali.load_annotations_granularity(data_path, "paragraphs") assert type(par_data) == utils.LyricData assert type(par_data.start_times) == np.ndarray @@ -117,80 +118,80 @@ def test_load_paragraphs(): assert np.array_equal(par_data.start_times, np.array([24.125, 24.420])) assert np.array_equal(par_data.end_times, np.array([24.420, 24.568])) - assert np.array_equal(par_data.lyrics, np.array(['why do', 'they'])) + assert np.array_equal(par_data.lyrics, np.array(["why do", "they"])) def test_load_dali_object(): data_path = ( - 'tests/resources/mir_datasets/DALI/annotations/' - + '4b196e6c99574dd49ad00d56e132712b.gz' + "tests/resources/mir_datasets/dali/annotations/" + + "4b196e6c99574dd49ad00d56e132712b.gz" ) dali_data = dali.load_annotations_class(data_path) assert type(dali_data) == DALI.Annotations - assert dali_data.annotations['annot']['notes'] == [ + assert dali_data.annotations["annot"]["notes"] == [ { - 'text': 'why', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.12471002069169, 24.272507833284063], - 'index': 0, + "text": "why", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.12471002069169, 24.272507833284063], + "index": 0, }, { - 'text': 'do', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.272507833284063, 24.42030564587644], - 'index': 1, + "text": "do", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.272507833284063, 24.42030564587644], + "index": 1, }, { - 'text': 'they', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.42030564587644, 24.568103458468812], - 'index': 2, + "text": "they", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.42030564587644, 24.568103458468812], + "index": 2, }, ] - assert dali_data.annotations['annot']['words'] == [ + assert dali_data.annotations["annot"]["words"] == [ { - 'text': 'why', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.12471002069169, 24.272507833284063], - 'index': 0, + "text": "why", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.12471002069169, 24.272507833284063], + "index": 0, }, { - 'text': 'do', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.272507833284063, 24.42030564587644], - 'index': 0, + "text": "do", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.272507833284063, 24.42030564587644], + "index": 0, }, { - 'text': 'they', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.42030564587644, 24.568103458468812], - 'index': 1, + "text": "they", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.42030564587644, 24.568103458468812], + "index": 1, }, ] - assert dali_data.annotations['annot']['lines'] == [ + assert dali_data.annotations["annot"]["lines"] == [ { - 'text': 'why do', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.12471002069169, 24.42030564587644], - 'index': 0, + "text": "why do", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.12471002069169, 24.42030564587644], + "index": 0, }, { - 'text': 'they', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.42030564587644, 24.568103458468812], - 'index': 1, + "text": "they", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.42030564587644, 24.568103458468812], + "index": 1, }, ] - assert dali_data.annotations['annot']['paragraphs'] == [ + assert dali_data.annotations["annot"]["paragraphs"] == [ { - 'text': 'why do', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.12471002069169, 24.42030564587644], + "text": "why do", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.12471002069169, 24.42030564587644], }, { - 'text': 'they', - 'freq': [1108.7305239074883, 1108.7305239074883], - 'time': [24.42030564587644, 24.568103458468812], + "text": "they", + "freq": [1108.7305239074883, 1108.7305239074883], + "time": [24.42030564587644, 24.568103458468812], }, ] diff --git a/tests/test_full_dataset.py b/tests/test_full_dataset.py index 1720ecec9..cba3f8e7f 100644 --- a/tests/test_full_dataset.py +++ b/tests/test_full_dataset.py @@ -3,40 +3,33 @@ This test takes a long time, but it makes sure that the datset can be locally downloaded, validated successfully, and loaded. """ -import importlib -from tests.test_utils import get_attributes_and_properties import os import pytest +import tqdm +from tests.test_utils import get_attributes_and_properties import mirdata @pytest.fixture() def dataset(test_dataset): - if test_dataset == '': + if test_dataset == "": return None - elif test_dataset not in mirdata.__all__: + elif test_dataset not in mirdata.DATASETS: raise ValueError("{} is not a dataset in mirdata".format(test_dataset)) - - return importlib.import_module("mirdata.{}".format(test_dataset)) - - -@pytest.fixture() -def data_home_dir(dataset): - if dataset is None: - return None - return os.path.join('tests/resources/mir_datasets_full', dataset.DATASET_DIR) + data_home = os.path.join("tests/resources/mir_datasets_full", test_dataset) + return mirdata.Dataset(test_dataset, data_home) # This is magically skipped by the the remote fixture `skip_remote` in conftest.py # when tests are run without the --local flag -def test_download(skip_remote, dataset, data_home_dir, skip_download): +def test_download(skip_remote, dataset, skip_download): if dataset is None: pytest.skip() # download the dataset if not skip_download: - dataset.download(data_home=data_home_dir) + dataset.download() print( "If this dataset does not have openly downloadable data, " @@ -45,43 +38,41 @@ def test_download(skip_remote, dataset, data_home_dir, skip_download): ) -def test_validation(skip_remote, dataset, data_home_dir): +def test_validation(skip_remote, dataset): if dataset is None: pytest.skip() # run validation - missing_files, invalid_checksums = dataset.validate( - data_home=data_home_dir, silence=True - ) + missing_files, invalid_checksums = dataset.validate(verbose=True) assert missing_files == {} assert invalid_checksums == {} -def test_load(skip_remote, dataset, data_home_dir): +def test_load(skip_remote, dataset): if dataset is None: pytest.skip() # run load - all_data = dataset.load(data_home=data_home_dir) + all_data = dataset.load_tracks() assert isinstance(all_data, dict) - track_ids = dataset.track_ids() + track_ids = dataset.track_ids assert set(track_ids) == set(all_data.keys()) # test that all attributes and properties can be called - for track_id in track_ids: + for track_id in tqdm.tqdm(track_ids): track = all_data[track_id] track_data = get_attributes_and_properties(track) - for attr in track_data['attributes']: + for attr in track_data["attributes"]: ret = getattr(track, attr) - for prop in track_data['properties']: + for prop in track_data["properties"]: ret = getattr(track, prop) - for cprop in track_data['cached_properties']: + for cprop in track_data["cached_properties"]: ret = getattr(track, cprop) jam = track.to_jams() diff --git a/tests/test_giantsteps_key.py b/tests/test_giantsteps_key.py index 0cf3312b1..f5b70e3b1 100644 --- a/tests/test_giantsteps_key.py +++ b/tests/test_giantsteps_key.py @@ -2,65 +2,66 @@ import numpy as np -from mirdata import giantsteps_key, utils +from mirdata.datasets import giantsteps_key +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = '3' - data_home = 'tests/resources/mir_datasets/GiantSteps_key' + default_trackid = "3" + data_home = "tests/resources/mir_datasets/giantsteps_key" track = giantsteps_key.Track(default_trackid, data_home=data_home) expected_attributes = { - 'audio_path': 'tests/resources/mir_datasets/GiantSteps_key/audio/10089 Jason Sparks - Close My Eyes feat. J. ' - 'Little (Original Mix).mp3', - 'keys_path': 'tests/resources/mir_datasets/GiantSteps_key/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. ' - 'Little (Original Mix).txt', - 'metadata_path': 'tests/resources/mir_datasets/GiantSteps_key/meta/10089 Jason Sparks - Close My Eyes feat. J. ' - 'Little (Original Mix).json', - 'title': '10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix)', - 'track_id': '3', + "audio_path": "tests/resources/mir_datasets/giantsteps_key/audio/10089 Jason Sparks - Close My Eyes feat. J. " + "Little (Original Mix).mp3", + "keys_path": "tests/resources/mir_datasets/giantsteps_key/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. " + "Little (Original Mix).txt", + "metadata_path": "tests/resources/mir_datasets/giantsteps_key/meta/10089 Jason Sparks - Close My Eyes feat. J. " + "Little (Original Mix).json", + "title": "10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix)", + "track_id": "3", } expected_property_types = { - 'key': str, - 'genres': dict, - 'artists': list, - 'tempo': int, + "key": str, + "genres": dict, + "artists": list, + "tempo": int, } run_track_tests(track, expected_attributes, expected_property_types) audio, sr = track.audio - assert sr == 44100, 'sample rate {} is not 44100'.format(sr) - assert audio.shape == (5294592,), 'audio shape {} was not (5294592,)'.format( + assert sr == 44100, "sample rate {} is not 44100".format(sr) + assert audio.shape == (5294592,), "audio shape {} was not (5294592,)".format( audio.shape ) def test_to_jams(): - data_home = 'tests/resources/mir_datasets/GiantSteps_key' - track = giantsteps_key.Track('3', data_home=data_home) + data_home = "tests/resources/mir_datasets/giantsteps_key" + track = giantsteps_key.Track("3", data_home=data_home) jam = track.to_jams() - assert jam['sandbox']['key'] == 'D major', 'key does not match expected' + assert jam["sandbox"]["key"] == "D major", "key does not match expected" assert ( - jam['file_metadata']['title'] - == '10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix)' - ), 'title does not match expected' + jam["file_metadata"]["title"] + == "10089 Jason Sparks - Close My Eyes feat. J. Little (Original Mix)" + ), "title does not match expected" sand_box = { "artists": ["Jason Sparks"], "genres": {"genres": ["Breaks"], "sub_genres": []}, "tempo": 150, "key": "D major", } - assert dict(jam['sandbox']) == sand_box, 'title does not match expected' + assert dict(jam["sandbox"]) == sand_box, "title does not match expected" def test_load_key(): key_path = ( - 'tests/resources/mir_datasets/GiantSteps_key/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. ' - + 'Little (Original Mix).txt' + "tests/resources/mir_datasets/giantsteps_key/keys_gs+/10089 Jason Sparks - Close My Eyes feat. J. " + + "Little (Original Mix).txt" ) key_data = giantsteps_key.load_key(key_path) @@ -73,11 +74,11 @@ def test_load_key(): def test_load_meta(): meta_path = ( - 'tests/resources/mir_datasets/GiantSteps_key/meta/10089 Jason Sparks - Close My Eyes feat. J. ' - + 'Little (Original Mix).json' + "tests/resources/mir_datasets/giantsteps_key/meta/10089 Jason Sparks - Close My Eyes feat. J. " + + "Little (Original Mix).json" ) - genres = {'genres': ['Breaks'], 'sub_genres': []} - artists = ['Jason Sparks'] + genres = {"genres": ["Breaks"], "sub_genres": []} + artists = ["Jason Sparks"] tempo = 150 assert type(giantsteps_key.load_genre(meta_path)) == dict diff --git a/tests/test_giantsteps_tempo.py b/tests/test_giantsteps_tempo.py index 5157e7c45..dd2d7035b 100644 --- a/tests/test_giantsteps_tempo.py +++ b/tests/test_giantsteps_tempo.py @@ -2,44 +2,47 @@ import numpy as np -from mirdata import giantsteps_tempo, utils +from mirdata.datasets import giantsteps_tempo +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = '113' - data_home = 'tests/resources/mir_datasets/GiantSteps_tempo' + default_trackid = "113" + data_home = "tests/resources/mir_datasets/giantsteps_tempo" track = giantsteps_tempo.Track(default_trackid, data_home=data_home) expected_attributes = { - 'audio_path': 'tests/resources/mir_datasets/GiantSteps_tempo/audio/28952.LOFI.mp3', - 'annotation_v1_path': 'tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset' - '-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams', - 'annotation_v2_path': 'tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset' - '-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams', - 'title': '28952', - 'track_id': '113', + "audio_path": "tests/resources/mir_datasets/giantsteps_tempo/audio/28952.LOFI.mp3", + "annotation_v1_path": "tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset" + "-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams", + "annotation_v2_path": "tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset" + "-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams", + "title": "28952", + "track_id": "113", } expected_property_types = { - 'tempo': utils.TempoData, - 'tempo_v2': utils.TempoData, - 'genre': str + "tempo": utils.TempoData, + "tempo_v2": utils.TempoData, + "genre": str, } run_track_tests(track, expected_attributes, expected_property_types) audio, sr = track.audio - assert sr == 22050, 'sample rate {} is not 22050'.format(sr) + assert sr == 22050, "sample rate {} is not 22050".format(sr) print(audio.shape) - assert audio.shape == (2646720,), 'audio shape {} was not (2646720,)'.format( + assert audio.shape == (2646720,), "audio shape {} was not (2646720,)".format( audio.shape ) def test_load_genre(): - genre_path = 'tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset' \ - '-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams' + genre_path = ( + "tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset" + "-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams" + ) genre_data = giantsteps_tempo.load_genre(genre_path) @@ -52,25 +55,29 @@ def test_load_genre(): def test_load_tempo(): tempo_path = ( - 'tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset' - '-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams' + "tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset" + "-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations/jams/28952.LOFI.jams" ) tempo_data = giantsteps_tempo.load_tempo(tempo_path) assert type(tempo_data) == utils.TempoData - assert tempo_data == utils.TempoData(time=np.array([0.0]), duration=np.array([120.0]), - value=np.array([137.6]), confidence=np.array([1.0])) + assert tempo_data == utils.TempoData( + time=np.array([0.0]), + duration=np.array([120.0]), + value=np.array([137.6]), + confidence=np.array([1.0]), + ) tempo_path = ( - 'tests/resources/mir_datasets/GiantSteps_tempo/giantsteps-tempo-dataset' - '-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams' + "tests/resources/mir_datasets/giantsteps_tempo/giantsteps-tempo-dataset" + "-0b7d47ba8cae59d3535a02e3db69e2cf6d0af5bb/annotations_v2/jams/28952.LOFI.jams" ) tempo_data = giantsteps_tempo.load_tempo(tempo_path) assert type(tempo_data) == utils.TempoData - assert np.array_equal(tempo_data.time, np.array([0., 0.])) - assert np.array_equal(tempo_data.duration, np.array([120., 120.])) - assert np.array_equal(tempo_data.value, np.array([77., 139.])) + assert np.array_equal(tempo_data.time, np.array([0.0, 0.0])) + assert np.array_equal(tempo_data.duration, np.array([120.0, 120.0])) + assert np.array_equal(tempo_data.value, np.array([77.0, 139.0])) assert giantsteps_tempo.load_tempo(None) is None diff --git a/tests/test_groove_midi.py b/tests/test_groove_midi.py index 9444d10f3..d9cd99ac1 100644 --- a/tests/test_groove_midi.py +++ b/tests/test_groove_midi.py @@ -3,49 +3,50 @@ import pretty_midi import shutil -from mirdata import groove_midi, utils, download_utils +from mirdata.datasets import groove_midi +from mirdata import utils, download_utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'drummer1/eval_session/1' - data_home = 'tests/resources/mir_datasets/Groove-MIDI' + default_trackid = "drummer1/eval_session/1" + data_home = "tests/resources/mir_datasets/groove_midi" track = groove_midi.Track(default_trackid, data_home=data_home) expected_attributes = { - 'drummer': 'drummer1', - 'session': 'drummer1/eval_session', - 'track_id': 'drummer1/eval_session/1', - 'style': 'funk/groove1', - 'tempo': 138, - 'beat_type': 'beat', - 'time_signature': '4-4', - 'midi_filename': 'drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid', - 'audio_filename': 'drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav', - 'midi_path': os.path.join( - data_home, 'drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid' + "drummer": "drummer1", + "session": "drummer1/eval_session", + "track_id": "drummer1/eval_session/1", + "style": "funk/groove1", + "tempo": 138, + "beat_type": "beat", + "time_signature": "4-4", + "midi_filename": "drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid", + "audio_filename": "drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav", + "midi_path": os.path.join( + data_home, "drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid" ), - 'audio_path': os.path.join( - data_home, 'drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav' + "audio_path": os.path.join( + data_home, "drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav" ), - 'duration': 27.872308, - 'split': 'test', + "duration": 27.872308, + "split": "test", } expected_property_types = { - 'beats': utils.BeatData, - 'drum_events': utils.EventData, - 'midi': pretty_midi.PrettyMIDI, + "beats": utils.BeatData, + "drum_events": utils.EventData, + "midi": pretty_midi.PrettyMIDI, } assert track._track_paths == { - 'audio': [ - 'drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav', - '7f94a191506f70ac9d313b7978203c3c', + "audio": [ + "drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav", + "7f94a191506f70ac9d313b7978203c3c", ], - 'midi': [ - 'drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid', - 'b01a609cee84cfbc2c154bb9b6566955', + "midi": [ + "drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid", + "b01a609cee84cfbc2c154bb9b6566955", ], } @@ -65,24 +66,24 @@ def test_track(): def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/Groove-MIDI' + data_home = "tests/resources/mir_datasets/groove_midi" metadata = groove_midi._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['drummer1/eval_session/1'] == { - 'drummer': 'drummer1', - 'session': 'drummer1/eval_session', - 'track_id': 'drummer1/eval_session/1', - 'style': 'funk/groove1', - 'tempo': 138, - 'beat_type': 'beat', - 'time_signature': '4-4', - 'midi_filename': 'drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid', - 'audio_filename': 'drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav', - 'duration': 27.872308, - 'split': 'test', + assert metadata["data_home"] == data_home + assert metadata["drummer1/eval_session/1"] == { + "drummer": "drummer1", + "session": "drummer1/eval_session", + "track_id": "drummer1/eval_session/1", + "style": "funk/groove1", + "tempo": 138, + "beat_type": "beat", + "time_signature": "4-4", + "midi_filename": "drummer1/eval_session/1_funk-groove1_138_beat_4-4.mid", + "audio_filename": "drummer1/eval_session/1_funk-groove1_138_beat_4-4.wav", + "duration": 27.872308, + "split": "test", } - metadata_none = groove_midi._load_metadata('asdf/asdf') + metadata_none = groove_midi._load_metadata("asdf/asdf") assert metadata_none is None @@ -93,29 +94,29 @@ def test_load_audio(): def test_download(httpserver): - data_home = 'tests/resources/mir_datasets/Groove-MIDI_download' + data_home = "tests/resources/mir_datasets/groove_midi_download" if os.path.exists(data_home): shutil.rmtree(data_home) httpserver.serve_content( - open('tests/resources/download/groove-v1-0.0.zip', 'rb').read() + open("tests/resources/download/groove-v1-0.0.zip", "rb").read() ) - groove_midi.REMOTES = { - 'all': download_utils.RemoteFileMetadata( - filename='groove-v1-0.0.zip', + remotes = { + "all": download_utils.RemoteFileMetadata( + filename="groove-v1-0.0.zip", url=httpserver.url, - checksum=('97a9a888d2a65cc87bb26e74df08b011'), + checksum=("97a9a888d2a65cc87bb26e74df08b011"), destination_dir=None, ) } - groove_midi.download(data_home=data_home) + groove_midi._download(data_home, remotes, None, None, False, False) assert os.path.exists(data_home) - assert not os.path.exists(os.path.join(data_home, 'groove')) + assert not os.path.exists(os.path.join(data_home, "groove")) assert os.path.exists(os.path.join(data_home, "info.csv")) - track = groove_midi.Track('drummer1/eval_session/1', data_home=data_home) + track = groove_midi.Track("drummer1/eval_session/1", data_home=data_home) assert os.path.exists(track.midi_path) assert os.path.exists(track.audio_path) diff --git a/tests/test_gtzan_genre.py b/tests/test_gtzan_genre.py index 7378c65d1..4b739a754 100644 --- a/tests/test_gtzan_genre.py +++ b/tests/test_gtzan_genre.py @@ -4,26 +4,20 @@ from tests.test_utils import run_track_tests -from mirdata import gtzan_genre +from mirdata.datasets import gtzan_genre from tests.test_utils import DEFAULT_DATA_HOME -TEST_DATA_HOME = "tests/resources/mir_datasets/GTZAN-Genre" - - -def test_track_default_data_home(): - # test data home None - track_default = gtzan_genre.Track("country.00000") - assert track_default._data_home == os.path.join(DEFAULT_DATA_HOME, "GTZAN-Genre") +TEST_DATA_HOME = "tests/resources/mir_datasets/gtzan_genre" def test_track(): default_trackid = "country.00000" track = gtzan_genre.Track(default_trackid, data_home=TEST_DATA_HOME) expected_attributes = { - 'genre': "country", - 'audio_path': "tests/resources/mir_datasets/GTZAN-Genre/" + "genre": "country", + "audio_path": "tests/resources/mir_datasets/gtzan_genre/" + "gtzan_genre/genres/country/country.00000.wav", - 'track_id': "country.00000", + "track_id": "country.00000", } run_track_tests(track, expected_attributes, {}) @@ -33,8 +27,8 @@ def test_track(): def test_hiphop(): - track = gtzan_genre.Track('hiphop.00000', data_home=TEST_DATA_HOME) - assert track.genre == 'hip-hop' + track = gtzan_genre.Track("hiphop.00000", data_home=TEST_DATA_HOME) + assert track.genre == "hip-hop" def test_to_jams(): diff --git a/tests/test_guitarset.py b/tests/test_guitarset.py index cd6b83d43..05e4d5596 100644 --- a/tests/test_guitarset.py +++ b/tests/test_guitarset.py @@ -3,49 +3,50 @@ import numpy as np import jams -from mirdata import guitarset, utils +from mirdata.datasets import guitarset +from mirdata import utils from tests.test_utils import run_track_tests -TEST_DATA_HOME = 'tests/resources/mir_datasets/GuitarSet' -TRACK = guitarset.Track('03_BN3-119-G_solo', data_home=TEST_DATA_HOME) +TEST_DATA_HOME = "tests/resources/mir_datasets/guitarset" +TRACK = guitarset.Track("03_BN3-119-G_solo", data_home=TEST_DATA_HOME) def test_track(): - default_trackid = '03_BN3-119-G_solo' + default_trackid = "03_BN3-119-G_solo" data_home = TEST_DATA_HOME track = guitarset.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': '03_BN3-119-G_solo', - 'audio_hex_cln_path': 'tests/resources/mir_datasets/GuitarSet/' - + 'audio_hex-pickup_debleeded/03_BN3-119-G_solo_hex_cln.wav', - 'audio_hex_path': 'tests/resources/mir_datasets/GuitarSet/' - + 'audio_hex-pickup_original/03_BN3-119-G_solo_hex.wav', - 'audio_mic_path': 'tests/resources/mir_datasets/GuitarSet/' - + 'audio_mono-mic/03_BN3-119-G_solo_mic.wav', - 'audio_mix_path': 'tests/resources/mir_datasets/GuitarSet/' - + 'audio_mono-pickup_mix/03_BN3-119-G_solo_mix.wav', - 'jams_path': 'tests/resources/mir_datasets/GuitarSet/' - + 'annotation/03_BN3-119-G_solo.jams', - 'player_id': '03', - 'tempo': 119, - 'mode': 'solo', - 'style': 'Bossa Nova', + "track_id": "03_BN3-119-G_solo", + "audio_hex_cln_path": "tests/resources/mir_datasets/guitarset/" + + "audio_hex-pickup_debleeded/03_BN3-119-G_solo_hex_cln.wav", + "audio_hex_path": "tests/resources/mir_datasets/guitarset/" + + "audio_hex-pickup_original/03_BN3-119-G_solo_hex.wav", + "audio_mic_path": "tests/resources/mir_datasets/guitarset/" + + "audio_mono-mic/03_BN3-119-G_solo_mic.wav", + "audio_mix_path": "tests/resources/mir_datasets/guitarset/" + + "audio_mono-pickup_mix/03_BN3-119-G_solo_mix.wav", + "jams_path": "tests/resources/mir_datasets/guitarset/" + + "annotation/03_BN3-119-G_solo.jams", + "player_id": "03", + "tempo": 119, + "mode": "solo", + "style": "Bossa Nova", } expected_property_types = { - 'beats': utils.BeatData, - 'leadsheet_chords': utils.ChordData, - 'inferred_chords': utils.ChordData, - 'key_mode': utils.KeyData, - 'pitch_contours': dict, - 'notes': dict, + "beats": utils.BeatData, + "leadsheet_chords": utils.ChordData, + "inferred_chords": utils.ChordData, + "key_mode": utils.KeyData, + "pitch_contours": dict, + "notes": dict, } run_track_tests(track, expected_attributes, expected_property_types) - assert type(track.pitch_contours['E']) is utils.F0Data - assert type(track.notes['E']) is utils.NoteData + assert type(track.pitch_contours["E"]) is utils.F0Data + assert type(track.notes["E"]) is utils.NoteData def test_load_beats(): @@ -56,22 +57,22 @@ def test_load_beats(): def test_load_chords(): assert np.allclose(TRACK.leadsheet_chords.intervals[:, 0], [0]) assert np.allclose(TRACK.leadsheet_chords.intervals[:, 1], [2]) - assert TRACK.leadsheet_chords.labels == ['G:maj'] + assert TRACK.leadsheet_chords.labels == ["G:maj"] assert np.allclose(TRACK.inferred_chords.intervals[:, 0], [0]) assert np.allclose(TRACK.inferred_chords.intervals[:, 1], [2]) - assert TRACK.inferred_chords.labels == ['G:maj7/1'] + assert TRACK.inferred_chords.labels == ["G:maj7/1"] def test_load_keys(): assert np.allclose(TRACK.key_mode.start_times, [0]) assert np.allclose(TRACK.key_mode.end_times, [2]) - assert TRACK.key_mode.keys == ['G:major'] + assert TRACK.key_mode.keys == ["G:major"] def test_load_contours(): assert np.allclose( - TRACK.pitch_contours['e'].times[:10], + TRACK.pitch_contours["e"].times[:10], [ 0.7670358269999724, 0.7728408159999844, @@ -86,7 +87,7 @@ def test_load_contours(): ], ) assert np.allclose( - TRACK.pitch_contours['e'].frequencies[:10], + TRACK.pitch_contours["e"].frequencies[:10], [ 393.388, 393.301, @@ -100,21 +101,21 @@ def test_load_contours(): 393.37, ], ) - assert np.allclose(TRACK.pitch_contours['e'].confidence[:10], np.ones((10,))) + assert np.allclose(TRACK.pitch_contours["e"].confidence[:10], np.ones((10,))) def test_load_notes(): assert np.allclose( - TRACK.notes['e'].intervals[:, 0], + TRACK.notes["e"].intervals[:, 0], [0.7612308390022235, 1.5072852607709137, 1.7806185941042258], ) assert np.allclose( - TRACK.notes['e'].intervals[:, 1], [1.2604598639455844, 1.7336798185940552, 2.0] + TRACK.notes["e"].intervals[:, 1], [1.2604598639455844, 1.7336798185940552, 2.0] ) assert np.allclose( - TRACK.notes['e'].notes, [67.0576287044242, 71.03221526299762, 71.03297250121584] + TRACK.notes["e"].notes, [67.0576287044242, 71.03221526299762, 71.03297250121584] ) - assert np.allclose(TRACK.notes['e'].confidence, [1, 1, 1]) + assert np.allclose(TRACK.notes["e"].confidence, [1, 1, 1]) def test_audio_mono(): @@ -141,8 +142,8 @@ def test_audio_hex_cln(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/GuitarSet' - track = guitarset.Track('03_BN3-119-G_solo', data_home=data_home) + data_home = "tests/resources/mir_datasets/guitarset" + track = guitarset.Track("03_BN3-119-G_solo", data_home=data_home) jam = track.to_jams() assert type(jam) == jams.JAMS diff --git a/tests/test_ikala.py b/tests/test_ikala.py index 795322a34..b8da0b020 100644 --- a/tests/test_ikala.py +++ b/tests/test_ikala.py @@ -2,32 +2,33 @@ import numpy as np -from mirdata import ikala, utils +from mirdata.datasets import ikala +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = '10161_chorus' - data_home = 'tests/resources/mir_datasets/iKala' + default_trackid = "10161_chorus" + data_home = "tests/resources/mir_datasets/ikala" track = ikala.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': '10161_chorus', - 'audio_path': 'tests/resources/mir_datasets/iKala/' - + 'Wavfile/10161_chorus.wav', - 'song_id': '10161', - 'section': 'chorus', - 'singer_id': '1', - 'f0_path': 'tests/resources/mir_datasets/iKala/PitchLabel/10161_chorus.pv', - 'lyrics_path': 'tests/resources/mir_datasets/iKala/Lyrics/10161_chorus.lab', + "track_id": "10161_chorus", + "audio_path": "tests/resources/mir_datasets/ikala/" + + "Wavfile/10161_chorus.wav", + "song_id": "10161", + "section": "chorus", + "singer_id": "1", + "f0_path": "tests/resources/mir_datasets/ikala/PitchLabel/10161_chorus.pv", + "lyrics_path": "tests/resources/mir_datasets/ikala/Lyrics/10161_chorus.lab", } - expected_property_types = {'f0': utils.F0Data, 'lyrics': utils.LyricData} + expected_property_types = {"f0": utils.F0Data, "lyrics": utils.LyricData} assert track._track_paths == { - 'audio': ['Wavfile/10161_chorus.wav', '278ae003cb0d323e99b9a643c0f2eeda'], - 'pitch': ['PitchLabel/10161_chorus.pv', '0d93a011a9e668fd80673049089bbb14'], - 'lyrics': ['Lyrics/10161_chorus.lab', '79bbeb72b422056fd43be4e8d63319ce'], + "audio": ["Wavfile/10161_chorus.wav", "278ae003cb0d323e99b9a643c0f2eeda"], + "pitch": ["PitchLabel/10161_chorus.pv", "0d93a011a9e668fd80673049089bbb14"], + "lyrics": ["Lyrics/10161_chorus.lab", "79bbeb72b422056fd43be4e8d63319ce"], } run_track_tests(track, expected_attributes, expected_property_types) @@ -53,29 +54,29 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/iKala' - track = ikala.Track('10161_chorus', data_home=data_home) + data_home = "tests/resources/mir_datasets/ikala" + track = ikala.Track("10161_chorus", data_home=data_home) jam = track.to_jams() - lyrics = jam.search(namespace='lyric')[0]['data'] + lyrics = jam.search(namespace="lyric")[0]["data"] assert [lyric.time for lyric in lyrics] == [0.027, 0.232] assert [lyric.duration for lyric in lyrics] == [0.20500000000000002, 0.736] - assert [lyric.value for lyric in lyrics] == ['JUST', 'WANNA'] + assert [lyric.value for lyric in lyrics] == ["JUST", "WANNA"] assert [lyric.confidence for lyric in lyrics] == [None, None] - f0s = jam.search(namespace='pitch_contour')[0]['data'] + f0s = jam.search(namespace="pitch_contour")[0]["data"] assert [f0.time for f0 in f0s] == [0.016, 0.048] assert [f0.duration for f0 in f0s] == [0.0, 0.0] assert [f0.value for f0 in f0s] == [ - {'frequency': 0.0, 'index': 0, 'voiced': False}, - {'frequency': 260.946404518887, 'index': 0, 'voiced': True}, + {"frequency": 0.0, "index": 0, "voiced": False}, + {"frequency": 260.946404518887, "index": 0, "voiced": True}, ] assert [f0.confidence for f0 in f0s] == [0.0, 1.0] def test_load_f0(): # load a file which exists - f0_path = 'tests/resources/mir_datasets/iKala/PitchLabel/10161_chorus.pv' + f0_path = "tests/resources/mir_datasets/ikala/PitchLabel/10161_chorus.pv" f0_data = ikala.load_f0(f0_path) # check types @@ -92,7 +93,7 @@ def test_load_f0(): def test_load_lyrics(): # load a file without pronunciations - lyrics_path_simple = 'tests/resources/mir_datasets/iKala/Lyrics/10161_chorus.lab' + lyrics_path_simple = "tests/resources/mir_datasets/ikala/Lyrics/10161_chorus.lab" lyrics_data_simple = ikala.load_lyrics(lyrics_path_simple) # check types @@ -105,11 +106,11 @@ def test_load_lyrics(): # check values assert np.array_equal(lyrics_data_simple.start_times, np.array([0.027, 0.232])) assert np.array_equal(lyrics_data_simple.end_times, np.array([0.232, 0.968])) - assert np.array_equal(lyrics_data_simple.lyrics, np.array(['JUST', 'WANNA'])) + assert np.array_equal(lyrics_data_simple.lyrics, np.array(["JUST", "WANNA"])) assert np.array_equal(lyrics_data_simple.pronunciations, np.array([None, None])) # load a file with pronunciations - lyrics_path_pronun = 'tests/resources/mir_datasets/iKala/Lyrics/10164_chorus.lab' + lyrics_path_pronun = "tests/resources/mir_datasets/ikala/Lyrics/10164_chorus.lab" lyrics_data_pronun = ikala.load_lyrics(lyrics_path_pronun) # check types @@ -122,16 +123,16 @@ def test_load_lyrics(): # check values assert np.array_equal(lyrics_data_pronun.start_times, np.array([0.021, 0.571])) assert np.array_equal(lyrics_data_pronun.end_times, np.array([0.189, 1.415])) - assert np.array_equal(lyrics_data_pronun.lyrics, np.array(['ASDF', 'EVERYBODY'])) - assert np.array_equal(lyrics_data_pronun.pronunciations, np.array(['t i au', None])) + assert np.array_equal(lyrics_data_pronun.lyrics, np.array(["ASDF", "EVERYBODY"])) + assert np.array_equal(lyrics_data_pronun.pronunciations, np.array(["t i au", None])) def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/iKala' + data_home = "tests/resources/mir_datasets/ikala" metadata = ikala._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['10161'] == '1' - assert metadata['21025'] == '1' + assert metadata["data_home"] == data_home + assert metadata["10161"] == "1" + assert metadata["21025"] == "1" - metadata_none = ikala._load_metadata('asdf/asdf') + metadata_none = ikala._load_metadata("asdf/asdf") assert metadata_none is None diff --git a/tests/test_jams_utils.py b/tests/test_jams_utils.py index 787b58a9b..dd98adf3c 100644 --- a/tests/test_jams_utils.py +++ b/tests/test_jams_utils.py @@ -12,7 +12,7 @@ def get_jam_data(jam, namespace, annot_numb): duration = [] value = [] confidence = [] - for obs in jam.search(namespace=namespace)[annot_numb]['data']: + for obs in jam.search(namespace=namespace)[annot_numb]["data"]: time.append(obs.time) duration.append(round(obs.duration, 3)) value.append(obs.value) @@ -22,15 +22,15 @@ def get_jam_data(jam, namespace, annot_numb): def test_beats(): beat_data_1 = [(utils.BeatData(np.array([0.2, 0.3]), np.array([1, 2])), None)] - beat_data_2 = [(utils.BeatData(np.array([0.5, 0.7]), np.array([2, 3])), 'beats_2')] + beat_data_2 = [(utils.BeatData(np.array([0.5, 0.7]), np.array([2, 3])), "beats_2")] beat_data_3 = [ - (utils.BeatData(np.array([0.0, 0.3]), np.array([1, 2])), 'beats_1'), - (utils.BeatData(np.array([0.5, 0.13]), np.array([4, 3])), 'beats_2'), + (utils.BeatData(np.array([0.0, 0.3]), np.array([1, 2])), "beats_1"), + (utils.BeatData(np.array([0.5, 0.13]), np.array([4, 3])), "beats_2"), ] - beat_data_4 = (utils.BeatData(np.array([0.0, 0.3]), np.array([1, 2])), 'beats_1') + beat_data_4 = (utils.BeatData(np.array([0.0, 0.3]), np.array([1, 2])), "beats_1") beat_data_5 = [ - (utils.BeatData(np.array([0.0, 0.3]), np.array([1, 2])), 'beats_1'), - [utils.BeatData(np.array([0.5, 0.13]), np.array([4, 3])), 'beats_2'], + (utils.BeatData(np.array([0.0, 0.3]), np.array([1, 2])), "beats_1"), + [utils.BeatData(np.array([0.5, 0.13]), np.array([4, 3])), "beats_2"], ] beat_data_6 = [(None, None)] beat_data_7 = [ @@ -38,7 +38,7 @@ def test_beats(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ) @@ -49,27 +49,27 @@ def test_beats(): jam_3 = jams_utils.jams_converter(beat_data=beat_data_3) jam_6 = jams_utils.jams_converter(beat_data=beat_data_6) - time, duration, value, confidence = get_jam_data(jam_1, 'beat', 0) + time, duration, value, confidence = get_jam_data(jam_1, "beat", 0) assert time == [0.2, 0.3] assert duration == [0.0, 0.0] assert value == [1, 2] assert confidence == [None, None] - assert jam_2.annotations[0]['sandbox']['name'] == 'beats_2' + assert jam_2.annotations[0]["sandbox"]["name"] == "beats_2" - time, duration, value, confidence = get_jam_data(jam_3, 'beat', 0) + time, duration, value, confidence = get_jam_data(jam_3, "beat", 0) assert time == [0.0, 0.3] assert duration == [0.0, 0.0] assert value == [1, 2] assert confidence == [None, None] - time, duration, value, confidence = get_jam_data(jam_3, 'beat', 1) + time, duration, value, confidence = get_jam_data(jam_3, "beat", 1) assert time == [0.13, 0.5] assert duration == [0.0, 0.0] assert value == [3, 4] assert confidence == [None, None] - time, duration, value, confidence = get_jam_data(jam_6, 'beat', 0) + time, duration, value, confidence = get_jam_data(jam_6, "beat", 0) assert time == [] assert duration == [] assert value == [] @@ -90,7 +90,7 @@ def test_chords(): ( utils.ChordData( np.array([[0.0, 0.5, 1.0], [0.5, 1.0, 1.5]]).T, - np.array(['A', 'A', 'E']), + np.array(["A", "A", "E"]), ), None, ) @@ -99,30 +99,30 @@ def test_chords(): ( utils.ChordData( np.array([[0.0, 0.8, 1.0], [0.5, 1.0, 1.5]]).T, - np.array(['A', 'B', 'C']), + np.array(["A", "B", "C"]), ), - 'chords_2', + "chords_2", ) ] chord_data_3 = [ ( utils.ChordData( np.array([[0.0, 0.5, 1.0], [0.5, 1.0, 1.5]]).T, - np.array(['A', 'A', 'E']), + np.array(["A", "A", "E"]), ), - 'chords_1', + "chords_1", ), ( utils.ChordData( np.array([[0.0, 0.7, 1.0], [0.7, 1.0, 1.5]]).T, - np.array(['A', 'B', 'C']), + np.array(["A", "B", "C"]), ), - 'chords_2', + "chords_2", ), ] chord_data_4 = ( utils.ChordData( - np.array([[0.0, 0.5, 1.0], [0.5, 1.0, 1.5]]).T, np.array(['A', 'A', 'E']) + np.array([[0.0, 0.5, 1.0], [0.5, 1.0, 1.5]]).T, np.array(["A", "A", "E"]) ), None, ) @@ -130,16 +130,16 @@ def test_chords(): [ utils.ChordData( np.array([[0.0, 0.5, 1.0], [0.5, 1.0, 1.5]]).T, - np.array(['A', 'A', 'E']), + np.array(["A", "A", "E"]), ), None, ], ( utils.ChordData( np.array([[0.0, 0.8, 1.0], [0.5, 1.0, 1.5]]).T, - np.array(['A', 'B', 'C']), + np.array(["A", "B", "C"]), ), - 'chords_2', + "chords_2", ), ] chord_data_6 = [(None, None)] @@ -148,7 +148,7 @@ def test_chords(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ) @@ -159,27 +159,27 @@ def test_chords(): jam_3 = jams_utils.jams_converter(chord_data=chord_data_3) jam_6 = jams_utils.jams_converter(chord_data=chord_data_6) - time, duration, value, confidence = get_jam_data(jam_1, 'chord', 0) + time, duration, value, confidence = get_jam_data(jam_1, "chord", 0) assert time == [0.0, 0.5, 1.0] assert duration == [0.5, 0.5, 0.5] - assert value == ['A', 'A', 'E'] + assert value == ["A", "A", "E"] assert confidence == [None, None, None] - assert jam_2.annotations[0]['sandbox']['name'] == 'chords_2' + assert jam_2.annotations[0]["sandbox"]["name"] == "chords_2" - time, duration, value, confidence = get_jam_data(jam_3, 'chord', 0) + time, duration, value, confidence = get_jam_data(jam_3, "chord", 0) assert time == [0.0, 0.5, 1.0] assert duration == [0.5, 0.5, 0.5] - assert value == ['A', 'A', 'E'] + assert value == ["A", "A", "E"] assert confidence == [None, None, None] - time, duration, value, confidence = get_jam_data(jam_3, 'chord', 1) + time, duration, value, confidence = get_jam_data(jam_3, "chord", 1) assert time == [0.0, 0.7, 1.0] assert duration == [0.7, 0.3, 0.5] - assert value == ['A', 'B', 'C'] + assert value == ["A", "B", "C"] assert confidence == [None, None, None] - time, duration, value, confidence = get_jam_data(jam_6, 'chord', 0) + time, duration, value, confidence = get_jam_data(jam_6, "chord", 0) assert time == [] assert duration == [] assert value == [] @@ -213,7 +213,7 @@ def test_notes(): np.array([1108.731, 1108.731, 1108.731]), np.array([1, 1, 1]), ), - 'notes_2', + "notes_2", ) ] note_data_3 = [ @@ -223,7 +223,7 @@ def test_notes(): np.array([1108.731, 1108.731, 1108.731]), np.array([1, 1, 1]), ), - 'notes_1', + "notes_1", ), ( utils.NoteData( @@ -231,7 +231,7 @@ def test_notes(): np.array([1108.731, 1108.731, 1108.731]), np.array([1, 1, 1]), ), - 'notes_2', + "notes_2", ), ] note_data_4 = ( @@ -257,7 +257,7 @@ def test_notes(): np.array([1108.731, 1108.731, 1108.731]), np.array([1, 1, 1]), ), - 'notes_2', + "notes_2", ), ] note_data_6 = [(None, None)] @@ -266,7 +266,7 @@ def test_notes(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ) @@ -277,27 +277,27 @@ def test_notes(): jam_3 = jams_utils.jams_converter(note_data=note_data_3) jam_6 = jams_utils.jams_converter(note_data=note_data_6) - time, duration, value, confidence = get_jam_data(jam_1, 'note_hz', 0) + time, duration, value, confidence = get_jam_data(jam_1, "note_hz", 0) assert time == [0.0, 0.5, 1.0] assert duration == [0.5, 0.5, 0.5] assert value == [1108.731, 1108.731, 1108.731] assert confidence == [None, None, None] - assert jam_2.annotations[0]['sandbox']['name'] == 'notes_2' + assert jam_2.annotations[0]["sandbox"]["name"] == "notes_2" - time, duration, value, confidence = get_jam_data(jam_3, 'note_hz', 0) + time, duration, value, confidence = get_jam_data(jam_3, "note_hz", 0) assert time == [0.0, 0.5, 1.0] assert duration == [0.5, 0.5, 0.5] assert value == [1108.731, 1108.731, 1108.731] assert confidence == [None, None, None] - time, duration, value, confidence = get_jam_data(jam_3, 'note_hz', 1) + time, duration, value, confidence = get_jam_data(jam_3, "note_hz", 1) assert time == [0.0, 0.7, 1.0] assert duration == [0.7, 0.3, 0.5] assert value == [1108.731, 1108.731, 1108.731] assert confidence == [None, None, None] - time, duration, value, confidence = get_jam_data(jam_6, 'note_hz', 0) + time, duration, value, confidence = get_jam_data(jam_6, "note_hz", 0) assert time == [] assert duration == [] assert value == [] @@ -318,7 +318,7 @@ def test_sections(): ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), None, ) @@ -327,31 +327,31 @@ def test_sections(): ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), - 'sections_2', + "sections_2", ) ] section_data_3 = [ ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), - 'sections_1', + "sections_1", ), ( utils.SectionData( np.array([[0.0, 15.0, 20.0], [15.0, 20.0, 30.0]]).T, - np.array(['verse A', 'verse B', 'verse C']), + np.array(["verse A", "verse B", "verse C"]), ), - 'sections_2', + "sections_2", ), ] section_data_4 = ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), None, ) @@ -359,16 +359,16 @@ def test_sections(): [ utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), None, ], ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), - 'sections_2', + "sections_2", ), ] section_data_6 = [(None, None)] @@ -377,7 +377,7 @@ def test_sections(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ) @@ -388,27 +388,27 @@ def test_sections(): jam_3 = jams_utils.jams_converter(section_data=section_data_3) jam_6 = jams_utils.jams_converter(section_data=section_data_6) - time, duration, value, confidence = get_jam_data(jam_1, 'segment', 0) + time, duration, value, confidence = get_jam_data(jam_1, "segment", 0) assert time == [0.0, 10.0, 20.0] assert duration == [10.0, 10.0, 5.0] - assert value == ['verse A', 'verse B', 'verse A'] + assert value == ["verse A", "verse B", "verse A"] assert confidence == [None, None, None] - assert jam_2.annotations[0]['sandbox']['name'] == 'sections_2' + assert jam_2.annotations[0]["sandbox"]["name"] == "sections_2" - time, duration, value, confidence = get_jam_data(jam_3, 'segment', 0) + time, duration, value, confidence = get_jam_data(jam_3, "segment", 0) assert time == [0.0, 10.0, 20.0] assert duration == [10.0, 10.0, 5.0] - assert value == ['verse A', 'verse B', 'verse A'] + assert value == ["verse A", "verse B", "verse A"] assert confidence == [None, None, None] - time, duration, value, confidence = get_jam_data(jam_3, 'segment', 1) + time, duration, value, confidence = get_jam_data(jam_3, "segment", 1) assert time == [0.0, 15.0, 20.0] assert duration == [15.0, 5.0, 10.0] - assert value == ['verse A', 'verse B', 'verse C'] + assert value == ["verse A", "verse B", "verse C"] assert confidence == [None, None, None] - time, duration, value, confidence = get_jam_data(jam_6, 'segment', 0) + time, duration, value, confidence = get_jam_data(jam_6, "segment", 0) assert time == [] assert duration == [] assert value == [] @@ -431,14 +431,14 @@ def test_multi_sections(): ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), None, ), ( utils.SectionData( np.array([[0.0, 15.0, 20.0], [15.0, 20.0, 25.0]]).T, - np.array(['verse a', 'verse b', 'verse a']), + np.array(["verse a", "verse b", "verse a"]), ), None, ), @@ -453,19 +453,19 @@ def test_multi_sections(): ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), 0, ), ( utils.SectionData( np.array([[0.0, 15.0, 20.0], [15.0, 20.0, 25.0]]).T, - np.array(['verse a', 'verse b', 'verse a']), + np.array(["verse a", "verse b", "verse a"]), ), 1, ), ], - 'annotator_1', + "annotator_1", ) ] multi_section_data_3 = [ @@ -474,38 +474,38 @@ def test_multi_sections(): ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), 0, ), ( utils.SectionData( np.array([[0.0, 15.0, 20.0], [15.0, 20.0, 25.0]]).T, - np.array(['verse a', 'verse b', 'verse a']), + np.array(["verse a", "verse b", "verse a"]), ), 1, ), ], - 'annotator_1', + "annotator_1", ), ( [ ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), 0, ), ( utils.SectionData( np.array([[0.0, 15.0, 20.0], [15.0, 20.0, 25.0]]).T, - np.array(['verse a', 'verse b', 'verse a']), + np.array(["verse a", "verse b", "verse a"]), ), 1, ), ], - 'annotator_2', + "annotator_2", ), ] multi_section_data_4 = ( @@ -513,14 +513,14 @@ def test_multi_sections(): ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), None, ), ( utils.SectionData( np.array([[0.0, 15.0, 20.0], [15.0, 20.0, 25.0]]).T, - np.array(['verse a', 'verse b', 'verse a']), + np.array(["verse a", "verse b", "verse a"]), ), None, ), @@ -533,14 +533,14 @@ def test_multi_sections(): ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), None, ), ( utils.SectionData( np.array([[0.0, 15.0, 20.0], [15.0, 20.0, 25.0]]).T, - np.array(['verse a', 'verse b', 'verse a']), + np.array(["verse a", "verse b", "verse a"]), ), None, ), @@ -554,14 +554,14 @@ def test_multi_sections(): ( utils.SectionData( np.array([[0.0, 10.0, 20.0], [10.0, 20.0, 25.0]]).T, - np.array(['verse A', 'verse B', 'verse A']), + np.array(["verse A", "verse B", "verse A"]), ), None, ), ( utils.SectionData( np.array([[0.0, 15.0, 20.0], [15.0, 20.0, 25.0]]).T, - np.array(['verse a', 'verse b', 'verse a']), + np.array(["verse a", "verse b", "verse a"]), ), None, ), @@ -577,7 +577,7 @@ def test_multi_sections(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ), @@ -585,7 +585,7 @@ def test_multi_sections(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ), @@ -599,51 +599,51 @@ def test_multi_sections(): jam_3 = jams_utils.jams_converter(multi_section_data=multi_section_data_3) jam_7 = jams_utils.jams_converter(multi_section_data=multi_section_data_7) - time, duration, value, confidence = get_jam_data(jam_1, 'multi_segment', 0) + time, duration, value, confidence = get_jam_data(jam_1, "multi_segment", 0) assert time == [0.0, 0.0, 10.0, 15.0, 20.0, 20.0] assert duration == [10.0, 15.0, 10.0, 5.0, 5.0, 5.0] assert value == [ - {'label': 'verse A', 'level': None}, - {'label': 'verse a', 'level': None}, - {'label': 'verse B', 'level': None}, - {'label': 'verse b', 'level': None}, - {'label': 'verse A', 'level': None}, - {'label': 'verse a', 'level': None}, + {"label": "verse A", "level": None}, + {"label": "verse a", "level": None}, + {"label": "verse B", "level": None}, + {"label": "verse b", "level": None}, + {"label": "verse A", "level": None}, + {"label": "verse a", "level": None}, ] assert confidence == [None, None, None, None, None, None] assert ( - jam_2.annotations[0]['annotation_metadata']['annotator']['name'] - == 'annotator_1' + jam_2.annotations[0]["annotation_metadata"]["annotator"]["name"] + == "annotator_1" ) - time, duration, value, confidence = get_jam_data(jam_3, 'multi_segment', 0) + time, duration, value, confidence = get_jam_data(jam_3, "multi_segment", 0) assert time == [0.0, 0.0, 10.0, 15.0, 20.0, 20.0] assert duration == [10.0, 15.0, 10.0, 5.0, 5.0, 5.0] assert value == [ - {'label': 'verse A', 'level': 0}, - {'label': 'verse a', 'level': 1}, - {'label': 'verse B', 'level': 0}, - {'label': 'verse b', 'level': 1}, - {'label': 'verse A', 'level': 0}, - {'label': 'verse a', 'level': 1}, + {"label": "verse A", "level": 0}, + {"label": "verse a", "level": 1}, + {"label": "verse B", "level": 0}, + {"label": "verse b", "level": 1}, + {"label": "verse A", "level": 0}, + {"label": "verse a", "level": 1}, ] assert confidence == [None, None, None, None, None, None] - time, duration, value, confidence = get_jam_data(jam_3, 'multi_segment', 1) + time, duration, value, confidence = get_jam_data(jam_3, "multi_segment", 1) assert time == [0.0, 0.0, 10.0, 15.0, 20.0, 20.0] assert duration == [10.0, 15.0, 10.0, 5.0, 5.0, 5.0] assert value == [ - {'label': 'verse A', 'level': 0}, - {'label': 'verse a', 'level': 1}, - {'label': 'verse B', 'level': 0}, - {'label': 'verse b', 'level': 1}, - {'label': 'verse A', 'level': 0}, - {'label': 'verse a', 'level': 1}, + {"label": "verse A", "level": 0}, + {"label": "verse a", "level": 1}, + {"label": "verse B", "level": 0}, + {"label": "verse b", "level": 1}, + {"label": "verse A", "level": 0}, + {"label": "verse a", "level": 1}, ] assert confidence == [None, None, None, None, None, None] - time, duration, value, confidence = get_jam_data(jam_7, 'multi_segment', 0) + time, duration, value, confidence = get_jam_data(jam_7, "multi_segment", 0) assert time == [] assert duration == [] assert value == [] @@ -663,22 +663,22 @@ def test_multi_sections(): def test_keys(): key_data_1 = [ - (utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(['A'])), None) + (utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(["A"])), None) ] key_data_2 = [ - (utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(['A'])), 'keys_1') + (utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(["A"])), "keys_1") ] key_data_3 = [ - (utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(['A'])), 'keys_1'), - (utils.KeyData(np.array([0.0]), np.array([50.0]), np.array(['B'])), 'keys_2'), + (utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(["A"])), "keys_1"), + (utils.KeyData(np.array([0.0]), np.array([50.0]), np.array(["B"])), "keys_2"), ] key_data_4 = ( - utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(['A'])), - 'keys_1', + utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(["A"])), + "keys_1", ) key_data_5 = [ - [utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(['A'])), 'keys_1'], - (utils.KeyData(np.array([0.0]), np.array([50.0]), np.array(['B'])), 'keys_2'), + [utils.KeyData(np.array([0.0]), np.array([100.0]), np.array(["A"])), "keys_1"], + (utils.KeyData(np.array([0.0]), np.array([50.0]), np.array(["B"])), "keys_2"), ] key_data_6 = [(None, None)] key_data_7 = [ @@ -686,7 +686,7 @@ def test_keys(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ) @@ -697,27 +697,27 @@ def test_keys(): jam_3 = jams_utils.jams_converter(key_data=key_data_3) jam_6 = jams_utils.jams_converter(key_data=key_data_6) - time, duration, value, confidence = get_jam_data(jam_1, 'key', 0) + time, duration, value, confidence = get_jam_data(jam_1, "key", 0) assert time == [0.0] assert duration == [100.0] - assert value == ['A'] + assert value == ["A"] assert confidence == [None] - assert jam_2.annotations[0]['sandbox']['name'] == 'keys_1' + assert jam_2.annotations[0]["sandbox"]["name"] == "keys_1" - time, duration, value, confidence = get_jam_data(jam_3, 'key', 0) + time, duration, value, confidence = get_jam_data(jam_3, "key", 0) assert time == [0.0] assert duration == [100.0] - assert value == ['A'] + assert value == ["A"] assert confidence == [None] - time, duration, value, confidence = get_jam_data(jam_3, 'key', 1) + time, duration, value, confidence = get_jam_data(jam_3, "key", 1) assert time == [0.0] assert duration == [50.0] - assert value == ['B'] + assert value == ["B"] assert confidence == [None] - time, duration, value, confidence = get_jam_data(jam_6, 'key', 0) + time, duration, value, confidence = get_jam_data(jam_6, "key", 0) assert time == [] assert duration == [] assert value == [] @@ -747,7 +747,7 @@ def test_f0s(): utils.F0Data( np.array([0.016, 0.048]), np.array([0.0, 260.9]), np.array([0.0, 1.0]) ), - 'f0s_1', + "f0s_1", ) ] f0_data_3 = [ @@ -755,33 +755,33 @@ def test_f0s(): utils.F0Data( np.array([0.016, 0.048]), np.array([0.0, 260.9]), np.array([0.0, 1.0]) ), - 'f0s_1', + "f0s_1", ), ( utils.F0Data( np.array([0.003, 0.012]), np.array([0.0, 230.5]), np.array([0.0, 1.0]) ), - 'f0s_2', + "f0s_2", ), ] f0_data_4 = ( utils.F0Data( np.array([0.016, 0.048]), np.array([0.0, 260.9]), np.array([0.0, 1.0]) ), - 'f0s_1', + "f0s_1", ) f0_data_5 = [ [ utils.F0Data( np.array([0.016, 0.048]), np.array([0.0, 260.9]), np.array([0.0, 1.0]) ), - 'f0s_1', + "f0s_1", ], ( utils.F0Data( np.array([0.003, 0.012]), np.array([0.0, 230.5]), np.array([0.0, 1.0]) ), - 'f0s_2', + "f0s_2", ), ] f0_data_6 = [(None, None)] @@ -790,7 +790,7 @@ def test_f0s(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ) @@ -801,36 +801,36 @@ def test_f0s(): jam_3 = jams_utils.jams_converter(f0_data=f0_data_3) jam_6 = jams_utils.jams_converter(f0_data=f0_data_6) - time, duration, value, confidence = get_jam_data(jam_1, 'pitch_contour', 0) + time, duration, value, confidence = get_jam_data(jam_1, "pitch_contour", 0) assert time == [0.016, 0.048] assert duration == [0.0, 0.0] assert value == [ - {'frequency': 0.0, 'index': 0, 'voiced': False}, - {'frequency': 260.9, 'index': 0, 'voiced': True}, + {"frequency": 0.0, "index": 0, "voiced": False}, + {"frequency": 260.9, "index": 0, "voiced": True}, ] assert confidence == [0.0, 1.0] - assert jam_2.annotations[0]['sandbox']['name'] == 'f0s_1' + assert jam_2.annotations[0]["sandbox"]["name"] == "f0s_1" - time, duration, value, confidence = get_jam_data(jam_3, 'pitch_contour', 0) + time, duration, value, confidence = get_jam_data(jam_3, "pitch_contour", 0) assert time == [0.016, 0.048] assert duration == [0.0, 0.0] assert value == [ - {'frequency': 0.0, 'index': 0, 'voiced': False}, - {'frequency': 260.9, 'index': 0, 'voiced': True}, + {"frequency": 0.0, "index": 0, "voiced": False}, + {"frequency": 260.9, "index": 0, "voiced": True}, ] assert confidence == [0.0, 1.0] - time, duration, value, confidence = get_jam_data(jam_3, 'pitch_contour', 1) + time, duration, value, confidence = get_jam_data(jam_3, "pitch_contour", 1) assert time == [0.003, 0.012] assert duration == [0.0, 0.0] assert value == [ - {'frequency': 0.0, 'index': 0, 'voiced': False}, - {'frequency': 230.5, 'index': 0, 'voiced': True}, + {"frequency": 0.0, "index": 0, "voiced": False}, + {"frequency": 230.5, "index": 0, "voiced": True}, ] assert confidence == [0.0, 1.0] - time, duration, value, confidence = get_jam_data(jam_6, 'pitch_contour', 0) + time, duration, value, confidence = get_jam_data(jam_6, "pitch_contour", 0) assert time == [] assert duration == [] assert value == [] @@ -852,7 +852,7 @@ def test_lyrics(): utils.LyricData( np.array([0.027, 0.232]), np.array([0.227, 0.742]), - np.array(['The', 'Test']), + np.array(["The", "Test"]), np.array([None, None]), ), None, @@ -863,10 +863,10 @@ def test_lyrics(): utils.LyricData( np.array([0.027, 0.232]), np.array([0.227, 0.742]), - np.array(['The', 'Test']), + np.array(["The", "Test"]), np.array([None, None]), ), - 'lyrics_1', + "lyrics_1", ) ] lyrics_data_3 = [ @@ -874,48 +874,48 @@ def test_lyrics(): utils.LyricData( np.array([0.027, 0.232]), np.array([0.227, 0.742]), - np.array(['The', 'Test']), + np.array(["The", "Test"]), np.array([None, None]), ), - 'lyrics_1', + "lyrics_1", ), ( utils.LyricData( np.array([0.0, 0.232]), np.array([0.227, 0.742]), - np.array(['is', 'cool']), + np.array(["is", "cool"]), np.array([None, None]), ), - 'lyrics_2', + "lyrics_2", ), ] lyrics_data_4 = ( utils.LyricData( np.array([0.027, 0.232]), np.array([0.227, 0.742]), - np.array(['The', 'Test']), + np.array(["The", "Test"]), np.array([None, None]), ), - 'lyrics_1', + "lyrics_1", ) lyrics_data_5 = [ ( utils.LyricData( np.array([0.027, 0.232]), np.array([0.227, 0.742]), - np.array(['The', 'Test']), + np.array(["The", "Test"]), np.array([None, None]), ), - 'lyrics_1', + "lyrics_1", ), [ utils.LyricData( np.array([0.0, 0.232]), np.array([0.227, 0.742]), - np.array(['is', 'cool']), + np.array(["is", "cool"]), np.array([None, None]), ), - 'lyrics_2', + "lyrics_2", ], ] lyrics_data_6 = [(None, None)] @@ -924,7 +924,7 @@ def test_lyrics(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), None, ) @@ -935,27 +935,27 @@ def test_lyrics(): jam_3 = jams_utils.jams_converter(lyrics_data=lyrics_data_3) jam_6 = jams_utils.jams_converter(lyrics_data=lyrics_data_6) - time, duration, value, confidence = get_jam_data(jam_1, 'lyrics', 0) + time, duration, value, confidence = get_jam_data(jam_1, "lyrics", 0) assert time == [0.027, 0.232] assert duration == [0.2, 0.51] - assert value == ['The', 'Test'] + assert value == ["The", "Test"] assert confidence == [None, None] - assert jam_2.annotations[0]['sandbox']['name'] == 'lyrics_1' + assert jam_2.annotations[0]["sandbox"]["name"] == "lyrics_1" - time, duration, value, confidence = get_jam_data(jam_3, 'lyrics', 0) + time, duration, value, confidence = get_jam_data(jam_3, "lyrics", 0) assert time == [0.027, 0.232] assert duration == [0.2, 0.51] - assert value == ['The', 'Test'] + assert value == ["The", "Test"] assert confidence == [None, None] - time, duration, value, confidence = get_jam_data(jam_3, 'lyrics', 1) + time, duration, value, confidence = get_jam_data(jam_3, "lyrics", 1) assert time == [0.0, 0.232] assert duration == [0.227, 0.51] - assert value == ['is', 'cool'] + assert value == ["is", "cool"] assert confidence == [None, None] - time, duration, value, confidence = get_jam_data(jam_6, 'lyrics', 0) + time, duration, value, confidence = get_jam_data(jam_6, "lyrics", 0) assert time == [] assert duration == [] assert value == [] @@ -972,22 +972,22 @@ def test_lyrics(): def test_tags(): - tag_data1 = [('blues', 'I am a description')] - tag_data2 = [('disco', 'tag 1'), ('rock', 'tag 2')] - tag_data3 = [('invalid', 'asdf')] - tag_data4 = ('jazz', 'wrong format') - tag_data5 = ['wrong format too'] - tag_data6 = [(123, 'asdf')] + tag_data1 = [("blues", "I am a description")] + tag_data2 = [("disco", "tag 1"), ("rock", "tag 2")] + tag_data3 = [("invalid", "asdf")] + tag_data4 = ("jazz", "wrong format") + tag_data5 = ["wrong format too"] + tag_data6 = [(123, "asdf")] jam1 = jams_utils.jams_converter( - tags_gtzan_data=tag_data1, metadata={'duration': 10.0} + tags_gtzan_data=tag_data1, metadata={"duration": 10.0} ) assert jam1.validate() jam2 = jams_utils.jams_converter( - tags_gtzan_data=tag_data2, metadata={'duration': 10.0} + tags_gtzan_data=tag_data2, metadata={"duration": 10.0} ) assert jam2.validate() jam3 = jams_utils.jams_converter( - tags_gtzan_data=tag_data3, metadata={'duration': 10.0} + tags_gtzan_data=tag_data3, metadata={"duration": 10.0} ) with pytest.raises(jams.SchemaError): assert jam3.validate() @@ -1000,22 +1000,22 @@ def test_tags(): def test_tempos(): - tempo_data1 = [(120, 'I am a description')] - tempo_data2 = [(120.0, 'tempo 1'), (240, 'tempo 2')] - tempo_data3 = [(-1, 'asdf')] - tempo_data4 = (120.5, 'wrong format') - tempo_data5 = ['wrong format too'] - tempo_data6 = [('string!', 'string!')] + tempo_data1 = [(120, "I am a description")] + tempo_data2 = [(120.0, "tempo 1"), (240, "tempo 2")] + tempo_data3 = [(-1, "asdf")] + tempo_data4 = (120.5, "wrong format") + tempo_data5 = ["wrong format too"] + tempo_data6 = [("string!", "string!")] jam1 = jams_utils.jams_converter( - tempo_data=tempo_data1, metadata={'duration': 10.0} + tempo_data=tempo_data1, metadata={"duration": 10.0} ) assert jam1.validate() jam2 = jams_utils.jams_converter( - tempo_data=tempo_data2, metadata={'duration': 10.0} + tempo_data=tempo_data2, metadata={"duration": 10.0} ) assert jam2.validate() jam3 = jams_utils.jams_converter( - tempo_data=tempo_data3, metadata={'duration': 10.0} + tempo_data=tempo_data3, metadata={"duration": 10.0} ) with pytest.raises(jams.SchemaError): assert jam3.validate() @@ -1033,25 +1033,25 @@ def test_events(): utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array(['event A', 'event B']), + np.array(["event A", "event B"]), ), - 'I am a description', + "I am a description", ) ] event_data2 = [ ( utils.EventData( - np.array([0.2, 0.3]), np.array([0.4, 0.5]), np.array([2, 'event B']) + np.array([0.2, 0.3]), np.array([0.4, 0.5]), np.array([2, "event B"]) ), - 'events 1', + "events 1", ), ( utils.EventData( np.array([0.2, 0.3]), np.array([0.3, 0.4]), - np.array([{'a': 1, 2: 'b'}, 'a great label']), + np.array([{"a": 1, 2: "b"}, "a great label"]), ), - 'events 2', + "events 2", ), ] event_data3 = [ @@ -1059,24 +1059,24 @@ def test_events(): utils.EventData( np.array([20, 30]), # invalid because > duration np.array([0.3, 0.4]), - np.array([{'a': 1, 2: 'b'}, 'a great label']), + np.array([{"a": 1, 2: "b"}, "a great label"]), ), - 'asdf', + "asdf", ) ] - event_data4 = ('jazz', 'wrong format') - event_data5 = ['wrong format too'] - event_data6 = [('wrong', 'description')] + event_data4 = ("jazz", "wrong format") + event_data5 = ["wrong format too"] + event_data6 = [("wrong", "description")] jam1 = jams_utils.jams_converter( - event_data=event_data1, metadata={'duration': 10.0} + event_data=event_data1, metadata={"duration": 10.0} ) assert jam1.validate() jam2 = jams_utils.jams_converter( - event_data=event_data2, metadata={'duration': 10.0} + event_data=event_data2, metadata={"duration": 10.0} ) assert jam2.validate() jam3 = jams_utils.jams_converter( - event_data=event_data3, metadata={'duration': 10.0} + event_data=event_data3, metadata={"duration": 10.0} ) with pytest.raises(jams.SchemaError): assert jam3.validate() @@ -1090,67 +1090,67 @@ def test_events(): def test_metadata(): metadata_1 = { - 'duration': 1.5, - 'artist': 'Meatloaf', - 'title': 'Le ciel est blue', - 'favourite_color': 'rainbow', + "duration": 1.5, + "artist": "Meatloaf", + "title": "Le ciel est blue", + "favourite_color": "rainbow", } jam_1 = jams_utils.jams_converter(lyrics_data=[(None, None)], metadata=metadata_1) - assert jam_1['file_metadata']['title'] == 'Le ciel est blue' - assert jam_1['file_metadata']['artist'] == 'Meatloaf' - assert jam_1['file_metadata']['duration'] == 1.5 - assert jam_1['sandbox']['favourite_color'] == 'rainbow' + assert jam_1["file_metadata"]["title"] == "Le ciel est blue" + assert jam_1["file_metadata"]["artist"] == "Meatloaf" + assert jam_1["file_metadata"]["duration"] == 1.5 + assert jam_1["sandbox"]["favourite_color"] == "rainbow" # test meatadata value None metadata_2 = { - 'duration': 1.5, - 'artist': 'breakmaster cylinder', - 'title': None, - 'extra': None, + "duration": 1.5, + "artist": "breakmaster cylinder", + "title": None, + "extra": None, } jam2 = jams_utils.jams_converter(metadata=metadata_2) assert jam2.validate() - assert jam2['file_metadata']['duration'] == 1.5 - assert jam2['file_metadata']['artist'] == 'breakmaster cylinder' - assert jam2['file_metadata']['title'] == '' - assert 'extra' not in jam2['sandbox'] + assert jam2["file_metadata"]["duration"] == 1.5 + assert jam2["file_metadata"]["artist"] == "breakmaster cylinder" + assert jam2["file_metadata"]["title"] == "" + assert "extra" not in jam2["sandbox"] def test_duration(): # duration from audio file jam = jams_utils.jams_converter( - audio_path='tests/resources/mir_datasets/iKala/Wavfile/10161_chorus.wav' + audio_path="tests/resources/mir_datasets/ikala/Wavfile/10161_chorus.wav" ) assert jam.file_metadata.duration == 2.0 assert jam.validate() # test invalid file path with pytest.raises(OSError): - jams_utils.jams_converter(audio_path='i/dont/exist') + jams_utils.jams_converter(audio_path="i/dont/exist") - jam1 = jams_utils.jams_converter(metadata={'duration': 4}) + jam1 = jams_utils.jams_converter(metadata={"duration": 4}) assert jam1.file_metadata.duration == 4.0 assert jam1.validate() # test incomplete metadata - jam2 = jams_utils.jams_converter(metadata={'artist': 'b'}) + jam2 = jams_utils.jams_converter(metadata={"artist": "b"}) with pytest.raises(jams_utils.jams.SchemaError): jam2.validate() # test metadata duration and audio file equal jam3 = jams_utils.jams_converter( - audio_path='tests/resources/mir_datasets/iKala/Wavfile/10161_chorus.wav', - metadata={'duration': 2}, + audio_path="tests/resources/mir_datasets/ikala/Wavfile/10161_chorus.wav", + metadata={"duration": 2}, ) assert jam3.file_metadata.duration == 2 assert jam3.validate() # test metadata and duration not equal jam4 = jams_utils.jams_converter( - audio_path='tests/resources/mir_datasets/iKala/Wavfile/10161_chorus.wav', - metadata={'duration': 1000}, + audio_path="tests/resources/mir_datasets/ikala/Wavfile/10161_chorus.wav", + metadata={"duration": 1000}, ) assert jam4.file_metadata.duration == 1000 assert jam4.validate() diff --git a/tests/test_loaders.py b/tests/test_loaders.py index 45f9dcf8e..9796d0cb0 100644 --- a/tests/test_loaders.py +++ b/tests/test_loaders.py @@ -5,34 +5,92 @@ from inspect import signature import io import os -import requests import sys import pytest +import requests + import mirdata -from mirdata import track +from mirdata import core from tests.test_utils import DEFAULT_DATA_HOME -DATASETS = [importlib.import_module("mirdata.{}".format(d)) for d in mirdata.__all__] +DATASETS = mirdata.DATASETS CUSTOM_TEST_TRACKS = { - 'beatles': '0111', - 'giantsteps_key': '3', - 'dali': '4b196e6c99574dd49ad00d56e132712b', - 'giantsteps_tempo': '113', - 'guitarset': '03_BN3-119-G_solo', - 'medley_solos_db': 'd07b1fc0-567d-52c2-fef4-239f31c9d40e', - 'medleydb_melody': 'MusicDelta_Beethoven', - 'mridangam_stroke': '224030', - 'rwc_classical': 'RM-C003', - 'rwc_jazz': 'RM-J004', - 'rwc_popular': 'RM-P001', - 'salami': '2', - 'tinysol': 'Fl-ord-C4-mf-N-T14d', + "beatles": "0111", + "giantsteps_key": "3", + "dali": "4b196e6c99574dd49ad00d56e132712b", + "giantsteps_tempo": "113", + "guitarset": "03_BN3-119-G_solo", + "medley_solos_db": "d07b1fc0-567d-52c2-fef4-239f31c9d40e", + "medleydb_melody": "MusicDelta_Beethoven", + "mridangam_stroke": "224030", + "rwc_classical": "RM-C003", + "rwc_jazz": "RM-J004", + "rwc_popular": "RM-P001", + "salami": "2", + "tinysol": "Fl-ord-C4-mf-N-T14d", } +def test_dataset_attributes(): + for dataset_name in DATASETS: + dataset = mirdata.Dataset(dataset_name) + assert ( + dataset.name == dataset_name + ), "{}.dataset attribute does not match dataset name".format(dataset_name) + assert ( + dataset.bibtex is not None + ), "No BIBTEX information provided for {}".format(dataset_name) + assert ( + isinstance(dataset._remotes, dict) or dataset._remotes is None + ), "{}.REMOTES must be a dictionary".format(dataset_name) + assert isinstance(dataset._index, dict), "{}.DATA is not properly set".format( + dataset_name + ) + assert ( + isinstance(dataset._download_info, str) or dataset._download_info is None + ), "{}.DOWNLOAD_INFO must be a string".format(dataset_name) + assert type(dataset._track_object) == type( + core.Track + ), "{}.Track must be an instance of core.Track".format(dataset_name) + assert callable(dataset._download_fn), "{}._download is not a function".format( + dataset_name + ) + assert dataset.readme != "", "{} has no module readme".format(dataset_name) + + +def test_forward_compatibility(): + for dataset_name in DATASETS: + dataset_module = importlib.import_module( + "mirdata.datasets.{}".format(dataset_name) + ) + assert not hasattr( + dataset_module, "validate" + ), "{}: loaders no longer need validate methods".format(dataset_name) + assert not hasattr(dataset_module, "download"), ( + "{}: loaders no longer need download methods. " + + "If you want to specify a custom download function, call it _download" + ).format(dataset_name) + assert not hasattr( + dataset_module, "track_ids" + ), "{}: loaders no longer need track_ids methods".format(dataset_name) + assert not hasattr( + dataset_module, "load" + ), "{}: loaders no longer need load methods".format(dataset_name) + assert not hasattr( + dataset_module, "DATASET_DIR" + ), "{}: loaders no longer need to define DATASET_DIR".format(dataset_name) + + if hasattr(dataset_module, "Track"): + track_params = signature(dataset_module.Track).parameters + assert ( + track_params["data_home"].default == inspect._empty + ), "{}.Track should no longer take default arguments".format(dataset_name) + + def test_cite(): - for dataset in DATASETS: + for dataset_name in DATASETS: + dataset = mirdata.Dataset(dataset_name) text_trap = io.StringIO() sys.stdout = text_trap dataset.cite() @@ -44,105 +102,30 @@ def test_cite(): def test_download(mocker): - for dataset in DATASETS: - dataset_name = dataset.__name__.split(".")[1] + for dataset_name in DATASETS: + print(dataset_name) + dataset = mirdata.Dataset(dataset_name) # test parameters & defaults - assert hasattr(dataset, "download"), "{} has no download method".format( - dataset_name - ) - assert hasattr( - dataset.download, "__call__" - ), "{}.download is not callable".format(dataset_name) - params = signature(dataset.download).parameters - assert ( - "data_home" in params - ), "data_home must be an argument of {}.download".format(dataset_name) - assert ( - params["data_home"].default is None - ), "the default value of data_Home in {}.download should be None".format( + assert callable(dataset._download_fn), "{}.download is not callable".format( dataset_name ) - - # if there are no remotes, make sure partial_download, - # force_overwrite, and cleanup are not parameters - if not hasattr(dataset, "REMOTES"): - assert ( - "partial_download" not in params - ), "{} has no REMOTES, so its download method does not need a partial_download argument".format( - dataset_name - ) - assert ( - "force_overwrite" not in params - ), "{} has no REMOTES so its download method does not need a force_overwrite argument".format( - dataset_name - ) - assert ( - "cleanup" not in params - ), "{} has no REMOTES so its download method does not need a cleanup argument".format( - dataset_name - ) - # if there are remotes, make sure force_overwrite is specified and - # the default is False - else: - assert ( - "force_overwrite" in params - ), "{} has REMOTES, so its download method must have a force_overwrite parameter".format( - dataset_name - ) - assert ( - params["force_overwrite"].default is False - ), "the force_overwrite parameter of {}.download must default to False".format( - dataset_name - ) - - # if there are remotes but only one item, make sure partial_download - # is not a parameter - if len(dataset.REMOTES) == 1: - assert ( - "partial_download" not in params - ), "{}.REMOTES has only one item, so its download method does not need a partial_download argument".format( - dataset_name - ) - # if there is more than one item in remotes, make sure partial_download - # is a parameter and the default is None - else: - assert ( - "partial_download" in params - ), "{}.REMOTES has multiple downloads, so its download method should have a partial_download argument".format( - dataset_name - ) - assert ( - params["partial_download"].default is None - ), "the default argument of partial_download in {}.download should be None" - - extensions = [ - os.path.splitext(r.filename)[-1] for r in dataset.REMOTES.values() - ] - # if there are any zip or tar files to download, make sure cleanup - # is a parameter and its default is True - if any([e == ".zip" or e == ".gz" for e in extensions]): - assert ( - "cleanup" in params - ), "{}.REMOTES contains zip or tar files, so its download method should have a cleanup argument".format( - dataset_name - ) - assert ( - params["cleanup"].default is True - ), "the default value for cleanup in {}.download should be True".format( - dataset_name - ) - # if there are no zip or tar files, make sure cleanup is not a parameter - else: - assert ( - "cleanup" not in params - ), "there are no zip or tar files in {}.REMOTES so its download method does not need a cleanup argument".format( - dataset_name - ) + params = signature(dataset._download_fn).parameters + expected_params = [ + "save_dir", + "remotes", + "partial_download", + "info_message", + "force_overwrite", + "cleanup", + ] + assert set(params) == set( + expected_params + ), "{}.download must have parameters {}".format(dataset_name, expected_params) # check that the download method can be called without errors - if hasattr(dataset, "REMOTES"): - mock_downloader = mocker.patch.object(dataset, "REMOTES") + if dataset._remotes != {}: + mock_downloader = mocker.patch.object(dataset, "_remotes") if dataset_name not in DOWNLOAD_EXCEPTIONS: try: dataset.download() @@ -152,12 +135,12 @@ def test_download(mocker): mocker.resetall() # check that links are online - for key in dataset.REMOTES: + for key in dataset._remotes: # skip this test if it's in known issues if dataset_name in KNOWN_ISSUES and key in KNOWN_ISSUES[dataset_name]: continue - url = dataset.REMOTES[key].url + url = dataset._remotes[key].url try: request = requests.head(url) assert request.ok, "Link {} for {} does not return OK".format( @@ -179,30 +162,33 @@ def test_download(mocker): # This is magically skipped by the the remote fixture `skip_local` in conftest.py # when tests are run with the --local flag def test_validate(skip_local): - for dataset in DATASETS: - dataset_name = dataset.__name__.split(".")[1] - data_home = os.path.join("tests/resources/mir_datasets", dataset.DATASET_DIR) + for dataset_name in DATASETS: + data_home = os.path.join("tests/resources/mir_datasets", dataset_name) + dataset = mirdata.Dataset(dataset_name, data_home=data_home) try: - dataset.validate(data_home=data_home) + dataset.validate() except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) try: - dataset.validate(data_home=data_home, silence=True) + dataset.validate(verbose=False) except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) + dataset_default = mirdata.Dataset(dataset_name, data_home=None) try: - dataset.validate(data_home=None, silence=True) + dataset_default.validate(verbose=False) except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) def test_load_and_trackids(): - for dataset in DATASETS: - dataset_name = dataset.__name__.split(".")[1] + for dataset_name in DATASETS: + data_home = os.path.join("tests/resources/mir_datasets", dataset_name) + dataset = mirdata.Dataset(dataset_name, data_home=data_home) + dataset_default = mirdata.Dataset(dataset_name, data_home=None) try: - track_ids = dataset.track_ids() + track_ids = dataset.track_ids except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) @@ -211,71 +197,91 @@ def test_load_and_trackids(): ) trackid_len = len(track_ids) - data_home = os.path.join("tests/resources/mir_datasets", dataset.DATASET_DIR) - try: - dataset_data = dataset.load(data_home=data_home) - except: - assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) + # if the dataset has tracks, test the loaders + if dataset._track_object is not None: - assert type(dataset_data) is dict, "{}.load should return a dictionary".format( - dataset_name - ) - assert ( - len(dataset_data.keys()) == trackid_len - ), "the dictionary returned {}.load() does not have the same number of elements as {}.track_ids()".format( - dataset_name, dataset_name - ) + try: + choice_track = dataset.choice_track() + except: + assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) + assert isinstance( + choice_track, core.Track + ), "{}.choice_track must return an instance of type core.Track".format( + dataset_name + ) - try: - dataset_data_default = dataset.load() - except: - assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) + try: + dataset_data = dataset.load_tracks() + except: + assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) - assert ( - type(dataset_data_default) is dict - ), "{}.load should return a dictionary".format(dataset_name) - assert ( - len(dataset_data_default.keys()) == trackid_len - ), "the dictionary returned {}.load() does not have the same number of elements as {}.track_ids()".format( - dataset_name, dataset_name - ) + assert ( + type(dataset_data) is dict + ), "{}.load should return a dictionary".format(dataset_name) + assert ( + len(dataset_data.keys()) == trackid_len + ), "the dictionary returned {}.load() does not have the same number of elements as {}.track_ids()".format( + dataset_name, dataset_name + ) + + try: + dataset_data_default = dataset_default.load_tracks() + except: + assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) + + assert ( + type(dataset_data_default) is dict + ), "{}.load should return a dictionary".format(dataset_name) + assert ( + len(dataset_data_default.keys()) == trackid_len + ), "the dictionary returned {}.load() does not have the same number of elements as {}.track_ids()".format( + dataset_name, dataset_name + ) def test_track(): data_home_dir = "tests/resources/mir_datasets" - for dataset in DATASETS: + for dataset_name in DATASETS: - dataset_name = dataset.__name__.split(".")[1] + data_home = os.path.join(data_home_dir, dataset_name) + dataset = mirdata.Dataset(dataset_name, data_home=data_home) + dataset_default = mirdata.Dataset(dataset_name, data_home=None) + + # if the dataset doesn't have a track object, make sure it raises a value error + # and move on to the next dataset + if dataset._track_object is None: + with pytest.raises(NotImplementedError): + dataset.track("~faketrackid~?!") + continue if dataset_name in CUSTOM_TEST_TRACKS: trackid = CUSTOM_TEST_TRACKS[dataset_name] else: - trackid = dataset.track_ids()[0] + trackid = dataset.track_ids[0] try: - track_default = dataset.Track(trackid) + track_default = dataset_default.track(trackid) except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) assert track_default._data_home == os.path.join( - DEFAULT_DATA_HOME, dataset.DATASET_DIR + DEFAULT_DATA_HOME, dataset.name ), "{}: Track._data_home path is not set as expected".format(dataset_name) # test data home specified - data_home = os.path.join(data_home_dir, dataset.DATASET_DIR) try: - track_test = dataset.Track(trackid, data_home=data_home) + track_test = dataset.track(trackid) except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) assert isinstance( - track_test, track.Track - ), "{}.Track must be an instance of type track.Track".format(dataset_name) + track_test, core.Track + ), "{}.track must be an instance of type core.Track".format(dataset_name) assert hasattr( track_test, "to_jams" - ), "{}.Track must have a to_jams method".format(dataset_name) + ), "{}.track must have a to_jams method".format(dataset_name) # Validate JSON schema try: @@ -283,7 +289,7 @@ def test_track(): except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) - assert jam.validate(), "Jams validation failed for {}.Track({})".format( + assert jam.validate(), "Jams validation failed for {}.track({})".format( dataset_name, trackid ) @@ -297,16 +303,7 @@ def test_track(): assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) with pytest.raises(ValueError): - dataset.Track("~faketrackid~?!") - - try: - track_custom = dataset.Track(trackid, data_home="casa/de/data") - except: - assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) - - assert ( - track_custom._data_home == "casa/de/data" - ), "{}: Track._data_home path is not set as expected".format(dataset_name) + dataset.track("~faketrackid~?!") # for load_* functions which require more than one argument @@ -321,15 +318,18 @@ def test_track(): def test_load_methods(): - for dataset in DATASETS: - dataset_name = dataset.__name__.split(".")[1] - + for dataset_name in DATASETS: + dataset = mirdata.Dataset(dataset_name) all_methods = dir(dataset) load_methods = [ getattr(dataset, m) for m in all_methods if m.startswith("load_") ] for load_method in load_methods: method_name = load_method.__name__ + + # skip default methods + if method_name == "load_tracks": + continue params = [ p for p in signature(load_method).parameters.values() @@ -353,9 +353,8 @@ def test_load_methods(): def test_multitracks(): data_home_dir = "tests/resources/mir_datasets" - for dataset in DATASETS: - - dataset_name = dataset.__name__.split(".")[1] + for dataset_name in DATASETS: + dataset = mirdata.Dataset(dataset_name) # TODO this is currently an opt-in test. Make it an opt out test # once #265 is addressed @@ -371,15 +370,16 @@ def test_multitracks(): assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) # test data home specified - data_home = os.path.join(data_home_dir, dataset.DATASET_DIR) + data_home = os.path.join(data_home_dir, dataset_name) + dataset_specific = mirdata.Dataset(dataset_name, data_home=data_home) try: - mtrack_test = dataset.MultiTrack(mtrack_id, data_home=data_home) + mtrack_test = dataset_specific.MultiTrack(mtrack_id, data_home=data_home) except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) assert isinstance( - mtrack_test, track.MultiTrack - ), "{}.MultiTrack must be an instance of type track.MultiTrack".format( + mtrack_test, core.MultiTrack + ), "{}.MultiTrack must be an instance of type core.MultiTrack".format( dataset_name ) @@ -393,6 +393,6 @@ def test_multitracks(): except: assert False, "{}: {}".format(dataset_name, sys.exc_info()[0]) - assert jam.validate(), "Jams validation failed for {}.Track({})".format( + assert jam.validate(), "Jams validation failed for {}.MultiTrack({})".format( dataset_name, mtrack_id ) diff --git a/tests/test_maestro.py b/tests/test_maestro.py index 7315760fb..101fa8489 100644 --- a/tests/test_maestro.py +++ b/tests/test_maestro.py @@ -4,41 +4,42 @@ import pretty_midi import numpy as np -from mirdata import maestro, utils, download_utils +from mirdata.datasets import maestro +from mirdata import utils, download_utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = '2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1' - data_home = 'tests/resources/mir_datasets/MAESTRO' + default_trackid = "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1" + data_home = "tests/resources/mir_datasets/maestro" track = maestro.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': '2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1', - 'midi_path': os.path.join( + "track_id": "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1", + "midi_path": os.path.join( data_home, - '2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi', + "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi", ), - 'audio_path': os.path.join( - data_home, '2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav' + "audio_path": os.path.join( + data_home, "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav" ), - 'canonical_composer': 'Alban Berg', - 'canonical_title': 'Sonata Op. 1', - 'year': 2018, - 'duration': 698.661160312, - 'split': 'train', + "canonical_composer": "Alban Berg", + "canonical_title": "Sonata Op. 1", + "year": 2018, + "duration": 698.661160312, + "split": "train", } - expected_property_types = {'notes': utils.NoteData, 'midi': pretty_midi.PrettyMIDI} + expected_property_types = {"notes": utils.NoteData, "midi": pretty_midi.PrettyMIDI} assert track._track_paths == { - 'audio': [ - '2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav', - '1694d8431f01eeb2a18444196550b99d', + "audio": [ + "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav", + "1694d8431f01eeb2a18444196550b99d", ], - 'midi': [ - '2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi', - '4901b1578ee4fe8c1696e02f60924949', + "midi": [ + "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi", + "4901b1578ee4fe8c1696e02f60924949", ], } @@ -52,8 +53,8 @@ def test_track(): def test_load_midi(): midi_file = ( - 'tests/resources/mir_datasets/MAESTRO/2018/' - + 'MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi' + "tests/resources/mir_datasets/maestro/2018/" + + "MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi" ) midi = maestro.load_midi(midi_file) assert len(midi.instruments) == 1 @@ -62,8 +63,8 @@ def test_load_midi(): def test_load_notes(): midi_file = ( - 'tests/resources/mir_datasets/MAESTRO/2018/' - + 'MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi' + "tests/resources/mir_datasets/maestro/2018/" + + "MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.midi" ) notes = maestro.load_notes(midi_file) expected_intervals = np.array([[0.98307292, 1.80989583], [1.78385417, 1.90625]]) @@ -73,11 +74,11 @@ def test_load_notes(): def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/MAESTRO' + data_home = "tests/resources/mir_datasets/maestro" metadata = maestro._load_metadata(data_home) - default_trackid = '2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1' + default_trackid = "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1" - assert metadata['data_home'] == data_home + assert metadata["data_home"] == data_home assert metadata[default_trackid] == { "canonical_composer": "Alban Berg", "canonical_title": "Sonata Op. 1", @@ -87,87 +88,87 @@ def test_load_metadata(): "audio_filename": "2018/MIDI-Unprocessed_Chamber3_MID--AUDIO_10_R3_2018_wav--1.wav", "duration": 698.661160312, } - metadata_none = maestro._load_metadata('asdf/asdf') + metadata_none = maestro._load_metadata("asdf/asdf") assert metadata_none is None def test_download_partial(httpserver): - data_home = 'tests/resources/mir_datasets/MAESTRO_download' + data_home = "tests/resources/mir_datasets/maestro_download" if os.path.exists(data_home): shutil.rmtree(data_home) httpserver.serve_content( - open('tests/resources/download/maestro-v2.0.0.json', 'r').read() + open("tests/resources/download/maestro-v2.0.0.json", "r").read() ) - maestro.REMOTES = { - 'all': download_utils.RemoteFileMetadata( - filename='1-maestro-v2.0.0.json', + remotes = { + "all": download_utils.RemoteFileMetadata( + filename="1-maestro-v2.0.0.json", url=httpserver.url, - checksum=('d41d8cd98f00b204e9800998ecf8427e'), + checksum=("d41d8cd98f00b204e9800998ecf8427e"), destination_dir=None, ), - 'midi': download_utils.RemoteFileMetadata( - filename='2-maestro-v2.0.0.json', + "midi": download_utils.RemoteFileMetadata( + filename="2-maestro-v2.0.0.json", url=httpserver.url, - checksum=('d41d8cd98f00b204e9800998ecf8427e'), + checksum=("d41d8cd98f00b204e9800998ecf8427e"), destination_dir=None, ), - 'metadata': download_utils.RemoteFileMetadata( - filename='3-maestro-v2.0.0.json', + "metadata": download_utils.RemoteFileMetadata( + filename="3-maestro-v2.0.0.json", url=httpserver.url, - checksum=('d41d8cd98f00b204e9800998ecf8427e'), - destination_dir='maestro-v2.0.0', + checksum=("d41d8cd98f00b204e9800998ecf8427e"), + destination_dir="maestro-v2.0.0", ), } - maestro.download(data_home=data_home, partial_download=None) - assert os.path.exists(os.path.join(data_home, '1-maestro-v2.0.0.json')) - assert not os.path.exists(os.path.join(data_home, '2-maestro-v2.0.0.json')) - assert not os.path.exists(os.path.join(data_home, '3-maestro-v2.0.0.json')) + maestro._download(data_home, remotes, None, None, False, False) + assert os.path.exists(os.path.join(data_home, "1-maestro-v2.0.0.json")) + assert not os.path.exists(os.path.join(data_home, "2-maestro-v2.0.0.json")) + assert not os.path.exists(os.path.join(data_home, "3-maestro-v2.0.0.json")) if os.path.exists(data_home): shutil.rmtree(data_home) - maestro.download(data_home=data_home, partial_download=['all', 'midi']) - assert os.path.exists(os.path.join(data_home, '1-maestro-v2.0.0.json')) - assert not os.path.exists(os.path.join(data_home, '2-maestro-v2.0.0.json')) - assert not os.path.exists(os.path.join(data_home, '3-maestro-v2.0.0.json')) + maestro._download(data_home, remotes, ["all", "midi"], None, False, False) + assert os.path.exists(os.path.join(data_home, "1-maestro-v2.0.0.json")) + assert not os.path.exists(os.path.join(data_home, "2-maestro-v2.0.0.json")) + assert not os.path.exists(os.path.join(data_home, "3-maestro-v2.0.0.json")) if os.path.exists(data_home): shutil.rmtree(data_home) - maestro.download(data_home=data_home, partial_download=['metadata', 'midi']) - assert not os.path.exists(os.path.join(data_home, '1-maestro-v2.0.0.json')) - assert os.path.exists(os.path.join(data_home, '2-maestro-v2.0.0.json')) - assert not os.path.exists(os.path.join(data_home, '3-maestro-v2.0.0.json')) + maestro._download(data_home, remotes, ["metadata", "midi"], None, False, False) + assert not os.path.exists(os.path.join(data_home, "1-maestro-v2.0.0.json")) + assert os.path.exists(os.path.join(data_home, "2-maestro-v2.0.0.json")) + assert not os.path.exists(os.path.join(data_home, "3-maestro-v2.0.0.json")) if os.path.exists(data_home): shutil.rmtree(data_home) - maestro.download(data_home=data_home, partial_download=['metadata']) - assert not os.path.exists(os.path.join(data_home, '1-maestro-v2.0.0.json')) - assert not os.path.exists(os.path.join(data_home, '2-maestro-v2.0.0.json')) - assert os.path.exists(os.path.join(data_home, '3-maestro-v2.0.0.json')) + maestro._download(data_home, remotes, ["metadata"], None, False, False) + assert not os.path.exists(os.path.join(data_home, "1-maestro-v2.0.0.json")) + assert not os.path.exists(os.path.join(data_home, "2-maestro-v2.0.0.json")) + assert os.path.exists(os.path.join(data_home, "3-maestro-v2.0.0.json")) def test_download(httpserver): - data_home = 'tests/resources/mir_datasets/MAESTRO_download' + data_home = "tests/resources/mir_datasets/maestro_download" if os.path.exists(data_home): shutil.rmtree(data_home) # download the full dataset httpserver.serve_content( - open('tests/resources/download/maestro-v2.0.0.zip', 'rb').read() + open("tests/resources/download/maestro-v2.0.0.zip", "rb").read() ) - maestro.REMOTES = { - 'all': download_utils.RemoteFileMetadata( - filename='maestro-v2.0.0.zip', + remotes = { + "all": download_utils.RemoteFileMetadata( + filename="maestro-v2.0.0.zip", url=httpserver.url, - checksum=('625180ffa41cd9f2ab7252dd954b9e8a'), + checksum=("625180ffa41cd9f2ab7252dd954b9e8a"), destination_dir=None, ) } - maestro.download(data_home=data_home) + maestro._download(data_home, remotes, None, None, False, False) assert os.path.exists(data_home) - assert not os.path.exists(os.path.join(data_home, 'maestro-v2.0.0')) + assert not os.path.exists(os.path.join(data_home, "maestro-v2.0.0")) assert os.path.exists(os.path.join(data_home, "maestro-v2.0.0.json")) assert os.path.exists( @@ -188,21 +189,21 @@ def test_download(httpserver): # download the midi-only zip httpserver.serve_content( - open('tests/resources/download/maestro-v2.0.0-midi.zip', 'rb').read() + open("tests/resources/download/maestro-v2.0.0-midi.zip", "rb").read() ) - maestro.REMOTES = { - 'midi': download_utils.RemoteFileMetadata( - filename='maestro-v2.0.0-midi.zip', + remotes = { + "midi": download_utils.RemoteFileMetadata( + filename="maestro-v2.0.0-midi.zip", url=httpserver.url, - checksum=('c82283fff347ed2bd833693c09a9f01d'), + checksum=("c82283fff347ed2bd833693c09a9f01d"), destination_dir=None, ) } - maestro.download(data_home=data_home, partial_download=['midi']) + maestro._download(data_home, remotes, ["midi"], None, False, False) assert os.path.exists(data_home) - assert not os.path.exists(os.path.join(data_home, 'maestro-v2.0.0')) + assert not os.path.exists(os.path.join(data_home, "maestro-v2.0.0")) assert os.path.exists(os.path.join(data_home, "maestro-v2.0.0.json")) assert not os.path.exists( @@ -223,21 +224,21 @@ def test_download(httpserver): # download only the metadata httpserver.serve_content( - open('tests/resources/download/maestro-v2.0.0.json', 'rb').read() + open("tests/resources/download/maestro-v2.0.0.json", "rb").read() ) - maestro.REMOTES = { - 'metadata': download_utils.RemoteFileMetadata( - filename='maestro-v2.0.0.json', + remotes = { + "metadata": download_utils.RemoteFileMetadata( + filename="maestro-v2.0.0.json", url=httpserver.url, - checksum=('d41d8cd98f00b204e9800998ecf8427e'), + checksum=("d41d8cd98f00b204e9800998ecf8427e"), destination_dir=None, ) } - maestro.download(data_home=data_home, partial_download=['metadata']) + maestro._download(data_home, remotes, ["metadata"], None, False, False) assert os.path.exists(data_home) - assert not os.path.exists(os.path.join(data_home, 'maestro-v2.0.0')) + assert not os.path.exists(os.path.join(data_home, "maestro-v2.0.0")) assert os.path.exists(os.path.join(data_home, "maestro-v2.0.0.json")) assert not os.path.exists( diff --git a/tests/test_medley_solos_db.py b/tests/test_medley_solos_db.py index 1f5f100d8..ce71efe25 100644 --- a/tests/test_medley_solos_db.py +++ b/tests/test_medley_solos_db.py @@ -1,22 +1,22 @@ # -*- coding: utf-8 -*- -from mirdata import medley_solos_db +from mirdata.datasets import medley_solos_db from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'd07b1fc0-567d-52c2-fef4-239f31c9d40e' - data_home = 'tests/resources/mir_datasets/Medley-solos-DB' + default_trackid = "d07b1fc0-567d-52c2-fef4-239f31c9d40e" + data_home = "tests/resources/mir_datasets/medley_solos_db" track = medley_solos_db.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': 'd07b1fc0-567d-52c2-fef4-239f31c9d40e', - 'audio_path': 'tests/resources/mir_datasets/Medley-solos-DB/' - + 'audio/Medley-solos-DB_validation-3_d07b1fc0-567d-52c2-fef4-239f31c9d40e.wav', - 'instrument': 'flute', - 'instrument_id': 3, - 'song_id': 210, - 'subset': 'validation', + "track_id": "d07b1fc0-567d-52c2-fef4-239f31c9d40e", + "audio_path": "tests/resources/mir_datasets/medley_solos_db/" + + "audio/Medley-solos-DB_validation-3_d07b1fc0-567d-52c2-fef4-239f31c9d40e.wav", + "instrument": "flute", + "instrument_id": 3, + "song_id": 210, + "subset": "validation", } expected_property_types = {} @@ -30,13 +30,13 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/Medley-solos-DB' + data_home = "tests/resources/mir_datasets/medley_solos_db" track = medley_solos_db.Track( - 'd07b1fc0-567d-52c2-fef4-239f31c9d40e', data_home=data_home + "d07b1fc0-567d-52c2-fef4-239f31c9d40e", data_home=data_home ) jam = track.to_jams() - assert jam['sandbox']['instrument'] == 'flute' - assert jam['sandbox']['instrument_id'] == 3 - assert jam['sandbox']['song_id'] == 210 - assert jam['sandbox']['subset'] == 'validation' + assert jam["sandbox"]["instrument"] == "flute" + assert jam["sandbox"]["instrument_id"] == 3 + assert jam["sandbox"]["song_id"] == 210 + assert jam["sandbox"]["subset"] == "validation" diff --git a/tests/test_medleydb_melody.py b/tests/test_medleydb_melody.py index 0d3d47254..eb0893827 100644 --- a/tests/test_medleydb_melody.py +++ b/tests/test_medleydb_melody.py @@ -2,37 +2,38 @@ import numpy as np -from mirdata import medleydb_melody, utils +from mirdata.datasets import medleydb_melody +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'MusicDelta_Beethoven' - data_home = 'tests/resources/mir_datasets/MedleyDB-Melody' + default_trackid = "MusicDelta_Beethoven" + data_home = "tests/resources/mir_datasets/medleydb_melody" track = medleydb_melody.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': 'MusicDelta_Beethoven', - 'audio_path': 'tests/resources/mir_datasets/' - + 'MedleyDB-Melody/audio/MusicDelta_Beethoven_MIX.wav', - 'melody1_path': 'tests/resources/mir_datasets/' - + 'MedleyDB-Melody/melody1/MusicDelta_Beethoven_MELODY1.csv', - 'melody2_path': 'tests/resources/mir_datasets/' - + 'MedleyDB-Melody/melody2/MusicDelta_Beethoven_MELODY2.csv', - 'melody3_path': 'tests/resources/mir_datasets/' - + 'MedleyDB-Melody/melody3/MusicDelta_Beethoven_MELODY3.csv', - 'artist': 'MusicDelta', - 'title': 'Beethoven', - 'genre': 'Classical', - 'is_excerpt': True, - 'is_instrumental': True, - 'n_sources': 18, + "track_id": "MusicDelta_Beethoven", + "audio_path": "tests/resources/mir_datasets/" + + "medleydb_melody/audio/MusicDelta_Beethoven_MIX.wav", + "melody1_path": "tests/resources/mir_datasets/" + + "medleydb_melody/melody1/MusicDelta_Beethoven_MELODY1.csv", + "melody2_path": "tests/resources/mir_datasets/" + + "medleydb_melody/melody2/MusicDelta_Beethoven_MELODY2.csv", + "melody3_path": "tests/resources/mir_datasets/" + + "medleydb_melody/melody3/MusicDelta_Beethoven_MELODY3.csv", + "artist": "MusicDelta", + "title": "Beethoven", + "genre": "Classical", + "is_excerpt": True, + "is_instrumental": True, + "n_sources": 18, } expected_property_types = { - 'melody1': utils.F0Data, - 'melody2': utils.F0Data, - 'melody3': utils.MultipitchData, + "melody1": utils.F0Data, + "melody2": utils.F0Data, + "melody3": utils.MultipitchData, } run_track_tests(track, expected_attributes, expected_property_types) @@ -44,28 +45,28 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/MedleyDB-Melody' - track = medleydb_melody.Track('MusicDelta_Beethoven', data_home=data_home) + data_home = "tests/resources/mir_datasets/medleydb_melody" + track = medleydb_melody.Track("MusicDelta_Beethoven", data_home=data_home) jam = track.to_jams() - f0s = jam.search(namespace='pitch_contour')[1]['data'] + f0s = jam.search(namespace="pitch_contour")[1]["data"] assert [f0.time for f0 in f0s] == [0.046439909297052155, 0.052244897959183675] assert [f0.duration for f0 in f0s] == [0.0, 0.0] assert [f0.value for f0 in f0s] == [ - {'frequency': 0.0, 'index': 0, 'voiced': False}, - {'frequency': 965.992, 'index': 0, 'voiced': True}, + {"frequency": 0.0, "index": 0, "voiced": False}, + {"frequency": 965.992, "index": 0, "voiced": True}, ] assert [f0.confidence for f0 in f0s] == [0.0, 1.0] - assert jam['file_metadata']['title'] == 'Beethoven' - assert jam['file_metadata']['artist'] == 'MusicDelta' + assert jam["file_metadata"]["title"] == "Beethoven" + assert jam["file_metadata"]["artist"] == "MusicDelta" def test_load_melody(): # load a file which exists melody_path = ( - 'tests/resources/mir_datasets/MedleyDB-Melody/' - + 'melody1/MusicDelta_Beethoven_MELODY1.csv' + "tests/resources/mir_datasets/medleydb_melody/" + + "melody1/MusicDelta_Beethoven_MELODY1.csv" ) melody_data = medleydb_melody.load_melody(melody_path) @@ -86,8 +87,8 @@ def test_load_melody(): def test_load_melody3(): # load a file which exists melody_path = ( - 'tests/resources/mir_datasets/MedleyDB-Melody/' - + 'melody3/MusicDelta_Beethoven_MELODY3.csv' + "tests/resources/mir_datasets/medleydb_melody/" + + "melody3/MusicDelta_Beethoven_MELODY3.csv" ) melody_data = medleydb_melody.load_melody3(melody_path) @@ -122,21 +123,21 @@ def test_load_melody3(): def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/MedleyDB-Melody' + data_home = "tests/resources/mir_datasets/medleydb_melody" metadata = medleydb_melody._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['MusicDelta_Beethoven'] == { - 'audio_path': 'MedleyDB-Melody/audio/MusicDelta_Beethoven_MIX.wav', - 'melody1_path': 'MedleyDB-Melody/melody1/MusicDelta_Beethoven_MELODY1.csv', - 'melody2_path': 'MedleyDB-Melody/melody2/MusicDelta_Beethoven_MELODY2.csv', - 'melody3_path': 'MedleyDB-Melody/melody3/MusicDelta_Beethoven_MELODY3.csv', - 'artist': 'MusicDelta', - 'title': 'Beethoven', - 'genre': 'Classical', - 'is_excerpt': True, - 'is_instrumental': True, - 'n_sources': 18, + assert metadata["data_home"] == data_home + assert metadata["MusicDelta_Beethoven"] == { + "audio_path": "medleydb_melody/audio/MusicDelta_Beethoven_MIX.wav", + "melody1_path": "medleydb_melody/melody1/MusicDelta_Beethoven_MELODY1.csv", + "melody2_path": "medleydb_melody/melody2/MusicDelta_Beethoven_MELODY2.csv", + "melody3_path": "medleydb_melody/melody3/MusicDelta_Beethoven_MELODY3.csv", + "artist": "MusicDelta", + "title": "Beethoven", + "genre": "Classical", + "is_excerpt": True, + "is_instrumental": True, + "n_sources": 18, } - metadata_none = medleydb_melody._load_metadata('asdf/asdf') + metadata_none = medleydb_melody._load_metadata("asdf/asdf") assert metadata_none is None diff --git a/tests/test_medleydb_pitch.py b/tests/test_medleydb_pitch.py index 6baf6d8bb..7dbf2c42f 100644 --- a/tests/test_medleydb_pitch.py +++ b/tests/test_medleydb_pitch.py @@ -2,28 +2,29 @@ import numpy as np -from mirdata import medleydb_pitch, utils +from mirdata.datasets import medleydb_pitch +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'AClassicEducation_NightOwl_STEM_08' - data_home = 'tests/resources/mir_datasets/MedleyDB-Pitch' + default_trackid = "AClassicEducation_NightOwl_STEM_08" + data_home = "tests/resources/mir_datasets/medleydb_pitch" track = medleydb_pitch.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': 'AClassicEducation_NightOwl_STEM_08', - 'audio_path': 'tests/resources/mir_datasets/' - + 'MedleyDB-Pitch/audio/AClassicEducation_NightOwl_STEM_08.wav', - 'pitch_path': 'tests/resources/mir_datasets/' - + 'MedleyDB-Pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv', - 'instrument': 'male singer', - 'artist': 'AClassicEducation', - 'title': 'NightOwl', - 'genre': 'Singer/Songwriter', + "track_id": "AClassicEducation_NightOwl_STEM_08", + "audio_path": "tests/resources/mir_datasets/" + + "medleydb_pitch/audio/AClassicEducation_NightOwl_STEM_08.wav", + "pitch_path": "tests/resources/mir_datasets/" + + "medleydb_pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv", + "instrument": "male singer", + "artist": "AClassicEducation", + "title": "NightOwl", + "genre": "Singer/Songwriter", } - expected_property_types = {'pitch': utils.F0Data} + expected_property_types = {"pitch": utils.F0Data} run_track_tests(track, expected_attributes, expected_property_types) @@ -34,30 +35,30 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/MedleyDB-Pitch' + data_home = "tests/resources/mir_datasets/medleydb_pitch" track = medleydb_pitch.Track( - 'AClassicEducation_NightOwl_STEM_08', data_home=data_home + "AClassicEducation_NightOwl_STEM_08", data_home=data_home ) jam = track.to_jams() - f0s = jam.search(namespace='pitch_contour')[0]['data'] + f0s = jam.search(namespace="pitch_contour")[0]["data"] assert [f0.time for f0 in f0s] == [0.06965986394557823, 0.07546485260770976] assert [f0.duration for f0 in f0s] == [0.0, 0.0] assert [f0.value for f0 in f0s] == [ - {'frequency': 0.0, 'index': 0, 'voiced': False}, - {'frequency': 191.877, 'index': 0, 'voiced': True}, + {"frequency": 0.0, "index": 0, "voiced": False}, + {"frequency": 191.877, "index": 0, "voiced": True}, ] assert [f0.confidence for f0 in f0s] == [0.0, 1.0] - assert jam['file_metadata']['title'] == 'NightOwl' - assert jam['file_metadata']['artist'] == 'AClassicEducation' + assert jam["file_metadata"]["title"] == "NightOwl" + assert jam["file_metadata"]["artist"] == "AClassicEducation" def test_load_pitch(): # load a file which exists pitch_path = ( - 'tests/resources/mir_datasets/MedleyDB-Pitch/' - + 'pitch/AClassicEducation_NightOwl_STEM_08.csv' + "tests/resources/mir_datasets/medleydb_pitch/" + + "pitch/AClassicEducation_NightOwl_STEM_08.csv" ) pitch_data = medleydb_pitch.load_pitch(pitch_path) @@ -76,14 +77,14 @@ def test_load_pitch(): def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/MedleyDB-Pitch' + data_home = "tests/resources/mir_datasets/medleydb_pitch" metadata = medleydb_pitch._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['AClassicEducation_NightOwl_STEM_08'] == { - 'audio_path': 'MedleyDB-Pitch/audio/AClassicEducation_NightOwl_STEM_08.wav', - 'pitch_path': 'MedleyDB-Pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv', - 'instrument': 'male singer', - 'artist': 'AClassicEducation', - 'title': 'NightOwl', - 'genre': 'Singer/Songwriter', + assert metadata["data_home"] == data_home + assert metadata["AClassicEducation_NightOwl_STEM_08"] == { + "audio_path": "medleydb_pitch/audio/AClassicEducation_NightOwl_STEM_08.wav", + "pitch_path": "medleydb_pitch/pitch/AClassicEducation_NightOwl_STEM_08.csv", + "instrument": "male singer", + "artist": "AClassicEducation", + "title": "NightOwl", + "genre": "Singer/Songwriter", } diff --git a/tests/test_mridangam_stroke.py b/tests/test_mridangam_stroke.py index 4394cbb3e..10854a261 100644 --- a/tests/test_mridangam_stroke.py +++ b/tests/test_mridangam_stroke.py @@ -4,28 +4,20 @@ from tests.test_utils import run_track_tests -from mirdata import mridangam_stroke +from mirdata.datasets import mridangam_stroke from tests.test_utils import DEFAULT_DATA_HOME -def test_track_default_data_home(): - # test data home None - track_default = mridangam_stroke.Track("224030") - assert track_default._data_home == os.path.join( - DEFAULT_DATA_HOME, "Mridangam-Stroke" - ) - - def test_track(): default_trackid = "224030" - data_home = 'tests/resources/mir_datasets/Mridangam-Stroke' + data_home = "tests/resources/mir_datasets/mridangam_stroke" track = mridangam_stroke.Track(default_trackid, data_home=data_home) expected_attributes = { - 'audio_path': "tests/resources/mir_datasets/Mridangam-Stroke/mridangam_stroke_1.5/" + "audio_path": "tests/resources/mir_datasets/mridangam_stroke/mridangam_stroke_1.5/" + "B/224030__akshaylaya__bheem-b-001.wav", - 'track_id': "224030", - 'stroke_name': 'bheem', - 'tonic': 'B' + "track_id": "224030", + "stroke_name": "bheem", + "tonic": "B", } run_track_tests(track, expected_attributes, {}) @@ -37,7 +29,7 @@ def test_track(): def test_to_jams(): default_trackid = "224030" - data_home = 'tests/resources/mir_datasets/Mridangam-Stroke' + data_home = "tests/resources/mir_datasets/mridangam_stroke" track = mridangam_stroke.Track(default_trackid, data_home=data_home) jam = track.to_jams() @@ -47,9 +39,13 @@ def test_to_jams(): # Test the stroke parser parsed_stroke = jam.annotations["tag_open"][0].data[0].value assert parsed_stroke == "bheem" - assert parsed_stroke in mridangam_stroke.STROKE_DICT, "Stroke {} not in stroke dictionary".format(parsed_stroke) + assert ( + parsed_stroke in mridangam_stroke.STROKE_DICT + ), "Stroke {} not in stroke dictionary".format(parsed_stroke) # Test the tonic parser parsed_tonic = jam.sandbox.tonic assert parsed_tonic == "B" - assert parsed_tonic in mridangam_stroke.TONIC_DICT, "Stroke {} not in stroke dictionary".format(parsed_tonic) + assert ( + parsed_tonic in mridangam_stroke.TONIC_DICT + ), "Stroke {} not in stroke dictionary".format(parsed_tonic) diff --git a/tests/test_orchset.py b/tests/test_orchset.py index 668608197..59d833158 100644 --- a/tests/test_orchset.py +++ b/tests/test_orchset.py @@ -1,38 +1,39 @@ # -*- coding: utf-8 -*- - +import os, shutil import numpy as np -from mirdata import orchset, utils +from mirdata.datasets import orchset +from mirdata import utils, download_utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'Beethoven-S3-I-ex1' - data_home = 'tests/resources/mir_datasets/Orchset' + default_trackid = "Beethoven-S3-I-ex1" + data_home = "tests/resources/mir_datasets/orchset" track = orchset.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': 'Beethoven-S3-I-ex1', - 'audio_path_mono': 'tests/resources/mir_datasets/Orchset/' - + 'audio/mono/Beethoven-S3-I-ex1.wav', - 'audio_path_stereo': 'tests/resources/mir_datasets/Orchset/' - + 'audio/stereo/Beethoven-S3-I-ex1.wav', - 'melody_path': 'tests/resources/mir_datasets/Orchset/' - + 'GT/Beethoven-S3-I-ex1.mel', - 'composer': 'Beethoven', - 'work': 'S3-I', - 'excerpt': '1', - 'predominant_melodic_instruments': ['strings', 'winds'], - 'alternating_melody': True, - 'contains_winds': True, - 'contains_strings': True, - 'contains_brass': False, - 'only_strings': False, - 'only_winds': False, - 'only_brass': False, + "track_id": "Beethoven-S3-I-ex1", + "audio_path_mono": "tests/resources/mir_datasets/orchset/" + + "audio/mono/Beethoven-S3-I-ex1.wav", + "audio_path_stereo": "tests/resources/mir_datasets/orchset/" + + "audio/stereo/Beethoven-S3-I-ex1.wav", + "melody_path": "tests/resources/mir_datasets/orchset/" + + "GT/Beethoven-S3-I-ex1.mel", + "composer": "Beethoven", + "work": "S3-I", + "excerpt": "1", + "predominant_melodic_instruments": ["strings", "winds"], + "alternating_melody": True, + "contains_winds": True, + "contains_strings": True, + "contains_brass": False, + "only_strings": False, + "only_winds": False, + "only_brass": False, } - expected_property_types = {'melody': utils.F0Data} + expected_property_types = {"melody": utils.F0Data} run_track_tests(track, expected_attributes, expected_property_types) @@ -47,26 +48,26 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/Orchset' - track = orchset.Track('Beethoven-S3-I-ex1', data_home=data_home) + data_home = "tests/resources/mir_datasets/orchset" + track = orchset.Track("Beethoven-S3-I-ex1", data_home=data_home) jam = track.to_jams() - f0s = jam.search(namespace='pitch_contour')[0]['data'] + f0s = jam.search(namespace="pitch_contour")[0]["data"] assert [f0.time for f0 in f0s] == [0.0, 0.08, 0.09] assert [f0.duration for f0 in f0s] == [0.0, 0.0, 0.0] assert [f0.value for f0 in f0s] == [ - {'frequency': 0.0, 'index': 0, 'voiced': False}, - {'frequency': 0.0, 'index': 0, 'voiced': False}, - {'frequency': 622.254, 'index': 0, 'voiced': True}, + {"frequency": 0.0, "index": 0, "voiced": False}, + {"frequency": 0.0, "index": 0, "voiced": False}, + {"frequency": 622.254, "index": 0, "voiced": True}, ] assert [f0.confidence for f0 in f0s] == [0.0, 0.0, 1.0] - assert jam['sandbox']['alternating_melody'] == True + assert jam["sandbox"]["alternating_melody"] == True def test_load_melody(): # load a file which exists - melody_path = 'tests/resources/mir_datasets/Orchset/GT/Beethoven-S3-I-ex1.mel' + melody_path = "tests/resources/mir_datasets/orchset/GT/Beethoven-S3-I-ex1.mel" melody_data = orchset.load_melody(melody_path) # check types @@ -82,79 +83,114 @@ def test_load_melody(): def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/Orchset' + data_home = "tests/resources/mir_datasets/orchset" metadata = orchset._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['Beethoven-S3-I-ex1'] == { - 'predominant_melodic_instruments-raw': 'strings+winds', - 'predominant_melodic_instruments-normalized': ['strings', 'winds'], - 'alternating_melody': True, - 'contains_winds': True, - 'contains_strings': True, - 'contains_brass': False, - 'only_strings': False, - 'only_winds': False, - 'only_brass': False, - 'composer': 'Beethoven', - 'work': 'S3-I', - 'excerpt': '1', + assert metadata["data_home"] == data_home + assert metadata["Beethoven-S3-I-ex1"] == { + "predominant_melodic_instruments-raw": "strings+winds", + "predominant_melodic_instruments-normalized": ["strings", "winds"], + "alternating_melody": True, + "contains_winds": True, + "contains_strings": True, + "contains_brass": False, + "only_strings": False, + "only_winds": False, + "only_brass": False, + "composer": "Beethoven", + "work": "S3-I", + "excerpt": "1", } - assert metadata['Haydn-S94-Menuet-ex1'] == { - 'predominant_melodic_instruments-raw': 'string+winds', - 'predominant_melodic_instruments-normalized': ['strings', 'winds'], - 'alternating_melody': True, - 'contains_winds': True, - 'contains_strings': True, - 'contains_brass': False, - 'only_strings': False, - 'only_winds': False, - 'only_brass': False, - 'composer': 'Haydn', - 'work': 'S94-Menuet', - 'excerpt': '1', + assert metadata["Haydn-S94-Menuet-ex1"] == { + "predominant_melodic_instruments-raw": "string+winds", + "predominant_melodic_instruments-normalized": ["strings", "winds"], + "alternating_melody": True, + "contains_winds": True, + "contains_strings": True, + "contains_brass": False, + "only_strings": False, + "only_winds": False, + "only_brass": False, + "composer": "Haydn", + "work": "S94-Menuet", + "excerpt": "1", } - assert metadata['Musorgski-Ravel-PicturesExhibition-Promenade1-ex2'] == { - 'predominant_melodic_instruments-raw': 'strings', - 'predominant_melodic_instruments-normalized': ['strings'], - 'alternating_melody': False, - 'contains_winds': True, - 'contains_strings': False, - 'contains_brass': False, - 'only_strings': True, - 'only_winds': False, - 'only_brass': False, - 'composer': 'Musorgski-Ravel', - 'work': 'PicturesExhibition-Promenade1', - 'excerpt': '2', + assert metadata["Musorgski-Ravel-PicturesExhibition-Promenade1-ex2"] == { + "predominant_melodic_instruments-raw": "strings", + "predominant_melodic_instruments-normalized": ["strings"], + "alternating_melody": False, + "contains_winds": True, + "contains_strings": False, + "contains_brass": False, + "only_strings": True, + "only_winds": False, + "only_brass": False, + "composer": "Musorgski-Ravel", + "work": "PicturesExhibition-Promenade1", + "excerpt": "2", } - assert metadata['Rimski-Korsakov-Scheherazade-YoungPrincePrincess-ex4'] == { - 'predominant_melodic_instruments-raw': 'strings+winds', - 'predominant_melodic_instruments-normalized': ['strings', 'winds'], - 'alternating_melody': True, - 'contains_winds': True, - 'contains_strings': True, - 'contains_brass': False, - 'only_strings': False, - 'only_winds': False, - 'only_brass': False, - 'composer': 'Rimski-Korsakov', - 'work': 'Scheherazade-YoungPrincePrincess', - 'excerpt': '4', + assert metadata["Rimski-Korsakov-Scheherazade-YoungPrincePrincess-ex4"] == { + "predominant_melodic_instruments-raw": "strings+winds", + "predominant_melodic_instruments-normalized": ["strings", "winds"], + "alternating_melody": True, + "contains_winds": True, + "contains_strings": True, + "contains_brass": False, + "only_strings": False, + "only_winds": False, + "only_brass": False, + "composer": "Rimski-Korsakov", + "work": "Scheherazade-YoungPrincePrincess", + "excerpt": "4", } - assert metadata['Schubert-S8-II-ex2'] == { - 'predominant_melodic_instruments-raw': 'winds (solo)', - 'predominant_melodic_instruments-normalized': ['winds'], - 'alternating_melody': False, - 'contains_winds': False, - 'contains_strings': True, - 'contains_brass': False, - 'only_strings': False, - 'only_winds': True, - 'only_brass': False, - 'composer': 'Schubert', - 'work': 'S8-II', - 'excerpt': '2', + assert metadata["Schubert-S8-II-ex2"] == { + "predominant_melodic_instruments-raw": "winds (solo)", + "predominant_melodic_instruments-normalized": ["winds"], + "alternating_melody": False, + "contains_winds": False, + "contains_strings": True, + "contains_brass": False, + "only_strings": False, + "only_winds": True, + "only_brass": False, + "composer": "Schubert", + "work": "S8-II", + "excerpt": "2", } - metadata_none = orchset._load_metadata('asdf/asdf') + metadata_none = orchset._load_metadata("asdf/asdf") assert metadata_none is None + + +def test_download(httpserver): + data_home = "tests/resources/mir_datasets/orchset_download" + if os.path.exists(data_home): + shutil.rmtree(data_home) + + httpserver.serve_content( + open("tests/resources/download/Orchset_dataset_0.zip", "rb").read() + ) + + remotes = { + "all": download_utils.RemoteFileMetadata( + filename="Orchset_dataset_0.zip", + url=httpserver.url, + checksum=("4794bc3514f7e8d1727f0d975d6d1ee2"), + destination_dir=None, + ) + } + orchset._download(data_home, remotes, None, None, False, True) + + assert os.path.exists(data_home) + assert not os.path.exists(os.path.join(data_home, "Orchset")) + + assert os.path.exists(os.path.join(data_home, "README.txt")) + assert os.path.exists( + os.path.join(data_home, "Orchset - Predominant Melodic Instruments.csv") + ) + track = orchset.Track("Beethoven-S3-I-ex1", data_home=data_home) + assert os.path.exists(track.audio_path_mono) + assert os.path.exists(track.audio_path_stereo) + assert os.path.exists(track.melody_path) + + if os.path.exists(data_home): + shutil.rmtree(data_home) diff --git a/tests/test_rwc_classical.py b/tests/test_rwc_classical.py index cfe6aff21..190794fd5 100644 --- a/tests/test_rwc_classical.py +++ b/tests/test_rwc_classical.py @@ -2,34 +2,35 @@ import numpy as np -from mirdata import rwc_classical, utils +from mirdata.datasets import rwc_classical +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'RM-C003' - data_home = 'tests/resources/mir_datasets/RWC-Classical' + default_trackid = "RM-C003" + data_home = "tests/resources/mir_datasets/rwc_classical" track = rwc_classical.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': 'RM-C003', - 'audio_path': 'tests/resources/mir_datasets/RWC-Classical/' - + 'audio/rwc-c-m01/3.wav', - 'sections_path': 'tests/resources/mir_datasets/RWC-Classical/' - + 'annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT', - 'beats_path': 'tests/resources/mir_datasets/RWC-Classical/' - + 'annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT', - 'piece_number': 'No. 3', - 'suffix': 'M01', - 'track_number': 'Tr. 03', - 'title': 'Symphony no.5 in C minor, op.67. 1st mvmt.', - 'composer': 'Beethoven, Ludwig van', - 'artist': 'Tokyo City Philharmonic Orchestra', - 'duration': 435, - 'category': 'Symphony', + "track_id": "RM-C003", + "audio_path": "tests/resources/mir_datasets/rwc_classical/" + + "audio/rwc-c-m01/3.wav", + "sections_path": "tests/resources/mir_datasets/rwc_classical/" + + "annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT", + "beats_path": "tests/resources/mir_datasets/rwc_classical/" + + "annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT", + "piece_number": "No. 3", + "suffix": "M01", + "track_number": "Tr. 03", + "title": "Symphony no.5 in C minor, op.67. 1st mvmt.", + "composer": "Beethoven, Ludwig van", + "artist": "Tokyo City Philharmonic Orchestra", + "duration": 435, + "category": "Symphony", } - expected_property_types = {'beats': utils.BeatData, 'sections': utils.SectionData} + expected_property_types = {"beats": utils.BeatData, "sections": utils.SectionData} run_track_tests(track, expected_attributes, expected_property_types) @@ -41,11 +42,11 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/RWC-Classical' - track = rwc_classical.Track('RM-C003', data_home=data_home) + data_home = "tests/resources/mir_datasets/rwc_classical" + track = rwc_classical.Track("RM-C003", data_home=data_home) jam = track.to_jams() - beats = jam.search(namespace='beat')[0]['data'] + beats = jam.search(namespace="beat")[0]["data"] assert [beat.time for beat in beats] == [ 1.65, 2.58, @@ -69,21 +70,21 @@ def test_to_jams(): None, ] - segments = jam.search(namespace='segment')[0]['data'] + segments = jam.search(namespace="segment")[0]["data"] assert [segment.time for segment in segments] == [0.29, 419.96] assert [segment.duration for segment in segments] == [45.85, 13.75] - assert [segment.value for segment in segments] == ['chorus A', 'ending'] + assert [segment.value for segment in segments] == ["chorus A", "ending"] assert [segment.confidence for segment in segments] == [None, None] - assert jam['file_metadata']['title'] == 'Symphony no.5 in C minor, op.67. 1st mvmt.' - assert jam['file_metadata']['artist'] == 'Tokyo City Philharmonic Orchestra' + assert jam["file_metadata"]["title"] == "Symphony no.5 in C minor, op.67. 1st mvmt." + assert jam["file_metadata"]["artist"] == "Tokyo City Philharmonic Orchestra" def test_load_sections(): # load a file which exists section_path = ( - 'tests/resources/mir_datasets/RWC-Classical/' - + 'annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT' + "tests/resources/mir_datasets/rwc_classical/" + + "annotations/AIST.RWC-MDB-C-2001.CHORUS/RM-C003.CHORUS.TXT" ) section_data = rwc_classical.load_sections(section_path) @@ -95,7 +96,7 @@ def test_load_sections(): # check values assert np.array_equal(section_data.intervals[:, 0], np.array([0.29, 419.96])) assert np.array_equal(section_data.intervals[:, 1], np.array([46.14, 433.71])) - assert np.array_equal(section_data.labels, np.array(['chorus A', 'ending'])) + assert np.array_equal(section_data.labels, np.array(["chorus A", "ending"])) def test_position_in_bar(): @@ -152,8 +153,8 @@ def test_position_in_bar(): def test_load_beats(): beats_path = ( - 'tests/resources/mir_datasets/RWC-Classical/' - + 'annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT' + "tests/resources/mir_datasets/rwc_classical/" + + "annotations/AIST.RWC-MDB-C-2001.BEAT/RM-C003.BEAT.TXT" ) beat_data = rwc_classical.load_beats(beats_path) @@ -170,16 +171,16 @@ def test_load_beats(): def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/RWC-Classical' + data_home = "tests/resources/mir_datasets/rwc_classical" metadata = rwc_classical._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['RM-C003'] == { - 'piece_number': 'No. 3', - 'suffix': 'M01', - 'track_number': 'Tr. 03', - 'title': 'Symphony no.5 in C minor, op.67. 1st mvmt.', - 'composer': 'Beethoven, Ludwig van', - 'artist': 'Tokyo City Philharmonic Orchestra', - 'duration': 435, - 'category': 'Symphony', + assert metadata["data_home"] == data_home + assert metadata["RM-C003"] == { + "piece_number": "No. 3", + "suffix": "M01", + "track_number": "Tr. 03", + "title": "Symphony no.5 in C minor, op.67. 1st mvmt.", + "composer": "Beethoven, Ludwig van", + "artist": "Tokyo City Philharmonic Orchestra", + "duration": 435, + "category": "Symphony", } diff --git a/tests/test_rwc_jazz.py b/tests/test_rwc_jazz.py index d535f9cd4..63bf7d764 100644 --- a/tests/test_rwc_jazz.py +++ b/tests/test_rwc_jazz.py @@ -1,34 +1,35 @@ # -*- coding: utf-8 -*- -from mirdata import rwc_jazz, utils +from mirdata.datasets import rwc_jazz +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'RM-J004' - data_home = 'tests/resources/mir_datasets/RWC-Jazz' + default_trackid = "RM-J004" + data_home = "tests/resources/mir_datasets/rwc_jazz" track = rwc_jazz.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': 'RM-J004', - 'audio_path': 'tests/resources/mir_datasets/RWC-Jazz/' - + 'audio/rwc-j-m01/4.wav', - 'sections_path': 'tests/resources/mir_datasets/RWC-Jazz/' - + 'annotations/AIST.RWC-MDB-J-2001.CHORUS/RM-J004.CHORUS.TXT', - 'beats_path': 'tests/resources/mir_datasets/RWC-Jazz/' - + 'annotations/AIST.RWC-MDB-J-2001.BEAT/RM-J004.BEAT.TXT', - 'piece_number': 'No. 4', - 'suffix': 'M01', - 'track_number': 'Tr. 04', - 'title': 'Crescent Serenade (Piano Solo)', - 'artist': 'Makoto Nakamura', - 'duration': 167, - 'variation': 'Instrumentation 1', - 'instruments': 'Pf', + "track_id": "RM-J004", + "audio_path": "tests/resources/mir_datasets/rwc_jazz/" + + "audio/rwc-j-m01/4.wav", + "sections_path": "tests/resources/mir_datasets/rwc_jazz/" + + "annotations/AIST.RWC-MDB-J-2001.CHORUS/RM-J004.CHORUS.TXT", + "beats_path": "tests/resources/mir_datasets/rwc_jazz/" + + "annotations/AIST.RWC-MDB-J-2001.BEAT/RM-J004.BEAT.TXT", + "piece_number": "No. 4", + "suffix": "M01", + "track_number": "Tr. 04", + "title": "Crescent Serenade (Piano Solo)", + "artist": "Makoto Nakamura", + "duration": 167, + "variation": "Instrumentation 1", + "instruments": "Pf", } - expected_property_types = {'beats': utils.BeatData, 'sections': utils.SectionData} + expected_property_types = {"beats": utils.BeatData, "sections": utils.SectionData} run_track_tests(track, expected_attributes, expected_property_types) @@ -40,11 +41,11 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/RWC-Jazz' - track = rwc_jazz.Track('RM-J004', data_home=data_home) + data_home = "tests/resources/mir_datasets/rwc_jazz" + track = rwc_jazz.Track("RM-J004", data_home=data_home) jam = track.to_jams() - beats = jam.search(namespace='beat')[0]['data'] + beats = jam.search(namespace="beat")[0]["data"] assert [beat.time for beat in beats] == [ 0.05, 0.86, @@ -83,7 +84,7 @@ def test_to_jams(): None, ] - segments = jam.search(namespace='segment')[0]['data'] + segments = jam.search(namespace="segment")[0]["data"] assert [segment.time for segment in segments] == [0.05, 6.53, 152.06] assert [segment.duration for segment in segments] == [ 6.48, @@ -91,40 +92,40 @@ def test_to_jams(): 13.319999999999993, ] assert [segment.value for segment in segments] == [ - 'nothing', - 'chorus A', - 'chorus B', + "nothing", + "chorus A", + "chorus B", ] assert [segment.confidence for segment in segments] == [None, None, None] - assert jam['file_metadata']['title'] == 'Crescent Serenade (Piano Solo)' - assert jam['file_metadata']['artist'] == 'Makoto Nakamura' + assert jam["file_metadata"]["title"] == "Crescent Serenade (Piano Solo)" + assert jam["file_metadata"]["artist"] == "Makoto Nakamura" def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/RWC-Jazz' + data_home = "tests/resources/mir_datasets/rwc_jazz" metadata = rwc_jazz._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['RM-J004'] == { - 'piece_number': 'No. 4', - 'suffix': 'M01', - 'track_number': 'Tr. 04', - 'title': 'Crescent Serenade (Piano Solo)', - 'artist': 'Makoto Nakamura', - 'duration': 167, - 'variation': 'Instrumentation 1', - 'instruments': 'Pf', + assert metadata["data_home"] == data_home + assert metadata["RM-J004"] == { + "piece_number": "No. 4", + "suffix": "M01", + "track_number": "Tr. 04", + "title": "Crescent Serenade (Piano Solo)", + "artist": "Makoto Nakamura", + "duration": 167, + "variation": "Instrumentation 1", + "instruments": "Pf", } - assert metadata['RM-J044'] == { - 'piece_number': 'No. 44', - 'suffix': 'M04', - 'track_number': 'Tr. 09', - 'title': 'Joyful, Joyful, We Adore Thee', - 'artist': 'K’s Band', - 'duration': 270, - 'variation': 'Style (Free jazz)', - 'instruments': 'Pf & Bs & Dr & Gt & Ts & Fl & Bar', + assert metadata["RM-J044"] == { + "piece_number": "No. 44", + "suffix": "M04", + "track_number": "Tr. 09", + "title": "Joyful, Joyful, We Adore Thee", + "artist": "K’s Band", + "duration": 270, + "variation": "Style (Free jazz)", + "instruments": "Pf & Bs & Dr & Gt & Ts & Fl & Bar", } - metadata_none = rwc_jazz._load_metadata('asdf/asdf') + metadata_none = rwc_jazz._load_metadata("asdf/asdf") assert metadata_none is None diff --git a/tests/test_rwc_popular.py b/tests/test_rwc_popular.py index 81baedc30..2ed4cf434 100644 --- a/tests/test_rwc_popular.py +++ b/tests/test_rwc_popular.py @@ -2,45 +2,46 @@ import numpy as np -from mirdata import rwc_popular, utils +from mirdata.datasets import rwc_popular +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'RM-P001' - data_home = 'tests/resources/mir_datasets/RWC-Popular' + default_trackid = "RM-P001" + data_home = "tests/resources/mir_datasets/rwc_popular" track = rwc_popular.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': 'RM-P001', - 'audio_path': 'tests/resources/mir_datasets/RWC-Popular/' - + 'audio/rwc-p-m01/1.wav', - 'sections_path': 'tests/resources/mir_datasets/RWC-Popular/' - + 'annotations/AIST.RWC-MDB-P-2001.CHORUS/RM-P001.CHORUS.TXT', - 'beats_path': 'tests/resources/mir_datasets/RWC-Popular/' - + 'annotations/AIST.RWC-MDB-P-2001.BEAT/RM-P001.BEAT.TXT', - 'chords_path': 'tests/resources/mir_datasets/RWC-Popular/' - + 'annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab', - 'voca_inst_path': 'tests/resources/mir_datasets/RWC-Popular/' - + 'annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT', - 'piece_number': 'No. 1', - 'suffix': 'M01', - 'track_number': 'Tr. 01', - 'title': 'Eien no replica', - 'artist': 'Kazuo Nishi', - 'singer_information': 'Male', - 'duration': 209, - 'tempo': '135', - 'instruments': 'Gt', - 'drum_information': 'Drum sequences', + "track_id": "RM-P001", + "audio_path": "tests/resources/mir_datasets/rwc_popular/" + + "audio/rwc-p-m01/1.wav", + "sections_path": "tests/resources/mir_datasets/rwc_popular/" + + "annotations/AIST.RWC-MDB-P-2001.CHORUS/RM-P001.CHORUS.TXT", + "beats_path": "tests/resources/mir_datasets/rwc_popular/" + + "annotations/AIST.RWC-MDB-P-2001.BEAT/RM-P001.BEAT.TXT", + "chords_path": "tests/resources/mir_datasets/rwc_popular/" + + "annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab", + "voca_inst_path": "tests/resources/mir_datasets/rwc_popular/" + + "annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT", + "piece_number": "No. 1", + "suffix": "M01", + "track_number": "Tr. 01", + "title": "Eien no replica", + "artist": "Kazuo Nishi", + "singer_information": "Male", + "duration": 209, + "tempo": "135", + "instruments": "Gt", + "drum_information": "Drum sequences", } expected_property_types = { - 'beats': utils.BeatData, - 'sections': utils.SectionData, - 'chords': utils.ChordData, - 'vocal_instrument_activity': utils.EventData, + "beats": utils.BeatData, + "sections": utils.SectionData, + "chords": utils.ChordData, + "vocal_instrument_activity": utils.EventData, } run_track_tests(track, expected_attributes, expected_property_types) @@ -53,11 +54,11 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/RWC-Popular' - track = rwc_popular.Track('RM-P001', data_home=data_home) + data_home = "tests/resources/mir_datasets/rwc_popular" + track = rwc_popular.Track("RM-P001", data_home=data_home) jam = track.to_jams() - beats = jam.search(namespace='beat')[0]['data'] + beats = jam.search(namespace="beat")[0]["data"] assert [beat.time for beat in beats] == [ 0.04, 0.49, @@ -81,7 +82,7 @@ def test_to_jams(): None, ] - segments = jam.search(namespace='segment')[0]['data'] + segments = jam.search(namespace="segment")[0]["data"] assert [segment.time for segment in segments] == [0.04, 10.26, 188.48, 202.71] assert [segment.duration for segment in segments] == [ 10.22, @@ -90,14 +91,14 @@ def test_to_jams(): 4.449999999999989, ] assert [segment.value for segment in segments] == [ - 'intro', - 'chorus A', - 'bridge A', - 'ending', + "intro", + "chorus A", + "bridge A", + "ending", ] assert [segment.confidence for segment in segments] == [None, None, None, None] - chords = jam.search(namespace='chord')[0]['data'] + chords = jam.search(namespace="chord")[0]["data"] assert [chord.time for chord in chords] == [0.0, 0.104, 3.646, 43.992, 44.494] assert [chord.duration for chord in chords] == [ 0.104, @@ -107,22 +108,22 @@ def test_to_jams(): 3.142000000000003, ] assert [chord.value for chord in chords] == [ - 'N', - 'Ab:min', - 'E:maj', - 'Bb:maj(*3)', - 'C:min7', + "N", + "Ab:min", + "E:maj", + "Bb:maj(*3)", + "C:min7", ] assert [chord.confidence for chord in chords] == [None, None, None, None, None] - assert jam['file_metadata']['title'] == 'Eien no replica' - assert jam['file_metadata']['artist'] == 'Kazuo Nishi' + assert jam["file_metadata"]["title"] == "Eien no replica" + assert jam["file_metadata"]["artist"] == "Kazuo Nishi" def test_load_chords(): chords_path = ( - 'tests/resources/mir_datasets/RWC-Popular/' - + 'annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab' + "tests/resources/mir_datasets/rwc_popular/" + + "annotations/AIST.RWC-MDB-P-2001.CHORD/RWC_Pop_Chords/N001-M01-T01.lab" ) chord_data = rwc_popular.load_chords(chords_path) @@ -139,14 +140,14 @@ def test_load_chords(): chord_data.intervals[:, 1], np.array([0.104, 1.858, 5.387, 44.494, 47.636]) ) assert np.array_equal( - chord_data.labels, ['N', 'Ab:min', 'E:maj', 'Bb:maj(*3)', 'C:min7'] + chord_data.labels, ["N", "Ab:min", "E:maj", "Bb:maj(*3)", "C:min7"] ) def test_load_voca_inst(): vocinst_path = ( - 'tests/resources/mir_datasets/RWC-Popular/' - + 'annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT' + "tests/resources/mir_datasets/rwc_popular/" + + "annotations/AIST.RWC-MDB-P-2001.VOCA_INST/RM-P001.VOCA_INST.TXT" ) vocinst_data = rwc_popular.load_voca_inst(vocinst_path) @@ -190,24 +191,24 @@ def test_load_voca_inst(): assert np.array_equal( vocinst_data.event, np.array( - ['b', 'm:withm', 'b', 'm:withm', 'b', 'm:withm', 'b', 's:electricguitar'] + ["b", "m:withm", "b", "m:withm", "b", "m:withm", "b", "s:electricguitar"] ), ) def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/RWC-Popular' + data_home = "tests/resources/mir_datasets/rwc_popular" metadata = rwc_popular._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['RM-P001'] == { - 'piece_number': 'No. 1', - 'suffix': 'M01', - 'track_number': 'Tr. 01', - 'title': 'Eien no replica', - 'artist': 'Kazuo Nishi', - 'singer_information': 'Male', - 'duration': 209, - 'tempo': '135', - 'instruments': 'Gt', - 'drum_information': 'Drum sequences', + assert metadata["data_home"] == data_home + assert metadata["RM-P001"] == { + "piece_number": "No. 1", + "suffix": "M01", + "track_number": "Tr. 01", + "title": "Eien no replica", + "artist": "Kazuo Nishi", + "singer_information": "Male", + "duration": 209, + "tempo": "135", + "instruments": "Gt", + "drum_information": "Drum sequences", } diff --git a/tests/test_salami.py b/tests/test_salami.py index c65c6b0e5..8f336d97a 100644 --- a/tests/test_salami.py +++ b/tests/test_salami.py @@ -1,44 +1,45 @@ # -*- coding: utf-8 -*- import numpy as np -from mirdata import salami, utils +from mirdata.datasets import salami +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = '2' - data_home = 'tests/resources/mir_datasets/Salami' + default_trackid = "2" + data_home = "tests/resources/mir_datasets/salami" track = salami.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': '2', - 'audio_path': 'tests/resources/mir_datasets/Salami/' + 'audio/2.mp3', - 'sections_annotator1_uppercase_path': 'tests/resources/mir_datasets/Salami/' - + 'salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt', - 'sections_annotator1_lowercase_path': 'tests/resources/mir_datasets/Salami/' - + 'salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_lowercase.txt', - 'sections_annotator2_uppercase_path': 'tests/resources/mir_datasets/Salami/' - + 'salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_uppercase.txt', - 'sections_annotator2_lowercase_path': 'tests/resources/mir_datasets/Salami/' - + 'salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_lowercase.txt', - 'source': 'Codaich', - 'annotator_1_id': '5', - 'annotator_2_id': '8', - 'duration': 264, - 'title': 'For_God_And_Country', - 'artist': 'The_Smashing_Pumpkins', - 'annotator_1_time': '37', - 'annotator_2_time': '45', - 'broad_genre': 'popular', - 'genre': 'Alternative_Pop___Rock', + "track_id": "2", + "audio_path": "tests/resources/mir_datasets/salami/" + "audio/2.mp3", + "sections_annotator1_uppercase_path": "tests/resources/mir_datasets/salami/" + + "salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt", + "sections_annotator1_lowercase_path": "tests/resources/mir_datasets/salami/" + + "salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_lowercase.txt", + "sections_annotator2_uppercase_path": "tests/resources/mir_datasets/salami/" + + "salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_uppercase.txt", + "sections_annotator2_lowercase_path": "tests/resources/mir_datasets/salami/" + + "salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile2_lowercase.txt", + "source": "Codaich", + "annotator_1_id": "5", + "annotator_2_id": "8", + "duration": 264, + "title": "For_God_And_Country", + "artist": "The_Smashing_Pumpkins", + "annotator_1_time": "37", + "annotator_2_time": "45", + "broad_genre": "popular", + "genre": "Alternative_Pop___Rock", } expected_property_types = { - 'sections_annotator_1_uppercase': utils.SectionData, - 'sections_annotator_1_lowercase': utils.SectionData, - 'sections_annotator_2_uppercase': utils.SectionData, - 'sections_annotator_2_lowercase': utils.SectionData, + "sections_annotator_1_uppercase": utils.SectionData, + "sections_annotator_1_lowercase": utils.SectionData, + "sections_annotator_2_uppercase": utils.SectionData, + "sections_annotator_2_lowercase": utils.SectionData, } run_track_tests(track, expected_attributes, expected_property_types) @@ -49,34 +50,34 @@ def test_track(): assert y.shape == (89856,) # Test file with missing annotations - track = salami.Track('192', data_home=data_home) + track = salami.Track("192", data_home=data_home) # test attributes - assert track.source == 'Codaich' - assert track.annotator_1_id == '16' - assert track.annotator_2_id == '14' + assert track.source == "Codaich" + assert track.annotator_1_id == "16" + assert track.annotator_2_id == "14" assert track.duration == 209 - assert track.title == 'Sull__aria' - assert track.artist == 'Compilations' - assert track.annotator_1_time == '20' - assert track.annotator_2_time == '' - assert track.broad_genre == 'classical' - assert track.genre == 'Classical_-_Classical' - assert track.track_id == '192' + assert track.title == "Sull__aria" + assert track.artist == "Compilations" + assert track.annotator_1_time == "20" + assert track.annotator_2_time == "" + assert track.broad_genre == "classical" + assert track.genre == "Classical_-_Classical" + assert track.track_id == "192" assert track._data_home == data_home assert track._track_paths == { - 'audio': ['audio/192.mp3', 'd954d5dc9f17d66155d3310d838756b8'], - 'annotator_1_uppercase': [ - 'salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_uppercase.txt', - '4d268cfd27fe011dbe579f25f8d125ce', + "audio": ["audio/192.mp3", "d954d5dc9f17d66155d3310d838756b8"], + "annotator_1_uppercase": [ + "salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_uppercase.txt", + "4d268cfd27fe011dbe579f25f8d125ce", ], - 'annotator_1_lowercase': [ - 'salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_lowercase.txt', - '6640237e7844d0d9d37bf21cf96a2690', + "annotator_1_lowercase": [ + "salami-data-public-hierarchy-corrections/annotations/192/parsed/textfile1_lowercase.txt", + "6640237e7844d0d9d37bf21cf96a2690", ], - 'annotator_2_uppercase': [None, None], - 'annotator_2_lowercase': [None, None], + "annotator_2_uppercase": [None, None], + "annotator_2_lowercase": [None, None], } # test that cached properties don't fail and have the expected type @@ -86,19 +87,19 @@ def test_track(): assert track.sections_annotator_2_lowercase is None # Test file with missing annotations - track = salami.Track('1015', data_home=data_home) + track = salami.Track("1015", data_home=data_home) assert track._track_paths == { - 'audio': ['audio/1015.mp3', '811a4a6b46f0c15a61bfb299b21ebdc4'], - 'annotator_1_uppercase': [None, None], - 'annotator_1_lowercase': [None, None], - 'annotator_2_uppercase': [ - 'salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_uppercase.txt', - 'e4a268342a45fdffd8ec9c3b8287ad8b', + "audio": ["audio/1015.mp3", "811a4a6b46f0c15a61bfb299b21ebdc4"], + "annotator_1_uppercase": [None, None], + "annotator_1_lowercase": [None, None], + "annotator_2_uppercase": [ + "salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_uppercase.txt", + "e4a268342a45fdffd8ec9c3b8287ad8b", ], - 'annotator_2_lowercase': [ - 'salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_lowercase.txt', - '201642fcea4a27c60f7b48de46a82234', + "annotator_2_lowercase": [ + "salami-data-public-hierarchy-corrections/annotations/1015/parsed/textfile2_lowercase.txt", + "201642fcea4a27c60f7b48de46a82234", ], } @@ -111,11 +112,11 @@ def test_track(): def test_to_jams(): - data_home = 'tests/resources/mir_datasets/Salami' - track = salami.Track('2', data_home=data_home) + data_home = "tests/resources/mir_datasets/salami" + track = salami.Track("2", data_home=data_home) jam = track.to_jams() - segments = jam.search(namespace='multi_segment')[0]['data'] + segments = jam.search(namespace="multi_segment")[0]["data"] assert [segment.time for segment in segments] == [ 0.0, 0.0, @@ -141,16 +142,16 @@ def test_to_jams(): 1.6797959180000248, ] assert [segment.value for segment in segments] == [ - {'label': 'Silence', 'level': 0}, - {'label': 'Silence', 'level': 1}, - {'label': 'A', 'level': 0}, - {'label': 'b', 'level': 1}, - {'label': 'b', 'level': 1}, - {'label': 'B', 'level': 0}, - {'label': 'ab', 'level': 1}, - {'label': 'ab', 'level': 1}, - {'label': 'Silence', 'level': 0}, - {'label': 'Silence', 'level': 1}, + {"label": "Silence", "level": 0}, + {"label": "Silence", "level": 1}, + {"label": "A", "level": 0}, + {"label": "b", "level": 1}, + {"label": "b", "level": 1}, + {"label": "B", "level": 0}, + {"label": "ab", "level": 1}, + {"label": "ab", "level": 1}, + {"label": "Silence", "level": 0}, + {"label": "Silence", "level": 1}, ] assert [segment.confidence for segment in segments] == [ None, @@ -165,15 +166,15 @@ def test_to_jams(): None, ] - assert jam['file_metadata']['title'] == 'For_God_And_Country' - assert jam['file_metadata']['artist'] == 'The_Smashing_Pumpkins' + assert jam["file_metadata"]["title"] == "For_God_And_Country" + assert jam["file_metadata"]["artist"] == "The_Smashing_Pumpkins" def test_load_sections(): # load a file which exists sections_path = ( - 'tests/resources/mir_datasets/Salami/' - + 'salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt' + "tests/resources/mir_datasets/salami/" + + "salami-data-public-hierarchy-corrections/annotations/2/parsed/textfile1_uppercase.txt" ) section_data = salami.load_sections(sections_path) @@ -192,7 +193,7 @@ def test_load_sections(): np.array([0.464399092, 14.379863945, 263.205419501, 264.885215419]), ) assert np.array_equal( - section_data.labels, np.array(['Silence', 'A', 'B', 'Silence']) + section_data.labels, np.array(["Silence", "A", "B", "Silence"]) ) # load none @@ -201,21 +202,21 @@ def test_load_sections(): def test_load_metadata(): - data_home = 'tests/resources/mir_datasets/Salami' + data_home = "tests/resources/mir_datasets/salami" metadata = salami._load_metadata(data_home) - assert metadata['data_home'] == data_home - assert metadata['2'] == { - 'source': 'Codaich', - 'annotator_1_id': '5', - 'annotator_2_id': '8', - 'duration': 264, - 'title': 'For_God_And_Country', - 'artist': 'The_Smashing_Pumpkins', - 'annotator_1_time': '37', - 'annotator_2_time': '45', - 'class': 'popular', - 'genre': 'Alternative_Pop___Rock', + assert metadata["data_home"] == data_home + assert metadata["2"] == { + "source": "Codaich", + "annotator_1_id": "5", + "annotator_2_id": "8", + "duration": 264, + "title": "For_God_And_Country", + "artist": "The_Smashing_Pumpkins", + "annotator_1_time": "37", + "annotator_2_time": "45", + "class": "popular", + "genre": "Alternative_Pop___Rock", } - none_metadata = salami._load_metadata('asdf/asdf') + none_metadata = salami._load_metadata("asdf/asdf") assert none_metadata is None diff --git a/tests/test_tinysol.py b/tests/test_tinysol.py index da6fb4017..b33ecd138 100644 --- a/tests/test_tinysol.py +++ b/tests/test_tinysol.py @@ -2,32 +2,33 @@ import numpy as np -from mirdata import tinysol, utils +from mirdata.datasets import tinysol +from mirdata import utils from tests.test_utils import run_track_tests def test_track(): - default_trackid = 'Fl-ord-C4-mf-N-T14d' - data_home = 'tests/resources/mir_datasets/TinySOL' + default_trackid = "Fl-ord-C4-mf-N-T14d" + data_home = "tests/resources/mir_datasets/tinysol" track = tinysol.Track(default_trackid, data_home=data_home) expected_attributes = { - 'track_id': 'Fl-ord-C4-mf-N-T14d', - 'audio_path': 'tests/resources/mir_datasets/TinySOL/' - + 'audio/Winds/Flute/ordinario/Fl-ord-C4-mf-N-T14d.wav', - 'dynamics': 'mf', - 'fold': 0, - 'family': 'Winds', - 'instrument_abbr': 'Fl', - 'instrument_full': 'Flute', - 'technique_abbr': 'ord', - 'technique_full': 'ordinario', - 'pitch': 'C4', - 'pitch_id': 60, - 'dynamics_id': 2, - 'instance_id': 0, - 'is_resampled': True, - 'string_id': None, + "track_id": "Fl-ord-C4-mf-N-T14d", + "audio_path": "tests/resources/mir_datasets/tinysol/" + + "audio/Winds/Flute/ordinario/Fl-ord-C4-mf-N-T14d.wav", + "dynamics": "mf", + "fold": 0, + "family": "Winds", + "instrument_abbr": "Fl", + "instrument_full": "Flute", + "technique_abbr": "ord", + "technique_full": "ordinario", + "pitch": "C4", + "pitch_id": 60, + "dynamics_id": 2, + "instance_id": 0, + "is_resampled": True, + "string_id": None, } expected_property_types = {} @@ -39,44 +40,44 @@ def test_track(): assert sr == 44100 # test with a string instrument - track = tinysol.Track('Cb-ord-A2-mf-2c-N', data_home=data_home) + track = tinysol.Track("Cb-ord-A2-mf-2c-N", data_home=data_home) def test_to_jams(): - data_home = 'tests/resources/mir_datasets/TinySOL' + data_home = "tests/resources/mir_datasets/tinysol" # Case with a wind instrument (no string_id) - track = tinysol.Track('Fl-ord-C4-mf-N-T14d', data_home=data_home) + track = tinysol.Track("Fl-ord-C4-mf-N-T14d", data_home=data_home) jam = track.to_jams() - assert jam['sandbox']['Fold'] == 0 - assert jam['sandbox']['Family'] == 'Winds' - assert jam['sandbox']['Instrument (abbr.)'] == 'Fl' - assert jam['sandbox']['Instrument (in full)'] == 'Flute' - assert jam['sandbox']['Technique (abbr.)'] == 'ord' - assert jam['sandbox']['Technique (in full)'] == 'ordinario' - assert jam['sandbox']['Pitch'] == 'C4' - assert jam['sandbox']['Pitch ID'] == 60 - assert jam['sandbox']['Dynamics'] == 'mf' - assert jam['sandbox']['Dynamics ID'] == 2 - assert jam['sandbox']['Instance ID'] == 0 - assert 'String ID' not in jam['sandbox'] - assert jam['sandbox']['Resampled'] + assert jam["sandbox"]["Fold"] == 0 + assert jam["sandbox"]["Family"] == "Winds" + assert jam["sandbox"]["Instrument (abbr.)"] == "Fl" + assert jam["sandbox"]["Instrument (in full)"] == "Flute" + assert jam["sandbox"]["Technique (abbr.)"] == "ord" + assert jam["sandbox"]["Technique (in full)"] == "ordinario" + assert jam["sandbox"]["Pitch"] == "C4" + assert jam["sandbox"]["Pitch ID"] == 60 + assert jam["sandbox"]["Dynamics"] == "mf" + assert jam["sandbox"]["Dynamics ID"] == 2 + assert jam["sandbox"]["Instance ID"] == 0 + assert "String ID" not in jam["sandbox"] + assert jam["sandbox"]["Resampled"] # Case with a string instrument - track = tinysol.Track('Cb-ord-A2-mf-2c-N', data_home=data_home) + track = tinysol.Track("Cb-ord-A2-mf-2c-N", data_home=data_home) jam = track.to_jams() - assert jam['sandbox']['Fold'] == 4 - assert jam['sandbox']['Family'] == 'Strings' - assert jam['sandbox']['Instrument (abbr.)'] == 'Cb' - assert jam['sandbox']['Instrument (in full)'] == 'Contrabass' - assert jam['sandbox']['Technique (abbr.)'] == 'ord' - assert jam['sandbox']['Technique (in full)'] == 'ordinario' - assert jam['sandbox']['Pitch'] == 'A2' - assert jam['sandbox']['Pitch ID'] == 45 - assert jam['sandbox']['Dynamics'] == 'mf' - assert jam['sandbox']['Dynamics ID'] == 2 - assert jam['sandbox']['Instance ID'] == 1 - assert jam['sandbox']['String ID'] == 2 - assert not jam['sandbox']['Resampled'] + assert jam["sandbox"]["Fold"] == 4 + assert jam["sandbox"]["Family"] == "Strings" + assert jam["sandbox"]["Instrument (abbr.)"] == "Cb" + assert jam["sandbox"]["Instrument (in full)"] == "Contrabass" + assert jam["sandbox"]["Technique (abbr.)"] == "ord" + assert jam["sandbox"]["Technique (in full)"] == "ordinario" + assert jam["sandbox"]["Pitch"] == "A2" + assert jam["sandbox"]["Pitch ID"] == 45 + assert jam["sandbox"]["Dynamics"] == "mf" + assert jam["sandbox"]["Dynamics ID"] == 2 + assert jam["sandbox"]["Instance ID"] == 1 + assert jam["sandbox"]["String ID"] == 2 + assert not jam["sandbox"]["Resampled"] diff --git a/tests/test_utils.py b/tests/test_utils.py index 460acee9d..3e53cab4e 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -12,23 +12,23 @@ import pytest if sys.version_info.major == 3: - builtin_module_name = 'builtins' + builtin_module_name = "builtins" else: - builtin_module_name = '__builtin__' + builtin_module_name = "__builtin__" -DEFAULT_DATA_HOME = os.path.join(os.getenv('HOME', '/tmp'), 'mir_datasets') +DEFAULT_DATA_HOME = os.path.join(os.getenv("HOME", "/tmp"), "mir_datasets") def run_track_tests(track, expected_attributes, expected_property_types): track_attr = get_attributes_and_properties(track) # test track attributes - for attr in track_attr['attributes']: + for attr in track_attr["attributes"]: print("{}: {}".format(attr, getattr(track, attr))) assert expected_attributes[attr] == getattr(track, attr) # test track property types - for prop in track_attr['cached_properties']: + for prop in track_attr["cached_properties"]: print("{}: {}".format(prop, type(getattr(track, prop)))) assert isinstance(getattr(track, prop), expected_property_types[prop]) @@ -39,7 +39,7 @@ def get_attributes_and_properties(class_instance): cached_properties = [] functions = [] for val in dir(class_instance.__class__): - if val.startswith('_'): + if val.startswith("_"): continue attr = getattr(class_instance.__class__, val) @@ -56,91 +56,86 @@ def get_attributes_and_properties(class_instance): itertools.chain.from_iterable([properties, cached_properties, functions]) ) for val in dir(class_instance): - if val.startswith('_'): + if val.startswith("_"): continue if val not in non_attributes: attributes.append(val) return { - 'attributes': attributes, - 'properties': properties, - 'cached_properties': cached_properties, - 'functions': functions, + "attributes": attributes, + "properties": properties, + "cached_properties": cached_properties, + "functions": functions, } @pytest.fixture def mock_validated(mocker): - return mocker.patch.object(utils, 'check_validated') + return mocker.patch.object(utils, "check_validated") @pytest.fixture def mock_validator(mocker): - return mocker.patch.object(utils, 'validator') + return mocker.patch.object(utils, "validator") @pytest.fixture def mock_check_index(mocker): - return mocker.patch.object(utils, 'check_index') + return mocker.patch.object(utils, "check_index") def test_md5(mocker): - audio_file = b'audio1234' + audio_file = b"audio1234" - expected_checksum = '6dc00d1bac757abe4ea83308dde68aab' + expected_checksum = "6dc00d1bac757abe4ea83308dde68aab" mocker.patch( - '%s.open' % builtin_module_name, new=mocker.mock_open(read_data=audio_file) + "%s.open" % builtin_module_name, new=mocker.mock_open(read_data=audio_file) ) - md5_checksum = utils.md5('test_file_path') + md5_checksum = utils.md5("test_file_path") assert expected_checksum == md5_checksum @pytest.mark.parametrize( - 'test_index,expected_missing,expected_inv_checksum', + "test_index,expected_missing,expected_inv_checksum", [ - ('test_index_valid.json', {}, {}), + ("test_index_valid.json", {}, {}), ( - 'test_index_missing_file.json', - {'10161_chorus': ['tests/resources/10162_chorus.wav']}, + "test_index_missing_file.json", + {"10161_chorus": ["tests/resources/10162_chorus.wav"]}, {}, ), ( - 'test_index_invalid_checksum.json', + "test_index_invalid_checksum.json", {}, - {'10161_chorus': ['tests/resources/10161_chorus.wav']}, + {"10161_chorus": ["tests/resources/10161_chorus.wav"]}, ), ], ) def test_check_index(test_index, expected_missing, expected_inv_checksum): - index_path = os.path.join('tests/indexes', test_index) + index_path = os.path.join("tests/indexes", test_index) with open(index_path) as index_file: test_index = json.load(index_file) - missing_files, invalid_checksums = utils.check_index(test_index, 'tests/resources/') + missing_files, invalid_checksums = utils.check_index(test_index, "tests/resources/") assert expected_missing == missing_files assert expected_inv_checksum == invalid_checksums @pytest.mark.parametrize( - 'missing_files,invalid_checksums', + "missing_files,invalid_checksums", [ - ({'10161_chorus': ['tests/resources/10162_chorus.wav']}, {}), - ({}, {'10161_chorus': ['tests/resources/10161_chorus.wav']}), + ({"10161_chorus": ["tests/resources/10162_chorus.wav"]}, {}), + ({}, {"10161_chorus": ["tests/resources/10161_chorus.wav"]}), ({}, {}), ], ) def test_validator(mocker, mock_check_index, missing_files, invalid_checksums): mock_check_index.return_value = missing_files, invalid_checksums - m, c = utils.validator('foo', 'bar', True) + m, c = utils.validator("foo", "bar", False) assert m == missing_files assert c == invalid_checksums - mock_check_index.assert_called_once_with('foo', 'bar') + mock_check_index.assert_called_once_with("foo", "bar", False) - -# This is magically skipped by the the remote fixture `skip_local` in conftest.py -# when tests are run with the --local flag -def test_get_default_dataset_path(skip_local): - assert '/tmp/mir_datasets/data_home' == utils.get_default_dataset_path('data_home')