diff --git a/.github/workflows/python-package-pip.yaml b/.github/workflows/python-package-pip.yaml index 196f0cf..66d0e61 100644 --- a/.github/workflows/python-package-pip.yaml +++ b/.github/workflows/python-package-pip.yaml @@ -10,6 +10,7 @@ on: env: CACHE_NUMBER: 0 # increase to reset cache manually + AAC_DATASETS_ROOT: "$HOME/.cache/data" # Cancel workflow if a new push occurs concurrency: @@ -23,7 +24,7 @@ jobs: strategy: matrix: os: [ubuntu-latest] - python-version: ["3.7", "3.8", "3.9", "3.10", "3.11"] + python-version: ["3.7", "3.11"] steps: # --- INSTALLATIONS --- @@ -43,10 +44,13 @@ jobs: run: | python -m pip install "aac-datasets[dev] @ git+https://github.com/Labbeti/aac-datasets@${GITHUB_REF##*/}" - - name: Install soundfile for torchaudio + - name: Install soundfile for torchaudio, ffmpeg and yt-dlp for AudioCaps download run: | - # For soundfile dep + sudo add-apt-repository ppa:tomtomtom/yt-dlp # Add ppa repo to apt + sudo apt-get update sudo apt-get install libsndfile1 + sudo apt-get install ffmpeg + sudo apt-get install yt-dlp # --- TESTS --- - name: Compile python files @@ -60,11 +64,31 @@ jobs: - name: Check format with Black run: | python -m black --check --diff src - + - name: Print install info run: | aac-datasets-info - + ffmpeg -version + yt-dlp --version + - name: Test with pytest run: | python -m pytest -v + + - name: Build data root + run: | + dataroot=`eval echo $AAC_DATASETS_ROOT` + echo "Building directory '$dataroot'..." + mkdir -p "$dataroot" + + - name: Try to download Clotho val + run: | + aac-datasets-download --verbose 2 clotho --subsets val + + - name: Try to download AudioCaps val + run: | + aac-datasets-download --verbose 2 audiocaps --subsets val --max_workers none --with_tags true + + - name: Check data root + run: | + aac-datasets-check --verbose 2 --datasets clotho audiocaps diff --git a/CHANGELOG.md b/CHANGELOG.md index a56245d..0194f92 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,14 @@ # Change log All notable changes to this project will be documented in this file. + +## [0.5.0] 2024-01-05 +### Changed +- Update typing for paths with python class `Path`. +- Refactor functional interface to load raw metadata for each dataset. +- Refactor class variables to init arguments. +- Faster AudioCaps download with `ThreadPoolExecutor`. + ## [0.4.1] 2023-10-25 ### Added - `AudioCaps.DOWNLOAD_AUDIO` class variable for compatibility with [audiocaps-download 1.0](https://github.com/MorenoLaQuatra/audiocaps-download). diff --git a/CITATION.cff b/CITATION.cff index 7a4881b..db5dd24 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -22,5 +22,5 @@ keywords: - captioning - audio-captioning license: MIT -version: 0.4.1 -date-released: '2023-10-25' +version: 0.5.0 +date-released: '2024-01-05' diff --git a/README.md b/README.md index a44dcea..1143565 100644 --- a/README.md +++ b/README.md @@ -56,29 +56,47 @@ for batch in dataloader: ... ``` -## Datasets stats +## Download datasets +To download a dataset, you can use `download` argument in dataset construction : +```python +dataset = Clotho(root=".", subset="dev", download=True) +``` +However, if you want to download datasets from a script, you can also use the following command : +```bash +aac-datasets-download --root "." clotho --subsets "dev" +``` + +## Datasets information Here is the statistics for each dataset : -| | AudioCaps | Clotho | MACS | WavCaps | + -For Clotho, the dev subset should be used for training, val for validation and eval for testing. +| Dataset | Sampling
rate (kHz) | Estimated
size (GB) | Source | Subsets | +|:---:|:---:|:---:|:---:|:---:| +| AudioCaps | 32 | 43 | AudioSet | `train`
`val`
`test`
`train_v2` | +| Clotho | 44.1 | 53 | Freesound | `dev`
`val`
`eval`
`dcase_aac_test`
`dcase_aac_analysis`
`dcase_t2a_audio`
`dcase_t2a_captions` | +| MACS | 48 | 13 | TAU Urban Acoustic Scenes 2019 | `full` | +| WavCaps | 32 | 941 | AudioSet
BBC Sound Effects
FreeSound
SoundBible | `as`
`as_noac`
`bbc`
`fsd`
`fsd_nocl`
`sb` | + +For Clotho, the **dev** subset should be used for training, val for validation and eval for testing. -Here is the **train** subset statistics for AudioCaps, Clotho and MACS datasets : +Here is additional statistics on the train subset for AudioCaps, Clotho and MACS: -| | AudioCaps/train | Clotho/dev | MACS/full | -|:---:|:---:|:---:|:---:| -| Nb audios | 49,838 | 3,840 | 3,930 | -| Total audio duration (h) | 136.61 | 24.0 | 10.9 | -| Audio duration range (s) | 0.5-10 | 15-30 | 10 | -| Nb captions per audio | 1 | 5 | 2-5 | -| Nb captions | 49,838 | 19,195 | 17,275 | -| Total nb words2 | 402,482 | 217,362 | 160,006 | -| Sentence size2 | 2-52 | 8-20 | 5-40 | +| | AudioCaps/train | Clotho/dev | MACS/full | WavCaps/full | +|:---:|:---:|:---:|:---:|:---:| +| Nb audios | 49,838 | 3,840 | 3,930 | 403,050 | +| Total audio duration (h) | 136.61 | 24.0 | 10.9 | 7563.3 | +| Audio duration range (s) | 0.5-10 | 15-30 | 10 | 1-67,109 | +| Nb captions per audio | 1 | 5 | 2-5 | 1 | +| Nb captions | 49,838 | 19,195 | 17,275 | 403,050 | +| Total nb words2 | 402,482 | 217,362 | 160,006 | 3,161,823 | +| Sentence size2 | 2-52 | 8-20 | 5-40 | 2-38 | +| Vocabulary2 | 4724 | 4369 | 2721 | 24600 | 1 This duration is estimated on the total duration of 46230/49838 files of 126.7h. @@ -86,7 +104,7 @@ Here is the **train** subset statistics for AudioCaps, Clotho and MACS datasets ## Requirements -This package has been developped for Ubuntu 20.04, and it is expected to work on most Linux distributions. +This package has been developped for Ubuntu 20.04, and it is expected to work on most Linux-based distributions. ### Python packages Python requirements are automatically installed when using pip on this repository. @@ -104,7 +122,7 @@ numpy >= 1.21.2 The external requirements needed to download **AudioCaps** are **ffmpeg** and **yt-dlp**. **ffmpeg** can be install on Ubuntu using `sudo apt install ffmpeg` and **yt-dlp** from the [official repo](https://github.com/yt-dlp/yt-dlp). - + You can also override their paths for AudioCaps: ```python @@ -116,16 +134,6 @@ dataset = AudioCaps( ) ``` -## Download datasets -To download a dataset, you can use `download` argument in dataset construction : -```python -dataset = Clotho(root=".", subset="dev", download=True) -``` -However, if you want to download datasets from a script, you can also use the following command : -```bash -aac-datasets-download --root "." clotho --subsets "dev" -``` - ## Additional information ### Compatibility with audiocaps-download If you want to use [audiocaps-download 1.0](https://github.com/MorenoLaQuatra/audiocaps-download) package to download AudioCaps, you will have to respect the AudioCaps folder tree: @@ -139,9 +147,13 @@ downloader.download(format="wav") Then disable audio download and set the correct audio format before init AudioCaps : ```python from aac_datasets import AudioCaps -AudioCaps.AUDIO_FORMAT = "wav" -AudioCaps.DOWNLOAD_AUDIO = False # this will only download labels and metadata files -dataset = AudioCaps(root=root, subset="train", download=True) +dataset = AudioCaps( + root=root, + subset="train", + download=True, + audio_format="wav", + download_audio=False, # this will only download labels and metadata files +) ``` ## References @@ -155,21 +167,21 @@ dataset = AudioCaps(root=root, subset="train", download=True) [3] F. Font, A. Mesaros, D. P. W. Ellis, E. Fonseca, M. Fuentes, and B. Elizalde, Proceedings of the 6th Workshop on Detection and Classication of Acoustic Scenes and Events (DCASE 2021). Barcelona, Spain: Music Technology Group - Universitat Pompeu Fabra, Nov. 2021. Available: https://doi.org/10.5281/zenodo.5770113 #### WavCaps -[1] X. Mei et al., “WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research,” arXiv preprint arXiv:2303.17395, 2023, [Online]. Available: https://arxiv.org/pdf/2303.17395.pdf +[4] X. Mei et al., “WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research,” arXiv preprint arXiv:2303.17395, 2023, [Online]. Available: https://arxiv.org/pdf/2303.17395.pdf ## Cite the aac-datasets package If you use this software, please consider cite it as "Labbe, E. (2013). aac-datasets: Audio Captioning datasets for PyTorch.", or use the following BibTeX citation: ``` @software{ - Labbe_aac_datasets_2023, + Labbe_aac_datasets_2024, author = {Labbé, Etienne}, license = {MIT}, - month = {10}, + month = {01}, title = {{aac-datasets}}, url = {https://github.com/Labbeti/aac-datasets/}, - version = {0.4.1}, - year = {2023} + version = {0.5.0}, + year = {2024} } ``` diff --git a/data/train_v2.csv b/data/audiocaps/train_v2.csv similarity index 100% rename from data/train_v2.csv rename to data/audiocaps/train_v2.csv diff --git a/data/blacklist_audiocaps.full.csv b/data/wavcaps/blacklist_audiocaps.full.csv similarity index 100% rename from data/blacklist_audiocaps.full.csv rename to data/wavcaps/blacklist_audiocaps.full.csv diff --git a/data/blacklist_clotho.full.csv b/data/wavcaps/blacklist_clotho.full.csv similarity index 100% rename from data/blacklist_clotho.full.csv rename to data/wavcaps/blacklist_clotho.full.csv diff --git a/docs/aac_datasets.datasets.functional.audiocaps.rst b/docs/aac_datasets.datasets.functional.audiocaps.rst new file mode 100644 index 0000000..aa8c58f --- /dev/null +++ b/docs/aac_datasets.datasets.functional.audiocaps.rst @@ -0,0 +1,7 @@ +aac\_datasets.datasets.functional.audiocaps module +================================================== + +.. automodule:: aac_datasets.datasets.functional.audiocaps + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aac_datasets.datasets.functional.clotho.rst b/docs/aac_datasets.datasets.functional.clotho.rst new file mode 100644 index 0000000..31213cb --- /dev/null +++ b/docs/aac_datasets.datasets.functional.clotho.rst @@ -0,0 +1,7 @@ +aac\_datasets.datasets.functional.clotho module +=============================================== + +.. automodule:: aac_datasets.datasets.functional.clotho + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aac_datasets.datasets.functional.common.rst b/docs/aac_datasets.datasets.functional.common.rst new file mode 100644 index 0000000..1647955 --- /dev/null +++ b/docs/aac_datasets.datasets.functional.common.rst @@ -0,0 +1,7 @@ +aac\_datasets.datasets.functional.common module +=============================================== + +.. automodule:: aac_datasets.datasets.functional.common + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aac_datasets.datasets.functional.macs.rst b/docs/aac_datasets.datasets.functional.macs.rst new file mode 100644 index 0000000..9de2b9d --- /dev/null +++ b/docs/aac_datasets.datasets.functional.macs.rst @@ -0,0 +1,7 @@ +aac\_datasets.datasets.functional.macs module +============================================= + +.. automodule:: aac_datasets.datasets.functional.macs + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aac_datasets.datasets.functional.rst b/docs/aac_datasets.datasets.functional.rst new file mode 100644 index 0000000..88c73ac --- /dev/null +++ b/docs/aac_datasets.datasets.functional.rst @@ -0,0 +1,19 @@ +aac\_datasets.datasets.functional package +========================================= + +.. automodule:: aac_datasets.datasets.functional + :members: + :undoc-members: + :show-inheritance: + +Submodules +---------- + +.. toctree:: + :maxdepth: 4 + + aac_datasets.datasets.functional.audiocaps + aac_datasets.datasets.functional.clotho + aac_datasets.datasets.functional.common + aac_datasets.datasets.functional.macs + aac_datasets.datasets.functional.wavcaps diff --git a/docs/aac_datasets.datasets.functional.wavcaps.rst b/docs/aac_datasets.datasets.functional.wavcaps.rst new file mode 100644 index 0000000..ad807ee --- /dev/null +++ b/docs/aac_datasets.datasets.functional.wavcaps.rst @@ -0,0 +1,7 @@ +aac\_datasets.datasets.functional.wavcaps module +================================================ + +.. automodule:: aac_datasets.datasets.functional.wavcaps + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aac_datasets.datasets.legacy.audiocaps.rst b/docs/aac_datasets.datasets.legacy.audiocaps.rst deleted file mode 100644 index 4a46656..0000000 --- a/docs/aac_datasets.datasets.legacy.audiocaps.rst +++ /dev/null @@ -1,7 +0,0 @@ -aac\_datasets.datasets.legacy.audiocaps module -============================================== - -.. automodule:: aac_datasets.datasets.legacy.audiocaps - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/aac_datasets.datasets.legacy.clotho.rst b/docs/aac_datasets.datasets.legacy.clotho.rst deleted file mode 100644 index c692167..0000000 --- a/docs/aac_datasets.datasets.legacy.clotho.rst +++ /dev/null @@ -1,7 +0,0 @@ -aac\_datasets.datasets.legacy.clotho module -=========================================== - -.. automodule:: aac_datasets.datasets.legacy.clotho - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/aac_datasets.datasets.legacy.macs.rst b/docs/aac_datasets.datasets.legacy.macs.rst deleted file mode 100644 index 884df06..0000000 --- a/docs/aac_datasets.datasets.legacy.macs.rst +++ /dev/null @@ -1,7 +0,0 @@ -aac\_datasets.datasets.legacy.macs module -========================================= - -.. automodule:: aac_datasets.datasets.legacy.macs - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/aac_datasets.datasets.legacy.rst b/docs/aac_datasets.datasets.legacy.rst deleted file mode 100644 index 0d5bd53..0000000 --- a/docs/aac_datasets.datasets.legacy.rst +++ /dev/null @@ -1,17 +0,0 @@ -aac\_datasets.datasets.legacy package -===================================== - -.. automodule:: aac_datasets.datasets.legacy - :members: - :undoc-members: - :show-inheritance: - -Submodules ----------- - -.. toctree:: - :maxdepth: 4 - - aac_datasets.datasets.legacy.audiocaps - aac_datasets.datasets.legacy.clotho - aac_datasets.datasets.legacy.macs diff --git a/docs/aac_datasets.datasets.rst b/docs/aac_datasets.datasets.rst index 55eecb7..66653f3 100644 --- a/docs/aac_datasets.datasets.rst +++ b/docs/aac_datasets.datasets.rst @@ -12,7 +12,7 @@ Subpackages .. toctree:: :maxdepth: 4 - aac_datasets.datasets.legacy + aac_datasets.datasets.functional Submodules ---------- diff --git a/docs/aac_datasets.utils.audioset_mapping.rst b/docs/aac_datasets.utils.audioset_mapping.rst new file mode 100644 index 0000000..166f61b --- /dev/null +++ b/docs/aac_datasets.utils.audioset_mapping.rst @@ -0,0 +1,7 @@ +aac\_datasets.utils.audioset\_mapping module +============================================ + +.. automodule:: aac_datasets.utils.audioset_mapping + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aac_datasets.utils.globals.rst b/docs/aac_datasets.utils.globals.rst new file mode 100644 index 0000000..2628672 --- /dev/null +++ b/docs/aac_datasets.utils.globals.rst @@ -0,0 +1,7 @@ +aac\_datasets.utils.globals module +================================== + +.. automodule:: aac_datasets.utils.globals + :members: + :undoc-members: + :show-inheritance: diff --git a/docs/aac_datasets.utils.paths.rst b/docs/aac_datasets.utils.paths.rst deleted file mode 100644 index 43ba916..0000000 --- a/docs/aac_datasets.utils.paths.rst +++ /dev/null @@ -1,7 +0,0 @@ -aac\_datasets.utils.paths module -================================ - -.. automodule:: aac_datasets.utils.paths - :members: - :undoc-members: - :show-inheritance: diff --git a/docs/aac_datasets.utils.rst b/docs/aac_datasets.utils.rst index a5ccbe0..6f23e98 100644 --- a/docs/aac_datasets.utils.rst +++ b/docs/aac_datasets.utils.rst @@ -12,7 +12,9 @@ Submodules .. toctree:: :maxdepth: 4 + aac_datasets.utils.audioset_mapping + aac_datasets.utils.cmdline aac_datasets.utils.collate aac_datasets.utils.collections aac_datasets.utils.download - aac_datasets.utils.paths + aac_datasets.utils.globals diff --git a/docs/data_subsets.rst b/docs/data_subsets.rst index 40a8412..52fb158 100644 --- a/docs/data_subsets.rst +++ b/docs/data_subsets.rst @@ -3,37 +3,37 @@ About datasets subsets AudioCaps ######################## -The original AudioCaps dataset contains only 3 subsets : train, val and test. +The original AudioCaps dataset contains only 3 subsets : `train`, `val` and `test`. -A fourth subset named train_v2 is another version of the train subset where captions has been manually corrected or deleted. For more details, see paper "CoNeTTE: An efficient Audio Captioning system leveraging multiple datasets with Task Embedding". +A fourth subset named `train_v2` is another version of the train subset where captions has been manually corrected or deleted. For more details, see paper "CoNeTTE: An efficient Audio Captioning system leveraging multiple datasets with Task Embedding". Clotho ######################## Clotho contains 7 subsets: -- dev : contains 3.8K files for training, -- val : contains 1K files for validation, -- eval : contains 1K files for testing, -- dcase_aac_test : contains 1K files without captions used in the DCASE challenge task 6a (AAC), -- dcase_aac_analysis : contains 6K audio files without captions used in the DCASE challenge task 6a (AAC), -- dcase_t2a_audio : contains 1K audio files without captions used in the DCASE challenge task 6b (Text-to-Audio retrieval), -- dcase_t2a_captions : contains 1K captions (queries) without audios files used in the DCASE challenge task 6b (Text-to-Audio retrieval). +- `dev` : contains 3.8K files for training, +- `val` : contains 1K files for validation, +- `eval` : contains 1K files for testing, +- `dcase_aac_test` : contains 1K files without captions used in the DCASE challenge task 6a (AAC), +- `dcase_aac_analysis` : contains 6K audio files without captions used in the DCASE challenge task 6a (AAC), +- `dcase_t2a_audio` : contains 1K audio files without captions used in the DCASE challenge task 6b (Text-to-Audio retrieval), +- `dcase_t2a_captions` : contains 1K captions (queries) without audios files used in the DCASE challenge task 6b (Text-to-Audio retrieval). MACS ######################## -MACS contains only 1 subset: full. Its data is typically used as additional training data. +MACS contains only 1 subset: `full`. Its data is typically used as additional training data. WavCaps ######################## WavCaps contains 6 subsets: -- as : contains 108K files from AudioSet strongly labeled dataset, -- bbc : contains 31K files from BBC Sound Effects website, -- fsd : contains 262K files from FreeSound website, -- sb : contains 1.2K files from SoundBible website, -- as_noac : contains 99K files from as subset without overlapping data with AudioCaps, -- fsd_nocl : contains 258K files from fsd subset without overlapping data with Clotho (except for subsets of task 6a). +- `as` : contains 108K files from AudioSet strongly labeled dataset, +- `bbc` : contains 31K files from BBC Sound Effects website, +- `fsd` : contains 262K files from FreeSound website, +- `sb` : contains 1.2K files from SoundBible website, +- `as_noac` : contains 99K files from as subset without overlapping data with AudioCaps, +- `fsd_nocl` : contains 258K files from fsd subset without overlapping data with Clotho (except for subsets of task 6a). Since WavCaps does not contains validation or testing subsets, all of their data is used as additional training data. -The subsets as_noac and fsd_nocl are provided to avoid biases when evaluating on AudioCaps or Clotho datasets. +The subsets as_noac and `fsd_nocl` are provided to avoid biases when evaluating on AudioCaps or Clotho datasets. diff --git a/docs/installation.rst b/docs/installation.rst index 6724d40..6284f10 100644 --- a/docs/installation.rst +++ b/docs/installation.rst @@ -21,12 +21,14 @@ The python requirements are automatically installed when using pip on this repos pyyaml >= 6.0 tqdm >= 4.64.0 huggingface-hub>=0.15.1 + numpy>=1.21.2 External requirements (AudioCaps only) ###################################### -The external requirements needed to download **AudioCaps** are **ffmpeg** and **youtube-dl** (yt-dlp should work too). -These two programs can be download on Ubuntu using `sudo apt install ffmpeg youtube-dl`. +The external requirements needed to download **AudioCaps** are **ffmpeg** and **yt-dlp**. +.. These two programs can be download on Ubuntu using `sudo apt install ffmpeg youtube-dl`. +**ffmpeg** can be install on Ubuntu using `sudo apt install ffmpeg` and **yt-dlp** from the `official repo `_. You can also override their paths for AudioCaps: diff --git a/docs/usage.rst b/docs/usage.rst index aae413a..cbf70fc 100644 --- a/docs/usage.rst +++ b/docs/usage.rst @@ -14,13 +14,13 @@ You can download each dataset subset by using the download=True option in datase _ = Clotho("/my/path/to/data", subset="dev", download=True) You can also do the same by using functions : -.. :caption: Download Clotho development dataset (command line). +.. :caption: Download Clotho development dataset (python). .. code-block:: python - from aac_datasets.download import download_clotho + from aac_datasets.download import download_clotho_dataset - download_clotho("/my/path/to/data", subsets=("dev",), download=True) + download_clotho_dataset("/my/path/to/data", subset="dev", download=True) Or by the command line : .. :caption: Download Clotho development dataset (command line). diff --git a/examples/dataloader.ipynb b/examples/dataloader.ipynb index 8a82df9..ab5f405 100644 --- a/examples/dataloader.ipynb +++ b/examples/dataloader.ipynb @@ -60,7 +60,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "aac-datasets version: 0.4.0\n" + "aac-datasets version: 0.5.0\n" ] } ], @@ -108,7 +108,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 7, "metadata": {}, "outputs": [], "source": [ @@ -118,12 +118,12 @@ "from torch.utils.data.dataloader import DataLoader\n", "\n", "from aac_datasets import Clotho\n", - "from aac_datasets.utils import AdvancedCollate" + "from aac_datasets.utils.collate import AdvancedCollate" ] }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -188,7 +188,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.11" + "version": "3.10.4" }, "orig_nbformat": 4, "vscode": { diff --git a/pyproject.toml b/pyproject.toml index e856f0f..efaf0f9 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,16 +23,7 @@ classifiers = [ maintainers = [ {name = "Etienne Labbé (Labbeti)", email = "labbeti.pub@gmail.com"}, ] -dependencies = [ - "torch>=1.10.1", - "torchaudio>=0.10.1", - "py7zr>=0.17.2", - "pyyaml>=6.0", - "tqdm>=4.64.0", - "huggingface-hub>=0.15.1", - "numpy>=1.21.2", -] -dynamic = ["version"] +dynamic = ["version", "dependencies", "optional-dependencies"] [project.urls] Homepage = "https://pypi.org/project/aac-datasets/" @@ -46,19 +37,11 @@ aac-datasets-check = "aac_datasets.check:_main_check" aac-datasets-download = "aac_datasets.download:_main_download" aac-datasets-info = "aac_datasets.info:print_install_info" -[project.optional-dependencies] -dev = [ - "pytest==6.2.5", - "flake8==4.0.1", - "click==8.0.4", - "black==22.10.0", - "ipykernel==6.9.1", - "twine==4.0.1", -] - [tool.setuptools.packages.find] where = ["src"] # list of folders that contain the packages (["."] by default) include = ["aac_datasets*"] # package names should match these glob patterns (["*"] by default) [tool.setuptools.dynamic] version = {attr = "aac_datasets.__version__"} +dependencies = {file = ["requirements.txt"]} +optional-dependencies = {dev = { file = ["requirements-dev.txt"] }} diff --git a/requirements-dev.txt b/requirements-dev.txt new file mode 100644 index 0000000..8d9e47e --- /dev/null +++ b/requirements-dev.txt @@ -0,0 +1,10 @@ +# -*- coding: utf-8 -*- + +pytest==6.2.5 +flake8==4.0.1 +click==8.0.4 +black==22.10.0 +ipykernel==6.9.1 +twine==4.0.1 +sphinx +sphinx-press-theme==0.8.0 diff --git a/src/aac_datasets/__init__.py b/src/aac_datasets/__init__.py index f754989..eae6098 100644 --- a/src/aac_datasets/__init__.py +++ b/src/aac_datasets/__init__.py @@ -10,20 +10,20 @@ __maintainer__ = "Etienne Labbé (Labbeti)" __name__ = "aac-datasets" __status__ = "Development" -__version__ = "0.4.1" +__version__ = "0.5.0" from .datasets.audiocaps import AudioCaps from .datasets.clotho import Clotho from .datasets.macs import MACS from .datasets.wavcaps import WavCaps -from .utils.paths import ( +from .utils.globals import ( get_default_ffmpeg_path, get_default_root, - get_default_ytdl_path, + get_default_ytdlp_path, set_default_ffmpeg_path, set_default_root, - set_default_ytdl_path, + set_default_ytdlp_path, ) @@ -34,8 +34,8 @@ "WavCaps", "get_default_ffmpeg_path", "get_default_root", - "get_default_ytdl_path", + "get_default_ytdlp_path", "set_default_ffmpeg_path", "set_default_root", - "set_default_ytdl_path", + "set_default_ytdlp_path", ] diff --git a/src/aac_datasets/check.py b/src/aac_datasets/check.py index 32b7717..49e8ed5 100644 --- a/src/aac_datasets/check.py +++ b/src/aac_datasets/check.py @@ -3,9 +3,10 @@ import logging import os.path as osp +import random from argparse import ArgumentParser, Namespace -from typing import Dict, Iterable +from typing import Dict, Iterable, Union import yaml @@ -15,25 +16,32 @@ from aac_datasets.datasets.clotho import Clotho, ClothoCard from aac_datasets.datasets.macs import MACS, MACSCard from aac_datasets.datasets.wavcaps import WavCaps, WavCapsCard -from aac_datasets.utils.paths import get_default_root +from aac_datasets.utils.globals import get_default_root from aac_datasets.download import _setup_logging +DATASETS_NAMES = (AudioCapsCard.NAME, ClothoCard.NAME, MACSCard.NAME, WavCapsCard.NAME) + pylog = logging.getLogger(__name__) def check_directory( root: str, verbose: int = 0, - datasets: Iterable[str] = ("audiocaps", "clotho", "macs"), + datasets: Union[Iterable[str], str] = DATASETS_NAMES, ) -> Dict[str, Dict[str, int]]: """Check which datasets are installed in root. :param root: The directory to check. :param verbose: The verbose level. defaults to 0. - :param datasets: The datasets to search in root directory. defaults to ("audiocaps", "clotho", "macs"). + :param datasets: The datasets to search in root directory. defaults to DATASETS_NAMES. :returns: A dictionary of datanames containing the length of each subset. """ + if isinstance(datasets, str): + datasets = [datasets] + else: + datasets = list(datasets) + data_infos = [ (AudioCapsCard.NAME, AudioCaps), (ClothoCard.NAME, Clotho), @@ -53,9 +61,13 @@ def check_directory( pylog.info(f"Searching for {ds_name}...") found_dsets = {} - for subset in ds_class.SUBSETS: + for subset in ds_class.CARD.SUBSETS: try: ds = ds_class(root, subset, verbose=0) + if len(ds) > 0: + # Try to load a random item + idx = random.randint(0, len(ds) - 1) + _item = ds[idx] found_dsets[subset] = ds except RuntimeError: @@ -105,7 +117,7 @@ def _get_main_check_args() -> Namespace: "--datasets", type=str, nargs="+", - default=(AudioCapsCard.NAME, ClothoCard.NAME, MACSCard.NAME, WavCapsCard.NAME), + default=DATASETS_NAMES, help="The datasets to check in root directory.", ) diff --git a/src/aac_datasets/datasets/audiocaps.py b/src/aac_datasets/datasets/audiocaps.py index d9ff585..d2a6f36 100644 --- a/src/aac_datasets/datasets/audiocaps.py +++ b/src/aac_datasets/datasets/audiocaps.py @@ -1,14 +1,10 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import csv import logging -import os import os.path as osp -import subprocess -import time -from subprocess import CalledProcessError +from pathlib import Path from typing import ( Any, Callable, @@ -16,31 +12,35 @@ Dict, List, Optional, - Tuple, + Union, ) import torch import torchaudio -import tqdm from torch import Tensor -from torch.hub import download_url_to_file from typing_extensions import TypedDict, NotRequired try: - # To support torchaudio >=2.1.0 + # To support torchaudio >= 2.1.0 from torchaudio import AudioMetaData # type: ignore except ImportError: from torchaudio.backend.common import AudioMetaData -from aac_datasets.datasets.base import AACDataset, DatasetCard -from aac_datasets.utils.paths import _get_root, _get_ffmpeg_path, _get_ytdl_path +from aac_datasets.datasets.base import AACDataset +from aac_datasets.datasets.functional.audiocaps import ( + AudioCapsCard, + download_audiocaps_dataset, + load_audiocaps_dataset, + _get_audio_subset_dpath, +) +from aac_datasets.utils.globals import _get_root, _get_ffmpeg_path, _get_ytdlp_path pylog = logging.getLogger(__name__) -class AudioCapsItem(TypedDict, total=True): +class AudioCapsItem(TypedDict): r"""Class representing a single AudioCaps item.""" # Common attributes @@ -58,38 +58,6 @@ class AudioCapsItem(TypedDict, total=True): youtube_id: str -class AudioCapsCard(DatasetCard): - ANNOTATIONS_CREATORS: Tuple[str, ...] = ("crowdsourced",) - CAPTIONS_PER_AUDIO: Dict[str, int] = { - "train": 1, - "val": 5, - "test": 5, - "train_v2": 1, - } - CITATION: str = r""" - @inproceedings{kim_etal_2019_audiocaps, - title = {{A}udio{C}aps: Generating Captions for Audios in The Wild}, - author = {Kim, Chris Dongjoo and Kim, Byeongchang and Lee, Hyunmin and Kim, Gunhee}, - year = 2019, - month = jun, - booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, - publisher = {Association for Computational Linguistics}, - address = {Minneapolis, Minnesota}, - pages = {119--132}, - doi = {10.18653/v1/N19-1011}, - url = {https://aclanthology.org/N19-1011}, - } - """ - HOMEPAGE: str = "https://audiocaps.github.io/" - LANGUAGE: Tuple[str, ...] = ("en",) - LANGUAGE_DETAILS: Tuple[str, ...] = ("en-US",) - NAME: str = "audiocaps" - PRETTY_NAME: str = "AudioCaps" - SIZE_CATEGORIES: Tuple[str, ...] = ("10K None: """ :param root: Dataset root directory. @@ -156,23 +126,42 @@ def __init__( defaults to False. :param transform: The transform to apply to the global dict item. This transform is applied only in getitem method when argument is an integer. defaults to None. - :param flat_captions: If True, map captions to audio instead of audio to caption. - defaults to True. :param verbose: Verbose level. defaults to 0. + :param force_download: If True, force to re-download file even if they exists on disk. + defaults to False. + :param verify_files: If True, check hash value when possible. + defaults to False. + + :param audio_duration: Extracted duration for each audio file in seconds. + defaults to 10.0. + :param audio_format: Audio format and extension name. + defaults to "flac". + :param audio_n_channels: Number of channels extracted for each audio file. + defaults to 1. + :param download_audio: If True, download audio, metadata and labels files. Otherwise it will only donwload metadata and labels files. + defaults to True. :param exclude_removed_audio: If True, the dataset will exclude from the dataset the audio not downloaded from youtube (i.e. not present on disk). If False, invalid audios will return an empty tensor of shape (0,). defaults to True. - :param with_tags: If True, load the tags from AudioSet dataset. - Note: tags needs to be downloaded with download=True & with_tags=True before being used. - defaults to False. + :param ffmpeg_path: Path to ffmpeg executable file. + defaults to "ffmpeg". + :param flat_captions: If True, map captions to audio instead of audio to caption. + defaults to True. + :param max_workers: Number of threads to download audio files in parallel. + Do not use a value too high to avoid "Too Many Requests" error. + The value None will use `min(32, os.cpu_count() + 4)` workers, which is the default of ThreadPoolExecutor. + defaults to 1. :param sr: The sample rate used for audio files in the dataset (in Hz). Since original YouTube videos are recorded in various settings, this parameter allow to download allow audio files with a specific sample rate. defaults to 32000. - :param ffmpeg_path: Path to ffmpeg executable file. - defaults to "ffmpeg". - :param ytdl_path: Path to youtube-dl or ytdlp executable. - defaults to "youtube-dl". + :param verify_files: If True, check all file already downloaded are valid. + defaults to False. + :param with_tags: If True, load the tags from AudioSet dataset. + Note: tags needs to be downloaded with download=True & with_tags=True before being used. + defaults to False. + :param ytdlp_path: Path to yt-dlp or ytdlp executable. + defaults to "yt-dlp". """ if subset not in AudioCapsCard.SUBSETS: raise ValueError( @@ -180,34 +169,35 @@ def __init__( ) root = _get_root(root) + ytdlp_path = _get_ytdlp_path(ytdlp_path) ffmpeg_path = _get_ffmpeg_path(ffmpeg_path) - ytdl_path = _get_ytdl_path(ytdl_path) if download: - _prepare_audiocaps_dataset( + download_audiocaps_dataset( root=root, subset=subset, - sr=sr, - with_tags=with_tags, + force=force_download, verbose=verbose, - force=AudioCaps.FORCE_PREPARE_DATA, - ytdl_path=ytdl_path, + verify_files=verify_files, + audio_duration=audio_duration, + audio_format=audio_format, + audio_n_channels=audio_n_channels, + download_audio=download_audio, ffmpeg_path=ffmpeg_path, - audio_format=AudioCaps.AUDIO_FORMAT, - audio_duration=AudioCaps.AUDIO_DURATION, - n_channels=AudioCaps.AUDIO_N_CHANNELS, - verify_files=AudioCaps.VERIFY_FILES, - download_audio=AudioCaps.DOWNLOAD_AUDIO, + max_workers=max_workers, + sr=sr, + with_tags=with_tags, + ytdlp_path=ytdlp_path, ) - raw_data, index_to_tagname = _load_audiocaps_dataset( + raw_data, index_to_name = load_audiocaps_dataset( root=root, subset=subset, + verbose=verbose, + audio_format=audio_format, + exclude_removed_audio=exclude_removed_audio, sr=sr, with_tags=with_tags, - exclude_removed_audio=exclude_removed_audio, - verbose=verbose, - audio_format=AudioCaps.AUDIO_FORMAT, ) audio_subset_dpath = _get_audio_subset_dpath(root, subset, sr) size = len(next(iter(raw_data.values()))) @@ -239,9 +229,9 @@ def __init__( self._download = download self._exclude_removed_audio = exclude_removed_audio self._with_tags = with_tags - self._index_to_tagname = index_to_tagname + self._index_to_name = index_to_name - self.add_post_columns( + self.add_online_columns( { "audio": AudioCaps._load_audio, "audio_metadata": AudioCaps._load_audio_metadata, @@ -262,8 +252,8 @@ def exclude_removed_audio(self) -> bool: return self._exclude_removed_audio @property - def index_to_tagname(self) -> List[str]: - return self._index_to_tagname + def index_to_name(self) -> Dict[int, str]: + return self._index_to_name @property def root(self) -> str: @@ -281,28 +271,6 @@ def subset(self) -> str: def with_tags(self) -> bool: return self._with_tags - # Public class methods - @classmethod - def load_class_labels_indices( - cls, - root: str, - sr: int = 32_000, - ) -> List[Dict[str, str]]: - class_labels_indices_fpath = osp.join( - _get_audiocaps_dpath(root, sr), - _AUDIOSET_LINKS["class_labels_indices"]["fname"], - ) - if not osp.isfile(class_labels_indices_fpath): - raise ValueError( - f"Cannot find class_labels_indices file in root='{root}'." - f"Maybe use AudioCaps(root, download=True, with_tags=True) before or use a different root directory." - ) - - with open(class_labels_indices_fpath, "r") as file: - reader = csv.DictReader(file) - audioset_classes_data = list(reader) - return audioset_classes_data - # Magic methods def __repr__(self) -> str: repr_dic = { @@ -340,545 +308,3 @@ def _load_audio_metadata(self, idx: int) -> AudioMetaData: fpath = self.at(idx, "fpath") audio_metadata = torchaudio.info(fpath) # type: ignore return audio_metadata - - -def _get_audiocaps_dpath(root: str, sr: int) -> str: - return osp.join(root, "AUDIOCAPS") - - -def _get_audio_subset_dpath(root: str, subset: str, sr: int) -> str: - return osp.join( - _get_audiocaps_dpath(root, sr), - f"audio_{sr}Hz", - _AUDIOCAPS_AUDIO_DNAMES[subset], - ) - - -def _is_prepared(root: str, subset: str, sr: int, verbose: int) -> bool: - links = _AUDIOCAPS_LINKS[subset] - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(_get_audiocaps_dpath(root, sr), captions_fname) - audio_subset_dpath = _get_audio_subset_dpath(root, subset, sr) - - msgs = [] - - if not osp.isdir(audio_subset_dpath): - msgs.append(f"Cannot find directory '{audio_subset_dpath}'.") - if not osp.isfile(captions_fpath): - msgs.append(f"Cannot find file '{captions_fpath}'.") - - if verbose >= 0: - for msg in msgs: - pylog.warning(msg) - - return len(msgs) == 0 - - -def _load_audiocaps_dataset( - root: str, - subset: str, - sr: int, - with_tags: bool, - exclude_removed_audio: bool, - verbose: int, - audio_format: str = "flac", -) -> Tuple[Dict[str, List[Any]], List[str]]: - if not _is_prepared(root, subset, sr, verbose): - raise RuntimeError( - f"Cannot load data: audiocaps_{subset} is not prepared in data root={root}. Please use download=True in dataset constructor." - ) - - links = _AUDIOCAPS_LINKS[subset] - audiocaps_root = _get_audiocaps_dpath(root, sr) - audio_subset_dpath = _get_audio_subset_dpath(root, subset, sr) - - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(audiocaps_root, captions_fname) - with open(captions_fpath, "r") as file: - reader = csv.DictReader(file) - captions_data = list(reader) - - if with_tags: - class_labels_indices_fpath = osp.join( - audiocaps_root, _AUDIOSET_LINKS["class_labels_indices"]["fname"] - ) - unbal_tags_fpath = osp.join( - audiocaps_root, _AUDIOSET_LINKS["unbalanced"]["fname"] - ) - - if not all(map(osp.isfile, (class_labels_indices_fpath, unbal_tags_fpath))): - raise FileNotFoundError( - f"Cannot load tags without tags files '{osp.basename(class_labels_indices_fpath)}' and '{osp.basename(unbal_tags_fpath)}'." - f"Please use download=True and with_tags=True in dataset constructor." - ) - - audioset_classes_data = AudioCaps.load_class_labels_indices(root, sr) - - with open(unbal_tags_fpath, "r") as file: - fieldnames = ("YTID", "start_seconds", "end_seconds", "positive_labels") - reader = csv.DictReader( - file, fieldnames, skipinitialspace=True, strict=True - ) - # Skip the comments - for _ in range(3): - next(reader) - unbal_tags_data = list(reader) - else: - audioset_classes_data = [] - unbal_tags_data = [] - - # Build global mappings - fnames_dic = dict.fromkeys( - f"{line['youtube_id']}_{line['start_time']}.{audio_format}" - for line in captions_data - ) - audio_fnames_on_disk = dict.fromkeys(os.listdir(audio_subset_dpath)) - if exclude_removed_audio: - fnames_lst = [fname for fname in fnames_dic if fname in audio_fnames_on_disk] - is_on_disk_lst = [True for _ in range(len(fnames_lst))] - else: - fnames_lst = list(fnames_dic) - is_on_disk_lst = [fname in audio_fnames_on_disk for fname in fnames_lst] - - dataset_size = len(fnames_lst) - fname_to_idx = {fname: i for i, fname in enumerate(fnames_lst)} - - mid_to_tag_name = {} - tag_name_to_index = {} - - for line in audioset_classes_data: - # keys: index, mid, display_name - mid_to_tag_name[line["mid"]] = line["display_name"] - tag_name_to_index[line["display_name"]] = int(line["index"]) - - classes_indexes = list(tag_name_to_index.values()) - assert len(classes_indexes) == 0 or classes_indexes == list( - range(classes_indexes[-1] + 1) - ) - index_to_tagname = list(tag_name_to_index.keys()) - - # Process each field into a single structure - all_caps_dic: Dict[str, List[Any]] = { - key: [None for _ in range(dataset_size)] - for key in ("audiocaps_ids", "youtube_id", "start_time", "captions") - } - for line in tqdm.tqdm( - captions_data, - disable=verbose <= 0, - desc=f"Loading AudioCaps ({subset}) captions...", - ): - # audiocap_id, youtube_id, start_time, caption - audiocap_id = line["audiocap_id"] - youtube_id = line["youtube_id"] - start_time = line["start_time"] - caption = line["caption"] - - fname = f"{youtube_id}_{start_time}.{audio_format}" - if fname in fname_to_idx: - idx = fname_to_idx[fname] - - if all_caps_dic["start_time"][idx] is None: - all_caps_dic["start_time"][idx] = start_time - all_caps_dic["youtube_id"][idx] = youtube_id - all_caps_dic["audiocaps_ids"][idx] = [audiocap_id] - all_caps_dic["captions"][idx] = [caption] - else: - assert all_caps_dic["start_time"][idx] == start_time - assert all_caps_dic["youtube_id"][idx] == youtube_id - - all_caps_dic["audiocaps_ids"][idx].append(audiocap_id) - all_caps_dic["captions"][idx].append(caption) - - # Load tags from audioset data - all_tags_lst = [[] for _ in range(dataset_size)] - - for line in tqdm.tqdm( - unbal_tags_data, - disable=verbose <= 0, - desc="Loading AudioSet tags for AudioCaps...", - ): - # keys: YTID, start_seconds, end_seconds, positive_labels - youtube_id = line["YTID"] - # Note : In audioset, start_time is a string repr of a float value, audiocaps it is a string repr of an integer - start_time = int(float(line["start_seconds"])) - fname = f"{youtube_id}_{start_time}.{audio_format}" - if fname in fname_to_idx: - tags_mid = line["positive_labels"] - tags_mid = tags_mid.split(",") - tags_names = [mid_to_tag_name[tag] for tag in tags_mid] - tags_indexes = [tag_name_to_index[tag] for tag in tags_names] - - idx = fname_to_idx[fname] - all_tags_lst[idx] = tags_indexes - - raw_data = { - "fname": fnames_lst, - "tags": all_tags_lst, - "is_on_disk": is_on_disk_lst, - } - raw_data.update(all_caps_dic) - - # Convert audiocaps_ids and start_time to ints - raw_data["audiocaps_ids"] = [ - list(map(int, item)) for item in raw_data["audiocaps_ids"] - ] - raw_data["start_time"] = list(map(int, raw_data["start_time"])) - - if verbose >= 1: - pylog.info( - f"{AudioCapsCard.PRETTY_NAME}(subset={subset}) has been loaded. (len={len(fnames_lst)})" - ) - - return raw_data, index_to_tagname - - -def _prepare_audiocaps_dataset( - root: str, - subset: str, - sr: int, - with_tags: bool, - verbose: int, - force: bool = False, - ytdl_path: str = ..., - ffmpeg_path: str = ..., - audio_format: str = "flac", - audio_duration: float = 10.0, - n_channels: int = 1, - verify_files: bool = False, - download_audio: bool = True, -) -> None: - if not osp.isdir(root): - raise RuntimeError(f"Cannot find root directory '{root}'.") - - _check_ytdl(ytdl_path) - _check_ffmpeg(ffmpeg_path) - - if _is_prepared(root, subset, sr, -1) and not force: - return None - - links = _AUDIOCAPS_LINKS[subset] - audiocaps_root = _get_audiocaps_dpath(root, sr) - audio_subset_dpath = _get_audio_subset_dpath(root, subset, sr) - - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(audiocaps_root, captions_fname) - - os.makedirs(audio_subset_dpath, exist_ok=True) - - if not osp.isfile(captions_fpath): - url = links["captions"]["url"] - if url is None: - raise ValueError( - f"AudioCaps subset '{subset}' cannot be automatically downloaded. (found url={url})" - ) - download_url_to_file(url, captions_fpath, progress=verbose >= 1) - - if download_audio: - start = time.perf_counter() - with open(captions_fpath, "r") as file: - n_samples = len(file.readlines()) - - if verbose >= 1: - pylog.info(f"Start downloading audio files for {subset} AudioCaps split.") - - with open(captions_fpath, "r") as file: - # Download audio files - reader = csv.DictReader(file) - captions_data = list(reader) - - n_download_ok = 0 - n_download_err = 0 - n_already_ok = 0 - n_already_err = 0 - - for i, line in enumerate( - tqdm.tqdm(captions_data, total=n_samples, disable=verbose < 1) - ): - # Keys: audiocap_id, youtube_id, start_time, caption - audiocap_id, youtube_id, start_time = [ - line[key] for key in ("audiocap_id", "youtube_id", "start_time") - ] - fpath = osp.join( - audio_subset_dpath, - f"{youtube_id}_{start_time}.{audio_format}", - ) - if not start_time.isdigit(): - raise RuntimeError( - f'Start time "{start_time}" is not an integer (audiocap_id={audiocap_id}, youtube_id={youtube_id}).' - ) - start_time = int(start_time) - prefix = f"[{audiocap_id:6s};{i:5d}/{n_samples}] " - - if not osp.isfile(fpath): - success = _download_and_extract_from_youtube( - youtube_id=youtube_id, - fpath_out=fpath, - start_time=start_time, - duration=audio_duration, - sr=sr, - ytdl_path=ytdl_path, - ffmpeg_path=ffmpeg_path, - n_channels=n_channels, - ) - if success: - valid_file = _check_file(fpath, sr) - if valid_file: - if verbose >= 2: - pylog.debug( - f"{prefix}File '{youtube_id}' has been downloaded and verified." - ) - n_download_ok += 1 - else: - if verbose >= 2: - pylog.warning( - f"{prefix}File '{youtube_id}' has been downloaded but it is not valid and it will be removed." - ) - os.remove(fpath) - n_download_err += 1 - else: - if verbose >= 2: - pylog.warning( - f"{prefix}Cannot extract audio from {youtube_id}. (maybe the source video has been removed?)" - ) - n_download_err += 1 - - elif verify_files: - valid_file = _check_file(fpath, sr) - if valid_file: - if verbose >= 2: - pylog.info( - f"{prefix}File '{youtube_id}' is already downloaded and has been verified." - ) - n_already_ok += 1 - else: - if verbose >= 2: - pylog.warning( - f"{prefix}File '{youtube_id}' is already downloaded but it is not valid and will be removed." - ) - os.remove(fpath) - n_already_err += 1 - else: - if verbose >= 2: - pylog.debug( - f"{prefix}File '{youtube_id}' is already downloaded but it is not verified due to verify_files={verify_files}." - ) - n_already_ok += 1 - - if verbose >= 1: - duration = int(time.perf_counter() - start) - pylog.info( - f"Download and preparation of AudioCaps for subset '{subset}' done in {duration}s." - ) - pylog.info(f"- {n_download_ok} downloads success,") - pylog.info(f"- {n_download_err} downloads failed,") - pylog.info(f"- {n_already_ok} already downloaded,") - pylog.info(f"- {n_already_err} already downloaded errors,") - pylog.info(f"- {n_samples} total samples.") - - if with_tags: - for key in ("class_labels_indices", "unbalanced"): - infos = _AUDIOSET_LINKS[key] - url = infos["url"] - fname = infos["fname"] - fpath = osp.join(audiocaps_root, fname) - if not osp.isfile(fpath): - if verbose >= 1: - pylog.info(f"Downloading file '{fname}'...") - download_url_to_file(url, fpath, progress=verbose >= 1) - - if verbose >= 2: - pylog.debug( - f"Dataset {AudioCapsCard.PRETTY_NAME} (subset={subset}) has been prepared." - ) - - -def _download_and_extract_from_youtube( - youtube_id: str, - fpath_out: str, - start_time: int, - duration: float = 10.0, - sr: int = 16000, - n_channels: int = 1, - target_format: str = "flac", - acodec: str = "flac", - ytdl_path: str = ..., - ffmpeg_path: str = ..., -) -> bool: - """Download audio from youtube with youtube-dl and ffmpeg.""" - - # Get audio download link with youtube-dl, without start time - link = _get_youtube_link(youtube_id, None) - get_url_command = [ - ytdl_path, - "--youtube-skip-dash-manifest", - "-g", - link, - ] - try: - output = subprocess.check_output(get_url_command) - except (CalledProcessError, PermissionError): - return False - - output = output.decode() - lines = output.split("\n") - if len(lines) < 2: - return False - _video_link, audio_link = lines[:2] - - # Download and extract audio from audio_link to fpath_out with ffmpeg - extract_command = [ - ffmpeg_path, - # Input - "-i", - audio_link, - # Remove video - "-vn", - # Format (flac) - "-f", - target_format, - # Audio codec (flac) - "-acodec", - acodec, - # Get only 10s of the clip after start_time - "-ss", - str(start_time), - "-t", - str(duration), - # Resample to a specific rate (default to 32 kHz) - "-ar", - str(sr), - # Compute mean of 2 channels - "-ac", - str(n_channels), - fpath_out, - ] - try: - exitcode = subprocess.check_call(extract_command) - return exitcode == 0 - except (CalledProcessError, PermissionError): - return False - - -def _check_ytdl(ytdl_path: str) -> None: - try: - subprocess.check_call( - [ytdl_path, "--help"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - except (CalledProcessError, PermissionError, FileNotFoundError) as err: - pylog.error(f"Invalid ytdlp path '{ytdl_path}'. ({err})") - raise err - - -def _check_ffmpeg(ffmpeg_path: str) -> None: - try: - subprocess.check_call( - [ffmpeg_path, "--help"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - except (CalledProcessError, PermissionError, FileNotFoundError) as err: - pylog.error(f"Invalid ffmpeg path '{ffmpeg_path}'. ({err})") - raise err - - -def _check_file(fpath: str, expected_sr: Optional[int]) -> bool: - try: - audio, sr = torchaudio.load(fpath) # type: ignore - except RuntimeError: - message = ( - f"Found file '{fpath}' already downloaded but it is invalid (cannot load)." - ) - pylog.error(message) - return False - - if audio.nelement() == 0: - message = ( - f"Found file '{fpath}' already downloaded but it is invalid (empty audio)." - ) - pylog.error(message) - return False - - if expected_sr is not None and sr != expected_sr: - message = f"Found file '{fpath}' already downloaded but it is invalid (invalid sr={sr} != {expected_sr})." - pylog.error(message) - return False - - return True - - -def _get_youtube_link(youtube_id: str, start_time: Optional[int]) -> str: - link = f"https://www.youtube.com/watch?v={youtube_id}" - if start_time is None: - return link - else: - return f"{link}&t={start_time}s" - - -def _get_youtube_link_embed( - youtube_id: str, start_time: Optional[int], duration: float = 10.0 -) -> str: - link = f"https://www.youtube.com/embed/{youtube_id}" - if start_time is None: - return link - else: - end_time = start_time + duration - return f"{link}?start={start_time}&end={end_time}" - - -# Audio directory names per subset -_AUDIOCAPS_AUDIO_DNAMES = { - "train": "train", - "val": "val", - "test": "test", - "train_v2": "train", -} - -# Archives and file links used to download AudioCaps labels and metadata -_AUDIOCAPS_LINKS = { - "train": { - "captions": { - "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/train.csv", - "fname": "train.csv", - }, - }, - "val": { - "captions": { - "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/val.csv", - "fname": "val.csv", - }, - }, - "test": { - "captions": { - "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/test.csv", - "fname": "test.csv", - }, - }, - "train_v2": { - "captions": { - "url": "https://raw.githubusercontent.com/Labbeti/aac-datasets/dev/data/train_v2.csv", - "fname": "train_v2.csv", - }, - }, -} - -# Archives and file links used to download AudioSet metadata -_AUDIOSET_LINKS = { - "class_labels_indices": { - "fname": "class_labels_indices.csv", - "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv", - }, - "eval": { - "fname": "eval_segments.csv", - "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv", - }, - "balanced": { - "fname": "balanced_train_segments.csv", - "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv", - }, - "unbalanced": { - "fname": "unbalanced_train_segments.csv", - "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv", - }, -} diff --git a/src/aac_datasets/datasets/base.py b/src/aac_datasets/datasets/base.py index d1b463e..d3a93c8 100644 --- a/src/aac_datasets/datasets/base.py +++ b/src/aac_datasets/datasets/base.py @@ -24,7 +24,7 @@ from typing_extensions import TypedDict try: - # To support torchaudio >=2.1.0 + # To support torchaudio >= 2.1.0 from torchaudio import AudioMetaData # type: ignore except ImportError: from torchaudio.backend.common import AudioMetaData @@ -36,10 +36,6 @@ pylog = logging.getLogger(__name__) -class DatasetCard: - pass - - ItemType = TypeVar("ItemType", bound=TypedDict, covariant=True) @@ -78,7 +74,7 @@ def __init__( self._sr = sr self._verbose = verbose - self._post_columns_fns = {} + self._online_fns = {} self._sizes = [] if self._flat_captions: @@ -87,13 +83,20 @@ def __init__( @staticmethod def new_empty() -> "AACDataset": """Create a new empty dataset.""" - return AACDataset({}, None, (), False, None, 0) + return AACDataset( + raw_data={}, + transform=None, + column_names=(), + flat_captions=False, + sr=None, + verbose=0, + ) # Properties @property def all_columns(self) -> List[str]: """The name of all columns of the dataset.""" - return list(self._raw_data | self._post_columns_fns) + return list(self._raw_data | self._online_fns) @property def column_names(self) -> List[str]: @@ -188,7 +191,7 @@ def at( ) elif idx.is_floating_point(): raise TypeError( - f"Invalid tensor dtype. (found floating-point tensor but expected integer tensor)" + "Invalid tensor dtype. (found floating-point tensor but expected integer tensor)" ) idx = idx.tolist() @@ -198,7 +201,9 @@ def at( if not isinstance(column, str) and isinstance(column, Iterable): return {column_i: self.at(idx, column_i) for column_i in column} - if isinstance(idx, (int, slice)) and column in self._raw_data.keys(): + if isinstance(idx, (int, slice)) and ( + column in self._raw_data.keys() and column not in self._online_fns + ): return self._raw_data[column][idx] # type: ignore if isinstance(idx, slice): @@ -229,7 +234,7 @@ def at( return values if isinstance(idx, int): - return self._load_auto_value(column, idx) + return self._load_online_value(column, idx) else: IDX_TYPES = ("int", "Iterable[int]", "None", "slice", "Tensor") raise TypeError( @@ -242,7 +247,7 @@ def has_raw_column(self, column: str) -> bool: def has_post_column(self, column: str) -> bool: """Returns True if column name exists in post processed data.""" - return column in self._post_columns_fns + return column in self._online_fns def has_column(self, column: str) -> bool: """Returns True if column name exists in data.""" @@ -253,8 +258,8 @@ def remove_column(self, column: str) -> Union[List[Any], Callable]: if column in self._raw_data: column_data = self._raw_data.pop(column, []) return column_data - elif column in self._post_columns_fns: - fn = self._post_columns_fns.pop(column) + elif column in self._online_fns: + fn = self._online_fns.pop(column) return fn else: raise ValueError(f"Column '{column}' does not exists in dataset.") @@ -271,7 +276,7 @@ def rename_column( if isinstance(column_data_or_fn, List): self.add_raw_column(new_column, column_data_or_fn, allow_replace) elif isinstance(column_data_or_fn, Callable): - self.add_post_column(new_column, column_data_or_fn, allow_replace) + self.add_online_column(new_column, column_data_or_fn, allow_replace) else: raise TypeError( f"Invalid type {type(column_data_or_fn)}. (expected List or Callable)" @@ -292,46 +297,46 @@ def add_raw_column( raise ValueError(f"Invalid number of rows in column '{column}'.") self._raw_data[column] = column_data - def add_post_column( + def add_online_column( self, column: str, load_fn: Callable[[Any, int], Any], allow_replace: bool = False, ) -> None: """Add a new post-processed column to this dataset.""" - if not allow_replace and column in self._post_columns_fns: + if not allow_replace and column in self._online_fns: raise ValueError( f"Column '{column}' already exists in {self} and found argument allow_replace={allow_replace}." ) - self._post_columns_fns[column] = load_fn + self._online_fns[column] = load_fn - def add_post_columns( + def add_online_columns( self, post_columns_fns: Dict[str, Callable[[Any, int], Any]], allow_replace: bool = False, ) -> None: """Add several new post-processed columns to this dataset.""" for name, load_fn in post_columns_fns.items(): - self.add_post_column(name, load_fn, allow_replace) + self.add_online_column(name, load_fn, allow_replace) - def load_post_column( + def preload_online_column( self, column: str, allow_replace: bool = False, ) -> Callable[[Any, int], Any]: """Load all data from a post-column data into raw data.""" - if column not in self._post_columns_fns: + if column not in self._online_fns: raise ValueError(f"Invalid argument column={column}.") column_data = [ - self._load_auto_value(column, i) + self._load_online_value(column, i) for i in tqdm.trange( len(self), disable=self._verbose < 2, desc=f"Preloading column '{column}'", ) ] - fn = self._post_columns_fns.pop(column) + fn = self._online_fns.pop(column) self.add_raw_column(column, column_data, allow_replace=allow_replace) return fn @@ -416,9 +421,9 @@ def _unflat_raw_data(self) -> None: raw_data = _unflat_raw_data(self._raw_data, self._sizes) self._raw_data = raw_data - def _load_auto_value(self, column: str, idx: int) -> Any: - if column in self._post_columns_fns: - fn = self._post_columns_fns[column] + def _load_online_value(self, column: str, idx: int) -> Any: + if column in self._online_fns: + fn = self._online_fns[column] return fn(self, idx) else: raise ValueError( @@ -427,7 +432,8 @@ def _load_auto_value(self, column: str, idx: int) -> Any: def _load_audio(self, idx: int) -> Tensor: fpath = self.at(idx, "fpath") - audio, sr = torchaudio.load(fpath) # type: ignore + audio_and_sr: Tuple[Tensor, int] = torchaudio.load(fpath) # type: ignore + audio, sr = audio_and_sr # Sanity check if audio.nelement() == 0: @@ -447,7 +453,7 @@ def _load_audio_metadata(self, idx: int) -> AudioMetaData: return audio_metadata def _load_duration(self, idx: int) -> float: - audio_metadata = self.at(idx, "audio_metadata") + audio_metadata: AudioMetaData = self.at(idx, "audio_metadata") duration = audio_metadata.num_frames / audio_metadata.sample_rate return duration diff --git a/src/aac_datasets/datasets/clotho.py b/src/aac_datasets/datasets/clotho.py index 17f870f..96f048d 100644 --- a/src/aac_datasets/datasets/clotho.py +++ b/src/aac_datasets/datasets/clotho.py @@ -1,37 +1,35 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import copy -import csv import logging -import os import os.path as osp +from pathlib import Path from typing import ( - Any, Callable, ClassVar, - Dict, List, Optional, - Tuple, + Union, ) -from zipfile import ZipFile -from py7zr import SevenZipFile from torch import Tensor -from torch.hub import download_url_to_file from typing_extensions import TypedDict, NotRequired -from aac_datasets.datasets.base import AACDataset, DatasetCard -from aac_datasets.utils.download import hash_file -from aac_datasets.utils.paths import _get_root +from aac_datasets.datasets.base import AACDataset +from aac_datasets.datasets.functional.clotho import ( + ClothoCard, + load_clotho_dataset, + download_clotho_dataset, + _get_audio_subset_dpath, +) +from aac_datasets.utils.globals import _get_root pylog = logging.getLogger(__name__) -class ClothoItem(TypedDict, total=True): +class ClothoItem(TypedDict): r"""Class representing a single Clotho item.""" # Common attributes @@ -51,53 +49,10 @@ class ClothoItem(TypedDict, total=True): license: NotRequired[str] -class ClothoCard(DatasetCard): - ANNOTATIONS_CREATORS: Tuple[str, ...] = ("crowdsourced",) - CAPTIONS_PER_AUDIO = { - "dev": 5, - "val": 5, - "eval": 5, - "dcase_aac_test": 0, - "dcase_aac_analysis": 0, - "dcase_t2a_audio": 0, - "dcase_t2a_captions": 1, - } - CITATION: str = r""" - @inproceedings{Drossos_2020_icassp, - author = "Drossos, Konstantinos and Lipping, Samuel and Virtanen, Tuomas", - title = "Clotho: an Audio Captioning Dataset", - booktitle = "Proc. IEEE Int. Conf. Acoustic., Speech and Signal Process. (ICASSP)", - year = "2020", - pages = "736-740", - abstract = "Audio captioning is the novel task of general audio content description using free text. It is an intermodal translation task (not speech-to-text), where a system accepts as an input an audio signal and outputs the textual description (i.e. the caption) of that signal. In this paper we present Clotho, a dataset for audio captioning consisting of 4981 audio samples of 15 to 30 seconds duration and 24 905 captions of eight to 20 words length, and a baseline method to provide initial results. Clotho is built with focus on audio content and caption diversity, and the splits of the data are not hampering the training or evaluation of methods. All sounds are from the Freesound platform, and captions are crowdsourced using Amazon Mechanical Turk and annotators from English speaking countries. Unique words, named entities, and speech transcription are removed with post-processing. Clotho is freely available online (https://zenodo.org/record/3490684)." - } - """ - HOMEPAGE: str = "https://zenodo.org/record/3490684" - LANGUAGE: Tuple[str, ...] = ("en",) - LANGUAGE_DETAILS: Tuple[str, ...] = ("en-US",) - DEFAULT_VERSION: str = "v2.1" - NAME: str = "clotho" - N_CHANNELS: int = 1 - PRETTY_NAME: str = "Clotho" - SAMPLE_RATE: int = 44_100 # Hz - SIZE_CATEGORIES: Tuple[str, ...] = ("1K None: """ @@ -170,10 +128,17 @@ def __init__( defaults to False. :param transform: The transform to apply to the global dict item. This transform is applied only in getitem method when argument is an integer. defaults to None. - :param flat_captions: If True, map captions to audio instead of audio to caption. - defaults to True. :param verbose: Verbose level to use. Can be 0 or 1. defaults to 0. + :param force_download: If True, force to re-download file even if they exists on disk. + defaults to False. + :param verify_files: If True, check hash value when possible. + defaults to False. + + :param clean_archives: If True, remove the compressed archives from disk to save space. + defaults to True. + :param flat_captions: If True, map captions to audio instead of audio to caption. + defaults to True. :param version: The version of the dataset. Can be one of :attr:`~ClothoCard.versions`. defaults to 'v2.1'. """ @@ -188,23 +153,22 @@ def __init__( f"Please consider using the fixed version 'v2.1'." ) - subsets = tuple(_CLOTHO_LINKS[version].keys()) - if subset not in subsets: + if subset not in ClothoCard.SUBSETS: raise ValueError( - f"Invalid Clotho argument subset={subset} for version={version}. Must be one of {subsets}." + f"Invalid Clotho argument subset={subset} for version={version}. Must be one of {ClothoCard.SUBSETS}." ) root = _get_root(root) if download: - _prepare_clotho_dataset( + download_clotho_dataset( root=root, - version=version, subset=subset, + force=force_download, verbose=verbose, - force=Clotho.FORCE_PREPARE_DATA, - verify_files=Clotho.VERIFY_FILES, - clean_archives=Clotho.CLEAN_ARCHIVES, + verify_files=verify_files, + clean_archives=clean_archives, + version=version, ) # Exclude some columns containing empty values for several subsets @@ -242,8 +206,11 @@ def __init__( for name in removed_columns: column_names.remove(name) - raw_data = _load_clotho_dataset( - root=root, version=version, subset=subset, verbose=verbose + raw_data = load_clotho_dataset( + root=root, + subset=subset, + verbose=verbose, + version=version, ) size = len(next(iter(raw_data.values()))) @@ -274,7 +241,7 @@ def __init__( self._version = version if "audio" not in removed_columns: - self.add_post_columns( + self.add_online_columns( { "audio": Clotho._load_audio, "audio_metadata": Clotho._load_audio_metadata, @@ -316,584 +283,3 @@ def __repr__(self) -> str: } repr_str = ", ".join(f"{k}={v}" for k, v in repr_dic.items()) return f"{ClothoCard.PRETTY_NAME}({repr_str})" - - -def _get_clotho_dpath(root: str, version: str) -> str: - return osp.join(root, f"CLOTHO_{version}") - - -def _get_archives_dpath(root: str, version: str) -> str: - return osp.join(_get_clotho_dpath(root, version), "archives") - - -def _get_audio_dpath(root: str, version: str) -> str: - return osp.join(_get_clotho_dpath(root, version), "clotho_audio_files") - - -def _get_csv_dpath(root: str, version: str) -> str: - return osp.join(_get_clotho_dpath(root, version), "clotho_csv_files") - - -def _get_audio_subset_dpath(root: str, version: str, subset: str) -> Optional[str]: - dname = _CLOTHO_AUDIO_DNAMES[subset] - if dname is None: - return None - - return osp.join( - _get_clotho_dpath(root, version), - "clotho_audio_files", - dname, - ) - - -def _is_prepared(root: str, version: str, subset: str) -> bool: - audio_dpath = _get_audio_dpath(root, version) - csv_dpath = _get_csv_dpath(root, version) - if not all(map(osp.isdir, (audio_dpath, csv_dpath))): - return False - - if ClothoCard.CAPTIONS_PER_AUDIO[subset] == 0: - return True - if _CLOTHO_AUDIO_DNAMES[subset] is None: - return True - - links = _CLOTHO_LINKS[version][subset] - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(csv_dpath, captions_fname) - with open(captions_fpath, "r") as file: - reader = csv.DictReader(file) - lines = list(reader) - - audio_subset_dpath = _get_audio_subset_dpath(root, version, subset) - return len(lines) == len(os.listdir(audio_subset_dpath)) - - -def _load_clotho_dataset( - root: str, - version: str, - subset: str, - verbose: int, -) -> Dict[str, List[Any]]: - if not _is_prepared(root, version, subset): - raise RuntimeError( - f"Cannot load data: clotho_{subset} is not prepared in data root={root}. Please use download=True in dataset constructor." - ) - - # Read fpath of .wav audio files - links = _CLOTHO_LINKS[version][subset] - csv_dpath = _get_csv_dpath(root, version) - - # Read Clotho files - if "captions" in links.keys(): - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(csv_dpath, captions_fname) - - # Keys: file_name, caption_1, caption_2, caption_3, caption_4, caption_5 - with open(captions_fpath, "r") as file: - reader = csv.DictReader(file) - captions_data = list(reader) - - if subset == "dcase_t2a_captions": - captions_data = [ - data | {"file_name": f"no_fname_{i}"} - for i, data in enumerate(captions_data) - ] - - else: - captions_data = [] - - if "metadata" in links.keys(): - metadata_fname = links["metadata"]["fname"] - metadata_fpath = osp.join(csv_dpath, metadata_fname) - - # Keys: file_name, keywords, sound_id, sound_link, start_end_samples, manufacturer, license - if version in ("v2", "v2.1"): - encoding = "ISO-8859-1" - else: - encoding = None - - with open(metadata_fpath, "r", encoding=encoding) as file: - delimiter = ";" if subset == "dcase_aac_test" else "," - reader = csv.DictReader(file, delimiter=delimiter) - metadata = list(reader) - else: - metadata = [] - - if "captions" in links.keys(): - # note: "dev", "val", "eval" - fnames_lst = [line["file_name"] for line in captions_data] - elif "metadata" in links.keys(): - # note: for "dcase_aac_test" subset which do not have captions CSV file - fnames_lst = [line["file_name"] for line in metadata] - else: - # note 1: for "dcase_aac_analysis" subset which do not have any CSV file - # note 2: force sorted list to have the same order on all OS - audio_subset_dpath = _get_audio_subset_dpath(root, version, subset) - fnames_lst = list(sorted(os.listdir(audio_subset_dpath))) - - idx_to_fname = {i: fname for i, fname in enumerate(fnames_lst)} - fname_to_idx = {fname: i for i, fname in idx_to_fname.items()} - dataset_size = len(fnames_lst) - - # Process each item field - if len(metadata) > 0: - subset_metadata_keys = [key for key in _METADATA_KEYS if key in metadata[0]] - else: - subset_metadata_keys = [] - - all_captions_lst = [[] for _ in range(dataset_size)] - - if subset != "dcase_t2a_captions": - captions_keys = _CAPTIONS_KEYS - else: - captions_keys = ("caption",) - - for line in captions_data: - fname = line["file_name"] - idx = fname_to_idx[fname] - all_captions_lst[idx] = [line[caption_key] for caption_key in captions_keys] - - all_metadata_dic: Dict[str, List[Any]] = { - key: [None for _ in range(dataset_size)] for key in subset_metadata_keys - } - for line in metadata: - fname = line["file_name"] - if fname not in fname_to_idx: - raise KeyError( - f"Cannot find metadata fname={fname} in captions file. (subset={subset})" - ) - idx = fname_to_idx[fname] - for key in subset_metadata_keys: - # The test subset does not have keywords in metadata, but has sound_id, sound_link, etc. - if key in line: - all_metadata_dic[key][idx] = line[key] - - raw_data = { - "fname": fnames_lst, - "captions": all_captions_lst, - } - raw_data.update(all_metadata_dic) - - if "keywords" in raw_data: - # Split keywords into List[str] - raw_data["keywords"] = [ - keywords.split(";") if keywords is not None else [] - for keywords in raw_data["keywords"] - ] - - if subset == "dcase_t2a_audio": - # Temporary patch to avoid file loading errors - # indexes: 53, 521, 677 - replaces = { - "raindrops on metal: police background.wav": "raindrops on metal_police background.wav", - "Intersection Wet : Metro Pass.wav": "Intersection Wet_Metro Pass.wav", - "Kitchen Roomtone w: Dripping Faucet_1-2.wav": "Kitchen Roomtone w_Dripping Faucet_1-2.wav", - } - raw_data["fname"] = [replaces.get(fname, fname) for fname in raw_data["fname"]] - - if verbose >= 1: - pylog.info( - f"Dataset {ClothoCard.PRETTY_NAME} ({subset}) has been loaded. (size={len(next(iter(raw_data.values())))})" - ) - return raw_data - - -def _prepare_clotho_dataset( - root: str, - version: str, - subset: str, - verbose: int, - force: bool, - verify_files: bool, - clean_archives: bool, -) -> None: - if not osp.isdir(root): - raise RuntimeError(f"Cannot find root directory '{root}'.") - - archives_dpath = _get_archives_dpath(root, version) - audio_dpath = _get_audio_dpath(root, version) - csv_dpath = _get_csv_dpath(root, version) - - for dpath in (archives_dpath, audio_dpath, csv_dpath): - os.makedirs(dpath, exist_ok=True) - - if verbose >= 1: - pylog.info(f"Start to download files for clotho_{subset}...") - - links = copy.deepcopy(_CLOTHO_LINKS[version][subset]) - EXTENSIONS = ("7z", "csv", "zip") - - # Download csv and 7z files - for file_info in links.values(): - fname, url, hash_value = ( - file_info["fname"], - file_info["url"], - file_info["hash_value"], - ) - extension = fname.split(".")[-1] - - if extension in ("7z", "zip"): - dpath = archives_dpath - elif extension == "csv": - dpath = csv_dpath - else: - raise RuntimeError( - f"Found invalid extension={extension}. Must be one of {EXTENSIONS}." - ) - - fpath = osp.join(dpath, fname) - if not osp.isfile(fpath) or force: - if verbose >= 1: - pylog.info(f"Download and check file '{fname}' from url={url}...") - - download_url_to_file( - url, - fpath, - progress=verbose >= 1, - ) - - elif verbose >= 1: - pylog.info(f"File '{fname}' is already downloaded.") - - if verify_files: - hash_value = file_info["hash_value"] - file_hash_value = hash_file(fpath, hash_type="md5") - if file_hash_value != hash_value: - raise RuntimeError( - f"Invalid checksum for file '{fname}'. (expected md5 checksum '{hash_value}' but found '{file_hash_value}')\n" - f"Please try to remove manually the file '{fpath}' and rerun MACS download." - ) - elif verbose >= 2: - pylog.debug(f"File '{fname}' has a valid checksum.") - - # Extract audio files from archives - audio_subset_dpath = _get_audio_subset_dpath(root, version, subset) - if audio_subset_dpath is not None: - for file_info in links.values(): - fname = file_info["fname"] - extension = fname.split(".")[-1] - - if extension == "csv": - continue - - if extension not in ("7z", "zip"): - pylog.error( - f"Found unexpected extension={extension} for downloaded file '{fname}'. Expected one of {EXTENSIONS}." - ) - continue - - fpath = osp.join(archives_dpath, fname) - - if verbose >= 1: - pylog.info(f"Extract archive file fname={fname}...") - - if extension == "7z": - archive_file = SevenZipFile(fpath) - compressed_fnames = [ - osp.basename(fname) for fname in archive_file.getnames() - ] - elif extension == "zip": - archive_file = ZipFile(fpath) - compressed_fnames = [ - osp.basename(file.filename) for file in archive_file.filelist - ] - else: - raise RuntimeError(f"Invalid extension '{extension}'.") - - # Ignore dir name from archive file - compressed_fnames = [ - fname for fname in compressed_fnames if fname.endswith(".wav") - ] - extracted_fnames = ( - os.listdir(audio_subset_dpath) if osp.isdir(audio_subset_dpath) else [] - ) - - if set(extracted_fnames) != set(compressed_fnames): - # For dcase_t2a_audio subset, the name of the audio dname is also "test", so we need to move the audio files to another folder named "test_retrieval_audio". - if subset == "dcase_t2a_audio": - target_dpath = audio_subset_dpath - os.makedirs(target_dpath, exist_ok=True) - else: - target_dpath = audio_dpath - - archive_file.extractall(target_dpath) - - if subset == "dcase_t2a_audio": - extracted_dpath = osp.join(target_dpath, "test") - for fname in os.listdir(extracted_dpath): - os.rename( - osp.join(extracted_dpath, fname), - osp.join(target_dpath, fname), - ) - os.rmdir(extracted_dpath) - - # Check if files is good now - extracted_fnames = os.listdir(audio_subset_dpath) - if set(extracted_fnames) != set(compressed_fnames): - found_but_not_expected = len( - set(extracted_fnames).difference(set(compressed_fnames)) - ) - expected_but_not_found = len( - set(compressed_fnames).difference(set(extracted_fnames)) - ) - - raise RuntimeError( - f"Invalid number of audios extracted, found {len(extracted_fnames)} files but expected the same {len(compressed_fnames)} files. " - f"(with found_but_not_expected={found_but_not_expected} and expected_but_not_found={expected_but_not_found})" - ) - - archive_file.close() - - if clean_archives: - for file_info in links.values(): - fname = file_info["fname"] - extension = fname.split(".")[-1] - if extension not in ("7z", "zip"): - continue - - fpath = osp.join(archives_dpath, fname) - if verbose >= 1: - pylog.info(f"Removing archive file {osp.basename(fpath)}...") - os.remove(fpath) - - if verbose >= 2: - pylog.debug(f"Dataset {ClothoCard.PRETTY_NAME} ({subset}) has been prepared.") - - -# Audio directory names per subset -_CLOTHO_AUDIO_DNAMES = { - "dev": "development", - "val": "validation", - "eval": "evaluation", - "dcase_aac_test": "test", - "dcase_aac_analysis": "clotho_analysis", - "dcase_t2a_audio": "test_retrieval_audio", - "dcase_t2a_captions": None, -} - -# Archives and file links used to download Clotho -_CLOTHO_LINKS = { - "v1": { - "dev": { - "audio_archive": { - "fname": "clotho_audio_development.7z", - "url": "https://zenodo.org/record/3490684/files/clotho_audio_development.7z?download=1", - "hash_value": "e3ce88561b317cc3825e8c861cae1ec6", - }, - "captions": { - "fname": "clotho_captions_development.csv", - "url": "https://zenodo.org/record/3490684/files/clotho_captions_development.csv?download=1", - "hash_value": "dd568352389f413d832add5cf604529f", - }, - "metadata": { - "fname": "clotho_metadata_development.csv", - "url": "https://zenodo.org/record/3490684/files/clotho_metadata_development.csv?download=1", - "hash_value": "582c18ee47cebdbe33dce1feeab53a56", - }, - }, - "eval": { - "audio_archive": { - "fname": "clotho_audio_evaluation.7z", - "url": "https://zenodo.org/record/3490684/files/clotho_audio_evaluation.7z?download=1", - "hash_value": "4569624ccadf96223f19cb59fe4f849f", - }, - "captions": { - "fname": "clotho_captions_evaluation.csv", - "url": "https://zenodo.org/record/3490684/files/clotho_captions_evaluation.csv?download=1", - "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", - }, - "metadata": { - "fname": "clotho_metadata_evaluation.csv", - "url": "https://zenodo.org/record/3490684/files/clotho_metadata_evaluation.csv?download=1", - "hash_value": "13946f054d4e1bf48079813aac61bf77", - }, - }, - "test": { - "audio_archive": { - "fname": "clotho_audio_test.7z", - "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", - "hash_value": "9b3fe72560a621641ff4351ba1154349", - }, - "metadata": { - "fname": "clotho_metadata_test.csv", - "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", - "hash_value": "52f8ad01c229a310a0ff8043df480e21", - }, - }, - }, - "v2": { - "dev": { - "audio_archive": { - "fname": "clotho_audio_development.7z", - "url": "https://zenodo.org/record/4743815/files/clotho_audio_development.7z?download=1", - "hash_value": "eda144a5e05a60e6d2e37a65fc4720a9", - }, - "captions": { - "fname": "clotho_captions_development.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_captions_development.csv?download=1", - "hash_value": "800633304e73d3daed364a2ba6069827", - }, - "metadata": { - "fname": "clotho_metadata_development.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_metadata_development.csv?download=1", - "hash_value": "5fdc51b4c4f3468ff7d251ea563588c9", - }, - }, - "val": { - "audio_archive": { - "fname": "clotho_audio_validation.7z", - "url": "https://zenodo.org/record/4743815/files/clotho_audio_validation.7z?download=1", - "hash_value": "0475bfa5793e80f748d32525018ebada", - }, - "captions": { - "fname": "clotho_captions_validation.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_captions_validation.csv?download=1", - "hash_value": "3109c353138a089c7ba724f27d71595d", - }, - "metadata": { - "fname": "clotho_metadata_validation.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_metadata_validation.csv?download=1", - "hash_value": "f69cfacebcd47c4d8d30d968f9865475", - }, - }, - "eval": { - "audio_archive": { - "fname": "clotho_audio_evaluation.7z", - "url": "https://zenodo.org/record/4743815/files/clotho_audio_evaluation.7z?download=1", - "hash_value": "4569624ccadf96223f19cb59fe4f849f", - }, - "captions": { - "fname": "clotho_captions_evaluation.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_captions_evaluation.csv?download=1", - "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", - }, - "metadata": { - "fname": "clotho_metadata_evaluation.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_metadata_evaluation.csv?download=1", - "hash_value": "13946f054d4e1bf48079813aac61bf77", - }, - }, - "dcase_aac_test": { - "audio_archive": { - "fname": "clotho_audio_test.7z", - "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", - "hash_value": "9b3fe72560a621641ff4351ba1154349", - }, - "metadata": { - "fname": "clotho_metadata_test.csv", - "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", - "hash_value": "52f8ad01c229a310a0ff8043df480e21", - }, - }, - }, - "v2.1": { - "dev": { - "audio_archive": { - "fname": "clotho_audio_development.7z", - "url": "https://zenodo.org/record/4783391/files/clotho_audio_development.7z?download=1", - "hash_value": "c8b05bc7acdb13895bb3c6a29608667e", - }, - "captions": { - "fname": "clotho_captions_development.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_captions_development.csv?download=1", - "hash_value": "d4090b39ce9f2491908eebf4d5b09bae", - }, - "metadata": { - "fname": "clotho_metadata_development.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_metadata_development.csv?download=1", - "hash_value": "170d20935ecfdf161ce1bb154118cda5", - }, - }, - "val": { - "audio_archive": { - "fname": "clotho_audio_validation.7z", - "url": "https://zenodo.org/record/4783391/files/clotho_audio_validation.7z?download=1", - "hash_value": "7dba730be08bada48bd15dc4e668df59", - }, - "captions": { - "fname": "clotho_captions_validation.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_captions_validation.csv?download=1", - "hash_value": "5879e023032b22a2c930aaa0528bead4", - }, - "metadata": { - "fname": "clotho_metadata_validation.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_metadata_validation.csv?download=1", - "hash_value": "2e010427c56b1ce6008b0f03f41048ce", - }, - }, - "eval": { - "audio_archive": { - "fname": "clotho_audio_evaluation.7z", - "url": "https://zenodo.org/record/4783391/files/clotho_audio_evaluation.7z?download=1", - "hash_value": "4569624ccadf96223f19cb59fe4f849f", - }, - "captions": { - "fname": "clotho_captions_evaluation.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_captions_evaluation.csv?download=1", - "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", - }, - "metadata": { - "fname": "clotho_metadata_evaluation.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_metadata_evaluation.csv?download=1", - "hash_value": "13946f054d4e1bf48079813aac61bf77", - }, - }, - "dcase_aac_test": { - "audio_archive": { - "fname": "clotho_audio_test.7z", - "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", - "hash_value": "9b3fe72560a621641ff4351ba1154349", - }, - "metadata": { - "fname": "clotho_metadata_test.csv", - "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", - "hash_value": "52f8ad01c229a310a0ff8043df480e21", - }, - }, - "dcase_aac_analysis": { - "audio_archive": { - "fname": "clotho_analysis_2022.zip", - "url": "https://zenodo.org/record/6610709/files/clotho_analysis_2022.zip?download=1", - "hash_value": "7e8fa4762cc3a7c5546606680b958d08", - }, - }, - "dcase_t2a_audio": { - "audio_archive": { - "fname": "retrieval_audio.7z", - "url": "https://zenodo.org/record/6590983/files/retrieval_audio.7z?download=1", - "hash_value": "24102395fd757c462421a483fba5c407", - }, - "metadata": { - "fname": "retrieval_audio_metadata.csv", - "url": "https://zenodo.org/record/6590983/files/retrieval_audio_metadata.csv?download=1", - "hash_value": "1301db07acbf1e4fabc467eb54e0d353", - }, - }, - "dcase_t2a_captions": { - "captions": { - "fname": "retrieval_captions.csv", - "url": "https://zenodo.org/record/6590983/files/retrieval_captions.csv?download=1", - "hash_value": "f9e810118be00c64ea8cd7557816d4fe", - }, - }, - }, -} - -# Captions column names in CSV files -_CAPTIONS_KEYS = ( - "caption_1", - "caption_2", - "caption_3", - "caption_4", - "caption_5", -) - -# Metadata column names in CSV files -_METADATA_KEYS = ( - "keywords", - "sound_id", - "sound_link", - "start_end_samples", - "manufacturer", - "license", -) diff --git a/src/aac_datasets/datasets/functional/__init__.py b/src/aac_datasets/datasets/functional/__init__.py new file mode 100644 index 0000000..721c818 --- /dev/null +++ b/src/aac_datasets/datasets/functional/__init__.py @@ -0,0 +1,7 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +from .audiocaps import AudioCapsCard, load_audiocaps_dataset, download_audiocaps_dataset +from .clotho import ClothoCard, load_clotho_dataset, download_clotho_dataset +from .macs import MACSCard, load_macs_dataset, download_macs_dataset +from .wavcaps import WavCapsCard, load_wavcaps_dataset, download_wavcaps_dataset diff --git a/src/aac_datasets/datasets/functional/audiocaps.py b/src/aac_datasets/datasets/functional/audiocaps.py new file mode 100644 index 0000000..7e01126 --- /dev/null +++ b/src/aac_datasets/datasets/functional/audiocaps.py @@ -0,0 +1,821 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import csv +import logging +import os +import os.path as osp +import subprocess +import time + +from concurrent.futures import ThreadPoolExecutor +from pathlib import Path +from subprocess import CalledProcessError +from typing import ( + Any, + Dict, + Iterable, + List, + Optional, + Tuple, + Union, +) + +import torchaudio +import tqdm + +from aac_datasets.datasets.functional.common import DatasetCard +from aac_datasets.utils.audioset_mapping import ( + download_audioset_mapping, + load_audioset_mapping, +) +from aac_datasets.utils.download import download_file +from aac_datasets.utils.globals import _get_root, _get_ffmpeg_path, _get_ytdlp_path + + +pylog = logging.getLogger(__name__) + + +class AudioCapsCard(DatasetCard): + ANNOTATIONS_CREATORS: Tuple[str, ...] = ("crowdsourced",) + CAPTIONS_PER_AUDIO: Dict[str, int] = { + "train": 1, + "val": 5, + "test": 5, + "train_v2": 1, + } + CITATION: str = r""" + @inproceedings{kim_etal_2019_audiocaps, + title = {{A}udio{C}aps: Generating Captions for Audios in The Wild}, + author = {Kim, Chris Dongjoo and Kim, Byeongchang and Lee, Hyunmin and Kim, Gunhee}, + year = 2019, + month = jun, + booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, + publisher = {Association for Computational Linguistics}, + address = {Minneapolis, Minnesota}, + pages = {119--132}, + doi = {10.18653/v1/N19-1011}, + url = {https://aclanthology.org/N19-1011}, + } + """ + DEFAULT_SUBSET: str = "train" + HOMEPAGE: str = "https://audiocaps.github.io/" + LANGUAGE: Tuple[str, ...] = ("en",) + LANGUAGE_DETAILS: Tuple[str, ...] = ("en-US",) + NAME: str = "audiocaps" + PRETTY_NAME: str = "AudioCaps" + SIZE_CATEGORIES: Tuple[str, ...] = ("10K Tuple[Dict[str, List[Any]], Dict[int, str]]: + """Load AudioCaps metadata. + + :param root: Dataset root directory. + The data will be stored in the 'AUDIOCAPS' subdirectory. + defaults to ".". + :param subset: The subset of AudioCaps to use. Can be one of :attr:`~AudioCapsCard.SUBSETS`. + defaults to "train". + :param verbose: Verbose level. + defaults to 0. + + :param audio_format: Audio format and extension name. + defaults to "flac". + :param exclude_removed_audio: If True, the dataset will exclude from the dataset the audio not downloaded from youtube (i.e. not present on disk). + If False, invalid audios will return an empty tensor of shape (0,). + defaults to True. + :param sr: The sample rate used for audio files in the dataset (in Hz). + Since original YouTube videos are recorded in various settings, this parameter allow to download allow audio files with a specific sample rate. + defaults to 32000. + :param with_tags: If True, load the tags from AudioSet dataset. + Note: tags needs to be downloaded with download=True & with_tags=True before being used. + defaults to False. + :returns: A dictionnary of lists containing each metadata. + Expected keys: "audiocaps_ids", "youtube_id", "start_time", "captions", "fname", "tags", "is_on_disk". + """ + + root = _get_root(root) + audiocaps_root = _get_audiocaps_root(root, sr) + audio_subset_dpath = _get_audio_subset_dpath(root, subset, sr) + + if not _is_prepared_audiocaps(root, subset, sr, audio_format, verbose): + raise RuntimeError( + f"Cannot load data: audiocaps_{subset} is not prepared in data root={root}. Please use download=True in dataset constructor." + ) + + links = _AUDIOCAPS_LINKS[subset] + captions_fname = links["captions"]["fname"] + captions_fpath = osp.join(audiocaps_root, captions_fname) + with open(captions_fpath, "r") as file: + reader = csv.DictReader(file) + captions_data = list(reader) + + if with_tags: + class_labels_indices_fpath = osp.join( + audiocaps_root, _AUDIOSET_LINKS["class_labels_indices"]["fname"] + ) + unbal_tags_fpath = osp.join( + audiocaps_root, _AUDIOSET_LINKS["unbalanced"]["fname"] + ) + + if not all(map(osp.isfile, (class_labels_indices_fpath, unbal_tags_fpath))): + raise FileNotFoundError( + f"Cannot load tags without tags files '{osp.basename(class_labels_indices_fpath)}' and '{osp.basename(unbal_tags_fpath)}'." + f"Please use download=True and with_tags=True in dataset constructor." + ) + + mid_to_index: Dict[str, int] = load_audioset_mapping( + "mid", "index", offline=True, cache_path=audiocaps_root, verbose=verbose + ) + index_to_name: Dict[int, str] = load_audioset_mapping( + "index", + "display_name", + offline=True, + cache_path=audiocaps_root, + verbose=verbose, + ) + + with open(unbal_tags_fpath, "r") as file: + FIELDNAMES = ("YTID", "start_seconds", "end_seconds", "positive_labels") + reader = csv.DictReader( + file, FIELDNAMES, skipinitialspace=True, strict=True + ) + # Skip the comments + for _ in range(3): + next(reader) + unbal_tags_data = list(reader) + else: + mid_to_index = {} + index_to_name = {} + unbal_tags_data = [] + + # Build global mappings + fnames_dic = dict.fromkeys( + _AUDIO_FNAME_FORMAT.format(**line, audio_format=audio_format) + for line in captions_data + ) + audio_fnames_on_disk = dict.fromkeys(os.listdir(audio_subset_dpath)) + if exclude_removed_audio: + fnames_lst = [fname for fname in fnames_dic if fname in audio_fnames_on_disk] + is_on_disk_lst = [True for _ in range(len(fnames_lst))] + else: + fnames_lst = list(fnames_dic) + is_on_disk_lst = [fname in audio_fnames_on_disk for fname in fnames_lst] + + dataset_size = len(fnames_lst) + fname_to_idx = {fname: i for i, fname in enumerate(fnames_lst)} + + # Process each field into a single structure + all_caps_dic: Dict[str, List[Any]] = { + key: [None for _ in range(dataset_size)] + for key in ("audiocaps_ids", "youtube_id", "start_time", "captions") + } + for line in tqdm.tqdm( + captions_data, + disable=verbose <= 0, + desc=f"Loading AudioCaps ({subset}) captions...", + ): + # audiocap_id, youtube_id, start_time, caption + audiocap_id = line["audiocap_id"] + youtube_id = line["youtube_id"] + start_time = line["start_time"] + caption = line["caption"] + + fname = _AUDIO_FNAME_FORMAT.format(**line, audio_format=audio_format) + if fname in fname_to_idx: + idx = fname_to_idx[fname] + + if all_caps_dic["start_time"][idx] is None: + all_caps_dic["start_time"][idx] = start_time + all_caps_dic["youtube_id"][idx] = youtube_id + all_caps_dic["audiocaps_ids"][idx] = [audiocap_id] + all_caps_dic["captions"][idx] = [caption] + else: + assert all_caps_dic["start_time"][idx] == start_time + assert all_caps_dic["youtube_id"][idx] == youtube_id + + all_caps_dic["audiocaps_ids"][idx].append(audiocap_id) + all_caps_dic["captions"][idx].append(caption) + + # Load tags from audioset data + all_tags_lst = [[] for _ in range(dataset_size)] + + for line in tqdm.tqdm( + unbal_tags_data, + disable=verbose <= 0, + desc="Loading AudioSet tags for AudioCaps...", + ): + # keys: YTID, start_seconds, end_seconds, positive_labels + youtube_id = line["YTID"] + # Note : In audioset, start_time is a string repr of a float value, audiocaps it is a string repr of an integer + start_time = int(float(line["start_seconds"])) + fname = _AUDIO_FNAME_FORMAT.format( + youtube_id=youtube_id, start_time=start_time, audio_format=audio_format + ) + if fname in fname_to_idx: + tags_mid = line["positive_labels"] + tags_mid = tags_mid.split(",") + tags_indexes = [mid_to_index[tag_mid] for tag_mid in tags_mid] + + idx = fname_to_idx[fname] + all_tags_lst[idx] = tags_indexes + + raw_data = { + "fname": fnames_lst, + "tags": all_tags_lst, + "is_on_disk": is_on_disk_lst, + } + raw_data.update(all_caps_dic) + + # Convert audiocaps_ids and start_time to ints + raw_data["audiocaps_ids"] = [ + list(map(int, item)) for item in raw_data["audiocaps_ids"] + ] + raw_data["start_time"] = list(map(int, raw_data["start_time"])) + + if verbose >= 1: + pylog.info( + f"{AudioCapsCard.PRETTY_NAME}(subset={subset}) has been loaded. (len={len(fnames_lst)})" + ) + + return raw_data, index_to_name + + +def download_audiocaps_dataset( + # Common args + root: Union[str, Path, None] = None, + subset: str = AudioCapsCard.DEFAULT_SUBSET, + force: bool = False, + verbose: int = 0, + verify_files: bool = False, + # AudioCaps-specific args + audio_duration: float = 10.0, + audio_format: str = "flac", + audio_n_channels: int = 1, + download_audio: bool = True, + ffmpeg_path: Union[str, Path, None] = None, + max_workers: Optional[int] = 1, + sr: int = 32_000, + ytdlp_path: Union[str, Path, None] = None, + with_tags: bool = False, +) -> None: + """Prepare AudioCaps data (audio, labels, metadata). + + :param root: Dataset root directory. + The data will be stored in the 'AUDIOCAPS' subdirectory. + defaults to ".". + :param subset: The subset of AudioCaps to use. Can be one of :attr:`~AudioCapsCard.SUBSETS`. + defaults to "train". + :param force: If True, force to re-download file even if they exists on disk. + defaults to False. + :param verbose: Verbose level. + defaults to 0. + :param verify_files: If True, check hash value when possible. + defaults to True. + + :param audio_duration: Extracted duration for each audio file in seconds. + defaults to 10.0. + :param audio_format: Audio format and extension name. + defaults to "flac". + :param audio_n_channels: Number of channels extracted for each audio file. + defaults to 1. + :param download_audio: If True, download audio, metadata and labels files. Otherwise it will only donwload metadata and labels files. + defaults to True. + :param ffmpeg_path: Path to ffmpeg executable file. + defaults to "ffmpeg". + :param max_workers: Number of threads to download audio files in parallel. + Do not use a value too high to avoid "Too Many Requests" error. + The value None will use `min(32, os.cpu_count() + 4)` workers, which is the default of ThreadPoolExecutor. + defaults to 1. + :param sr: The sample rate used for audio files in the dataset (in Hz). + Since original YouTube videos are recorded in various settings, this parameter allow to download allow audio files with a specific sample rate. + defaults to 32000. + :param with_tags: If True, download the tags from AudioSet dataset. + defaults to False. + :param ytdlp_path: Path to yt-dlp or ytdlp executable. + defaults to "yt-dlp". + """ + + root = _get_root(root) + ytdlp_path = _get_ytdlp_path(ytdlp_path) + ffmpeg_path = _get_ffmpeg_path(ffmpeg_path) + + if not osp.isdir(root): + raise RuntimeError(f"Cannot find root directory '{root}'.") + + _check_subprog_help(ytdlp_path, "ytdlp") + _check_subprog_help(ffmpeg_path, "ffmpeg") + + if _is_prepared_audiocaps(root, subset, sr, audio_format, -1) and not force: + return None + + audiocaps_root = _get_audiocaps_root(root, sr) + os.makedirs(audiocaps_root, exist_ok=True) + if with_tags: + _download_tags_files(root, sr, verbose) + + links = _AUDIOCAPS_LINKS[subset] + audio_subset_dpath = _get_audio_subset_dpath(root, subset, sr) + os.makedirs(audio_subset_dpath, exist_ok=True) + + captions_fname = links["captions"]["fname"] + captions_fpath = osp.join(audiocaps_root, captions_fname) + + if not osp.isfile(captions_fpath): + url = links["captions"]["url"] + if url is None: + raise ValueError( + f"AudioCaps subset '{subset}' cannot be automatically downloaded. (found url={url})" + ) + download_file(url, captions_fpath, verbose=verbose) + + if download_audio: + start = time.perf_counter() + if verbose >= 1: + pylog.info(f"Start downloading audio files for AudioCaps {subset} split...") + + with open(captions_fpath, "r") as file: + # Download audio files + reader = csv.DictReader(file) + captions_data = list(reader) + # Keys: audiocap_id, youtube_id, start_time, caption + + def _cast_line(line: Dict[str, Any], audio_format: str) -> Dict[str, Any]: + youtube_id = line["youtube_id"] + start_time = line["start_time"] + + if not start_time.isdigit(): + raise RuntimeError( + f"Start time '{start_time}' is not an integer (with youtube_id={youtube_id})." + ) + + start_time = int(start_time) + fname = _AUDIO_FNAME_FORMAT.format( + youtube_id=youtube_id, start_time=start_time, audio_format=audio_format + ) + + line.update({"start_time": start_time, "fname": fname}) + return line + + captions_data = [_cast_line(line, audio_format) for line in captions_data] + download_kwds = { + line["fname"]: {k: line[k] for k in ("fname", "youtube_id", "start_time")} + for line in captions_data + } + del captions_data + + present_audio_fnames = os.listdir(audio_subset_dpath) + present_audio_fpaths = [ + osp.join(audio_subset_dpath, fname) for fname in present_audio_fnames + ] + present_audio_fpaths = dict.fromkeys(present_audio_fpaths) + + common_kwds: Dict[str, Any] = dict( + audio_subset_dpath=audio_subset_dpath, + verify_files=verify_files, + present_audio_fpaths=present_audio_fpaths, + audio_duration=audio_duration, + sr=sr, + audio_n_channels=audio_n_channels, + ffmpeg_path=ffmpeg_path, + ytdlp_path=ytdlp_path, + verbose=verbose, + ) + with ThreadPoolExecutor(max_workers=max_workers) as executor: + if verbose >= 2: + pylog.debug(f"Using {executor._max_workers} workers.") + + submitted_dict = { + fname: executor.submit( + _download_from_youtube_and_verify, + **kwds, + **common_kwds, + ) + for fname, kwds in download_kwds.items() + } + for i, (fname, submitted) in enumerate( + tqdm.tqdm(submitted_dict.items(), disable=verbose < 1) + ): + file_exists, download_success, valid_file = submitted.result() + + if verbose < 2: + continue + + if not file_exists: + if not download_success: + msg = f"File '{fname}' cannot be downloaded. (maybe the source video has been removed?)" + elif valid_file: + msg = f"File '{fname}' has been downloaded and verified." + elif verify_files: + msg = f"File '{fname}' has been downloaded but it was not valid and has been removed." + else: + msg = f"File '{fname}' has been downloaded." + else: + if valid_file: + msg = f"File '{fname}' is already downloaded and has been verified." + elif verify_files: + msg = f"File '{fname}' is already downloaded but it was not valid and has been removed." + else: + msg = f"File '{fname}' is already downloaded." + + pylog.debug(f"[{i+1:5d}/{len(download_kwds)}] {msg}") + + if verbose >= 1: + duration = int(time.perf_counter() - start) + pylog.info( + f"Download and preparation of AudioCaps for subset '{subset}' done in {duration}s." + ) + pylog.info(f"- {len(download_kwds)} total samples.") + + if verbose >= 2: + pylog.debug( + f"Dataset {AudioCapsCard.PRETTY_NAME} (subset={subset}) has been prepared." + ) + + +def download_audiocaps_datasets( + # Common args + root: Union[str, Path, None] = None, + subsets: Union[str, Iterable[str]] = AudioCapsCard.DEFAULT_SUBSET, + force: bool = False, + verbose: int = 0, + verify_files: bool = False, + # AudioCaps-specific args + audio_duration: float = 10.0, + audio_format: str = "flac", + audio_n_channels: int = 1, + download_audio: bool = True, + ffmpeg_path: Union[str, Path, None] = None, + max_workers: Optional[int] = 1, + sr: int = 32_000, + with_tags: bool = False, + ytdlp_path: Union[str, Path, None] = None, +) -> None: + """Function helper to download a list of subsets. See :func:`~aac_datasets.datasets.functional.audiocaps.download_audiocaps_dataset` for details.""" + if isinstance(subsets, str): + subsets = [subsets] + else: + subsets = list(subsets) + + kwargs: Dict[str, Any] = dict( + root=root, + force=force, + verbose=verbose, + verify_files=verify_files, + audio_duration=audio_duration, + audio_format=audio_format, + audio_n_channels=audio_n_channels, + download_audio=download_audio, + ffmpeg_path=ffmpeg_path, + max_workers=max_workers, + sr=sr, + with_tags=with_tags, + ytdlp_path=ytdlp_path, + ) + for subset in subsets: + download_audiocaps_dataset( + subset=subset, + **kwargs, + ) + + +def _download_tags_files( + root: Union[str, Path, None], + sr: int, + verbose: int, +) -> None: + root = _get_root(root) + audiocaps_root = _get_audiocaps_root(root, sr) + + target = "unbalanced" + infos = _AUDIOSET_LINKS[target] + url = infos["url"] + fname = infos["fname"] + fpath = osp.join(audiocaps_root, fname) + if not osp.isfile(fpath): + if verbose >= 1: + pylog.info(f"Downloading file '{fname}'...") + download_file(url, fpath, verbose=verbose) + + download_audioset_mapping(audiocaps_root, verbose=verbose) + + +def _get_audiocaps_root(root: str, sr: int) -> str: + return osp.join(root, "AUDIOCAPS") + + +def _get_audio_subset_dpath(root: str, subset: str, sr: int) -> str: + return osp.join( + _get_audiocaps_root(root, sr), + f"audio_{sr}Hz", + _AUDIOCAPS_AUDIO_DNAMES[subset], + ) + + +def _is_prepared_audiocaps( + root: str, + subset: str = AudioCapsCard.DEFAULT_SUBSET, + sr: int = 32_000, + audio_format: str = "flac", + verbose: int = 0, +) -> bool: + links = _AUDIOCAPS_LINKS[subset] + captions_fname = links["captions"]["fname"] + captions_fpath = osp.join(_get_audiocaps_root(root, sr), captions_fname) + audio_subset_dpath = _get_audio_subset_dpath(root, subset, sr) + + msgs = [] + + if not osp.isdir(audio_subset_dpath): + msgs.append(f"Cannot find directory '{audio_subset_dpath}'.") + else: + audio_fnames = os.listdir(audio_subset_dpath) + audio_fnames = [fname for fname in audio_fnames if fname.endswith(audio_format)] + if len(audio_fnames) == 0: + msgs.append( + f"Cannot find any audio {audio_format} file in '{audio_subset_dpath}'." + ) + + if not osp.isfile(captions_fpath): + msgs.append(f"Cannot find file '{captions_fpath}'.") + + if verbose >= 0: + for msg in msgs: + pylog.warning(msg) + + return len(msgs) == 0 + + +def _download_from_youtube_and_verify( + fname: str, + youtube_id: str, + start_time: int, + audio_subset_dpath: str, + verify_files: bool, + present_audio_fpaths: Dict[str, None], + audio_duration: float, + sr: int, + audio_n_channels: int, + ffmpeg_path: str, + ytdlp_path: str, + verbose: int, +) -> Tuple[bool, bool, bool]: + fpath = osp.join(audio_subset_dpath, fname) + + file_exists = fpath in present_audio_fpaths + download_success = False + valid_file = False + + if not file_exists: + download_success = _download_from_youtube( + youtube_id=youtube_id, + fpath_out=fpath, + start_time=start_time, + audio_duration=audio_duration, + sr=sr, + audio_n_channels=audio_n_channels, + ffmpeg_path=ffmpeg_path, + ytdlp_path=ytdlp_path, + verbose=verbose, + ) + + if verify_files and (download_success or file_exists): + valid_file = _is_valid_audio_file( + fpath, + min_n_frames=1, + sr=sr, + n_channels=audio_n_channels, + ) + + if verify_files and not valid_file and osp.isfile(fpath): + os.remove(fpath) + + return file_exists, download_success, valid_file + + +def _download_from_youtube( + youtube_id: str, + fpath_out: str, + start_time: int, + audio_duration: float = 10.0, + sr: int = 32_000, + audio_n_channels: int = 1, + audio_format: str = "flac", + acodec: str = "flac", + ytdlp_path: Union[str, Path, None] = None, + ffmpeg_path: Union[str, Path, None] = None, + verbose: int = 0, +) -> bool: + """Download audio from youtube with yt-dlp and ffmpeg.""" + ytdlp_path = _get_ytdlp_path(ytdlp_path) + ffmpeg_path = _get_ffmpeg_path(ffmpeg_path) + + # Get audio download link with yt-dlp, without start time + link = _get_youtube_link(youtube_id, None) + get_url_command = [ + ytdlp_path, + "--youtube-skip-dash-manifest", + "-g", + link, + ] + try: + output = subprocess.check_output(get_url_command) + except (CalledProcessError, PermissionError) as err: + if verbose >= 2: + pylog.debug(err) + return False + + output = output.decode() + lines = output.split("\n") + if len(lines) < 2: + return False + _video_link, audio_link = lines[:2] + + # Download and extract audio from audio_link to fpath_out with ffmpeg + extract_command = [ + ffmpeg_path, + # Input + "-i", + audio_link, + # Remove video + "-vn", + # Format (flac) + "-f", + audio_format, + # Audio codec (flac) + "-acodec", + acodec, + # Get only 10s of the clip after start_time + "-ss", + str(start_time), + "-t", + str(audio_duration), + # Resample to a specific rate (default to 32 kHz) + "-ar", + str(sr), + # Compute mean of 2 channels + "-ac", + str(audio_n_channels), + fpath_out, + ] + try: + if verbose < 3: + stdout = subprocess.DEVNULL + stderr = subprocess.DEVNULL + else: + stdout = None + stderr = None + exitcode = subprocess.check_call(extract_command, stdout=stdout, stderr=stderr) + return exitcode == 0 + + except (CalledProcessError, PermissionError) as err: + if verbose >= 2: + pylog.debug(err) + return False + + +def _check_subprog_help( + path: str, + name: str, + stdout: Any = subprocess.DEVNULL, + stderr: Any = subprocess.DEVNULL, +) -> None: + try: + subprocess.check_call( + [path, "--help"], + stdout=stdout, + stderr=stderr, + ) + except (CalledProcessError, PermissionError, FileNotFoundError) as err: + pylog.error(f"Invalid {name} path '{path}'. ({err})") + raise err + + +def _is_valid_audio_file( + fpath: str, + *, + min_n_frames: Optional[int] = None, + max_n_frames: Optional[int] = None, + sr: Optional[int] = None, + n_channels: Optional[int] = None, +) -> bool: + try: + metadata = torchaudio.info(fpath) # type: ignore + except RuntimeError: + msg = f"Found file '{fpath}' already downloaded but it is invalid (cannot load metadata)." + pylog.error(msg) + return False + + msgs = [] + if min_n_frames is not None and metadata.num_frames < min_n_frames: + msg = f"Found file '{fpath}' already downloaded but it is invalid (audio is shorter than min_n_frames={min_n_frames} samples)." + msgs.append(msg) + + if max_n_frames is not None and metadata.num_frames > max_n_frames: + msg = f"Found file '{fpath}' already downloaded but it is invalid (audio is longer than max_n_frames={max_n_frames} samples)." + msgs.append(msg) + + if sr is not None and metadata.sample_rate != sr: + msg = f"Found file '{fpath}' already downloaded but it is invalid (invalid sr={metadata.sample_rate} != {sr})." + msgs.append(msg) + + if n_channels is not None and metadata.num_channels != n_channels: + msg = f"Found file '{fpath}' already downloaded but it is invalid (invalid n_channels={metadata.num_channels} != {sr})." + msgs.append(msg) + + for msg in msgs: + pylog.error(msg) + + return len(msgs) == 0 + + +def _get_youtube_link(youtube_id: str, start_time: Optional[int]) -> str: + link = f"https://www.youtube.com/watch?v={youtube_id}" + if start_time is None: + return link + else: + return f"{link}&t={start_time}s" + + +def _get_youtube_link_embed( + youtube_id: str, + start_time: Optional[int], + duration: float = 10.0, +) -> str: + link = f"https://www.youtube.com/embed/{youtube_id}" + if start_time is None: + return link + else: + end_time = start_time + duration + return f"{link}?start={start_time}&end={end_time}" + + +# Audio directory names per subset +_AUDIOCAPS_AUDIO_DNAMES = { + "train": "train", + "val": "val", + "test": "test", + "train_v2": "train", +} + +# Archives and file links used to download AudioCaps labels and metadata +_AUDIOCAPS_LINKS = { + "train": { + "captions": { + "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/train.csv", + "fname": "train.csv", + }, + }, + "val": { + "captions": { + "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/val.csv", + "fname": "val.csv", + }, + }, + "test": { + "captions": { + "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/test.csv", + "fname": "test.csv", + }, + }, + "train_v2": { + "captions": { + "url": "https://raw.githubusercontent.com/Labbeti/aac-datasets/dev/data/train_v2.csv", + "fname": "train_v2.csv", + }, + }, +} + +# Archives and file links used to download AudioSet metadata +_AUDIOSET_LINKS = { + "class_labels_indices": { + "fname": "class_labels_indices.csv", + "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv", + }, + "eval": { + "fname": "eval_segments.csv", + "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv", + }, + "balanced": { + "fname": "balanced_train_segments.csv", + "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv", + }, + "unbalanced": { + "fname": "unbalanced_train_segments.csv", + "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv", + }, +} + +# Audio filename format for AudioCaps +_AUDIO_FNAME_FORMAT = "{youtube_id}_{start_time}.{audio_format}" diff --git a/src/aac_datasets/datasets/functional/clotho.py b/src/aac_datasets/datasets/functional/clotho.py new file mode 100644 index 0000000..be1d80e --- /dev/null +++ b/src/aac_datasets/datasets/functional/clotho.py @@ -0,0 +1,744 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import copy +import csv +import logging +import os +import os.path as osp + +from pathlib import Path +from typing import ( + Any, + Dict, + Iterable, + List, + Optional, + Tuple, + Union, +) +from zipfile import ZipFile + +from py7zr import SevenZipFile + +from aac_datasets.datasets.functional.common import DatasetCard +from aac_datasets.utils.download import download_file, hash_file +from aac_datasets.utils.globals import _get_root + + +pylog = logging.getLogger(__name__) + + +class ClothoCard(DatasetCard): + ANNOTATIONS_CREATORS: Tuple[str, ...] = ("crowdsourced",) + CAPTIONS_PER_AUDIO: Dict[str, int] = { + "dev": 5, + "val": 5, + "eval": 5, + "dcase_aac_test": 0, + "dcase_aac_analysis": 0, + "dcase_t2a_audio": 0, + "dcase_t2a_captions": 1, + } + CITATION: str = r""" + @inproceedings{Drossos_2020_icassp, + title = {Clotho: an Audio Captioning Dataset}, + author = {Drossos, Konstantinos and Lipping, Samuel and Virtanen, Tuomas}, + year = 2020, + booktitle = {Proc. IEEE Int. Conf. Acoustic., Speech and Signal Process. (ICASSP)}, + pages = {736--740}, + abstract = {Audio captioning is the novel task of general audio content description using free text. It is an intermodal translation task (not speech-to-text), where a system accepts as an input an audio signal and outputs the textual description (i.e. the caption) of that signal. In this paper we present Clotho, a dataset for audio captioning consisting of 4981 audio samples of 15 to 30 seconds duration and 24 905 captions of eight to 20 words length, and a baseline method to provide initial results. Clotho is built with focus on audio content and caption diversity, and the splits of the data are not hampering the training or evaluation of methods. All sounds are from the Freesound platform, and captions are crowdsourced using Amazon Mechanical Turk and annotators from English speaking countries. Unique words, named entities, and speech transcription are removed with post-processing. Clotho is freely available online (https://zenodo.org/record/3490684).} + } + """ + DEFAULT_SUBSET: str = "dev" + HOMEPAGE: str = "https://zenodo.org/record/3490684" + LANGUAGE: Tuple[str, ...] = ("en",) + LANGUAGE_DETAILS: Tuple[str, ...] = ("en-US",) + DEFAULT_VERSION: str = "v2.1" + NAME: str = "clotho" + N_CHANNELS: int = 1 + PRETTY_NAME: str = "Clotho" + SAMPLE_RATE: int = 44_100 # Hz + SIZE_CATEGORIES: Tuple[str, ...] = ("1K Dict[str, List[Any]]: + """Load Clotho metadata. + + :param root: Dataset root directory. + defaults to ".". + :param subset: The subset of Clotho to use. Can be one of :attr:`~ClothoCard.SUBSETS`. + defaults to "dev". + :param verbose: Verbose level. + defaults to 0. + + :param version: The version of the dataset. Can be one of :attr:`~ClothoCard.versions`. + defaults to 'v2.1'. + :returns: A dictionnary of lists containing each metadata. + """ + root = _get_root(root) + if not _is_prepared_clotho(root, version, subset): + raise RuntimeError( + f"Cannot load data: clotho_{subset} is not prepared in data root={root}. Please use download=True in dataset constructor." + ) + + # Read fpath of .wav audio files + links = _CLOTHO_LINKS[version][subset] + csv_dpath = _get_csv_dpath(root, version) + + # Read Clotho files + if "captions" in links.keys(): + captions_fname = links["captions"]["fname"] + captions_fpath = osp.join(csv_dpath, captions_fname) + + # Keys: file_name, caption_1, caption_2, caption_3, caption_4, caption_5 + with open(captions_fpath, "r") as file: + reader = csv.DictReader(file) + captions_data = list(reader) + + if subset == "dcase_t2a_captions": + captions_data = [ + data | {"file_name": f"no_fname_{i}"} + for i, data in enumerate(captions_data) + ] + + else: + captions_data = [] + + if "metadata" in links.keys(): + metadata_fname = links["metadata"]["fname"] + metadata_fpath = osp.join(csv_dpath, metadata_fname) + + # Keys: file_name, keywords, sound_id, sound_link, start_end_samples, manufacturer, license + if version in ("v2", "v2.1"): + encoding = "ISO-8859-1" + else: + encoding = None + + with open(metadata_fpath, "r", encoding=encoding) as file: + delimiter = ";" if subset == "dcase_aac_test" else "," + reader = csv.DictReader(file, delimiter=delimiter) + metadata = list(reader) + else: + metadata = [] + + if "captions" in links.keys(): + # note: "dev", "val", "eval" + fnames_lst = [line["file_name"] for line in captions_data] + elif "metadata" in links.keys(): + # note: for "dcase_aac_test" subset which do not have captions CSV file + fnames_lst = [line["file_name"] for line in metadata] + else: + # note 1: for "dcase_aac_analysis" subset which do not have any CSV file + # note 2: force sorted list to have the same order on all OS + audio_subset_dpath = _get_audio_subset_dpath(root, version, subset) + if audio_subset_dpath is None: + raise RuntimeError( + f"INTERNAL ERROR: Invalid audio subset dirpath. (found audio_subset_dpath={audio_subset_dpath}, with subset={subset})" + ) + fnames_lst = list(sorted(os.listdir(audio_subset_dpath))) + + idx_to_fname = {i: fname for i, fname in enumerate(fnames_lst)} + fname_to_idx = {fname: i for i, fname in idx_to_fname.items()} + dataset_size = len(fnames_lst) + + # Process each item field + if len(metadata) > 0: + subset_metadata_keys = [key for key in _METADATA_KEYS if key in metadata[0]] + else: + subset_metadata_keys = [] + + all_captions_lst = [[] for _ in range(dataset_size)] + + if subset != "dcase_t2a_captions": + captions_keys = _CAPTIONS_KEYS + else: + captions_keys = ("caption",) + + for line in captions_data: + fname = line["file_name"] + idx = fname_to_idx[fname] + all_captions_lst[idx] = [line[caption_key] for caption_key in captions_keys] + + all_metadata_dic: Dict[str, List[Any]] = { + key: [None for _ in range(dataset_size)] for key in subset_metadata_keys + } + for line in metadata: + fname = line["file_name"] + if fname not in fname_to_idx: + raise KeyError( + f"Cannot find metadata fname={fname} in captions file. (subset={subset})" + ) + idx = fname_to_idx[fname] + for key in subset_metadata_keys: + # The test subset does not have keywords in metadata, but has sound_id, sound_link, etc. + if key in line: + all_metadata_dic[key][idx] = line[key] + + raw_data = { + "fname": fnames_lst, + "captions": all_captions_lst, + } + raw_data.update(all_metadata_dic) + + if "keywords" in raw_data: + # Split keywords into List[str] + raw_data["keywords"] = [ + keywords.split(";") if keywords is not None else [] + for keywords in raw_data["keywords"] + ] + + if subset == "dcase_t2a_audio": + # Temporary patch to avoid file loading errors + # indexes: 53, 521, 677 + replaces = { + "raindrops on metal: police background.wav": "raindrops on metal_police background.wav", + "Intersection Wet : Metro Pass.wav": "Intersection Wet_Metro Pass.wav", + "Kitchen Roomtone w: Dripping Faucet_1-2.wav": "Kitchen Roomtone w_Dripping Faucet_1-2.wav", + } + raw_data["fname"] = [replaces.get(fname, fname) for fname in raw_data["fname"]] + + if verbose >= 1: + pylog.info( + f"Dataset {ClothoCard.PRETTY_NAME} ({subset}) has been loaded. (size={len(next(iter(raw_data.values())))})" + ) + return raw_data + + +def download_clotho_dataset( + # Common args + root: Union[str, Path, None] = None, + subset: str = ClothoCard.DEFAULT_SUBSET, + force: bool = False, + verbose: int = 0, + verify_files: bool = True, + # Clotho-specific args + clean_archives: bool = True, + version: str = ClothoCard.DEFAULT_VERSION, +) -> None: + """Prepare Clotho data. + + :param root: Dataset root directory. + defaults to ".". + :param subset: The subset of Clotho to use. Can be one of :attr:`~ClothoCard.SUBSETS`. + defaults to "dev". + :param force: If True, force to download again all files. + defaults to False. + :param verbose: Verbose level. + defaults to 0. + :param verify_files: If True, check all file already downloaded are valid. + defaults to False. + + :param clean_archives: If True, remove the compressed archives from disk to save space. + defaults to True. + :param version: The version of the dataset. Can be one of :attr:`~ClothoCard.versions`. + defaults to 'v2.1'. + """ + if subset == "val" and version == "v1": + pylog.error( + f"Clotho version '{version}' does not have '{subset}' subset. It will be ignored." + ) + return None + + root = _get_root(root) + if not osp.isdir(root): + raise RuntimeError(f"Cannot find root directory '{root}'.") + + archives_dpath = _get_archives_dpath(root, version) + audio_dpath = _get_audio_dpath(root, version) + csv_dpath = _get_csv_dpath(root, version) + + for dpath in (archives_dpath, audio_dpath, csv_dpath): + os.makedirs(dpath, exist_ok=True) + + if verbose >= 1: + pylog.info(f"Start to download files for clotho_{subset}...") + + links = copy.deepcopy(_CLOTHO_LINKS[version][subset]) + EXTENSIONS = ("7z", "csv", "zip") + + # Download csv and 7z files + for file_info in links.values(): + fname, url, hash_value = ( + file_info["fname"], + file_info["url"], + file_info["hash_value"], + ) + extension = fname.split(".")[-1] + + if extension in ("7z", "zip"): + dpath = archives_dpath + elif extension == "csv": + dpath = csv_dpath + else: + raise RuntimeError( + f"Found invalid extension={extension}. Must be one of {EXTENSIONS}." + ) + + fpath = osp.join(dpath, fname) + if not osp.isfile(fpath) or force: + if verbose >= 1: + pylog.info(f"Download and check file '{fname}' from url={url}...") + + download_file(url, fpath, verbose=verbose) + + elif verbose >= 1: + pylog.info(f"File '{fname}' is already downloaded.") + + if verify_files: + hash_value = file_info["hash_value"] + file_hash_value = hash_file(fpath, hash_type="md5") + if file_hash_value != hash_value: + raise RuntimeError( + f"Invalid checksum for file '{fname}'. (expected md5 checksum '{hash_value}' but found '{file_hash_value}')\n" + f"Please try to remove manually the file '{fpath}' and rerun {ClothoCard.PRETTY_NAME} download." + ) + elif verbose >= 2: + pylog.debug(f"File '{fname}' has a valid checksum.") + + # Extract audio files from archives + audio_subset_dpath = _get_audio_subset_dpath(root, version, subset) + if audio_subset_dpath is not None: + for file_info in links.values(): + fname = file_info["fname"] + extension = fname.split(".")[-1] + + if extension == "csv": + continue + + if extension not in ("7z", "zip"): + pylog.error( + f"Found unexpected extension={extension} for downloaded file '{fname}'. Expected one of {EXTENSIONS}." + ) + continue + + fpath = osp.join(archives_dpath, fname) + + if verbose >= 1: + pylog.info(f"Extract archive file fname={fname}...") + + if extension == "7z": + archive_file = SevenZipFile(fpath) + compressed_fnames = [ + osp.basename(fname) for fname in archive_file.getnames() + ] + elif extension == "zip": + archive_file = ZipFile(fpath) + compressed_fnames = [ + osp.basename(file.filename) for file in archive_file.filelist + ] + else: + raise RuntimeError(f"Invalid extension '{extension}'.") + + # Ignore dir name from archive file + compressed_fnames = [ + fname for fname in compressed_fnames if fname.endswith(".wav") + ] + extracted_fnames = ( + os.listdir(audio_subset_dpath) if osp.isdir(audio_subset_dpath) else [] + ) + + if set(extracted_fnames) != set(compressed_fnames): + # For dcase_t2a_audio subset, the name of the audio dname is also "test", so we need to move the audio files to another folder named "test_retrieval_audio". + if subset == "dcase_t2a_audio": + target_dpath = audio_subset_dpath + os.makedirs(target_dpath, exist_ok=True) + else: + target_dpath = audio_dpath + + archive_file.extractall(target_dpath) + + if subset == "dcase_t2a_audio": + extracted_dpath = osp.join(target_dpath, "test") + for fname in os.listdir(extracted_dpath): + os.rename( + osp.join(extracted_dpath, fname), + osp.join(target_dpath, fname), + ) + os.rmdir(extracted_dpath) + + # Check if files is good now + extracted_fnames = os.listdir(audio_subset_dpath) + if set(extracted_fnames) != set(compressed_fnames): + found_but_not_expected = len( + set(extracted_fnames).difference(set(compressed_fnames)) + ) + expected_but_not_found = len( + set(compressed_fnames).difference(set(extracted_fnames)) + ) + + raise RuntimeError( + f"Invalid number of audios extracted, found {len(extracted_fnames)} files but expected the same {len(compressed_fnames)} files. " + f"(with found_but_not_expected={found_but_not_expected} and expected_but_not_found={expected_but_not_found})" + ) + + archive_file.close() + + if clean_archives: + for file_info in links.values(): + fname = file_info["fname"] + extension = fname.split(".")[-1] + if extension not in ("7z", "zip"): + continue + + fpath = osp.join(archives_dpath, fname) + if verbose >= 1: + pylog.info(f"Removing archive file {osp.basename(fpath)}...") + os.remove(fpath) + + if verbose >= 2: + pylog.debug(f"Dataset {ClothoCard.PRETTY_NAME} ({subset}) has been prepared.") + + +def download_clotho_datasets( + # Common args + root: Union[str, Path, None] = None, + subsets: Union[str, Iterable[str]] = ClothoCard.DEFAULT_SUBSET, + force: bool = False, + verbose: int = 0, + # Clotho-specific args + clean_archives: bool = True, + verify_files: bool = True, + version: str = ClothoCard.DEFAULT_VERSION, +) -> None: + """Function helper to download a list of subsets. See :func:`~aac_datasets.datasets.functional.clotho.download_clotho_dataset` for details.""" + if isinstance(subsets, str): + subsets = [subsets] + else: + subsets = list(subsets) + + kwargs: Dict[str, Any] = dict( + root=root, + force=force, + verbose=verbose, + clean_archives=clean_archives, + verify_files=verify_files, + version=version, + ) + for subset in subsets: + download_clotho_dataset( + subset=subset, + **kwargs, + ) + + +def _get_clotho_root(root: str, version: str) -> str: + return osp.join(root, f"CLOTHO_{version}") + + +def _get_archives_dpath(root: str, version: str) -> str: + return osp.join(_get_clotho_root(root, version), "archives") + + +def _get_audio_dpath(root: str, version: str) -> str: + return osp.join(_get_clotho_root(root, version), "clotho_audio_files") + + +def _get_csv_dpath(root: str, version: str) -> str: + return osp.join(_get_clotho_root(root, version), "clotho_csv_files") + + +def _get_audio_subset_dpath(root: str, version: str, subset: str) -> Optional[str]: + dname = _CLOTHO_AUDIO_DNAMES[subset] + if dname is None: + return None + + return osp.join( + _get_clotho_root(root, version), + "clotho_audio_files", + dname, + ) + + +def _is_prepared_clotho(root: str, version: str, subset: str) -> bool: + audio_dpath = _get_audio_dpath(root, version) + csv_dpath = _get_csv_dpath(root, version) + if not all(map(osp.isdir, (audio_dpath, csv_dpath))): + return False + + links = _CLOTHO_LINKS[version][subset] + + if "captions" in links: + captions_fname = links["captions"]["fname"] + captions_fpath = osp.join(csv_dpath, captions_fname) + + if not osp.isfile(captions_fpath): + return False + + if "metadata" in links: + metadata_fname = links["metadata"]["fname"] + metadata_fpath = osp.join(csv_dpath, metadata_fname) + if not osp.isfile(metadata_fpath): + return False + + if "audio_archive" in links: + audio_subset_dpath = _get_audio_subset_dpath(root, version, subset) + if audio_subset_dpath is None: + raise RuntimeError( + f"INTERNAL ERROR: Invalid audio subset dirpath. (found audio_subset_dpath={audio_subset_dpath}, with subset={subset})" + ) + if not osp.isdir(audio_subset_dpath): + return False + + audio_fnames = os.listdir(audio_subset_dpath) + if "captions" in links: + captions_fname = links["captions"]["fname"] + captions_fpath = osp.join(csv_dpath, captions_fname) + with open(captions_fpath, "r") as file: + reader = csv.DictReader(file) + lines = list(reader) + return len(audio_fnames) == len(lines) + else: + return len(audio_fnames) > 0 + + else: + return True + + +# Audio directory names per subset +_CLOTHO_AUDIO_DNAMES = { + "dev": "development", + "val": "validation", + "eval": "evaluation", + "dcase_aac_test": "test", + "dcase_aac_analysis": "clotho_analysis", + "dcase_t2a_audio": "test_retrieval_audio", + "dcase_t2a_captions": None, +} + +# Archives and file links used to download Clotho +_CLOTHO_LINKS = { + "v1": { + "dev": { + "audio_archive": { + "fname": "clotho_audio_development.7z", + "url": "https://zenodo.org/record/3490684/files/clotho_audio_development.7z?download=1", + "hash_value": "e3ce88561b317cc3825e8c861cae1ec6", + }, + "captions": { + "fname": "clotho_captions_development.csv", + "url": "https://zenodo.org/record/3490684/files/clotho_captions_development.csv?download=1", + "hash_value": "dd568352389f413d832add5cf604529f", + }, + "metadata": { + "fname": "clotho_metadata_development.csv", + "url": "https://zenodo.org/record/3490684/files/clotho_metadata_development.csv?download=1", + "hash_value": "582c18ee47cebdbe33dce1feeab53a56", + }, + }, + "eval": { + "audio_archive": { + "fname": "clotho_audio_evaluation.7z", + "url": "https://zenodo.org/record/3490684/files/clotho_audio_evaluation.7z?download=1", + "hash_value": "4569624ccadf96223f19cb59fe4f849f", + }, + "captions": { + "fname": "clotho_captions_evaluation.csv", + "url": "https://zenodo.org/record/3490684/files/clotho_captions_evaluation.csv?download=1", + "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", + }, + "metadata": { + "fname": "clotho_metadata_evaluation.csv", + "url": "https://zenodo.org/record/3490684/files/clotho_metadata_evaluation.csv?download=1", + "hash_value": "13946f054d4e1bf48079813aac61bf77", + }, + }, + "test": { + "audio_archive": { + "fname": "clotho_audio_test.7z", + "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", + "hash_value": "9b3fe72560a621641ff4351ba1154349", + }, + "metadata": { + "fname": "clotho_metadata_test.csv", + "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", + "hash_value": "52f8ad01c229a310a0ff8043df480e21", + }, + }, + }, + "v2": { + "dev": { + "audio_archive": { + "fname": "clotho_audio_development.7z", + "url": "https://zenodo.org/record/4743815/files/clotho_audio_development.7z?download=1", + "hash_value": "eda144a5e05a60e6d2e37a65fc4720a9", + }, + "captions": { + "fname": "clotho_captions_development.csv", + "url": "https://zenodo.org/record/4743815/files/clotho_captions_development.csv?download=1", + "hash_value": "800633304e73d3daed364a2ba6069827", + }, + "metadata": { + "fname": "clotho_metadata_development.csv", + "url": "https://zenodo.org/record/4743815/files/clotho_metadata_development.csv?download=1", + "hash_value": "5fdc51b4c4f3468ff7d251ea563588c9", + }, + }, + "val": { + "audio_archive": { + "fname": "clotho_audio_validation.7z", + "url": "https://zenodo.org/record/4743815/files/clotho_audio_validation.7z?download=1", + "hash_value": "0475bfa5793e80f748d32525018ebada", + }, + "captions": { + "fname": "clotho_captions_validation.csv", + "url": "https://zenodo.org/record/4743815/files/clotho_captions_validation.csv?download=1", + "hash_value": "3109c353138a089c7ba724f27d71595d", + }, + "metadata": { + "fname": "clotho_metadata_validation.csv", + "url": "https://zenodo.org/record/4743815/files/clotho_metadata_validation.csv?download=1", + "hash_value": "f69cfacebcd47c4d8d30d968f9865475", + }, + }, + "eval": { + "audio_archive": { + "fname": "clotho_audio_evaluation.7z", + "url": "https://zenodo.org/record/4743815/files/clotho_audio_evaluation.7z?download=1", + "hash_value": "4569624ccadf96223f19cb59fe4f849f", + }, + "captions": { + "fname": "clotho_captions_evaluation.csv", + "url": "https://zenodo.org/record/4743815/files/clotho_captions_evaluation.csv?download=1", + "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", + }, + "metadata": { + "fname": "clotho_metadata_evaluation.csv", + "url": "https://zenodo.org/record/4743815/files/clotho_metadata_evaluation.csv?download=1", + "hash_value": "13946f054d4e1bf48079813aac61bf77", + }, + }, + "dcase_aac_test": { + "audio_archive": { + "fname": "clotho_audio_test.7z", + "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", + "hash_value": "9b3fe72560a621641ff4351ba1154349", + }, + "metadata": { + "fname": "clotho_metadata_test.csv", + "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", + "hash_value": "52f8ad01c229a310a0ff8043df480e21", + }, + }, + }, + "v2.1": { + "dev": { + "audio_archive": { + "fname": "clotho_audio_development.7z", + "url": "https://zenodo.org/record/4783391/files/clotho_audio_development.7z?download=1", + "hash_value": "c8b05bc7acdb13895bb3c6a29608667e", + }, + "captions": { + "fname": "clotho_captions_development.csv", + "url": "https://zenodo.org/record/4783391/files/clotho_captions_development.csv?download=1", + "hash_value": "d4090b39ce9f2491908eebf4d5b09bae", + }, + "metadata": { + "fname": "clotho_metadata_development.csv", + "url": "https://zenodo.org/record/4783391/files/clotho_metadata_development.csv?download=1", + "hash_value": "170d20935ecfdf161ce1bb154118cda5", + }, + }, + "val": { + "audio_archive": { + "fname": "clotho_audio_validation.7z", + "url": "https://zenodo.org/record/4783391/files/clotho_audio_validation.7z?download=1", + "hash_value": "7dba730be08bada48bd15dc4e668df59", + }, + "captions": { + "fname": "clotho_captions_validation.csv", + "url": "https://zenodo.org/record/4783391/files/clotho_captions_validation.csv?download=1", + "hash_value": "5879e023032b22a2c930aaa0528bead4", + }, + "metadata": { + "fname": "clotho_metadata_validation.csv", + "url": "https://zenodo.org/record/4783391/files/clotho_metadata_validation.csv?download=1", + "hash_value": "2e010427c56b1ce6008b0f03f41048ce", + }, + }, + "eval": { + "audio_archive": { + "fname": "clotho_audio_evaluation.7z", + "url": "https://zenodo.org/record/4783391/files/clotho_audio_evaluation.7z?download=1", + "hash_value": "4569624ccadf96223f19cb59fe4f849f", + }, + "captions": { + "fname": "clotho_captions_evaluation.csv", + "url": "https://zenodo.org/record/4783391/files/clotho_captions_evaluation.csv?download=1", + "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", + }, + "metadata": { + "fname": "clotho_metadata_evaluation.csv", + "url": "https://zenodo.org/record/4783391/files/clotho_metadata_evaluation.csv?download=1", + "hash_value": "13946f054d4e1bf48079813aac61bf77", + }, + }, + "dcase_aac_test": { + "audio_archive": { + "fname": "clotho_audio_test.7z", + "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", + "hash_value": "9b3fe72560a621641ff4351ba1154349", + }, + "metadata": { + "fname": "clotho_metadata_test.csv", + "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", + "hash_value": "52f8ad01c229a310a0ff8043df480e21", + }, + }, + "dcase_aac_analysis": { + "audio_archive": { + "fname": "clotho_analysis_2022.zip", + "url": "https://zenodo.org/record/6610709/files/clotho_analysis_2022.zip?download=1", + "hash_value": "7e8fa4762cc3a7c5546606680b958d08", + }, + }, + "dcase_t2a_audio": { + "audio_archive": { + "fname": "retrieval_audio.7z", + "url": "https://zenodo.org/record/6590983/files/retrieval_audio.7z?download=1", + "hash_value": "24102395fd757c462421a483fba5c407", + }, + "metadata": { + "fname": "retrieval_audio_metadata.csv", + "url": "https://zenodo.org/record/6590983/files/retrieval_audio_metadata.csv?download=1", + "hash_value": "1301db07acbf1e4fabc467eb54e0d353", + }, + }, + "dcase_t2a_captions": { + "captions": { + "fname": "retrieval_captions.csv", + "url": "https://zenodo.org/record/6590983/files/retrieval_captions.csv?download=1", + "hash_value": "f9e810118be00c64ea8cd7557816d4fe", + }, + }, + }, +} + +# Captions column names in CSV files +_CAPTIONS_KEYS = ( + "caption_1", + "caption_2", + "caption_3", + "caption_4", + "caption_5", +) + +# Metadata column names in CSV files +_METADATA_KEYS = ( + "keywords", + "sound_id", + "sound_link", + "start_end_samples", + "manufacturer", + "license", +) diff --git a/src/aac_datasets/datasets/functional/common.py b/src/aac_datasets/datasets/functional/common.py new file mode 100644 index 0000000..eaf2ffe --- /dev/null +++ b/src/aac_datasets/datasets/functional/common.py @@ -0,0 +1,6 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + + +class DatasetCard: + pass diff --git a/src/aac_datasets/datasets/functional/macs.py b/src/aac_datasets/datasets/functional/macs.py new file mode 100644 index 0000000..2bb0ac3 --- /dev/null +++ b/src/aac_datasets/datasets/functional/macs.py @@ -0,0 +1,527 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import csv +import logging +import os +import os.path as osp +import shutil +import zipfile + +from pathlib import Path +from typing import ( + Any, + Dict, + Iterable, + List, + Tuple, + Union, +) + +import yaml + +from aac_datasets.datasets.functional.common import DatasetCard +from aac_datasets.utils.download import download_file, hash_file +from aac_datasets.utils.globals import _get_root + + +pylog = logging.getLogger(__name__) + + +class MACSCard(DatasetCard): + ANNOTATIONS_CREATORS: Tuple[str, ...] = ("crowdsourced",) + CITATION: str = r""" + @inproceedings{Martin2021b, + title = {Diversity and Bias in Audio Captioning Datasets}, + author = {Martin, Irene and Mesaros, Annamaria}, + year = 2021, + month = {November}, + booktitle = {Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)}, + address = {Barcelona, Spain}, + pages = {90--94}, + isbn = {978-84-09-36072-7}, + url = {https://dcase.community/documents/workshop2021/proceedings/DCASE2021Workshop_Martin_34.pdf}, + abstract = {Describing soundscapes in sentences allows better understanding of the acoustic scene than a single label indicating the acoustic scene class or a set of audio tags indicating the sound events active in the audio clip. In addition, the richness of natural language allows a range of possible descriptions for the same acoustic scene. In this work, we address the diversity obtained when collecting descriptions of soundscapes using crowdsourcing. We study how much the collection of audio captions can be guided by the instructions given in the annotation task, by analysing the possible bias introduced by auxiliary information provided in the annotation process. Our study shows that even when given hints on the audio content, different annotators describe the same soundscape using different vocabulary. In automatic captioning, hints provided as audio tags represent grounding textual information that facilitates guiding the captioning output towards specific concepts. We also release a new dataset of audio captions and audio tags produced by multiple annotators for a subset of the TAU Urban Acoustic Scenes 2018 dataset, suitable for studying guided captioning.}, + doi. = {10.5281/zenodo.5770113} + } + """ + DEFAULT_SUBSET: str = "full" + DESCRIPTION: str = "Multi-Annotator Captioned Soundscapes dataset." + HOMEPAGE: str = "https://zenodo.org/record/5114771" + LANGUAGE: Tuple[str, ...] = ("en",) + LANGUAGE_DETAILS: Tuple[str, ...] = ("en-US",) + MAX_CAPTIONS_PER_AUDIO: Dict[str, int] = {"full": 5} + MIN_CAPTIONS_PER_AUDIO: Dict[str, int] = {"full": 2} + NAME: str = "macs" + N_CHANNELS: int = 2 + PRETTY_NAME: str = "MACS" + SAMPLE_RATE: int = 48_000 # Hz + SIZE_CATEGORIES: Tuple[str, ...] = ("1K Tuple[Dict[str, List[Any]], Dict[int, float]]: + """Load MACS metadata. + + :param root: Dataset root directory. + defaults to ".". + :param subset: The subset of MACS to use. Can be one of :attr:`~MACSCard.SUBSETS`. + defaults to "full". + :param verbose: Verbose level. + defaults to 0. + :returns: A dictionnary of lists containing each metadata. + """ + + root = _get_root(root) + if not _is_prepared_macs(root): + raise RuntimeError( + f"Cannot load data: macs is not prepared in data root={root}. Please use download=True in dataset constructor." + ) + + macs_dpath = _get_macs_root(root) + tau_meta_dpath = _get_tau_meta_dpath(root) + + # Read data files + captions_fname = MACS_FILES["captions"]["fname"] + captions_fpath = osp.join(macs_dpath, captions_fname) + if verbose >= 2: + pylog.debug(f"Reading captions file {captions_fname}...") + + with open(captions_fpath, "r") as file: + caps_data = yaml.safe_load(file) + + tau_meta_fname = "meta.csv" + tau_meta_fpath = osp.join(tau_meta_dpath, tau_meta_fname) + if verbose >= 2: + pylog.debug(f"Reading Tau Urban acoustic scene meta file {tau_meta_fname}...") + + with open(tau_meta_fpath, "r") as file: + reader = csv.DictReader(file, delimiter="\t") + tau_tags_data = list(reader) + + competence_fname = MACS_FILES["annotators_competences"]["fname"] + competence_fpath = osp.join(macs_dpath, competence_fname) + if verbose >= 2: + pylog.debug(f"Reading file {competence_fname}...") + + with open(competence_fpath, "r") as file: + reader = csv.DictReader(file, delimiter="\t") + competences_data = list(reader) + + # Store MACS data + raw_data: Dict[str, List[Any]] = { + "fname": [item["filename"] for item in caps_data["files"]], + "captions": [ + [subitem["sentence"] for subitem in item["annotations"]] + for item in caps_data["files"] + ], + "tags": [ + [subitem["tags"] for subitem in item["annotations"]] + for item in caps_data["files"] + ], + "annotators_ids": [ + [subitem["annotator_id"] for subitem in item["annotations"]] + for item in caps_data["files"] + ], + } + dataset_size = len(raw_data["fname"]) + + # Build global mappings + fname_to_idx = {fname: i for i, fname in enumerate(raw_data["fname"])} + annotator_id_to_competence = { + int(annotator["annotator_id"]): float(annotator["competence"]) + for annotator in competences_data + } + + # Store TAU Urban acoustic scenes data + tau_additional_keys = ("scene_label", "identifier") + raw_data.update( + {key: [None for _ in range(dataset_size)] for key in tau_additional_keys} + ) + + tau_meta_fpath = osp.join(tau_meta_dpath, "meta.csv") + for tau_tags in tau_tags_data: + fname = osp.basename(tau_tags["filename"]) + if fname in fname_to_idx: + idx = fname_to_idx[fname] + for key in tau_additional_keys: + raw_data[key][idx] = tau_tags[key] + + # Sanity checks + assert all( + all(value is not None for value in raw_data[key]) for key in tau_additional_keys + ) + assert all(len(values) == dataset_size for values in raw_data.values()) + + if verbose >= 1: + pylog.info( + f"Dataset {MACSCard.PRETTY_NAME} ({subset}) has been loaded. (len={len(next(iter(raw_data.values())))})" + ) + + return raw_data, annotator_id_to_competence + + +def download_macs_dataset( + # Common args + root: Union[str, Path, None] = None, + subset: str = MACSCard.DEFAULT_SUBSET, + force: bool = False, + verbose: int = 0, + verify_files: bool = True, + # MACS-specific args + clean_archives: bool = True, +) -> None: + """Prepare MACS data. + + :param root: Dataset root directory. + defaults to ".". + :param subset: The subset of MACS to use. Can be one of :attr:`~MACSCard.SUBSETS`. + defaults to "full". + :param force: If True, force to download again all files. + defaults to False. + :param verbose: Verbose level. + defaults to 0. + :param verify_files: If True, check all file already downloaded are valid. + defaults to False. + + :param clean_archives: If True, remove the compressed archives from disk to save space. + defaults to True. + """ + + root = _get_root(root) + if not osp.isdir(root): + raise RuntimeError(f"Cannot find root directory '{root}'.") + + macs_dpath = _get_macs_root(root) + archives_dpath = _get_archives_dpath(root) + audio_dpath = _get_audio_dpath(root) + tau_meta_dpath = _get_tau_meta_dpath(root) + + for dpath in (archives_dpath, audio_dpath, tau_meta_dpath): + os.makedirs(dpath, exist_ok=True) + + # Download MACS specific files + for file_info in MACS_FILES.values(): + fname = file_info["fname"] + fpath = osp.join(macs_dpath, fname) + + if not osp.isfile(fpath) or force: + if verbose >= 1: + pylog.info(f"Downloading captions file '{fname}'...") + + url = file_info["url"] + download_file(url, fpath, verbose=verbose) + + if verify_files: + hash_value = file_info["hash_value"] + file_hash_value = hash_file(fpath, hash_type="md5") + if file_hash_value != hash_value: + raise RuntimeError( + f"Invalid checksum for file '{fname}'. (expected md5 checksum '{hash_value}' but found '{file_hash_value}')\n" + f"Please try to remove manually the file '{fpath}' and rerun {MACSCard.PRETTY_NAME} download." + ) + elif verbose >= 2: + pylog.debug(f"File '{fname}' has a valid checksum.") + + captions_fpath = osp.join(macs_dpath, MACS_FILES["captions"]["fname"]) + with open(captions_fpath, "r") as file: + captions_data = yaml.safe_load(file) + captions_data = captions_data["files"] + + # Download TAU Urban Sound audio archives files + for i, file_info in enumerate(MACS_ARCHIVES_FILES.values()): + zip_fname = file_info["fname"] + zip_fpath = osp.join(archives_dpath, zip_fname) + + if not osp.isfile(zip_fpath) or force: + if verbose >= 1: + pylog.info( + f"Downloading audio zip file '{zip_fpath}'... ({i+1}/{len(MACS_ARCHIVES_FILES)})" + ) + + url = file_info["url"] + download_file(url, zip_fpath, verbose=verbose) + + if verify_files: + hash_value = file_info["hash_value"] + file_hash_value = hash_file(zip_fpath, hash_type="md5") + if file_hash_value != hash_value: + raise RuntimeError( + f"Invalid checksum for file '{zip_fname}'. (expected md5 checksum '{hash_value}' but found '{file_hash_value}')\n" + f"Please try to remove manually the file '{zip_fpath}' and rerun {MACSCard.PRETTY_NAME} download." + ) + elif verbose >= 2: + pylog.debug(f"File '{zip_fname}' has a valid checksum.") + + # Extract files from TAU Urban Sound archives + macs_fnames = dict.fromkeys(data["filename"] for data in captions_data) + for i, (name, file_info) in enumerate(MACS_ARCHIVES_FILES.items()): + zip_fname = file_info["fname"] + zip_fpath = osp.join(archives_dpath, zip_fname) + + if verbose >= 2: + pylog.debug( + f"Check to extract TAU Urban acoustic scenes archive zip_fname={zip_fname}..." + ) + + is_audio_archive = name.startswith("audio") + if is_audio_archive: + target_dpath = audio_dpath + else: + target_dpath = tau_meta_dpath + + with zipfile.ZipFile(zip_fpath, "r") as file: + members_to_extract = [ + member + for member in file.namelist() + # Extract member if file if in captions yaml file and if the audio file is not already downloaded + if ( + (osp.basename(member) in macs_fnames or not is_audio_archive) + and not osp.isfile(osp.join(target_dpath, osp.basename(member))) + ) + ] + + if verbose >= 1: + pylog.info( + f"Extracting {len(members_to_extract)}/{len(file.namelist())} audio files from ZIP file '{zip_fname}'... ({i+1}/{len(MACS_ARCHIVES_FILES)})" + ) + + if len(members_to_extract) > 0: + file.extractall(archives_dpath, members_to_extract) + for member in members_to_extract: + extracted_fpath = osp.join(archives_dpath, member) + target_fpath = osp.join(target_dpath, osp.basename(member)) + shutil.move(extracted_fpath, target_fpath) + + if clean_archives: + if verbose >= 1: + pylog.info(f"Removing archives files in {archives_dpath}...") + shutil.rmtree(archives_dpath, ignore_errors=True) + + audio_fnames = [name for name in os.listdir(audio_dpath) if name.endswith(".wav")] + assert len(audio_fnames) == len(macs_fnames) + + if verbose >= 2: + pylog.debug(f"Dataset {MACSCard.PRETTY_NAME} ({subset}) has been prepared.") + + +def download_macs_datasets( + # Common args + root: Union[str, Path, None] = None, + subsets: Union[str, Iterable[str]] = MACSCard.DEFAULT_SUBSET, + force: bool = False, + verbose: int = 0, + # MACS-specific args + clean_archives: bool = True, + verify_files: bool = True, +) -> None: + """Function helper to download a list of subsets. See :func:`~aac_datasets.datasets.functional.macs.download_macs_dataset` for details.""" + if isinstance(subsets, str): + subsets = [subsets] + else: + subsets = list(subsets) + + kwargs: Dict[str, Any] = dict( + root=root, + force=force, + verbose=verbose, + clean_archives=clean_archives, + verify_files=verify_files, + ) + for subset in subsets: + download_macs_dataset( + subset=subset, + **kwargs, + ) + + +def _get_macs_root(root: str) -> str: + return osp.join(root, "MACS") + + +def _get_archives_dpath(root: str) -> str: + return osp.join(_get_macs_root(root), "archives") + + +def _get_audio_dpath(root: str) -> str: + return osp.join(_get_macs_root(root), "audio") + + +def _get_tau_meta_dpath(root: str) -> str: + return osp.join(_get_macs_root(root), "tau_meta") + + +def _is_prepared_macs(root: str) -> bool: + audio_dpath = _get_audio_dpath(root) + if not osp.isdir(audio_dpath): + return False + captions_fpath = osp.join(_get_macs_root(root), MACS_FILES["captions"]["fname"]) + if not osp.isfile(captions_fpath): + return False + + with open(captions_fpath, "r") as file: + data = yaml.safe_load(file) + data = data["files"] + fnames = os.listdir(audio_dpath) + return len(data) == len(fnames) + + +# MACS-specific files links. +MACS_FILES = { + "licence": { + "fname": "LICENSE.txt", + "url": "https://zenodo.org/record/5114771/files/LICENSE.txt?download=1", + "hash_value": "d3086f4517cccc32c1bb3a081b07cfa1", + }, + "captions": { + "fname": "MACS.yaml", + "url": "https://zenodo.org/record/5114771/files/MACS.yaml?download=1", + "hash_value": "23fcb2ebd0b109094034ef9e87972256", + }, + "annotators_competences": { + "fname": "MACS_competence.csv", + "url": "https://zenodo.org/record/5114771/files/MACS_competence.csv?download=1", + "hash_value": "4dfe9f951f0af9f29cb7952ec030370a", + }, +} + +# TAU_URBAN_ACOUSTIC archives files links. +TAU_URBAN_ACOUSTIC_DEV_FILES = { + "audio.1": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.1.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.1.zip?download=1", + "hash_value": "aca4ebfd9ed03d5f747d6ba8c24bc728", + }, + "audio.10": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.10.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.10.zip?download=1", + "hash_value": "0ffbf60006da520cc761fb74c878b98b", + }, + "audio.11": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.11.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.11.zip?download=1", + "hash_value": "599055d93b4c11057c29be2df54538d4", + }, + "audio.12": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.12.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.12.zip?download=1", + "hash_value": "98b8d162ff3665695c4c910e6c372cc8", + }, + "audio.13": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.13.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.13.zip?download=1", + "hash_value": "a356c08b1a5a21d433eba37ef87587f4", + }, + "audio.14": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.14.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.14.zip?download=1", + "hash_value": "f8969771e7faf7dd471d1cf78b0cf011", + }, + "audio.15": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.15.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.15.zip?download=1", + "hash_value": "4758c4b0fb7484faa632266e78850820", + }, + "audio.16": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.16.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.16.zip?download=1", + "hash_value": "a18acad9ede8ea76574216feb887f0bc", + }, + "audio.17": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.17.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.17.zip?download=1", + "hash_value": "1af7703484632f340da5c33662dc9632", + }, + "audio.18": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.18.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.18.zip?download=1", + "hash_value": "b67402bf3e08f4da394a7c18756c0fd2", + }, + "audio.19": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.19.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.19.zip?download=1", + "hash_value": "035db315f19106eb848b6f9b32bcc47c", + }, + "audio.2": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.2.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.2.zip?download=1", + "hash_value": "c4f170408ce77c8c70c532bf268d7be0", + }, + "audio.20": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.20.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.20.zip?download=1", + "hash_value": "9cb28c74911bf8a3eadcf53f50a5b5d6", + }, + "audio.21": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.21.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.21.zip?download=1", + "hash_value": "0e44ed85c88ec036a9725b4dd1dfaea0", + }, + "audio.3": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.3.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.3.zip?download=1", + "hash_value": "c7214a07211f10f3250290d05e72c37e", + }, + "audio.4": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.4.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.4.zip?download=1", + "hash_value": "a6a62110f6699cf4432072acb1dffda6", + }, + "audio.5": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.5.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.5.zip?download=1", + "hash_value": "091a0b6d3c84b8e60e46940aa7d4a8a0", + }, + "audio.6": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.6.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.6.zip?download=1", + "hash_value": "114f4ca13e074391b98a1cfd8140de65", + }, + "audio.7": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.7.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.7.zip?download=1", + "hash_value": "5951dd2968f7a514e2afbe279c4f060d", + }, + "audio.8": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.8.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.8.zip?download=1", + "hash_value": "b0b63dc95b327e1509857c8d8a663cc3", + }, + "audio.9": { + "fname": "TAU-urban-acoustic-scenes-2019-development.audio.9.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.9.zip?download=1", + "hash_value": "3c32a693a6b111ffb957be3c1dd22e9b", + }, + "doc": { + "fname": "TAU-urban-acoustic-scenes-2019-development.doc.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.doc.zip?download=1", + "hash_value": "1f6879544e80da70099a191613e7e51f", + }, + "meta": { + "fname": "TAU-urban-acoustic-scenes-2019-development.meta.zip", + "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.meta.zip?download=1", + "hash_value": "09782f2097e4735687af73c44919329c", + }, +} + +# List of TAU_URBAN_ACOUSTIC archives containing at least 1 MACS audio file. +MACS_ARCHIVES_FILES = { + name: TAU_URBAN_ACOUSTIC_DEV_FILES[name] + for name in ( + "audio.1", + "audio.10", + "audio.11", + "audio.12", + "audio.13", + "audio.2", + "audio.3", + "audio.9", + "meta", + ) +} diff --git a/src/aac_datasets/datasets/functional/wavcaps.py b/src/aac_datasets/datasets/functional/wavcaps.py new file mode 100644 index 0000000..69ccb8f --- /dev/null +++ b/src/aac_datasets/datasets/functional/wavcaps.py @@ -0,0 +1,687 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import csv +import json +import logging +import os +import os.path as osp +import subprocess +import zipfile + +from pathlib import Path +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union + +import tqdm + +from huggingface_hub import snapshot_download +from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE +from huggingface_hub.utils.tqdm import ( + are_progress_bars_disabled, + disable_progress_bars, + enable_progress_bars, +) +from typing_extensions import TypedDict + +from aac_datasets.datasets.functional.common import DatasetCard +from aac_datasets.utils.collections import list_dict_to_dict_list +from aac_datasets.utils.download import safe_rmdir +from aac_datasets.utils.globals import _get_root, _get_zip_path + + +pylog = logging.getLogger(__name__) + + +class WavCapsCard(DatasetCard): + ANNOTATIONS_CREATORS: Tuple[str, ...] = ("machine-generated",) + CAPTIONS_PER_AUDIO: Dict[str, int] = { + "as": 1, + "bbc": 1, + "fsd": 1, + "sb": 1, + "as_noac": 1, + "fsd_nocl": 1, + } + CITATION: str = r""" + @article{mei2023WavCaps, + title = {Wav{C}aps: A {ChatGPT}-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research}, + author = {Xinhao Mei and Chutong Meng and Haohe Liu and Qiuqiang Kong and Tom Ko and Chengqi Zhao and Mark D. Plumbley and Yuexian Zou and Wenwu Wang}, + year = 2023, + journal = {arXiv preprint arXiv:2303.17395}, + url = {https://arxiv.org/pdf/2303.17395.pdf} + } + """ + DEFAULT_REVISION: str = "85a0c21e26fa7696a5a74ce54fada99a9b43c6de" + DEFAULT_SUBSET: str = "as_noac" + DESCRIPTION: str = "WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research." + EXPECTED_SIZES: Dict[str, int] = { + "AudioSet_SL": 108317, + "BBC_Sound_Effects": 31201, + "FreeSound": 262300, + "SoundBible": 1320, # note: 1232 according to github+hf, but found 1320 => seems that archive contains more data than in json + } + HOMEPAGE = "https://huggingface.co/datasets/cvssp/WavCaps" + LANGUAGE: Tuple[str, ...] = ("en",) + LANGUAGE_DETAILS: Tuple[str, ...] = ("en-US",) + NAME: str = "wavcaps" + PRETTY_NAME: str = "WavCaps" + REPO_ID: str = "cvssp/WavCaps" + SOURCES: Tuple[str, ...] = tuple(EXPECTED_SIZES.keys()) + SUBSETS: Tuple[str, ...] = tuple(CAPTIONS_PER_AUDIO.keys()) + SAMPLE_RATE: int = 32_000 # Hz + SIZE_CATEGORIES: Tuple[str, ...] = ("100K Dict[str, List[Any]]: + """Load WavCaps metadata. + + :param root: Dataset root directory. + defaults to ".". + :param subset: The subset of MACS to use. Can be one of :attr:`~MACSCard.SUBSETS`. + defaults to "as_noac". + :param verbose: Verbose level. + defaults to 0. + + :param hf_cache_dir: Optional override for HuggingFace cache directory path. + defaults to None. + :param revision: Optional override for revision commit/name for HuggingFace rapository. + defaults to None. + :returns: A dictionnary of lists containing each metadata. + """ + root = _get_root(root) + if subset not in WavCapsCard.SUBSETS: + raise ValueError( + f"Invalid argument subset={subset}. (expected one of {WavCapsCard.SUBSETS})" + ) + + if subset == "as": + overlapped_ds = "AudioCaps" + overlapped_subsets = ("val", "test") + pylog.warning( + f"You selected WavCaps subset '{subset}', be careful to not use these data as training when evaluating on {overlapped_ds} {overlapped_subsets} subsets. " + "You can use as_noac subset for to avoid this bias with AudioCaps." + ) + + elif subset == "fsd": + overlapped_ds = "Clotho" + overlapped_subsets = ( + "val", + "eval", + "dcase_aac_test", + "dcase_aac_analysis", + "dcase_t2a_audio", + "dcase_t2a_captions", + ) + pylog.warning( + f"You selected WavCaps subset '{subset}', be careful to not use these data as training when evaluating on {overlapped_ds} {overlapped_subsets} subsets. " + f"You can use fsd_nocl subset for to avoid this bias for Clotho val, eval, dcase_t2a_audio and dcase_t2a_captions subsets. Data could still overlap with Clotho dcase_aac_test and dcase_aac_analysis subsets." + ) + + if subset in ("as_noac", "fsd_nocl"): + if subset == "as_noac": + target_subset = "as" + csv_fname = "blacklist_audiocaps.full.csv" + + elif subset == "fsd_nocl": + target_subset = "fsd" + csv_fname = "blacklist_clotho.full.csv" + + else: + raise ValueError(f"INTERNAL ERROR: Invalid argument subset={subset}.") + + raw_data = load_wavcaps_dataset( + root=root, + subset=target_subset, + verbose=verbose, + hf_cache_dir=hf_cache_dir, + revision=revision, + ) + wavcaps_ids = raw_data["id"] + + csv_fpath = ( + Path(__file__) + .parent.parent.parent.parent.joinpath("data") + .joinpath(csv_fname) + ) + with open(csv_fpath, "r") as file: + reader = csv.DictReader(file) + data = list(reader) + other_ids = [data_i["id"] for data_i in data] + other_ids = dict.fromkeys(other_ids) + + indexes = [i for i, wc_id in enumerate(wavcaps_ids) if wc_id not in other_ids] + + if verbose >= 1: + pylog.info( + f"Getting {len(indexes)}/{len(wavcaps_ids)} items from '{target_subset}' for subset '{subset}'." + ) + + raw_data = { + column: [column_data[idx] for idx in indexes] + for column, column_data in raw_data.items() + } + return raw_data + + if not _is_prepared_wavcaps(root, hf_cache_dir, revision, subset, verbose): + raise RuntimeError( + f"{WavCapsCard.PRETTY_NAME} is not prepared in root={root}. Please use download=True to install it in root." + ) + + json_dpath = _get_json_dpath(root, hf_cache_dir, revision) + json_paths = [ + ("AudioSet_SL", osp.join(json_dpath, "AudioSet_SL", "as_final.json")), + ( + "BBC_Sound_Effects", + osp.join(json_dpath, "BBC_Sound_Effects", "bbc_final.json"), + ), + ("FreeSound", osp.join(json_dpath, "FreeSound", "fsd_final.json")), + ("SoundBible", osp.join(json_dpath, "SoundBible", "sb_final.json")), + ] + json_paths = [ + (source, json_path) + for source, json_path in json_paths + if _use_source(source, subset) + ] + + raw_data = {k: [] for k in _WAVCAPS_RAW_COLUMNS + ("source", "fname")} + for source, json_path in json_paths: + if verbose >= 2: + pylog.debug(f"Loading metadata in JSON '{json_path}'...") + json_data, size = _load_json(json_path) + + sources = [source] * size + json_data.pop("audio", None) + + if source == "AudioSet_SL": + ids = json_data["id"] + fnames = [id_.replace(".wav", ".flac") for id_ in ids] + raw_data["fname"] += fnames + + elif source == "BBC_Sound_Effects": + ids = json_data["id"] + fnames = [f"{id_}.flac" for id_ in ids] + raw_data["fname"] += fnames + + elif source == "FreeSound": + ids = json_data["id"] + fnames = [f"{id_}.flac" for id_ in ids] + raw_data["fname"] += fnames + + elif source == "SoundBible": + ids = json_data["id"] + fnames = [f"{id_}.flac" for id_ in ids] + raw_data["fname"] += fnames + + else: + raise RuntimeError(f"Invalid source={source}.") + + for k in _WAVCAPS_RAW_COLUMNS: + if k in json_data: + raw_data[k] += json_data[k] + elif k in _DEFAULT_VALUES: + default_val = _DEFAULT_VALUES[k] + default_values = [default_val] * size + raw_data[k] += default_values + elif k in ("audio", "file_name"): + pass + else: + raise RuntimeError(f"Invalid column name {k}. (with source={source})") + + raw_data["source"] += sources + + raw_data.pop("audio") + raw_data.pop("file_name") + captions = raw_data.pop("caption") + + # Convert str -> List[str] for captions to match other datasets captions type + raw_data["captions"] = [[caption] for caption in captions] + + # Force floating-point precision for duration + raw_data["duration"] = list(map(float, raw_data["duration"])) + + return raw_data + + +def download_wavcaps_dataset( + # Common args + root: Union[str, Path, None] = None, + subset: str = WavCapsCard.DEFAULT_SUBSET, + force: bool = False, + verbose: int = 0, + verify_files: bool = False, + # WavCaps-specific args + clean_archives: bool = False, + hf_cache_dir: Optional[str] = None, + repo_id: Optional[str] = None, + revision: Optional[str] = None, + zip_path: Union[str, Path, None] = None, +) -> None: + """Prepare WavCaps data. + + :param root: Dataset root directory. + defaults to ".". + :param subset: The subset of MACS to use. Can be one of :attr:`~WavCapsCard.SUBSETS`. + defaults to "as_noac". + :param force: If True, force to download again all files. + defaults to False. + :param verbose: Verbose level. + defaults to 0. + :param verify_files: If True, check all file already downloaded are valid. + defaults to False. + + :param clean_archives: If True, remove the compressed archives from disk to save space. + defaults to True. + :param hf_cache_dir: Optional override for HuggingFace cache directory path. + defaults to None. + :param repo_id: Repository ID on HuggingFace. + defaults to "cvssp/WavCaps". + :param revision: Optional override for revision commit/name for HuggingFace rapository. + defaults to None. + :param zip_path: Path to zip executable path in shell. + defaults to "zip". + """ + if subset == "as_noac": + return download_wavcaps_dataset( + root=root, + subset="as", + revision=revision, + hf_cache_dir=hf_cache_dir, + force=force, + verify_files=verify_files, + clean_archives=clean_archives, + zip_path=zip_path, + verbose=verbose, + ) + elif subset == "fsd_nocl": + return download_wavcaps_dataset( + root=root, + subset="fsd", + revision=revision, + hf_cache_dir=hf_cache_dir, + force=force, + verify_files=verify_files, + clean_archives=clean_archives, + zip_path=zip_path, + verbose=verbose, + ) + + root = _get_root(root) + zip_path = _get_zip_path(zip_path) + + if subset not in WavCapsCard.SUBSETS: + raise ValueError( + f"Invalid argument subset={subset}. (expected one of {WavCapsCard.SUBSETS})" + ) + + # note: verbose=-1 to disable warning triggered when dset is not prepared + if not force and _is_prepared_wavcaps( + root, hf_cache_dir, revision, subset, verbose=-1 + ): + return None + + if hf_cache_dir is None: + hf_cache_dir = HUGGINGFACE_HUB_CACHE + if repo_id is None: + repo_id = WavCapsCard.REPO_ID + + # Download files from huggingface + ign_sources = [ + source for source in WavCapsCard.SOURCES if not _use_source(source, subset) + ] + ign_patterns = [ + pattern + for source in ign_sources + for pattern in (f"json_files/{source}/*.json", "Zip_files/*") # {source}/ + ] + if verbose >= 2: + pylog.debug(f"ign_sources={ign_sources}") + pylog.debug(f"ign_patterns={ign_patterns}") + + pbar_enabled = are_progress_bars_disabled() + if pbar_enabled and verbose <= 0: + disable_progress_bars() + + snapshot_dpath = snapshot_download( + repo_id=repo_id, + repo_type="dataset", + revision=revision, + resume_download=not force, + local_files_only=False, + cache_dir=hf_cache_dir, + allow_patterns=None, + ignore_patterns=ign_patterns, + ) + + if pbar_enabled and verbose <= 0: + enable_progress_bars() + + snapshot_abs_dpath = osp.abspath(snapshot_dpath) + wavcaps_root = _get_wavcaps_root(root, hf_cache_dir, revision) + if verbose >= 2: + pylog.debug(f"snapshot_dpath={snapshot_dpath}") + pylog.debug(f"snapshot_absdpath={snapshot_abs_dpath}") + pylog.debug(f"wavcaps_dpath={wavcaps_root}") + del snapshot_dpath + + # Build symlink to hf cache + if osp.exists(wavcaps_root): + if not osp.islink(wavcaps_root): + raise RuntimeError("WavCaps root exists but it is not a symlink.") + link_target_abspath = osp.abspath(osp.realpath(wavcaps_root)) + if link_target_abspath != snapshot_abs_dpath: + pylog.error( + "Target link is not pointing to current snapshot path. It will be automatically replaced." + ) + os.remove(wavcaps_root) + os.symlink(snapshot_abs_dpath, wavcaps_root, True) + else: + os.symlink(snapshot_abs_dpath, wavcaps_root, True) + + source_and_splitted = [ + ("AudioSet_SL", True), + ("BBC_Sound_Effects", True), + ("FreeSound", True), + ("SoundBible", False), + ] + source_and_splitted = { + source: is_splitted + for source, is_splitted in source_and_splitted + if _use_source(source, subset) + } + + archives_dpath = _get_archives_dpath(root, hf_cache_dir, revision) + for source, is_splitted in source_and_splitted.items(): + main_zip_fpath = osp.join( + archives_dpath, _WAVCAPS_ARCHIVE_DNAMES[source], f"{source}.zip" + ) + + if is_splitted: + merged_zip_fpath = osp.join( + archives_dpath, _WAVCAPS_ARCHIVE_DNAMES[source], f"{source}_merged.zip" + ) + else: + merged_zip_fpath = main_zip_fpath + + if is_splitted and not osp.isfile(merged_zip_fpath): + cmd = [ + zip_path, + "-FF", + main_zip_fpath, + "--out", + merged_zip_fpath, + ] + if verbose >= 2: + pylog.debug(f"Merging ZIP files for {source}...") + pylog.debug(f"Using command: {' '.join(cmd)}") + + if verbose >= 2: + stdout = None + stderr = None + else: + stdout = subprocess.DEVNULL + stderr = subprocess.DEVNULL + + subprocess.check_call(cmd, stdout=stdout, stderr=stderr) + + audio_subset_dpath = _get_audio_subset_dpath( + root, hf_cache_dir, revision, source + ) + os.makedirs(audio_subset_dpath, exist_ok=True) + + with zipfile.ZipFile(merged_zip_fpath, "r") as file: + flac_subnames = [name for name in file.namelist() if name.endswith(".flac")] + assert len(flac_subnames) > 0 + assert all( + osp.dirname(name) == osp.dirname(flac_subnames[0]) + for name in flac_subnames + ) + + src_root = osp.join(audio_subset_dpath, osp.dirname(flac_subnames[0])) + src_fnames_found = ( + dict.fromkeys(name for name in os.listdir(src_root)) + if osp.isdir(src_root) + else {} + ) + tgt_fnames_found = dict.fromkeys( + name for name in os.listdir(audio_subset_dpath) + ) + + missing_subnames = [ + subname + for subname in flac_subnames + if osp.basename(subname) not in src_fnames_found + and osp.basename(subname) not in tgt_fnames_found + ] + if verbose >= 2: + pylog.debug( + f"Extracting {len(missing_subnames)}/{len(flac_subnames)} audio files from {merged_zip_fpath}..." + ) + file.extractall(audio_subset_dpath, missing_subnames) + if verbose >= 2: + pylog.debug("Extraction done.") + + src_fnames_found = ( + dict.fromkeys(name for name in os.listdir(src_root)) + if osp.isdir(src_root) + else {} + ) + src_fpaths_to_move = [ + osp.join(audio_subset_dpath, subname) + for subname in flac_subnames + if osp.basename(subname) in src_fnames_found + ] + if verbose >= 2: + pylog.debug(f"Moving {len(src_fpaths_to_move)} files...") + for src_fpath in tqdm.tqdm(src_fpaths_to_move): + tgt_fpath = osp.join(audio_subset_dpath, osp.basename(src_fpath)) + os.rename(src_fpath, tgt_fpath) + if verbose >= 2: + pylog.debug("Move done.") + + if verify_files: + tgt_fnames_expected = [osp.basename(subname) for subname in flac_subnames] + tgt_fnames_found = dict.fromkeys( + fname for fname in os.listdir(audio_subset_dpath) + ) + if verbose >= 2: + pylog.debug(f"Checking {len(tgt_fnames_expected)} files...") + tgt_fnames_invalids = [ + fname for fname in tgt_fnames_expected if fname not in tgt_fnames_found + ] + if len(tgt_fnames_invalids) > 0: + raise FileNotFoundError( + f"Found {len(tgt_fnames_invalids)}/{len(tgt_fnames_expected)} invalid files." + ) + + safe_rmdir(audio_subset_dpath, rm_root=False, error_on_non_empty_dir=True) + + if clean_archives: + used_sources = source_and_splitted.keys() + for source in used_sources: + archive_source_dpath = osp.join( + archives_dpath, _WAVCAPS_ARCHIVE_DNAMES[source] + ) + archives_names = os.listdir(archive_source_dpath) + for name in archives_names: + if not name.endswith(".zip") and ".z" not in name: + continue + fpath = osp.join(archive_source_dpath, name) + if verbose >= 1: + pylog.info(f"Removing archive file {name} for source={source}...") + os.remove(fpath) + + +def download_wavcaps_datasets( + # Common args + root: Union[str, Path, None] = None, + subsets: Union[str, Iterable[str]] = WavCapsCard.DEFAULT_SUBSET, + force: bool = False, + verbose: int = 0, + # WavCaps-specific args + clean_archives: bool = False, + hf_cache_dir: Optional[str] = None, + repo_id: Optional[str] = None, + revision: Optional[str] = None, + verify_files: bool = False, + zip_path: Union[str, Path, None] = None, +) -> None: + """Function helper to download a list of subsets. See :func:`~aac_datasets.datasets.functional.wavcaps.download_wavcaps_dataset` for details.""" + if isinstance(subsets, str): + subsets = [subsets] + else: + subsets = list(subsets) + + kwargs: Dict[str, Any] = dict( + root=root, + force=force, + verbose=verbose, + clean_archives=clean_archives, + hf_cache_dir=hf_cache_dir, + repo_id=repo_id, + revision=revision, + verify_files=verify_files, + zip_path=zip_path, + ) + for subset in subsets: + download_wavcaps_dataset( + subset=subset, + **kwargs, + ) + + +def _get_wavcaps_root( + root: str, + hf_cache_dir: Optional[str], + revision: Optional[str], +) -> str: + return osp.join(root, "WavCaps") + + +def _get_json_dpath( + root: str, + hf_cache_dir: Optional[str], + revision: Optional[str], +) -> str: + return osp.join(_get_wavcaps_root(root, hf_cache_dir, revision), "json_files") + + +def _get_archives_dpath( + root: str, + hf_cache_dir: Optional[str], + revision: Optional[str], +) -> str: + return osp.join(_get_wavcaps_root(root, hf_cache_dir, revision), "Zip_files") + + +def _get_audio_dpath( + root: str, + hf_cache_dir: Optional[str], + revision: Optional[str], +) -> str: + return osp.join(_get_wavcaps_root(root, hf_cache_dir, revision), "Audio") + + +def _get_audio_subset_dpath( + root: str, + hf_cache_dir: Optional[str], + revision: Optional[str], + source: str, +) -> str: + return osp.join( + _get_audio_dpath(root, hf_cache_dir, revision), _WAVCAPS_AUDIO_DNAMES[source] + ) + + +def _is_prepared_wavcaps( + root: str, + hf_cache_dir: Optional[str], + revision: Optional[str], + subset: str, + verbose: int, +) -> bool: + sources = [source for source in WavCapsCard.SOURCES if _use_source(source, subset)] + for source in sources: + audio_fnames = os.listdir( + _get_audio_subset_dpath(root, hf_cache_dir, revision, source) + ) + expected_size = WavCapsCard.EXPECTED_SIZES[source] + if expected_size != len(audio_fnames): + if verbose >= 0: + pylog.error( + f"Invalid number of files for source={source}. (expected {expected_size} but found {len(audio_fnames)} files)" + ) + return False + return True + + +def _use_source(source: str, subset: str) -> bool: + return any( + ( + source == "AudioSet_SL" and subset in ("as", "as_noac"), + source == "BBC_Sound_Effects" and subset in ("bbc",), + source == "FreeSound" and subset in ("fsd", "fsd_nocl"), + source == "SoundBible" and subset in ("sb",), + ) + ) + + +def _load_json(fpath: str) -> Tuple[Dict[str, Any], int]: + with open(fpath, "r") as file: + data = json.load(file) + data = data["data"] + size = len(data) + data = list_dict_to_dict_list(data, key_mode="same") + return data, size + + +class _WavCapsRawItem(TypedDict): + # Common values + caption: str + duration: float + id: str + # Source Specific values + audio: Optional[str] + author: Optional[str] + description: Optional[str] + download_link: Optional[str] + file_name: Optional[str] + href: Optional[str] + tags: Optional[List[str]] + + +_DEFAULT_VALUES = { + "author": "", + "description": "", + "download_link": "", + "href": "", + "tags": [], +} + +_WAVCAPS_RAW_COLUMNS = tuple( + _WavCapsRawItem.__required_keys__ | _WavCapsRawItem.__optional_keys__ +) + +_WAVCAPS_AUDIO_DNAMES = { + # Source name to audio directory name + "AudioSet_SL": "AudioSet_SL", + "BBC_Sound_Effects": "BBC_Sound_Effects", + "FreeSound": "FreeSound", + "SoundBible": "SoundBible", +} + +_WAVCAPS_ARCHIVE_DNAMES = { + # Source name to audio directory name + "AudioSet_SL": "AudioSet_SL", + "BBC_Sound_Effects": "BBC_Sound_Effects", + "FreeSound": "FreeSound", + "SoundBible": "SoundBible", +} diff --git a/src/aac_datasets/datasets/legacy/__init__.py b/src/aac_datasets/datasets/legacy/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/src/aac_datasets/datasets/legacy/audiocaps.py b/src/aac_datasets/datasets/legacy/audiocaps.py deleted file mode 100644 index 0235388..0000000 --- a/src/aac_datasets/datasets/legacy/audiocaps.py +++ /dev/null @@ -1,851 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import csv -import logging -import os -import os.path as osp -import subprocess -import sys -import time - -from functools import lru_cache -from subprocess import CalledProcessError -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, overload - -import torch -import torchaudio -import tqdm - -from torch import Tensor -from torch.hub import download_url_to_file -from torch.utils.data.dataset import Dataset -from typing_extensions import TypedDict - - -pylog = logging.getLogger(__name__) - - -class AudioCapsItem(TypedDict): - r"""Class representing a single AudioCaps item.""" - - # Common attributes - audio: Tensor - captions: List[str] - dataset: str - fname: str - index: int - subset: str - sr: int - # AudioCaps-specific attributes - audiocaps_ids: List[int] - start_time: int - tags: List[int] - youtube_id: str - - -AUDIOCAPS_ALL_COLUMNS = tuple( - AudioCapsItem.__required_keys__ | AudioCapsItem.__optional_keys__ -) - - -class AudioCaps(Dataset[Dict[str, Any]]): - r"""Unofficial AudioCaps PyTorch dataset. - - Subsets available are 'train', 'val' and 'test'. - - Audio is a waveform tensor of shape (1, n_times) of 10 seconds max, sampled at 32kHz. - Target is a list of strings containing the captions. - The 'train' subset has only 1 caption per sample and 'val' and 'test' have 5 captions. - - Download requires 'youtube-dl' and 'ffmpeg' commands. - You can change the default path with :attr:`~AudioCaps.YOUTUBE_DL_PATH` or :attr:`~AudioCaps.FFMPEG_PATH` global variables. - - AudioCaps paper : https://www.aclweb.org/anthology/N19-1011.pdf - - .. code-block:: text - :caption: Dataset folder tree - - {root} - └── AUDIOCAPS_32000Hz - ├── train.csv - ├── val.csv - ├── test.csv - └── audio - ├── train - │ └── (46230/49838 flac files, ~42G for 32kHz) - ├── val - │ └── (464/495 flac files, ~425M for 32kHz) - └── test - └── (912/975 flac files, ~832M for 32kHz) - - """ - - # Common globals - AUDIO_N_CHANNELS = 1 - CITATION: str = r""" - @inproceedings{kim_etal_2019_audiocaps, - title = {{A}udio{C}aps: Generating Captions for Audios in The Wild}, - author = {Kim, Chris Dongjoo and Kim, Byeongchang and Lee, Hyunmin and Kim, Gunhee}, - year = 2019, - month = jun, - booktitle = {Proceedings of the 2019 Conference of the North {A}merican Chapter of the Association for Computational Linguistics: Human Language Technologies, Volume 1 (Long and Short Papers)}, - publisher = {Association for Computational Linguistics}, - address = {Minneapolis, Minnesota}, - pages = {119--132}, - doi = {10.18653/v1/N19-1011}, - url = {https://aclanthology.org/N19-1011}, - } - """ - DATASET_NAME = "audiocaps" - FORCE_PREPARE_DATA: bool = False - HOMEPAGE = "https://audiocaps.github.io/" - MAX_AUDIO_SEC = 10.00096876 - MIN_AUDIO_SEC = 0.6501874 - SAMPLE_RATE = 32000 - SUBSETS = ("train", "val", "test") - VERIFY_FILES = False - - # AudioCaps-specific globals - AUDIO_FILE_EXTENSION = "flac" - CAPTIONS_PER_AUDIO = {"train": 1, "val": 5, "test": 5} - DNAME_LOG = "logs" - FFMPEG_PATH: str = "ffmpeg" - MAX_CAPTION_LENGTH = 52 - MIN_CAPTION_LENGTH = 2 - N_AUDIOSET_CLASSES: int = 527 - REDIRECT_LOG = False - YOUTUBE_DL_PATH: str = "youtube-dl" - - # Initialization - def __init__( - self, - root: str = ".", - subset: str = "train", - download: bool = False, - transform: Optional[Callable[[Dict[str, Any]], Any]] = None, - flat_captions: bool = False, - verbose: int = 0, - exclude_removed_audio: bool = True, - with_tags: bool = False, - ) -> None: - """ - :param root: Dataset root directory. - The data will be stored in the 'AUDIOCAPS_{SAMPLE_RATE}' subdirectory. - defaults to ".". - :param subset: The subset of AudioCaps to use. Can be one of :attr:`~AudioCaps.SUBSETS`. - defaults to "train". - :param download: Download the dataset if download=True and if the dataset is not already downloaded. - defaults to False. - :param transform: The transform to apply to the global dict item. This transform is applied only in getitem method. - defaults to None. - :param flat_captions: If True, map captions to audio instead of audio to caption. - defaults to True. - :param verbose: Verbose level. - defaults to 0. - :param exclude_removed_audio: If True, the dataset will exclude from the dataset the audio not downloaded from youtube (i.e. not present on disk). - If False, invalid audios will return an empty tensor of shape (0,). - defaults to True. - :param with_tags: If True, load the tags from AudioSet dataset. - Note: tags needs to be downloaded with download=True & with_tags=True before being used. - defaults to False. - """ - if subset not in AudioCaps.SUBSETS: - raise ValueError( - f"Invalid argument subset={subset} for AudioCaps. (expected one of {AudioCaps.SUBSETS})" - ) - - super().__init__() - # Attributes - self._root = root - self._subset = subset - self._download = download - self._transform = transform - self._flat_captions = flat_captions - self._verbose = verbose - self._exclude_removed_audio = exclude_removed_audio - self._with_tags = with_tags - - # Data to load - self._all_items: Dict[str, List[Any]] = {} - self._loaded = False - self._index_to_tagname: List[str] = [] - - if self._download: - self._prepare_dataset() - self._load_dataset() - - # Properties - @property - def column_names(self) -> List[str]: - """The name of each column of the dataset.""" - column_names = list(AUDIOCAPS_ALL_COLUMNS) - if not self._with_tags: - column_names.remove("tags") - return column_names - - @property - def index_to_tagname(self) -> List[str]: - """AudioSet ordered list of tag names. Returns an empty list if `with_tags` is False.""" - return self._index_to_tagname - - @property - def info(self) -> Dict[str, Any]: - """Return the global dataset info.""" - return { - "dataset": self.DATASET_NAME, - "subset": self._subset, - "with_tags": self._with_tags, - } - - @property - def shape(self) -> Tuple[int, ...]: - """The shape of the AudioCaps dataset.""" - return len(self), len(self.column_names) - - # Public methods - @overload - def at(self, idx: int) -> AudioCapsItem: - ... - - @overload - def at(self, idx: Union[Iterable[int], slice, None]) -> Dict[str, List]: - ... - - @overload - def at(self, idx: Any, column: Any) -> Any: - ... - - def at( - self, - idx: Union[int, Iterable[int], None, slice] = None, - column: Union[str, Iterable[str], None] = None, - ) -> Any: - """Get a specific data field. - - :param index: The index or slice of the value in range [0, len(dataset)-1]. - :param column: The name(s) of the column. Can be any value of :meth:`~AudioCaps.column_names`. - :returns: The field value. The type depends of the column. - """ - if idx is None: - idx = slice(None) - if column is None: - column = self.column_names - - if not isinstance(column, str) and isinstance(column, Iterable): - return {column_i: self.at(idx, column_i) for column_i in column} - - if isinstance(idx, (int, slice)) and column in self._all_items.keys(): - return self._all_items[column][idx] # type: ignore - - if isinstance(idx, slice): - idx = range(len(self))[idx] - - if isinstance(idx, Iterable): - idx = list(idx) - if not all(isinstance(idx_i, int) for idx_i in idx): - raise TypeError( - f"Invalid input type for idx={idx}. (expected Iterable[int], not Iterable[{idx.__class__.__name__}])" - ) - return [self.at(idx_i, column) for idx_i in idx] - - if column == "audio": - fpath = self.at(idx, "fpath") - if not self._all_items["is_on_disk"][idx]: - return torch.empty((0,)) - audio, sr = torchaudio.load(fpath) # type: ignore - - # Sanity check - if audio.nelement() == 0: - raise RuntimeError( - f"Invalid audio number of elements in '{fpath}'. (expected audio.nelements()={audio.nelement()} > 0)" - ) - if sr != self.SAMPLE_RATE: - raise RuntimeError( - f"Invalid sample rate in '{fpath}'. (expected {self.SAMPLE_RATE} but found sr={sr})" - ) - return audio - - elif column == "audio_metadata": - fpath = self.at(idx, "fpath") - if not self._all_items["is_on_disk"][idx]: - return None - audio_metadata = torchaudio.info(fpath) # type: ignore - return audio_metadata - - elif column == "dataset": - return self.DATASET_NAME - - elif column == "fpath": - fname = self.at(idx, "fname") - fpath = osp.join(self.__audio_subset_root, fname) - return fpath - - elif column == "index": - return idx - - elif column == "num_channels": - audio_metadata = self.at(idx, "audio_metadata") - if audio_metadata is None: - return -1 - return audio_metadata.num_channels - - elif column == "num_frames": - audio_metadata = self.at(idx, "audio_metadata") - if audio_metadata is None: - return -1 - return audio_metadata.num_frames - - elif column == "sr": - audio_metadata = self.at(idx, "audio_metadata") - if audio_metadata is None: - return -1 - return audio_metadata.sample_rate - - elif column == "subset": - return self._subset - - else: - raise ValueError( - f"Invalid argument column={column} at idx={idx}. (expected one of {tuple(self.column_names)})" - ) - - def is_loaded(self) -> bool: - """Returns True if the dataset is loaded.""" - return self._loaded - - def set_transform( - self, - transform: Optional[Callable[[Dict[str, Any]], Any]], - ) -> None: - """Set the transform applied to each row.""" - self._transform = transform - - # Magic methods - @overload - def __getitem__(self, idx: int) -> AudioCapsItem: - ... - - @overload - def __getitem__(self, idx: Union[Iterable[int], slice, None]) -> Dict[str, List]: - ... - - @overload - def __getitem__(self, idx: Any) -> Any: - ... - - def __getitem__(self, idx: Any) -> Any: - if ( - isinstance(idx, tuple) - and len(idx) == 2 - and (isinstance(idx[1], (str, Iterable)) or idx[1] is None) - ): - idx, column = idx - else: - column = None - - item = self.at(idx, column) - if isinstance(idx, int) and column is None and self._transform is not None: - item = self._transform(item) - return item - - def __len__(self) -> int: - """ - :return: The number of items in the dataset. - """ - return len(self._all_items["captions"]) - - def __repr__(self) -> str: - return f"AudioCaps(size={len(self)}, subset={self._subset}, num_columns={len(self.column_names)}, with_tags={self._with_tags})" - - # Public class methods - @classmethod - def load_class_labels_indices(cls, root: str) -> List[Dict[str, str]]: - class_labels_indices_fpath = osp.join( - root, - f"AUDIOCAPS_{AudioCaps.SAMPLE_RATE}Hz", - AUDIOSET_LINKS["class_labels_indices"]["fname"], - ) - if not osp.isfile(class_labels_indices_fpath): - raise ValueError( - f"Cannot find class_labels_indices file in root='{root}'." - f"Maybe use AudioCaps(root, download=True, with_tags=True) before or use a different root directory." - ) - - with open(class_labels_indices_fpath, "r") as file: - reader = csv.DictReader(file) - audioset_classes_data = list(reader) - return audioset_classes_data - - # Private methods - def __check_file(self, fpath: str) -> bool: - try: - audio, sr = torchaudio.load(fpath) # type: ignore - except RuntimeError: - message = f'Found file "{fpath}" already downloaded but it is invalid (cannot load). It will be removed.' - pylog.error(message) - return False - - if audio.nelement() == 0: - message = f'Found file "{fpath}" already downloaded but it is invalid (empty audio). It will be removed.' - pylog.error(message) - return False - - if sr != self.SAMPLE_RATE: - message = f'Found file "{fpath}" already downloaded but it is invalid (invalid sr={sr} != {self.SAMPLE_RATE}). It will be removed.' - pylog.error(message) - return False - - return True - - @property - @lru_cache() - def __audio_subset_root(self) -> str: - return osp.join( - self.__audiocaps_root, - "audio", - self._subset, - ) - - @property - @lru_cache() - def __audiocaps_root(self) -> str: - return osp.join(self._root, f"AUDIOCAPS_{AudioCaps.SAMPLE_RATE}Hz") - - def __is_prepared(self) -> bool: - links = AUDIOCAPS_LINKS[self._subset] - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(self.__audiocaps_root, captions_fname) - return osp.isdir(self.__audio_subset_root) and osp.isfile(captions_fpath) - - def _load_dataset(self) -> None: - if not self.__is_prepared(): - raise RuntimeError( - f"Cannot load data: audiocaps_{self._subset} is not prepared in data root={self._root}. Please use download=True in dataset constructor." - ) - - links = AUDIOCAPS_LINKS[self._subset] - - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(self.__audiocaps_root, captions_fname) - with open(captions_fpath, "r") as file: - reader = csv.DictReader(file) - captions_data = list(reader) - - if self._with_tags: - class_labels_indices_fpath = osp.join( - self.__audiocaps_root, AUDIOSET_LINKS["class_labels_indices"]["fname"] - ) - unbal_tags_fpath = osp.join( - self.__audiocaps_root, AUDIOSET_LINKS["unbalanced"]["fname"] - ) - - if not all(map(osp.isfile, (class_labels_indices_fpath, unbal_tags_fpath))): - raise FileNotFoundError( - f"Cannot load tags without tags files '{osp.basename(class_labels_indices_fpath)}' and '{osp.basename(unbal_tags_fpath)}'." - f"Please use download=True and with_tags=True in dataset constructor." - ) - - audioset_classes_data = AudioCaps.load_class_labels_indices(self._root) - - with open(unbal_tags_fpath, "r") as file: - fieldnames = ("YTID", "start_seconds", "end_seconds", "positive_labels") - reader = csv.DictReader( - file, fieldnames, skipinitialspace=True, strict=True - ) - # Skip the comments - for _ in range(3): - next(reader) - unbal_tags_data = list(reader) - else: - audioset_classes_data = [] - unbal_tags_data = [] - - # Build global mappings - fnames_dic = dict.fromkeys( - f"{line['youtube_id']}_{line['start_time']}.{self.AUDIO_FILE_EXTENSION}" - for line in captions_data - ) - audio_fnames_on_disk = dict.fromkeys(os.listdir(self.__audio_subset_root)) - if self._exclude_removed_audio: - fnames_lst = [ - fname for fname in fnames_dic if fname in audio_fnames_on_disk - ] - is_on_disk_lst = [True for _ in range(len(fnames_lst))] - else: - fnames_lst = list(fnames_dic) - is_on_disk_lst = [fname in audio_fnames_on_disk for fname in fnames_lst] - - dataset_size = len(fnames_lst) - fname_to_idx = {fname: i for i, fname in enumerate(fnames_lst)} - - mid_to_tag_name = {} - tag_name_to_index = {} - - for line in audioset_classes_data: - # keys: index, mid, display_name - mid_to_tag_name[line["mid"]] = line["display_name"] - tag_name_to_index[line["display_name"]] = int(line["index"]) - - classes_indexes = list(tag_name_to_index.values()) - assert len(classes_indexes) == 0 or classes_indexes == list( - range(classes_indexes[-1] + 1) - ) - self._index_to_tagname = list(tag_name_to_index.keys()) - - # Process each field into a single structure - all_caps_dic: Dict[str, List[Any]] = { - key: [None for _ in range(dataset_size)] - for key in ("audiocaps_ids", "youtube_id", "start_time", "captions") - } - for line in tqdm.tqdm( - captions_data, - disable=self._verbose <= 0, - desc=f"Loading AudioCaps ({self._subset}) captions...", - ): - # audiocap_id, youtube_id, start_time, caption - audiocap_id = line["audiocap_id"] - youtube_id = line["youtube_id"] - start_time = line["start_time"] - caption = line["caption"] - - fname = f"{youtube_id}_{start_time}.{self.AUDIO_FILE_EXTENSION}" - if fname in fname_to_idx: - idx = fname_to_idx[fname] - - if all_caps_dic["start_time"][idx] is None: - all_caps_dic["start_time"][idx] = start_time - all_caps_dic["youtube_id"][idx] = youtube_id - all_caps_dic["audiocaps_ids"][idx] = [audiocap_id] - all_caps_dic["captions"][idx] = [caption] - else: - assert all_caps_dic["start_time"][idx] == start_time - assert all_caps_dic["youtube_id"][idx] == youtube_id - - all_caps_dic["audiocaps_ids"][idx].append(audiocap_id) - all_caps_dic["captions"][idx].append(caption) - - # Load tags from audioset data - all_tags_lst = [[] for _ in range(dataset_size)] - - for line in tqdm.tqdm( - unbal_tags_data, - disable=self._verbose <= 0, - desc="Loading AudioSet tags...", - ): - # keys: YTID, start_seconds, end_seconds, positive_labels - youtube_id = line["YTID"] - # Note : In audioset, start_time is a string repr of a float value, audiocaps it is a string repr of an integer - start_time = int(float(line["start_seconds"])) - fname = f"{youtube_id}_{start_time}.{self.AUDIO_FILE_EXTENSION}" - if fname in fname_to_idx: - tags_mid = line["positive_labels"] - tags_mid = tags_mid.split(",") - tags_names = [mid_to_tag_name[tag] for tag in tags_mid] - tags_indexes = [tag_name_to_index[tag] for tag in tags_names] - - idx = fname_to_idx[fname] - all_tags_lst[idx] = tags_indexes - - all_items = { - "fname": fnames_lst, - "tags": all_tags_lst, - "is_on_disk": is_on_disk_lst, - } - all_items.update(all_caps_dic) - - # Convert audiocaps_ids and start_time to ints - all_items["audiocaps_ids"] = [ - list(map(int, item)) for item in all_items["audiocaps_ids"] - ] - all_items["start_time"] = list(map(int, all_items["start_time"])) - - if self._flat_captions and self.CAPTIONS_PER_AUDIO[self._subset] > 1: - all_infos_flatten = {key: [] for key in all_items.keys()} - - for i, captions in enumerate(all_items["captions"]): - for caption in captions: - for key in all_items.keys(): - all_infos_flatten[key].append(all_items[key][i]) - all_infos_flatten["captions"] = [caption] - - all_items = all_infos_flatten - - self._all_items = all_items - self._loaded = True - - if self._verbose >= 1: - pylog.info(f"{repr(self)} has been loaded. (len={len(self)})") - - def _prepare_dataset(self) -> None: - if not osp.isdir(self._root): - raise RuntimeError(f"Cannot find root directory '{self._root}'.") - - try: - subprocess.check_call( - [self.YOUTUBE_DL_PATH, "--help"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - except (CalledProcessError, PermissionError, FileNotFoundError) as err: - pylog.error(f"Cannot use youtube-dl path '{self.YOUTUBE_DL_PATH}'. ({err})") - raise err - - try: - subprocess.check_call( - [self.FFMPEG_PATH, "--help"], - stdout=subprocess.DEVNULL, - stderr=subprocess.DEVNULL, - ) - except (CalledProcessError, PermissionError, FileNotFoundError) as err: - pylog.error(f"Cannot use ffmpeg path '{self.FFMPEG_PATH}'. ({err})") - raise err - - if self.__is_prepared() and not self.FORCE_PREPARE_DATA: - return None - - links = AUDIOCAPS_LINKS[self._subset] - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(self.__audiocaps_root, captions_fname) - - os.makedirs(self.__audio_subset_root, exist_ok=True) - - if not osp.isfile(captions_fpath): - url = links["captions"]["url"] - download_url_to_file(url, captions_fpath, progress=self._verbose >= 1) - - start = time.perf_counter() - with open(captions_fpath, "r") as file: - n_samples = len(file.readlines()) - - if self._verbose >= 1: - log_dpath = osp.join(self.__audiocaps_root, self.DNAME_LOG) - if not osp.isdir(log_dpath): - os.makedirs(log_dpath) - - if self.REDIRECT_LOG: - logging.basicConfig( - filename=osp.join(log_dpath, f"preparation_{self._subset}.txt"), - filemode="w", - level=logging.INFO, - force=True, - ) - pylog.info(f"Start downloading files for {self._subset} AudioCaps split.") - - with open(captions_fpath, "r") as file: - # Download audio files - reader = csv.DictReader(file) - if self._verbose >= 1: - reader = tqdm.tqdm(reader, total=n_samples) - - n_download_ok, n_download_err, n_already_ok, n_already_err = 0, 0, 0, 0 - for line in reader: - # Keys: audiocap_id, youtube_id, start_time, caption - audiocap_id, youtube_id, start_time = [ - line[key] for key in ("audiocap_id", "youtube_id", "start_time") - ] - fpath = osp.join( - self.__audio_subset_root, - f"{youtube_id}_{start_time}.{self.AUDIO_FILE_EXTENSION}", - ) - if not start_time.isdigit(): - raise RuntimeError( - f'Start time "{start_time}" is not an integer (audiocap_id={audiocap_id}, youtube_id={youtube_id}).' - ) - start_time = int(start_time) - - if not osp.isfile(fpath): - success = _download_and_extract_from_youtube( - youtube_id, - fpath, - start_time, - duration=self.MAX_AUDIO_SEC, - sr=self.SAMPLE_RATE, - youtube_dl_path=self.YOUTUBE_DL_PATH, - ffmpeg_path=self.FFMPEG_PATH, - n_channels=self.AUDIO_N_CHANNELS, - ) - if success: - valid_file = self.__check_file(fpath) - if valid_file: - if self._verbose >= 2: - pylog.debug( - f'[{audiocap_id:6s}] File "{youtube_id}" has been downloaded and verified.' - ) - n_download_ok += 1 - else: - if self._verbose >= 1: - pylog.warning( - f'[{audiocap_id:6s}] File "{youtube_id}" has been downloaded but it is not valid and it will be removed.' - ) - os.remove(fpath) - n_download_err += 1 - else: - pylog.error( - f'[{audiocap_id:6s}] Cannot extract audio from "{youtube_id}".' - ) - n_download_err += 1 - - elif self.VERIFY_FILES: - valid_file = self.__check_file(fpath) - if valid_file: - if self._verbose >= 2: - pylog.debug( - f'[{audiocap_id:6s}] File "{youtube_id}" is already downloaded and has been verified.' - ) - n_already_ok += 1 - else: - if self._verbose >= 1: - pylog.warning( - f'[{audiocap_id:6s}] File "{youtube_id}" is already downloaded but it is not valid and will be removed.' - ) - os.remove(fpath) - n_already_err += 1 - else: - if self._verbose >= 2: - pylog.debug( - f'[{audiocap_id:6s}] File "{youtube_id}" is already downloaded but it is not verified due to self.VERIFY_FILES={self.VERIFY_FILES}.' - ) - n_already_ok += 1 - - if self._with_tags: - for key in ("class_labels_indices", "unbalanced"): - infos = AUDIOSET_LINKS[key] - url = infos["url"] - fname = infos["fname"] - fpath = osp.join(self.__audiocaps_root, fname) - if not osp.isfile(fpath): - if self._verbose >= 1: - pylog.info(f"Downloading file '{fname}'...") - download_url_to_file(url, fpath, progress=self._verbose >= 1) - - if self._verbose >= 1: - duration = int(time.perf_counter() - start) - pylog.info( - f'Download and preparation of AudioCaps for subset "{self._subset}" done in {duration}s. ' - ) - pylog.info(f"- {n_download_ok} downloads success,") - pylog.info(f"- {n_download_err} downloads failed,") - pylog.info(f"- {n_already_ok} already downloaded,") - pylog.info(f"- {n_already_err} already downloaded errors,") - pylog.info(f"- {n_samples} total samples.") - - if self.REDIRECT_LOG: - logging.basicConfig( - stream=sys.stdout, - level=logging.INFO, - force=True, - ) - - if self._verbose >= 2: - pylog.debug( - f"Dataset {self.__class__.__name__} ({self._subset}) has been prepared." - ) - - -def _download_and_extract_from_youtube( - youtube_id: str, - fpath_out: str, - start_time: int, - duration: float = 10.0, - sr: int = 16000, - n_channels: int = 1, - target_format: str = "flac", - acodec: str = "flac", - youtube_dl_path: str = "youtube-dl", - ffmpeg_path: str = "ffmpeg", -) -> bool: - """Download audio from youtube with youtube-dl and ffmpeg.""" - - # Get audio download link with youtube-dl - link = f"https://www.youtube.com/watch?v={youtube_id}" - get_url_command = [ - youtube_dl_path, - "--youtube-skip-dash-manifest", - "-g", - link, - ] - try: - output = subprocess.check_output(get_url_command) - except (CalledProcessError, PermissionError): - return False - - output = output.decode() - lines = output.split("\n") - if len(lines) < 2: - return False - _video_link, audio_link = lines[:2] - - # Download and extract audio from audio_link to fpath_out with ffmpeg - extract_command = [ - ffmpeg_path, - # Input - "-i", - audio_link, - # Remove video - "-vn", - # Format (flac) - "-f", - target_format, - # Audio codec (flac) - "-acodec", - acodec, - # Get only 10s of the clip after start_time - "-ss", - str(start_time), - "-t", - str(duration), - # Resample to 16 kHz - "-ar", - str(sr), - # Compute mean of 2 channels - "-ac", - str(n_channels), - fpath_out, - ] - try: - exitcode = subprocess.check_call(extract_command) - return exitcode == 0 - except (CalledProcessError, PermissionError): - return False - - -AUDIOCAPS_LINKS = { - "train": { - "captions": { - "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/train.csv", - "fname": "train.csv", - }, - }, - "val": { - "captions": { - "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/val.csv", - "fname": "val.csv", - }, - }, - "test": { - "captions": { - "url": "https://raw.githubusercontent.com/cdjkim/audiocaps/master/dataset/test.csv", - "fname": "test.csv", - }, - }, -} - -AUDIOSET_LINKS = { - "class_labels_indices": { - "fname": "class_labels_indices.csv", - "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv", - }, - "eval": { - "fname": "eval_segments.csv", - "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/eval_segments.csv", - }, - "balanced": { - "fname": "balanced_train_segments.csv", - "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/balanced_train_segments.csv", - }, - "unbalanced": { - "fname": "unbalanced_train_segments.csv", - "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/unbalanced_train_segments.csv", - }, -} diff --git a/src/aac_datasets/datasets/legacy/clotho.py b/src/aac_datasets/datasets/legacy/clotho.py deleted file mode 100644 index 9938f52..0000000 --- a/src/aac_datasets/datasets/legacy/clotho.py +++ /dev/null @@ -1,942 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import copy -import csv -import logging -import os -import os.path as osp - -from functools import lru_cache -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, overload -from zipfile import ZipFile - -import torchaudio - -from py7zr import SevenZipFile -from torch import Tensor -from torch.hub import download_url_to_file -from torch.utils.data.dataset import Dataset -from typing_extensions import TypedDict - -from aac_datasets.utils.download import validate_file - - -pylog = logging.getLogger(__name__) - - -class ClothoItem(TypedDict): - r"""Class representing a single Clotho item.""" - - # Common attributes - audio: Tensor - captions: List[str] - dataset: str - fname: str - index: int - subset: str - sr: int - # Clotho-specific attributes - keywords: List[str] - sound_id: str # warning: some files contains "Not found" - sound_link: str # warning: some files contains "NA" - start_end_samples: str # warning: some files contains "" - manufacturer: str - license: str - - -CLOTHO_ALL_COLUMNS = tuple(ClothoItem.__required_keys__ | ClothoItem.__optional_keys__) - - -CLOTHO_LINKS = { - "v1": { - "dev": { - "audio_archive": { - "fname": "clotho_audio_development.7z", - "url": "https://zenodo.org/record/3490684/files/clotho_audio_development.7z?download=1", - "hash_value": "e3ce88561b317cc3825e8c861cae1ec6", - }, - "captions": { - "fname": "clotho_captions_development.csv", - "url": "https://zenodo.org/record/3490684/files/clotho_captions_development.csv?download=1", - "hash_value": "dd568352389f413d832add5cf604529f", - }, - "metadata": { - "fname": "clotho_metadata_development.csv", - "url": "https://zenodo.org/record/3490684/files/clotho_metadata_development.csv?download=1", - "hash_value": "582c18ee47cebdbe33dce1feeab53a56", - }, - }, - "eval": { - "audio_archive": { - "fname": "clotho_audio_evaluation.7z", - "url": "https://zenodo.org/record/3490684/files/clotho_audio_evaluation.7z?download=1", - "hash_value": "4569624ccadf96223f19cb59fe4f849f", - }, - "captions": { - "fname": "clotho_captions_evaluation.csv", - "url": "https://zenodo.org/record/3490684/files/clotho_captions_evaluation.csv?download=1", - "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", - }, - "metadata": { - "fname": "clotho_metadata_evaluation.csv", - "url": "https://zenodo.org/record/3490684/files/clotho_metadata_evaluation.csv?download=1", - "hash_value": "13946f054d4e1bf48079813aac61bf77", - }, - }, - "test": { - "audio_archive": { - "fname": "clotho_audio_test.7z", - "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", - "hash_value": "9b3fe72560a621641ff4351ba1154349", - }, - "metadata": { - "fname": "clotho_metadata_test.csv", - "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", - "hash_value": "52f8ad01c229a310a0ff8043df480e21", - }, - }, - }, - "v2": { - "dev": { - "audio_archive": { - "fname": "clotho_audio_development.7z", - "url": "https://zenodo.org/record/4743815/files/clotho_audio_development.7z?download=1", - "hash_value": "eda144a5e05a60e6d2e37a65fc4720a9", - }, - "captions": { - "fname": "clotho_captions_development.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_captions_development.csv?download=1", - "hash_value": "800633304e73d3daed364a2ba6069827", - }, - "metadata": { - "fname": "clotho_metadata_development.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_metadata_development.csv?download=1", - "hash_value": "5fdc51b4c4f3468ff7d251ea563588c9", - }, - }, - "val": { - "audio_archive": { - "fname": "clotho_audio_validation.7z", - "url": "https://zenodo.org/record/4743815/files/clotho_audio_validation.7z?download=1", - "hash_value": "0475bfa5793e80f748d32525018ebada", - }, - "captions": { - "fname": "clotho_captions_validation.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_captions_validation.csv?download=1", - "hash_value": "3109c353138a089c7ba724f27d71595d", - }, - "metadata": { - "fname": "clotho_metadata_validation.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_metadata_validation.csv?download=1", - "hash_value": "f69cfacebcd47c4d8d30d968f9865475", - }, - }, - "eval": { - "audio_archive": { - "fname": "clotho_audio_evaluation.7z", - "url": "https://zenodo.org/record/4743815/files/clotho_audio_evaluation.7z?download=1", - "hash_value": "4569624ccadf96223f19cb59fe4f849f", - }, - "captions": { - "fname": "clotho_captions_evaluation.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_captions_evaluation.csv?download=1", - "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", - }, - "metadata": { - "fname": "clotho_metadata_evaluation.csv", - "url": "https://zenodo.org/record/4743815/files/clotho_metadata_evaluation.csv?download=1", - "hash_value": "13946f054d4e1bf48079813aac61bf77", - }, - }, - "test": { - "audio_archive": { - "fname": "clotho_audio_test.7z", - "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", - "hash_value": "9b3fe72560a621641ff4351ba1154349", - }, - "metadata": { - "fname": "clotho_metadata_test.csv", - "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", - "hash_value": "52f8ad01c229a310a0ff8043df480e21", - }, - }, - }, - "v2.1": { - "dev": { - "audio_archive": { - "fname": "clotho_audio_development.7z", - "url": "https://zenodo.org/record/4783391/files/clotho_audio_development.7z?download=1", - "hash_value": "c8b05bc7acdb13895bb3c6a29608667e", - }, - "captions": { - "fname": "clotho_captions_development.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_captions_development.csv?download=1", - "hash_value": "d4090b39ce9f2491908eebf4d5b09bae", - }, - "metadata": { - "fname": "clotho_metadata_development.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_metadata_development.csv?download=1", - "hash_value": "170d20935ecfdf161ce1bb154118cda5", - }, - }, - "val": { - "audio_archive": { - "fname": "clotho_audio_validation.7z", - "url": "https://zenodo.org/record/4783391/files/clotho_audio_validation.7z?download=1", - "hash_value": "7dba730be08bada48bd15dc4e668df59", - }, - "captions": { - "fname": "clotho_captions_validation.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_captions_validation.csv?download=1", - "hash_value": "5879e023032b22a2c930aaa0528bead4", - }, - "metadata": { - "fname": "clotho_metadata_validation.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_metadata_validation.csv?download=1", - "hash_value": "2e010427c56b1ce6008b0f03f41048ce", - }, - }, - "eval": { - "audio_archive": { - "fname": "clotho_audio_evaluation.7z", - "url": "https://zenodo.org/record/4783391/files/clotho_audio_evaluation.7z?download=1", - "hash_value": "4569624ccadf96223f19cb59fe4f849f", - }, - "captions": { - "fname": "clotho_captions_evaluation.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_captions_evaluation.csv?download=1", - "hash_value": "1b16b9e57cf7bdb7f13a13802aeb57e2", - }, - "metadata": { - "fname": "clotho_metadata_evaluation.csv", - "url": "https://zenodo.org/record/4783391/files/clotho_metadata_evaluation.csv?download=1", - "hash_value": "13946f054d4e1bf48079813aac61bf77", - }, - }, - "test": { - "audio_archive": { - "fname": "clotho_audio_test.7z", - "url": "https://zenodo.org/record/3865658/files/clotho_audio_test.7z?download=1", - "hash_value": "9b3fe72560a621641ff4351ba1154349", - }, - "metadata": { - "fname": "clotho_metadata_test.csv", - "url": "https://zenodo.org/record/3865658/files/clotho_metadata_test.csv?download=1", - "hash_value": "52f8ad01c229a310a0ff8043df480e21", - }, - }, - "analysis": { - "audio_archive": { - "fname": "clotho_analysis_2022.zip", - "url": "https://zenodo.org/record/6610709/files/clotho_analysis_2022.zip?download=1", - "hash_value": "7e8fa4762cc3a7c5546606680b958d08", - }, - }, - "test_retrieval_audio": { - "audio_archive": { - "fname": "retrieval_audio.7z", - "url": "https://zenodo.org/record/6590983/files/retrieval_audio.7z?download=1", - "hash_value": "24102395fd757c462421a483fba5c407", - }, - "metadata": { - "fname": "retrieval_audio_metadata.csv", - "url": "https://zenodo.org/record/6590983/files/retrieval_audio_metadata.csv?download=1", - "hash_value": "1301db07acbf1e4fabc467eb54e0d353", - }, - }, - "test_retrieval_captions": { - "captions": { - "fname": "retrieval_captions.csv", - "url": "https://zenodo.org/record/6590983/files/retrieval_captions.csv?download=1", - "hash_value": "f9e810118be00c64ea8cd7557816d4fe", - }, - }, - }, -} -CLOTHO_LAST_VERSION = "v2.1" - -CLOTHO_AUDIO_DNAMES = { - "dev": "development", - "eval": "evaluation", - "val": "validation", - "test": "test", - "analysis": "clotho_analysis", - "test_retrieval_audio": "test_retrieval_audio", - "test_retrieval_captions": None, -} - -CAPTIONS_KEYS = ( - "caption_1", - "caption_2", - "caption_3", - "caption_4", - "caption_5", -) -METADATA_KEYS = ( - "keywords", - "sound_id", - "sound_link", - "start_end_samples", - "manufacturer", - "license", -) - - -class Clotho(Dataset[Dict[str, Any]]): - r"""Unofficial Clotho PyTorch dataset. - - Subsets available are 'train', 'val', 'eval', 'test' and 'analysis'. - - Audio are waveform sounds of 15 to 30 seconds, sampled at 44100 Hz. - Target is a list of 5 different sentences strings describing an audio sample. - The maximal number of words in captions is 20. - - Clotho V1 Paper : https://arxiv.org/pdf/1910.09387.pdf - - .. code-block:: text - :caption: Dataset folder tree for version 'v2.1' - - {root} - └── CLOTHO_v2.1 - ├── archives - | └── (5 7z files, ~8.9GB) - ├── clotho_audio_files - │ ├── clotho_analysis - │ │ └── (8360 wav files, ~19GB) - │ ├── development - │ │ └── (3839 wav files, ~7.1GB) - │ ├── evaluation - │ │ └── (1045 wav files, ~2.0GB) - │ ├── test - │ | └── (1043 wav files, ~2.0GB) - │ ├── test_retrieval_audio - │ | └── (1000 wav files, ~2.0GB) - │ └── validation - │ └── (1045 wav files, ~2.0GB) - └── clotho_csv_files - ├── clotho_captions_development.csv - ├── clotho_captions_evaluation.csv - ├── clotho_captions_validation.csv - ├── clotho_metadata_development.csv - ├── clotho_metadata_evaluation.csv - ├── clotho_metadata_test.csv - ├── clotho_metadata_validation.csv - ├── retrieval_audio_metadata.csv - └── retrieval_captions.csv - - """ - - # Common globals - AUDIO_N_CHANNELS = 1 - CITATION: str = r""" - @inproceedings{Drossos_2020_icassp, - author = "Drossos, Konstantinos and Lipping, Samuel and Virtanen, Tuomas", - title = "Clotho: an Audio Captioning Dataset", - booktitle = "Proc. IEEE Int. Conf. Acoustic., Speech and Signal Process. (ICASSP)", - year = "2020", - pages = "736-740", - abstract = "Audio captioning is the novel task of general audio content description using free text. It is an intermodal translation task (not speech-to-text), where a system accepts as an input an audio signal and outputs the textual description (i.e. the caption) of that signal. In this paper we present Clotho, a dataset for audio captioning consisting of 4981 audio samples of 15 to 30 seconds duration and 24 905 captions of eight to 20 words length, and a baseline method to provide initial results. Clotho is built with focus on audio content and caption diversity, and the splits of the data are not hampering the training or evaluation of methods. All sounds are from the Freesound platform, and captions are crowdsourced using Amazon Mechanical Turk and annotators from English speaking countries. Unique words, named entities, and speech transcription are removed with post-processing. Clotho is freely available online (https://zenodo.org/record/3490684)." - } - """ - DATASET_NAME = "clotho" - FORCE_PREPARE_DATA: bool = False - HOMEPAGE = "https://zenodo.org/record/3490684" - MAX_AUDIO_SEC = 30.0 - MIN_AUDIO_SEC = 15.0 - SAMPLE_RATE = 44100 - SUBSETS = tuple(CLOTHO_LINKS[CLOTHO_LAST_VERSION].keys()) - VERIFY_FILES: bool = True - - # Clotho-specific globals - CAPTION_MAX_LENGTH = 20 - CAPTION_MIN_LENGTH = 8 - CAPTIONS_PER_AUDIO = { - "dev": 5, - "val": 5, - "eval": 5, - "test": 0, - "analysis": 0, - "test_retrieval_audio": 0, - "test_retrieval_captions": 1, - } - CLEAN_ARCHIVES: bool = False - INVALID_SOUND_ID = "Not found" - INVALID_SOUND_LINK = "NA" - INVALID_START_END_SAMPLES = "" - SUBSETS_DICT = { - version: tuple(links.keys()) for version, links in CLOTHO_LINKS.items() - } - VERSIONS = tuple(CLOTHO_LINKS.keys()) - - # Initialization - def __init__( - self, - root: str = ".", - subset: str = "dev", - download: bool = False, - transform: Optional[Callable] = None, - flat_captions: bool = False, - verbose: int = 0, - version: str = "v2.1", - ) -> None: - """ - :param root: The parent of the dataset root directory. - Note: The data is stored in the 'CLOTHO_{version}' subdirectory. - defaults to ".". - :param subset: The subset of Clotho to use. Can be one of :attr:`~Clotho.SUBSETS`. - defaults to "dev". - :param download: Download the dataset if download=True and if the dataset is not already downloaded. - defaults to False. - :param transform: The transform to apply to the global dict item. This transform is applied only in getitem method. - defaults to None. - :param flat_captions: If True, map captions to audio instead of audio to caption. - defaults to True. - :param verbose: Verbose level to use. Can be 0 or 1. - defaults to 0. - :param version: The version of the dataset. Can be one of :attr:`~Clotho.VERSIONS`. - defaults to 'v2.1'. - """ - if version not in Clotho.VERSIONS: - raise ValueError( - f"Invalid Clotho argument version={version}. Must be one of {Clotho.VERSIONS}." - ) - - if version == "v2": - pylog.warning( - f"The version '{version}' of the Clotho dataset contains minor some errors in file names and few corrupted files." - f"Please consider using the fixed version 'v2.1'." - ) - - subsets = tuple(CLOTHO_LINKS[version].keys()) - if subset not in subsets: - raise ValueError( - f"Invalid Clotho argument subset={subset} for version={version}. Must be one of {subsets}." - ) - - super().__init__() - self._root = root - self._subset = subset - self._download = download - self._transform = transform - self._flat_captions = flat_captions - self._version = version - self._verbose = verbose - - self._all_items = {} - self._loaded = False - - if self._download: - self._prepare_dataset() - self._load_dataset() - - # Properties - @property - def column_names(self) -> List[str]: - """The name of each column of the dataset.""" - column_names = list(CLOTHO_ALL_COLUMNS) - column_names = [ - name - for name in column_names - if name in self._all_items or name not in METADATA_KEYS - ] - - if self._subset in ("test", "test_retrieval_audio", "analysis"): - removed_columns = ("captions",) - elif self._subset == "test_retrieval_captions": - removed_columns = ("audio", "sr", "fname") - else: - removed_columns = () - for name in removed_columns: - column_names.remove(name) - - return column_names - - @property - def info(self) -> Dict[str, Any]: - """Return the global dataset info.""" - return { - "dataset": self.DATASET_NAME, - "subset": self._subset, - "version": self._version, - } - - @property - def shape(self) -> Tuple[int, ...]: - """The shape of the Clotho dataset.""" - return len(self), len(self.column_names) - - # Public methods - @overload - def at(self, idx: int) -> ClothoItem: - ... - - @overload - def at(self, idx: Union[Iterable[int], slice, None]) -> Dict[str, List]: - ... - - @overload - def at(self, idx: Any, column: Any) -> Any: - ... - - def at( - self, - idx: Union[int, Iterable[int], None, slice] = None, - column: Union[str, Iterable[str], None] = None, - ) -> Any: - """Get a specific data field. - - :param index: The index or slice of the value in range [0, len(dataset)-1]. - :param column: The name(s) of the column. Can be any value of :meth:`~Clotho.column_names`. - :returns: The field value. The type depends of the column. - """ - if idx is None: - idx = slice(None) - if column is None: - column = self.column_names - - if not isinstance(column, str) and isinstance(column, Iterable): - return {column_i: self.at(idx, column_i) for column_i in column} - - if isinstance(idx, (int, slice)) and column in self._all_items.keys(): - return self._all_items[column][idx] - - if isinstance(idx, slice): - idx = range(len(self))[idx] - - if isinstance(idx, Iterable): - idx = list(idx) - if not all(isinstance(idx_i, int) for idx_i in idx): - raise TypeError( - f"Invalid input type for idx={idx}. (expected Iterable[int], not Iterable[{idx.__class__.__name__}])" - ) - return [self.at(idx_i, column) for idx_i in idx] - - if column == "audio": - fpath = self.at(idx, "fpath") - audio, sr = torchaudio.load(fpath) # type: ignore - - # Sanity check - if audio.nelement() == 0: - raise RuntimeError( - f"Invalid audio number of elements in {fpath}. (expected audio.nelement()={audio.nelement()} > 0)" - ) - if sr != self.SAMPLE_RATE: - raise RuntimeError( - f"Invalid sample rate in {fpath}. (expected {self.SAMPLE_RATE} but found sr={sr})" - ) - return audio - - elif column == "audio_metadata": - fpath = self.at(idx, "fpath") - audio_metadata = torchaudio.info(fpath) # type: ignore - return audio_metadata - - elif column == "dataset": - return self.DATASET_NAME - - elif column == "fpath": - fname = self.at(idx, "fname") - fpath = osp.join(self.__dpath_audio_subset, fname) - return fpath - - elif column == "index": - return idx - - elif column == "num_channels": - audio_metadata = self.at(idx, "audio_metadata") - return audio_metadata.num_channels - - elif column == "num_frames": - audio_metadata = self.at(idx, "audio_metadata") - return audio_metadata.num_frames - - elif column == "sr": - audio_metadata = self.at(idx, "audio_metadata") - return audio_metadata.sample_rate - - elif column == "subset": - return self._subset - - else: - raise ValueError( - f"Invalid argument column={column} at idx={idx}. (expected one of {tuple(self.column_names)})" - ) - - def is_loaded(self) -> bool: - """Returns True if the dataset is loaded.""" - return self._loaded - - def set_transform( - self, - transform: Optional[Callable[[Dict[str, Any]], Any]], - ) -> None: - """Set the transform applied to each row.""" - self._transform = transform - - # Magic methods - @overload - def __getitem__(self, idx: int) -> ClothoItem: - ... - - @overload - def __getitem__(self, idx: Union[Iterable[int], slice, None]) -> Dict[str, List]: - ... - - @overload - def __getitem__(self, idx: Any) -> Any: - ... - - def __getitem__(self, idx: Any) -> Any: - if ( - isinstance(idx, tuple) - and len(idx) == 2 - and (isinstance(idx[1], (str, Iterable)) or idx[1] is None) - ): - idx, column = idx - else: - column = None - - item = self.at(idx, column) - if isinstance(idx, int) and column is None and self._transform is not None: - item = self._transform(item) - return item - - def __len__(self) -> int: - """ - :return: The number of items in the dataset. - """ - return len(self._all_items["captions"]) - - def __repr__(self) -> str: - return f"Clotho(size={len(self)}, subset={self._subset}, num_columns={len(self.column_names)}, version={self._version})" - - # Private methods - @property - @lru_cache() - def __dpath_archives(self) -> str: - return osp.join(self.__dpath_data, "archives") - - @property - @lru_cache() - def __dpath_audio(self) -> str: - return osp.join(self.__dpath_data, "clotho_audio_files") - - @property - @lru_cache() - def __dpath_audio_subset(self) -> str: - return osp.join( - self.__dpath_data, - "clotho_audio_files", - CLOTHO_AUDIO_DNAMES[self._subset], - ) - - @property - @lru_cache() - def __dpath_csv(self) -> str: - return osp.join(self.__dpath_data, "clotho_csv_files") - - @property - @lru_cache() - def __dpath_data(self) -> str: - return osp.join(self._root, f"CLOTHO_{self._version}") - - def __is_prepared(self) -> bool: - if not all(map(osp.isdir, (self.__dpath_audio, self.__dpath_csv))): - return False - - if Clotho.CAPTIONS_PER_AUDIO[self._subset] == 0: - return True - if CLOTHO_AUDIO_DNAMES[self._subset] is None: - return True - - links = CLOTHO_LINKS[self._version][self._subset] - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(self.__dpath_csv, captions_fname) - with open(captions_fpath, "r") as file: - reader = csv.DictReader(file) - lines = list(reader) - return len(lines) == len(os.listdir(self.__dpath_audio_subset)) - - def _load_dataset(self) -> None: - if not self.__is_prepared(): - raise RuntimeError( - f"Cannot load data: clotho_{self._subset} is not prepared in data root={self._root}. Please use download=True in dataset constructor." - ) - - # Read fpath of .wav audio files - links = CLOTHO_LINKS[self._version][self._subset] - - # Read Clotho files - if "captions" in links.keys(): - captions_fname = links["captions"]["fname"] - captions_fpath = osp.join(self.__dpath_csv, captions_fname) - - # Keys: file_name, caption_1, caption_2, caption_3, caption_4, caption_5 - with open(captions_fpath, "r") as file: - reader = csv.DictReader(file) - captions_data = list(reader) - - if self._subset == "test_retrieval_captions": - captions_data = [ - data | {"file_name": f"no_fname_{i}"} - for i, data in enumerate(captions_data) - ] - - else: - captions_data = [] - - if "metadata" in links.keys(): - metadata_fname = links["metadata"]["fname"] - metadata_fpath = osp.join(self.__dpath_csv, metadata_fname) - - # Keys: file_name, keywords, sound_id, sound_link, start_end_samples, manufacturer, license - if self._version in ("v2", "v2.1"): - encoding = "ISO-8859-1" - else: - encoding = None - - with open(metadata_fpath, "r", encoding=encoding) as file: - delimiter = ";" if self._subset == "test" else "," - reader = csv.DictReader(file, delimiter=delimiter) - metadata = list(reader) - else: - metadata = [] - - if "captions" in links.keys(): - # note: "dev", "val", "eval" - fnames_lst = [line["file_name"] for line in captions_data] - elif "metadata" in links.keys(): - # note: for "test" subset which do not have captions CSV file - fnames_lst = [line["file_name"] for line in metadata] - else: - # note 1: for "analysis" subset which do not have any CSV file - # note 2: force sorted list to have the same order on all OS - fnames_lst = list(sorted(os.listdir(self.__dpath_audio_subset))) - - idx_to_fname = {i: fname for i, fname in enumerate(fnames_lst)} - fname_to_idx = {fname: i for i, fname in idx_to_fname.items()} - dataset_size = len(fnames_lst) - - # Process each item field - if len(metadata) > 0: - subset_metadata_keys = [key for key in METADATA_KEYS if key in metadata[0]] - else: - subset_metadata_keys = [] - - all_captions_lst = [[] for _ in range(dataset_size)] - - if self._subset != "test_retrieval_captions": - captions_keys = CAPTIONS_KEYS - else: - captions_keys = ("caption",) - - for line in captions_data: - fname = line["file_name"] - idx = fname_to_idx[fname] - all_captions_lst[idx] = [line[caption_key] for caption_key in captions_keys] - - all_metadata_dic: Dict[str, List[Any]] = { - key: [None for _ in range(dataset_size)] for key in subset_metadata_keys - } - for line in metadata: - fname = line["file_name"] - if fname not in fname_to_idx: - raise KeyError( - f"Cannot find metadata fname={fname} in captions file. (subset={self._subset})" - ) - idx = fname_to_idx[fname] - for key in subset_metadata_keys: - # The test subset does not have keywords in metadata, but has sound_id, sound_link, etc. - if key in line: - all_metadata_dic[key][idx] = line[key] - - all_items = { - "fname": fnames_lst, - "captions": all_captions_lst, - } - all_items.update(all_metadata_dic) - - if "keywords" in all_items: - # Split keywords into list[str] - all_items["keywords"] = [ - keywords.split(";") if keywords is not None else [] - for keywords in all_items["keywords"] - ] - - if self._subset == "test_retrieval_audio": - # Temporary patch to avoid file loading errors - # indexes: 53, 521, 677 - replaces = { - "raindrops on metal: police background.wav": "raindrops on metal_police background.wav", - "Intersection Wet : Metro Pass.wav": "Intersection Wet_Metro Pass.wav", - "Kitchen Roomtone w: Dripping Faucet_1-2.wav": "Kitchen Roomtone w_Dripping Faucet_1-2.wav", - } - all_items["fname"] = [ - replaces.get(fname, fname) for fname in all_items["fname"] - ] - - if self._flat_captions and self.CAPTIONS_PER_AUDIO[self._subset] > 1: - all_infos_flatten = {key: [] for key in all_items.keys()} - - for i, captions in enumerate(all_items["captions"]): - for caption in captions: - for key in all_items.keys(): - all_infos_flatten[key].append(all_items[key][i]) - all_infos_flatten["captions"] = [caption] - - all_items = all_infos_flatten - - self._all_items = all_items - self._loaded = True - - if self._verbose >= 1: - pylog.info(f"{repr(self)} has been loaded. (len={len(self)})") - - def _prepare_dataset(self) -> None: - if not osp.isdir(self._root): - raise RuntimeError(f"Cannot find root directory '{self._root}'.") - - os.makedirs(self.__dpath_archives, exist_ok=True) - os.makedirs(self.__dpath_audio, exist_ok=True) - os.makedirs(self.__dpath_csv, exist_ok=True) - - if self._verbose >= 1: - pylog.info(f"Start to download files for clotho_{self._subset}...") - - links = copy.deepcopy(CLOTHO_LINKS[self._version][self._subset]) - EXTENSIONS = ("7z", "csv", "zip") - - # Download csv and 7z files - for file_info in links.values(): - fname, url, hash_value = ( - file_info["fname"], - file_info["url"], - file_info["hash_value"], - ) - extension = fname.split(".")[-1] - - if extension in ("7z", "zip"): - dpath = self.__dpath_archives - elif extension == "csv": - dpath = self.__dpath_csv - else: - raise RuntimeError( - f"Found invalid extension={extension}. Must be one of {EXTENSIONS}." - ) - - fpath = osp.join(dpath, fname) - if not osp.isfile(fpath) or self.FORCE_PREPARE_DATA: - if self._verbose >= 1: - pylog.info(f"Download and check file '{fname}' from url={url}...") - - download_url_to_file( - url, - fpath, - progress=self._verbose >= 1, - ) - - elif self._verbose >= 1: - pylog.info(f"File '{fname}' is already downloaded.") - - if self.VERIFY_FILES: - valid = validate_file(fpath, hash_value, hash_type="md5") - if not valid: - raise RuntimeError(f"Invalid checksum for file {fname}.") - elif self._verbose >= 2: - pylog.debug(f"File '{fname}' has a valid checksum.") - - # Extract audio files from archives - for file_info in links.values(): - fname = file_info["fname"] - extension = fname.split(".")[-1] - - if extension == "csv": - continue - - if extension not in ("7z", "zip"): - pylog.error( - f"Found unexpected extension={extension} for downloaded file '{fname}'. Expected one of {EXTENSIONS}." - ) - continue - - fpath = osp.join(self.__dpath_archives, fname) - - if self._verbose >= 1: - pylog.info(f"Extract archive file fname={fname}...") - - if extension == "7z": - archive_file = SevenZipFile(fpath) - compressed_fnames = [ - osp.basename(fname) for fname in archive_file.getnames() - ] - elif extension == "zip": - archive_file = ZipFile(fpath) - compressed_fnames = [ - osp.basename(file.filename) for file in archive_file.filelist - ] - else: - raise RuntimeError(f"Invalid extension '{extension}'.") - - # Ignore dir name from archive file - compressed_fnames = [ - fname for fname in compressed_fnames if fname.endswith(".wav") - ] - extracted_fnames = ( - os.listdir(self.__dpath_audio_subset) - if osp.isdir(self.__dpath_audio_subset) - else [] - ) - - if set(extracted_fnames) != set(compressed_fnames): - # For test_retrieval_audio subset, the name of the audio dname is also "test", so we need to move the audio files to another folder named "test_retrieval_audio". - if self._subset == "test_retrieval_audio": - target_dpath = self.__dpath_audio_subset - os.makedirs(target_dpath, exist_ok=True) - else: - target_dpath = self.__dpath_audio - - archive_file.extractall(target_dpath) - - if self._subset == "test_retrieval_audio": - extracted_dpath = osp.join(target_dpath, "test") - for fname in os.listdir(extracted_dpath): - os.rename( - osp.join(extracted_dpath, fname), - osp.join(target_dpath, fname), - ) - os.rmdir(extracted_dpath) - - # Check if files is good now - extracted_fnames = os.listdir(self.__dpath_audio_subset) - if set(extracted_fnames) != set(compressed_fnames): - found_but_not_expected = len( - set(extracted_fnames).difference(set(compressed_fnames)) - ) - expected_but_not_found = len( - set(compressed_fnames).difference(set(extracted_fnames)) - ) - - raise RuntimeError( - f"Invalid number of audios extracted, found {len(extracted_fnames)} files but expected the same {len(compressed_fnames)} files. " - f"(with found_but_not_expected={found_but_not_expected} and expected_but_not_found={expected_but_not_found})" - ) - - archive_file.close() - - if self.CLEAN_ARCHIVES: - for file_info in links.values(): - fname = file_info["fname"] - extension = fname.split(".")[-1] - if extension not in ("7z", "zip"): - continue - - fpath = osp.join(self.__dpath_audio, fname) - if self._verbose >= 1: - pylog.info(f"Removing archive file {osp.basename(fpath)}...") - os.remove(fpath) - - if self._verbose >= 2: - pylog.debug( - f"Dataset {self.__class__.__name__} ({self._subset}) has been prepared." - ) diff --git a/src/aac_datasets/datasets/legacy/macs.py b/src/aac_datasets/datasets/legacy/macs.py deleted file mode 100644 index c771ef9..0000000 --- a/src/aac_datasets/datasets/legacy/macs.py +++ /dev/null @@ -1,732 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import copy -import csv -import logging -import os -import os.path as osp -import shutil -import zipfile - -from functools import lru_cache -from typing import Any, Callable, Dict, Iterable, List, Optional, Tuple, Union, overload - -import torchaudio -import yaml - -from torch import Tensor -from torch.hub import download_url_to_file -from torch.utils.data.dataset import Dataset -from typing_extensions import TypedDict - -from aac_datasets.utils.download import validate_file - - -pylog = logging.getLogger(__name__) - - -class MACSItem(TypedDict): - r"""Dataclass representing a single MACS item.""" - - # Common attributes - audio: Tensor - captions: List[str] - dataset: str - fname: str - index: int - subset: str - sr: int - # MACS-specific attributes - annotators_ids: List[str] - competences: List[float] - identifier: str - scene_label: str - tags: List[List[str]] - - -MACS_ALL_COLUMNS = tuple(MACSItem.__required_keys__ | MACSItem.__optional_keys__) - - -class MACS(Dataset[Dict[str, Any]]): - r"""Unofficial MACS PyTorch dataset. - - .. code-block:: text - :caption: Dataset folder tree - - {root} - └── MACS - ├── audio - │ └── (3930 wav files, ~13GB) - ├── LICENCE.txt - ├── MACS.yaml - ├── MACS_competence.csv - └── tau_meta - ├── fold1_evaluate.csv - ├── fold1_test.csv - ├── fold1_train.csv - └── meta.csv - """ - # Common globals - AUDIO_N_CHANNELS = 2 - CITATION: str = r""" - @inproceedings{Martin2021b, - title = {Diversity and Bias in Audio Captioning Datasets}, - author = {Martin, Irene and Mesaros, Annamaria}, - year = 2021, - month = {November}, - booktitle = {Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)}, - address = {Barcelona, Spain}, - pages = {90--94}, - isbn = {978-84-09-36072-7}, - url = {https://dcase.community/documents/workshop2021/proceedings/DCASE2021Workshop_Martin_34.pdf}, - abstract = {Describing soundscapes in sentences allows better understanding of the acoustic scene than a single label indicating the acoustic scene class or a set of audio tags indicating the sound events active in the audio clip. In addition, the richness of natural language allows a range of possible descriptions for the same acoustic scene. In this work, we address the diversity obtained when collecting descriptions of soundscapes using crowdsourcing. We study how much the collection of audio captions can be guided by the instructions given in the annotation task, by analysing the possible bias introduced by auxiliary information provided in the annotation process. Our study shows that even when given hints on the audio content, different annotators describe the same soundscape using different vocabulary. In automatic captioning, hints provided as audio tags represent grounding textual information that facilitates guiding the captioning output towards specific concepts. We also release a new dataset of audio captions and audio tags produced by multiple annotators for a subset of the TAU Urban Acoustic Scenes 2018 dataset, suitable for studying guided captioning.}, - doi. = {10.5281/zenodo.5770113} - } - """ - DATASET_NAME = "macs" - FORCE_PREPARE_DATA: bool = False - HOMEPAGE = "https://zenodo.org/record/5114771" - MAX_AUDIO_SEC = 10.000020833333334 - MIN_AUDIO_SEC = 9.999979166666666 - SAMPLE_RATE = 48000 # in Hz - SUBSETS = ("full",) - VERIFY_FILES: bool = True - - # MACS-specific globals - AUDIO_MAX_SIZE = 480001 - AUDIO_MIN_SIZE = 479999 - CLEAN_ARCHIVES: bool = False - MAX_CAPTIONS_PER_AUDIO = {"full": 5} - MIN_CAPTIONS_PER_AUDIO = {"full": 2} - - # Initialization - def __init__( - self, - root: str = ".", - subset: str = "full", - download: bool = False, - transform: Optional[Callable] = None, - flat_captions: bool = False, - verbose: int = 0, - ) -> None: - """ - :param root: The parent of the dataset root directory. - The data will be stored in the 'MACS' subdirectory. - defaults to ".". - :param subset: The subset of the dataset. This parameter is here only to accept the same interface than the other datasets. - The only valid subset is "full" and other values will raise a ValueError. - defaults to "full". - :param download: Download the dataset if download=True and if the dataset is not already downloaded. - defaults to False. - :param transform: The transform to apply to the global dict item. This transform is applied only in getitem method. - defaults to None. - :param flat_captions: If True, map captions to audio instead of audio to caption. - defaults to True. - :param verbose: Verbose level to use. Can be 0 or 1. - defaults to 0. - """ - if subset not in self.SUBSETS: - raise ValueError( - f"Invalid argument subset={subset} for MACS. (expected one of {self.SUBSETS})" - ) - - super().__init__() - self._root = root - self._subset = subset - self._download = download - self._transform = transform - self._flat_captions = flat_captions - self._verbose = verbose - - self._annotator_id_to_competence = {} - self._all_items = {} - self._loaded = False - - if self._download: - self._prepare_dataset() - self._load_dataset() - - # Properties - @property - def column_names(self) -> List[str]: - """The name of each column of the dataset.""" - column_names = list(MACS_ALL_COLUMNS) - return column_names - - @property - def info(self) -> Dict[str, Any]: - """Return the global dataset info.""" - return { - "dataset": self.DATASET_NAME, - "subset": self._subset, - } - - @property - def shape(self) -> Tuple[int, ...]: - """The shape of the MACS dataset.""" - return len(self), len(self.column_names) - - # Public methods - @overload - def at(self, idx: int) -> MACSItem: - ... - - @overload - def at(self, idx: Union[Iterable[int], slice, None]) -> Dict[str, List]: - ... - - @overload - def at(self, idx: Any, column: Any) -> Any: - ... - - def at( - self, - idx: Union[int, Iterable[int], None, slice] = None, - column: Union[str, Iterable[str], None] = None, - ) -> Any: - """Get a specific data field. - - :param index: The index or slice of the value in range [0, len(dataset)-1]. - :param column: The name(s) of the column. Can be any value of :meth:`~MACS.column_names`. - :returns: The field value. The type depends of the column. - """ - if idx is None: - idx = slice(None) - if column is None: - column = self.column_names - - if not isinstance(column, str) and isinstance(column, Iterable): - return {column_i: self.at(idx, column_i) for column_i in column} - - if isinstance(idx, (int, slice)) and column in self._all_items.keys(): - return self._all_items[column][idx] - - if isinstance(idx, slice): - idx = range(len(self))[idx] - - if isinstance(idx, Iterable): - idx = list(idx) - if not all(isinstance(idx_i, int) for idx_i in idx): - raise TypeError( - f"Invalid input type for idx={idx}. (expected Iterable[int], not Iterable[{idx.__class__.__name__}])" - ) - return [self.at(idx_i, column) for idx_i in idx] - - if column == "audio": - fpath = self.at(idx, "fpath") - audio, sr = torchaudio.load(fpath) # type: ignore - - # Sanity check - if audio.nelement() == 0: - raise RuntimeError( - f"Invalid audio number of elements in {fpath}. (expected audio.nelement()={audio.nelement()} > 0)" - ) - if sr != self.SAMPLE_RATE: - raise RuntimeError( - f"Invalid sample rate in {fpath}. (expected {self.SAMPLE_RATE} but found sr={sr})" - ) - return audio - - elif column == "audio_metadata": - fpath = self.at(idx, "fpath") - audio_metadata = torchaudio.info(fpath) # type: ignore - return audio_metadata - - elif column == "competences": - annotators_ids = self.at(idx, "annotators_ids") - competences = [self.get_competence(id_) for id_ in annotators_ids] - return competences - - elif column == "dataset": - return self.DATASET_NAME - - elif column == "fpath": - fname = self.at(idx, "fname") - fpath = osp.join(self.__dpath_audio, fname) - return fpath - - elif column == "index": - return idx - - elif column == "num_channels": - audio_metadata = self.at(idx, "audio_metadata") - return audio_metadata.num_channels - - elif column == "num_frames": - audio_metadata = self.at(idx, "audio_metadata") - return audio_metadata.num_frames - - elif column == "sr": - audio_metadata = self.at(idx, "audio_metadata") - return audio_metadata.sample_rate - - elif column == "subset": - return self._subset - - else: - raise ValueError( - f"Invalid argument column={column} at idx={idx}. (expected one of {tuple(self.column_names)})" - ) - - def get_annotator_id_to_competence_dict(self) -> Dict[int, float]: - """Get annotator to competence dictionary.""" - # Note : copy to prevent any changes on this attribute - return copy.deepcopy(self._annotator_id_to_competence) - - def get_competence(self, annotator_id: int) -> float: - """Get competence value for a specific annotator id.""" - return self._annotator_id_to_competence[annotator_id] - - def is_loaded(self) -> bool: - """Returns True if the dataset is loaded.""" - return self._loaded - - def set_transform( - self, - transform: Optional[Callable[[Dict[str, Any]], Any]], - ) -> None: - """Set the transform applied to each row.""" - self._transform = transform - - # Magic methods - @overload - def __getitem__(self, idx: int) -> MACSItem: - ... - - @overload - def __getitem__(self, idx: Union[Iterable[int], slice, None]) -> Dict[str, List]: - ... - - @overload - def __getitem__(self, idx: Any) -> Any: - ... - - def __getitem__(self, idx: Any) -> Any: - if ( - isinstance(idx, tuple) - and len(idx) == 2 - and (isinstance(idx[1], (str, Iterable)) or idx[1] is None) - ): - idx, column = idx - else: - column = None - - item = self.at(idx, column) - if isinstance(idx, int) and column is None and self._transform is not None: - item = self._transform(item) - return item - - def __len__(self) -> int: - return len(self._all_items["captions"]) - - def __repr__(self) -> str: - return f"MACS(size={len(self)}, subset={self._subset}, num_columns={len(self.column_names)})" - - # Private methods - @property - @lru_cache() - def __dpath_archives(self) -> str: - return osp.join(self.__dpath_data, "archives") - - @property - @lru_cache() - def __dpath_audio(self) -> str: - return osp.join(self.__dpath_data, "audio") - - @property - @lru_cache() - def __dpath_data(self) -> str: - return osp.join(self._root, "MACS") - - @property - @lru_cache() - def __dpath_tau_meta(self) -> str: - return osp.join(self.__dpath_data, "tau_meta") - - def __is_prepared(self) -> bool: - if not osp.isdir(self.__dpath_audio): - return False - captions_fpath = osp.join(self.__dpath_data, MACS_FILES["captions"]["fname"]) - if not osp.isfile(captions_fpath): - return False - - with open(captions_fpath, "r") as file: - data = yaml.safe_load(file) - data = data["files"] - fnames = os.listdir(self.__dpath_audio) - return len(data) == len(fnames) - - def _load_dataset(self) -> None: - if not self.__is_prepared(): - raise RuntimeError( - f"Cannot load data: macs is not prepared in data root={self._root}. Please use download=True in dataset constructor." - ) - - # Read data files - captions_fname = MACS_FILES["captions"]["fname"] - captions_fpath = osp.join(self.__dpath_data, captions_fname) - if self._verbose >= 2: - pylog.debug(f"Reading captions file {captions_fname}...") - - with open(captions_fpath, "r") as file: - caps_data = yaml.safe_load(file) - - tau_meta_fname = "meta.csv" - tau_meta_fpath = osp.join(self.__dpath_tau_meta, tau_meta_fname) - if self._verbose >= 2: - pylog.debug( - f"Reading Tau Urban acoustic scene meta file {tau_meta_fname}..." - ) - - with open(tau_meta_fpath, "r") as file: - reader = csv.DictReader(file, delimiter="\t") - tau_tags_data = list(reader) - - competence_fname = "MACS_competence.csv" - competence_fpath = osp.join(self.__dpath_data, competence_fname) - if self._verbose >= 2: - pylog.debug(f"Reading file {competence_fname}...") - - with open(competence_fpath, "r") as file: - reader = csv.DictReader(file, delimiter="\t") - competences_data = list(reader) - - # Store MACS data - all_items: Dict[str, List[Any]] = { - "fname": [item["filename"] for item in caps_data["files"]], - "captions": [ - [subitem["sentence"] for subitem in item["annotations"]] - for item in caps_data["files"] - ], - "tags": [ - [subitem["tags"] for subitem in item["annotations"]] - for item in caps_data["files"] - ], - "annotators_ids": [ - [subitem["annotator_id"] for subitem in item["annotations"]] - for item in caps_data["files"] - ], - } - dataset_size = len(all_items["fname"]) - - # Build global mappings - fname_to_idx = {fname: i for i, fname in enumerate(all_items["fname"])} - annotator_id_to_competence = { - int(annotator["annotator_id"]): float(annotator["competence"]) - for annotator in competences_data - } - - # Store TAU Urban acoustic scenes data - tau_additional_keys = ("scene_label", "identifier") - all_items.update( - {key: [None for _ in range(dataset_size)] for key in tau_additional_keys} - ) - - tau_meta_fpath = osp.join(self.__dpath_tau_meta, "meta.csv") - for tau_tags in tau_tags_data: - fname = osp.basename(tau_tags["filename"]) - if fname in fname_to_idx: - idx = fname_to_idx[fname] - for key in tau_additional_keys: - all_items[key][idx] = tau_tags[key] - - if self._flat_captions and self.MIN_CAPTIONS_PER_AUDIO[self._subset] > 1: - all_infos_flatten = {key: [] for key in all_items.keys()} - - for i, captions in enumerate(all_items["captions"]): - for caption in captions: - for key in all_items.keys(): - all_infos_flatten[key].append(all_items[key][i]) - all_infos_flatten["captions"] = [caption] - - all_items = all_infos_flatten - - # Sanity checks - assert all( - all(value is not None for value in all_items[key]) - for key in tau_additional_keys - ) - assert all(len(values) == dataset_size for values in all_items.values()) - - # Set attributes - self._all_items = all_items - self._annotator_id_to_competence = annotator_id_to_competence - self._loaded = True - - if self._verbose >= 1: - pylog.info(f"{repr(self)} has been loaded. (len={len(self)})") - - def _prepare_dataset(self) -> None: - if not osp.isdir(self._root): - raise RuntimeError(f"Cannot find root directory '{self._root}'.") - - os.makedirs(self.__dpath_archives, exist_ok=True) - os.makedirs(self.__dpath_audio, exist_ok=True) - os.makedirs(self.__dpath_tau_meta, exist_ok=True) - - # Download MACS specific files - for file_info in MACS_FILES.values(): - dpath = self.__dpath_data - fname = file_info["fname"] - fpath = osp.join(dpath, fname) - - if not osp.isfile(fpath) or self.FORCE_PREPARE_DATA: - if self._verbose >= 1: - pylog.info(f"Downloading captions file '{fname}'...") - - url = file_info["url"] - download_url_to_file( - url, - fpath, - progress=self._verbose >= 1, - ) - - if self.VERIFY_FILES: - hash_value = file_info["hash_value"] - valid = validate_file(fpath, hash_value, hash_type="md5") - if not valid: - raise RuntimeError(f"Invalid checksum for file {fname}.") - elif self._verbose >= 2: - pylog.debug(f"File '{fname}' has a valid checksum.") - - captions_fpath = osp.join(self.__dpath_data, MACS_FILES["captions"]["fname"]) - with open(captions_fpath, "r") as file: - captions_data = yaml.safe_load(file) - captions_data = captions_data["files"] - - # Download TAU Urban Sound audio archives files - for i, file_info in enumerate(MACS_ARCHIVES_FILES.values()): - dpath = self.__dpath_archives - zip_fname = file_info["fname"] - zip_fpath = osp.join(dpath, zip_fname) - - if not osp.isfile(zip_fpath) or self.FORCE_PREPARE_DATA: - if self._verbose >= 1: - pylog.info( - f"Downloading audio zip file '{zip_fpath}'... ({i+1}/{len(MACS_ARCHIVES_FILES)})" - ) - - url = file_info["url"] - download_url_to_file( - url, - zip_fpath, - progress=self._verbose >= 1, - ) - - if self.VERIFY_FILES: - hash_value = file_info["hash_value"] - valid = validate_file(zip_fpath, hash_value, hash_type="md5") - if not valid: - raise RuntimeError(f"Invalid checksum for file {zip_fname}.") - elif self._verbose >= 2: - pylog.debug(f"File '{zip_fname}' has a valid checksum.") - - # Extract files from TAU Urban Sound archives - macs_fnames = dict.fromkeys(data["filename"] for data in captions_data) - for i, (name, file_info) in enumerate(MACS_ARCHIVES_FILES.items()): - zip_fname = file_info["fname"] - zip_fpath = osp.join(self.__dpath_archives, zip_fname) - - if self._verbose >= 2: - pylog.debug( - f"Check to extract TAU Urban acoustic scenes archive zip_fname={zip_fname}..." - ) - - is_audio_archive = name.startswith("audio") - target_dpath = ( - self.__dpath_audio if is_audio_archive else self.__dpath_tau_meta - ) - - with zipfile.ZipFile(zip_fpath, "r") as file: - members_to_extract = [ - member - for member in file.namelist() - # Extract member if file if in captions yaml file and if the audio file is not already downloaded - if ( - (osp.basename(member) in macs_fnames or not is_audio_archive) - and not osp.isfile(osp.join(target_dpath, osp.basename(member))) - ) - ] - - if self._verbose >= 1: - pylog.info( - f"Extracting {len(members_to_extract)}/{len(file.namelist())} audio files from ZIP file '{zip_fname}'... ({i+1}/{len(MACS_ARCHIVES_FILES)})" - ) - - if len(members_to_extract) > 0: - file.extractall(self.__dpath_archives, members_to_extract) - for member in members_to_extract: - extracted_fpath = osp.join(self.__dpath_archives, member) - target_fpath = osp.join(target_dpath, osp.basename(member)) - shutil.move(extracted_fpath, target_fpath) - - if self.CLEAN_ARCHIVES: - if self._verbose >= 1: - pylog.info(f"Removing archives files in {self.__dpath_archives}...") - shutil.rmtree(self.__dpath_archives, ignore_errors=True) - - audio_fnames = [ - name for name in os.listdir(self.__dpath_audio) if name.endswith(".wav") - ] - assert len(audio_fnames) == len(macs_fnames) - - if self._verbose >= 2: - pylog.debug( - f"Dataset {self.__class__.__name__} ({self._subset}) has been prepared." - ) - - -# MACS-specific files links. -MACS_FILES = { - "licence": { - "fname": "LICENSE.txt", - "url": "https://zenodo.org/record/5114771/files/LICENSE.txt?download=1", - "hash_value": "d3086f4517cccc32c1bb3a081b07cfa1", - }, - "captions": { - "fname": "MACS.yaml", - "url": "https://zenodo.org/record/5114771/files/MACS.yaml?download=1", - "hash_value": "23fcb2ebd0b109094034ef9e87972256", - }, - "annotators_competences": { - "fname": "MACS_competence.csv", - "url": "https://zenodo.org/record/5114771/files/MACS_competence.csv?download=1", - "hash_value": "4dfe9f951f0af9f29cb7952ec030370a", - }, -} - -# TAU_URBAN_ACOUSTIC archives files links. -TAU_URBAN_ACOUSTIC_DEV_FILES = { - "audio.1": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.1.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.1.zip?download=1", - "hash_value": "aca4ebfd9ed03d5f747d6ba8c24bc728", - }, - "audio.10": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.10.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.10.zip?download=1", - "hash_value": "0ffbf60006da520cc761fb74c878b98b", - }, - "audio.11": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.11.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.11.zip?download=1", - "hash_value": "599055d93b4c11057c29be2df54538d4", - }, - "audio.12": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.12.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.12.zip?download=1", - "hash_value": "98b8d162ff3665695c4c910e6c372cc8", - }, - "audio.13": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.13.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.13.zip?download=1", - "hash_value": "a356c08b1a5a21d433eba37ef87587f4", - }, - "audio.14": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.14.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.14.zip?download=1", - "hash_value": "f8969771e7faf7dd471d1cf78b0cf011", - }, - "audio.15": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.15.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.15.zip?download=1", - "hash_value": "4758c4b0fb7484faa632266e78850820", - }, - "audio.16": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.16.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.16.zip?download=1", - "hash_value": "a18acad9ede8ea76574216feb887f0bc", - }, - "audio.17": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.17.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.17.zip?download=1", - "hash_value": "1af7703484632f340da5c33662dc9632", - }, - "audio.18": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.18.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.18.zip?download=1", - "hash_value": "b67402bf3e08f4da394a7c18756c0fd2", - }, - "audio.19": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.19.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.19.zip?download=1", - "hash_value": "035db315f19106eb848b6f9b32bcc47c", - }, - "audio.2": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.2.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.2.zip?download=1", - "hash_value": "c4f170408ce77c8c70c532bf268d7be0", - }, - "audio.20": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.20.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.20.zip?download=1", - "hash_value": "9cb28c74911bf8a3eadcf53f50a5b5d6", - }, - "audio.21": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.21.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.21.zip?download=1", - "hash_value": "0e44ed85c88ec036a9725b4dd1dfaea0", - }, - "audio.3": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.3.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.3.zip?download=1", - "hash_value": "c7214a07211f10f3250290d05e72c37e", - }, - "audio.4": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.4.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.4.zip?download=1", - "hash_value": "a6a62110f6699cf4432072acb1dffda6", - }, - "audio.5": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.5.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.5.zip?download=1", - "hash_value": "091a0b6d3c84b8e60e46940aa7d4a8a0", - }, - "audio.6": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.6.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.6.zip?download=1", - "hash_value": "114f4ca13e074391b98a1cfd8140de65", - }, - "audio.7": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.7.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.7.zip?download=1", - "hash_value": "5951dd2968f7a514e2afbe279c4f060d", - }, - "audio.8": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.8.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.8.zip?download=1", - "hash_value": "b0b63dc95b327e1509857c8d8a663cc3", - }, - "audio.9": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.9.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.9.zip?download=1", - "hash_value": "3c32a693a6b111ffb957be3c1dd22e9b", - }, - "doc": { - "fname": "TAU-urban-acoustic-scenes-2019-development.doc.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.doc.zip?download=1", - "hash_value": "1f6879544e80da70099a191613e7e51f", - }, - "meta": { - "fname": "TAU-urban-acoustic-scenes-2019-development.meta.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.meta.zip?download=1", - "hash_value": "09782f2097e4735687af73c44919329c", - }, -} - -# List of TAU_URBAN_ACOUSTIC archives containing at least 1 MACS audio. -MACS_ARCHIVES_FILES = { - name: TAU_URBAN_ACOUSTIC_DEV_FILES[name] - for name in ( - "audio.1", - "audio.10", - "audio.11", - "audio.12", - "audio.13", - "audio.2", - "audio.3", - "audio.9", - "meta", - ) -} diff --git a/src/aac_datasets/datasets/macs.py b/src/aac_datasets/datasets/macs.py index 755f5e9..44eaaf0 100644 --- a/src/aac_datasets/datasets/macs.py +++ b/src/aac_datasets/datasets/macs.py @@ -2,32 +2,30 @@ # -*- coding: utf-8 -*- import copy -import csv import logging -import os import os.path as osp -import shutil -import zipfile +from pathlib import Path from typing import ( - Any, Callable, ClassVar, Dict, List, Optional, - Tuple, + Union, ) -import yaml - from torch import Tensor -from torch.hub import download_url_to_file from typing_extensions import TypedDict -from aac_datasets.datasets.base import AACDataset, DatasetCard -from aac_datasets.utils.download import hash_file -from aac_datasets.utils.paths import _get_root +from aac_datasets.datasets.base import AACDataset +from aac_datasets.datasets.functional.macs import ( + MACSCard, + load_macs_dataset, + download_macs_dataset, + _get_audio_dpath, +) +from aac_datasets.utils.globals import _get_root pylog = logging.getLogger(__name__) @@ -52,38 +50,6 @@ class MACSItem(TypedDict): tags: List[List[str]] -class MACSCard(DatasetCard): - ANNOTATIONS_CREATORS: Tuple[str, ...] = ("crowdsourced",) - CITATION: str = r""" - @inproceedings{Martin2021b, - title = {Diversity and Bias in Audio Captioning Datasets}, - author = {Martin, Irene and Mesaros, Annamaria}, - year = 2021, - month = {November}, - booktitle = {Proceedings of the 6th Detection and Classification of Acoustic Scenes and Events 2021 Workshop (DCASE2021)}, - address = {Barcelona, Spain}, - pages = {90--94}, - isbn = {978-84-09-36072-7}, - url = {https://dcase.community/documents/workshop2021/proceedings/DCASE2021Workshop_Martin_34.pdf}, - abstract = {Describing soundscapes in sentences allows better understanding of the acoustic scene than a single label indicating the acoustic scene class or a set of audio tags indicating the sound events active in the audio clip. In addition, the richness of natural language allows a range of possible descriptions for the same acoustic scene. In this work, we address the diversity obtained when collecting descriptions of soundscapes using crowdsourcing. We study how much the collection of audio captions can be guided by the instructions given in the annotation task, by analysing the possible bias introduced by auxiliary information provided in the annotation process. Our study shows that even when given hints on the audio content, different annotators describe the same soundscape using different vocabulary. In automatic captioning, hints provided as audio tags represent grounding textual information that facilitates guiding the captioning output towards specific concepts. We also release a new dataset of audio captions and audio tags produced by multiple annotators for a subset of the TAU Urban Acoustic Scenes 2018 dataset, suitable for studying guided captioning.}, - doi. = {10.5281/zenodo.5770113} - } - """ - DESCRIPTION = "Multi-Annotator Captioned Soundscapes dataset." - HOMEPAGE: str = "https://zenodo.org/record/5114771" - LANGUAGE: Tuple[str, ...] = ("en",) - LANGUAGE_DETAILS: Tuple[str, ...] = ("en-US",) - MAX_CAPTIONS_PER_AUDIO: Dict[str, int] = {"full": 5} - MIN_CAPTIONS_PER_AUDIO: Dict[str, int] = {"full": 2} - NAME: str = "macs" - N_CHANNELS: int = 2 - PRETTY_NAME: str = "MACS" - SAMPLE_RATE: int = 48_000 # Hz - SIZE_CATEGORIES: Tuple[str, ...] = ("1K None: """ :param root: The parent of the dataset root directory. @@ -133,10 +100,17 @@ def __init__( defaults to False. :param transform: The transform to apply to the global dict item. This transform is applied only in getitem method when argument is an integer. defaults to None. - :param flat_captions: If True, map captions to audio instead of audio to caption. - defaults to True. :param verbose: Verbose level to use. Can be 0 or 1. defaults to 0. + :param force_download: If True, force to re-download file even if they exists on disk. + defaults to False. + :param verify_files: If True, check hash value when possible. + defaults to False. + + :param clean_archives: If True, remove the compressed archives from disk to save space. + defaults to True. + :param flat_captions: If True, map captions to audio instead of audio to caption. + defaults to True. """ if subset not in MACSCard.SUBSETS: raise ValueError( @@ -146,16 +120,16 @@ def __init__( root = _get_root(root) if download: - _prepare_macs_dataset( + download_macs_dataset( root=root, subset=subset, + force=force_download, verbose=verbose, - force=MACS.FORCE_PREPARE_DATA, - verify_files=MACS.VERIFY_FILES, - clean_archives=MACS.CLEAN_ARCHIVES, + clean_archives=clean_archives, + verify_files=verify_files, ) - raw_data, annotator_id_to_competence = _load_macs_dataset( + raw_data, annotator_id_to_competence = load_macs_dataset( root=root, subset=subset, verbose=verbose, @@ -186,7 +160,7 @@ def __init__( self._verbose = verbose self._annotator_id_to_competence = annotator_id_to_competence - self.add_post_columns( + self.add_online_columns( { "audio": MACS._load_audio, "audio_metadata": MACS._load_audio_metadata, @@ -239,412 +213,3 @@ def __repr__(self) -> str: } repr_str = ", ".join(f"{k}={v}" for k, v in repr_dic.items()) return f"{MACSCard.PRETTY_NAME}({repr_str})" - - -def _get_macs_dpath(root: str) -> str: - return osp.join(root, "MACS") - - -def _get_archives_dpath(root: str) -> str: - return osp.join(_get_macs_dpath(root), "archives") - - -def _get_audio_dpath(root: str) -> str: - return osp.join(_get_macs_dpath(root), "audio") - - -def _get_tau_meta_dpath(root: str) -> str: - return osp.join(_get_macs_dpath(root), "tau_meta") - - -def _is_prepared(root: str) -> bool: - audio_dpath = _get_audio_dpath(root) - if not osp.isdir(audio_dpath): - return False - captions_fpath = osp.join(_get_macs_dpath(root), MACS_FILES["captions"]["fname"]) - if not osp.isfile(captions_fpath): - return False - - with open(captions_fpath, "r") as file: - data = yaml.safe_load(file) - data = data["files"] - fnames = os.listdir(audio_dpath) - return len(data) == len(fnames) - - -def _load_macs_dataset( - root: str, subset: str, verbose: int -) -> Tuple[Dict[str, List[Any]], Dict[int, float]]: - if not _is_prepared(root): - raise RuntimeError( - f"Cannot load data: macs is not prepared in data root={root}. Please use download=True in dataset constructor." - ) - - macs_dpath = _get_macs_dpath(root) - tau_meta_dpath = _get_tau_meta_dpath(root) - - # Read data files - captions_fname = MACS_FILES["captions"]["fname"] - captions_fpath = osp.join(macs_dpath, captions_fname) - if verbose >= 2: - pylog.debug(f"Reading captions file {captions_fname}...") - - with open(captions_fpath, "r") as file: - caps_data = yaml.safe_load(file) - - tau_meta_fname = "meta.csv" - tau_meta_fpath = osp.join(tau_meta_dpath, tau_meta_fname) - if verbose >= 2: - pylog.debug(f"Reading Tau Urban acoustic scene meta file {tau_meta_fname}...") - - with open(tau_meta_fpath, "r") as file: - reader = csv.DictReader(file, delimiter="\t") - tau_tags_data = list(reader) - - competence_fname = "MACS_competence.csv" - competence_fpath = osp.join(macs_dpath, competence_fname) - if verbose >= 2: - pylog.debug(f"Reading file {competence_fname}...") - - with open(competence_fpath, "r") as file: - reader = csv.DictReader(file, delimiter="\t") - competences_data = list(reader) - - # Store MACS data - raw_data: Dict[str, List[Any]] = { - "fname": [item["filename"] for item in caps_data["files"]], - "captions": [ - [subitem["sentence"] for subitem in item["annotations"]] - for item in caps_data["files"] - ], - "tags": [ - [subitem["tags"] for subitem in item["annotations"]] - for item in caps_data["files"] - ], - "annotators_ids": [ - [subitem["annotator_id"] for subitem in item["annotations"]] - for item in caps_data["files"] - ], - } - dataset_size = len(raw_data["fname"]) - - # Build global mappings - fname_to_idx = {fname: i for i, fname in enumerate(raw_data["fname"])} - annotator_id_to_competence = { - int(annotator["annotator_id"]): float(annotator["competence"]) - for annotator in competences_data - } - - # Store TAU Urban acoustic scenes data - tau_additional_keys = ("scene_label", "identifier") - raw_data.update( - {key: [None for _ in range(dataset_size)] for key in tau_additional_keys} - ) - - tau_meta_fpath = osp.join(tau_meta_dpath, "meta.csv") - for tau_tags in tau_tags_data: - fname = osp.basename(tau_tags["filename"]) - if fname in fname_to_idx: - idx = fname_to_idx[fname] - for key in tau_additional_keys: - raw_data[key][idx] = tau_tags[key] - - # Sanity checks - assert all( - all(value is not None for value in raw_data[key]) for key in tau_additional_keys - ) - assert all(len(values) == dataset_size for values in raw_data.values()) - - if verbose >= 1: - pylog.info( - f"Dataset {MACSCard.PRETTY_NAME} ({subset}) has been loaded. (len={len(next(iter(raw_data.values())))})" - ) - - return raw_data, annotator_id_to_competence - - -def _prepare_macs_dataset( - root: str, - subset: str, - verbose: int, - force: bool, - verify_files: bool, - clean_archives: bool, -) -> None: - if not osp.isdir(root): - raise RuntimeError(f"Cannot find root directory '{root}'.") - - macs_dpath = _get_macs_dpath(root) - archives_dpath = _get_archives_dpath(root) - audio_dpath = _get_audio_dpath(root) - tau_meta_dpath = _get_tau_meta_dpath(root) - - for dpath in (archives_dpath, audio_dpath, tau_meta_dpath): - os.makedirs(dpath, exist_ok=True) - - # Download MACS specific files - for file_info in MACS_FILES.values(): - fname = file_info["fname"] - fpath = osp.join(macs_dpath, fname) - - if not osp.isfile(fpath) or force: - if verbose >= 1: - pylog.info(f"Downloading captions file '{fname}'...") - - url = file_info["url"] - download_url_to_file( - url, - fpath, - progress=verbose >= 1, - ) - - if verify_files: - hash_value = file_info["hash_value"] - file_hash_value = hash_file(fpath, hash_type="md5") - if file_hash_value != hash_value: - raise RuntimeError( - f"Invalid checksum for file '{fname}'. (expected md5 checksum '{hash_value}' but found '{file_hash_value}')\n" - f"Please try to remove manually the file '{fpath}' and rerun MACS download." - ) - elif verbose >= 2: - pylog.debug(f"File '{fname}' has a valid checksum.") - - captions_fpath = osp.join(macs_dpath, MACS_FILES["captions"]["fname"]) - with open(captions_fpath, "r") as file: - captions_data = yaml.safe_load(file) - captions_data = captions_data["files"] - - # Download TAU Urban Sound audio archives files - for i, file_info in enumerate(MACS_ARCHIVES_FILES.values()): - zip_fname = file_info["fname"] - zip_fpath = osp.join(archives_dpath, zip_fname) - - if not osp.isfile(zip_fpath) or force: - if verbose >= 1: - pylog.info( - f"Downloading audio zip file '{zip_fpath}'... ({i+1}/{len(MACS_ARCHIVES_FILES)})" - ) - - url = file_info["url"] - download_url_to_file( - url, - zip_fpath, - progress=verbose >= 1, - ) - - if verify_files: - hash_value = file_info["hash_value"] - file_hash_value = hash_file(zip_fpath, hash_type="md5") - if file_hash_value != hash_value: - raise RuntimeError( - f"Invalid checksum for file '{zip_fname}'. (expected md5 checksum '{hash_value}' but found '{file_hash_value}')\n" - f"Please try to remove manually the file '{zip_fpath}' and rerun MACS download." - ) - elif verbose >= 2: - pylog.debug(f"File '{zip_fname}' has a valid checksum.") - - # Extract files from TAU Urban Sound archives - macs_fnames = dict.fromkeys(data["filename"] for data in captions_data) - for i, (name, file_info) in enumerate(MACS_ARCHIVES_FILES.items()): - zip_fname = file_info["fname"] - zip_fpath = osp.join(archives_dpath, zip_fname) - - if verbose >= 2: - pylog.debug( - f"Check to extract TAU Urban acoustic scenes archive zip_fname={zip_fname}..." - ) - - is_audio_archive = name.startswith("audio") - if is_audio_archive: - target_dpath = audio_dpath - else: - target_dpath = tau_meta_dpath - - with zipfile.ZipFile(zip_fpath, "r") as file: - members_to_extract = [ - member - for member in file.namelist() - # Extract member if file if in captions yaml file and if the audio file is not already downloaded - if ( - (osp.basename(member) in macs_fnames or not is_audio_archive) - and not osp.isfile(osp.join(target_dpath, osp.basename(member))) - ) - ] - - if verbose >= 1: - pylog.info( - f"Extracting {len(members_to_extract)}/{len(file.namelist())} audio files from ZIP file '{zip_fname}'... ({i+1}/{len(MACS_ARCHIVES_FILES)})" - ) - - if len(members_to_extract) > 0: - file.extractall(archives_dpath, members_to_extract) - for member in members_to_extract: - extracted_fpath = osp.join(archives_dpath, member) - target_fpath = osp.join(target_dpath, osp.basename(member)) - shutil.move(extracted_fpath, target_fpath) - - if clean_archives: - if verbose >= 1: - pylog.info(f"Removing archives files in {archives_dpath}...") - shutil.rmtree(archives_dpath, ignore_errors=True) - - audio_fnames = [name for name in os.listdir(audio_dpath) if name.endswith(".wav")] - assert len(audio_fnames) == len(macs_fnames) - - if verbose >= 2: - pylog.debug(f"Dataset {MACSCard.PRETTY_NAME} ({subset}) has been prepared.") - - -# MACS-specific files links. -MACS_FILES = { - "licence": { - "fname": "LICENSE.txt", - "url": "https://zenodo.org/record/5114771/files/LICENSE.txt?download=1", - "hash_value": "d3086f4517cccc32c1bb3a081b07cfa1", - }, - "captions": { - "fname": "MACS.yaml", - "url": "https://zenodo.org/record/5114771/files/MACS.yaml?download=1", - "hash_value": "23fcb2ebd0b109094034ef9e87972256", - }, - "annotators_competences": { - "fname": "MACS_competence.csv", - "url": "https://zenodo.org/record/5114771/files/MACS_competence.csv?download=1", - "hash_value": "4dfe9f951f0af9f29cb7952ec030370a", - }, -} - -# TAU_URBAN_ACOUSTIC archives files links. -TAU_URBAN_ACOUSTIC_DEV_FILES = { - "audio.1": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.1.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.1.zip?download=1", - "hash_value": "aca4ebfd9ed03d5f747d6ba8c24bc728", - }, - "audio.10": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.10.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.10.zip?download=1", - "hash_value": "0ffbf60006da520cc761fb74c878b98b", - }, - "audio.11": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.11.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.11.zip?download=1", - "hash_value": "599055d93b4c11057c29be2df54538d4", - }, - "audio.12": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.12.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.12.zip?download=1", - "hash_value": "98b8d162ff3665695c4c910e6c372cc8", - }, - "audio.13": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.13.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.13.zip?download=1", - "hash_value": "a356c08b1a5a21d433eba37ef87587f4", - }, - "audio.14": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.14.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.14.zip?download=1", - "hash_value": "f8969771e7faf7dd471d1cf78b0cf011", - }, - "audio.15": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.15.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.15.zip?download=1", - "hash_value": "4758c4b0fb7484faa632266e78850820", - }, - "audio.16": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.16.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.16.zip?download=1", - "hash_value": "a18acad9ede8ea76574216feb887f0bc", - }, - "audio.17": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.17.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.17.zip?download=1", - "hash_value": "1af7703484632f340da5c33662dc9632", - }, - "audio.18": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.18.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.18.zip?download=1", - "hash_value": "b67402bf3e08f4da394a7c18756c0fd2", - }, - "audio.19": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.19.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.19.zip?download=1", - "hash_value": "035db315f19106eb848b6f9b32bcc47c", - }, - "audio.2": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.2.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.2.zip?download=1", - "hash_value": "c4f170408ce77c8c70c532bf268d7be0", - }, - "audio.20": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.20.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.20.zip?download=1", - "hash_value": "9cb28c74911bf8a3eadcf53f50a5b5d6", - }, - "audio.21": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.21.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.21.zip?download=1", - "hash_value": "0e44ed85c88ec036a9725b4dd1dfaea0", - }, - "audio.3": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.3.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.3.zip?download=1", - "hash_value": "c7214a07211f10f3250290d05e72c37e", - }, - "audio.4": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.4.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.4.zip?download=1", - "hash_value": "a6a62110f6699cf4432072acb1dffda6", - }, - "audio.5": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.5.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.5.zip?download=1", - "hash_value": "091a0b6d3c84b8e60e46940aa7d4a8a0", - }, - "audio.6": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.6.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.6.zip?download=1", - "hash_value": "114f4ca13e074391b98a1cfd8140de65", - }, - "audio.7": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.7.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.7.zip?download=1", - "hash_value": "5951dd2968f7a514e2afbe279c4f060d", - }, - "audio.8": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.8.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.8.zip?download=1", - "hash_value": "b0b63dc95b327e1509857c8d8a663cc3", - }, - "audio.9": { - "fname": "TAU-urban-acoustic-scenes-2019-development.audio.9.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.audio.9.zip?download=1", - "hash_value": "3c32a693a6b111ffb957be3c1dd22e9b", - }, - "doc": { - "fname": "TAU-urban-acoustic-scenes-2019-development.doc.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.doc.zip?download=1", - "hash_value": "1f6879544e80da70099a191613e7e51f", - }, - "meta": { - "fname": "TAU-urban-acoustic-scenes-2019-development.meta.zip", - "url": "https://zenodo.org/record/2589280/files/TAU-urban-acoustic-scenes-2019-development.meta.zip?download=1", - "hash_value": "09782f2097e4735687af73c44919329c", - }, -} - -# List of TAU_URBAN_ACOUSTIC archives containing at least 1 MACS audio file. -MACS_ARCHIVES_FILES = { - name: TAU_URBAN_ACOUSTIC_DEV_FILES[name] - for name in ( - "audio.1", - "audio.10", - "audio.11", - "audio.12", - "audio.13", - "audio.2", - "audio.3", - "audio.9", - "meta", - ) -} diff --git a/src/aac_datasets/datasets/wavcaps.py b/src/aac_datasets/datasets/wavcaps.py index 29fe446..00e6fe9 100644 --- a/src/aac_datasets/datasets/wavcaps.py +++ b/src/aac_datasets/datasets/wavcaps.py @@ -1,33 +1,23 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -import csv -import json import logging -import os import os.path as osp -import subprocess -import zipfile from pathlib import Path -from typing import Any, Callable, ClassVar, Dict, List, Optional, Tuple +from typing import Callable, ClassVar, List, Optional, Union -import tqdm - -from huggingface_hub import snapshot_download -from huggingface_hub.constants import HUGGINGFACE_HUB_CACHE -from huggingface_hub.utils.tqdm import ( - disable_progress_bars, - enable_progress_bars, - are_progress_bars_disabled, -) from torch import Tensor from typing_extensions import TypedDict -from aac_datasets.datasets.base import AACDataset, DatasetCard -from aac_datasets.utils.collections import list_dict_to_dict_list -from aac_datasets.utils.download import safe_rmdir -from aac_datasets.utils.paths import _get_root, _get_zip_path +from aac_datasets.datasets.base import AACDataset +from aac_datasets.datasets.functional.wavcaps import ( + WavCapsCard, + load_wavcaps_dataset, + download_wavcaps_dataset, + _get_audio_subset_dpath, +) +from aac_datasets.utils.globals import _get_root, _get_zip_path pylog = logging.getLogger(__name__) @@ -53,46 +43,8 @@ class WavCapsItem(TypedDict): tags: List[str] # FSD only -class WavCapsCard(DatasetCard): - ANNOTATIONS_CREATORS: Tuple[str, ...] = ("machine-generated",) - CAPTIONS_PER_AUDIO: Dict[str, int] = { - "as": 1, - "bbc": 1, - "fsd": 1, - "sb": 1, - "as_noac": 1, - "fsd_nocl": 1, - } - CITATION: str = r""" - @article{mei2023WavCaps, - title = {Wav{C}aps: A {ChatGPT}-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research}, - author = {Xinhao Mei and Chutong Meng and Haohe Liu and Qiuqiang Kong and Tom Ko and Chengqi Zhao and Mark D. Plumbley and Yuexian Zou and Wenwu Wang}, - year = 2023, - journal = {arXiv preprint arXiv:2303.17395}, - url = {https://arxiv.org/pdf/2303.17395.pdf} - } - """ - DEFAULT_REVISION: str = "85a0c21e26fa7696a5a74ce54fada99a9b43c6de" - DESCRIPTION = "WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research." - EXPECTED_SIZES: ClassVar[Dict[str, int]] = { - "AudioSet_SL": 108317, - "BBC_Sound_Effects": 31201, - "FreeSound": 262300, - "SoundBible": 1320, # note: 1232 according to github+hf, but found 1320 => seems that archive contains more data than in json - } - HOMEPAGE = "https://huggingface.co/datasets/cvssp/WavCaps" - LANGUAGE: Tuple[str, ...] = ("en",) - LANGUAGE_DETAILS: Tuple[str, ...] = ("en-US",) - NAME: str = "wavcaps" - PRETTY_NAME: str = "WavCaps" - SOURCES: ClassVar[Tuple[str, ...]] = tuple(EXPECTED_SIZES.keys()) - SUBSETS: Tuple[str, ...] = ("as", "bbc", "fsd", "sb", "as_noac", "fsd_nocl") - SAMPLE_RATE: int = 32_000 # Hz - TASK_CATEGORIES: Tuple[str, ...] = ("audio-to-text", "text-to-audio") - - class WavCaps(AACDataset[WavCapsItem]): - r"""Unofficial WavCaos PyTorch dataset. + r"""Unofficial WavCaps PyTorch dataset. WavCaps Paper : https://arxiv.org/pdf/2303.17395.pdf HuggingFace source : https://huggingface.co/datasets/cvssp/WavCaps @@ -151,25 +103,24 @@ class WavCaps(AACDataset[WavCapsItem]): # Common globals CARD: ClassVar[WavCapsCard] = WavCapsCard() - FORCE_PREPARE_DATA: ClassVar[bool] = False - VERIFY_FILES: ClassVar[bool] = False - - # WavCaps-specific globals - CLEAN_ARCHIVES: ClassVar[bool] = False - REPO_ID: ClassVar[str] = "cvssp/WavCaps" - RESUME_DL: ClassVar[bool] = True - SIZE_CATEGORIES: Tuple[str, ...] = ("100K None: """ :param root: The parent of the dataset root directory. @@ -181,12 +132,21 @@ def __init__( defaults to False. :param transform: The transform to apply to the global dict item. This transform is applied only in getitem method when argument is an integer. defaults to None. + :param verbose: Verbose level. Can be 0 or 1. + defaults to 0. + :param force_download: If True, force to re-download file even if they exists on disk. + defaults to False. + :param verify_files: If True, check hash value when possible. + defaults to False. + + :param clean_archives: If True, remove the compressed archives from disk to save space. + defaults to False. :param hf_cache_dir: HuggingFace cache directory. If None, use the global value :variable:`~huggingface_hub.constants.HUGGINGFACE_HUB_CACHE`. defaults to None. + :param repo_id: Repository ID on HuggingFace. + defaults to "cvssp/WavCaps". :param revision: The HuggingFace revision tag. defaults to :attr:`~WavCapsCard.DEFAULT_REVISION`. - :param verbose: Verbose level. Can be 0 or 1. - defaults to 0. :param zip_path: Path to zip executable path in shell. defaults to "zip". """ @@ -196,27 +156,29 @@ def __init__( ) root = _get_root(root) + zip_path = _get_zip_path(zip_path) if download: - _prepare_wavcaps_dataset( + download_wavcaps_dataset( root=root, subset=subset, - revision=revision, + force=force_download, + verbose=verbose, + clean_archives=clean_archives, hf_cache_dir=hf_cache_dir, - resume_dl=WavCaps.RESUME_DL, - force=WavCaps.FORCE_PREPARE_DATA, - verify_files=WavCaps.VERIFY_FILES, - clean_archives=WavCaps.CLEAN_ARCHIVES, + repo_id=repo_id, + revision=revision, + verify_files=verify_files, zip_path=zip_path, - verbose=verbose, ) - raw_data = _load_wavcaps_dataset( + raw_data = load_wavcaps_dataset( root=root, - hf_cache_dir=hf_cache_dir, - revision=revision, subset=subset, verbose=verbose, + hf_cache_dir=hf_cache_dir, + repo_id=repo_id, + revision=revision, ) size = len(next(iter(raw_data.values()))) @@ -247,7 +209,7 @@ def __init__( self._hf_cache_dir = hf_cache_dir self._revision = revision - self.add_post_columns( + self.add_online_columns( { "audio": WavCaps._load_audio, "audio_metadata": WavCaps._load_audio_metadata, @@ -274,528 +236,3 @@ def sr(self) -> int: @property def subset(self) -> str: return self._subset - - -def _get_wavcaps_dpath( - root: str, - hf_cache_dir: Optional[str], - revision: Optional[str], -) -> str: - return osp.join(root, "WavCaps") - - -def _get_json_dpath( - root: str, - hf_cache_dir: Optional[str], - revision: Optional[str], -) -> str: - return osp.join(_get_wavcaps_dpath(root, hf_cache_dir, revision), "json_files") - - -def _get_archives_dpath( - root: str, - hf_cache_dir: Optional[str], - revision: Optional[str], -) -> str: - return osp.join(_get_wavcaps_dpath(root, hf_cache_dir, revision), "Zip_files") - - -def _get_audio_dpath( - root: str, - hf_cache_dir: Optional[str], - revision: Optional[str], -) -> str: - return osp.join(_get_wavcaps_dpath(root, hf_cache_dir, revision), "Audio") - - -def _get_audio_subset_dpath( - root: str, - hf_cache_dir: Optional[str], - revision: Optional[str], - source: str, -) -> str: - return osp.join( - _get_audio_dpath(root, hf_cache_dir, revision), _WAVCAPS_AUDIO_DNAMES[source] - ) - - -def _is_prepared( - root: str, - hf_cache_dir: Optional[str], - revision: Optional[str], - subset: str, - verbose: int, -) -> bool: - sources = [source for source in WavCapsCard.SOURCES if _use_source(source, subset)] - for source in sources: - audio_fnames = os.listdir( - _get_audio_subset_dpath(root, hf_cache_dir, revision, source) - ) - expected_size = WavCapsCard.EXPECTED_SIZES[source] - if expected_size != len(audio_fnames): - if verbose >= 0: - pylog.error( - f"Invalid number of files for source={source}. (expected {expected_size} but found {len(audio_fnames)} files)" - ) - return False - return True - - -def _use_source(source: str, subset: str) -> bool: - return any( - ( - source == "AudioSet_SL" and subset in ("as", "as_noac"), - source == "BBC_Sound_Effects" and subset in ("bbc",), - source == "FreeSound" and subset in ("fsd", "fsd_nocl"), - source == "SoundBible" and subset in ("sb",), - ) - ) - - -def _load_wavcaps_dataset( - root: str, - hf_cache_dir: Optional[str], - revision: Optional[str], - subset: str, - verbose: int, -) -> Dict[str, List[Any]]: - if subset not in WavCapsCard.SUBSETS: - raise ValueError( - f"Invalid argument subset={subset}. (expected one of {WavCapsCard.SUBSETS})" - ) - - if subset == "as": - overlapped_ds = "AudioCaps" - overlapped_subsets = ("val", "test") - pylog.warning( - f"You selected WavCaps subset '{subset}', be careful to not use these data as training when evaluating on {overlapped_ds} {overlapped_subsets} subsets. " - "You can use as_noac subset for to avoid this bias with AudioCaps." - ) - - elif subset == "fsd": - overlapped_ds = "Clotho" - overlapped_subsets = ( - "val", - "eval", - "dcase_aac_test", - "dcase_aac_analysis", - "dcase_t2a_audio", - "dcase_t2a_captions", - ) - pylog.warning( - f"You selected WavCaps subset '{subset}', be careful to not use these data as training when evaluating on {overlapped_ds} {overlapped_subsets} subsets. " - f"You can use fsd_nocl subset for to avoid this bias for Clotho val, eval, dcase_t2a_audio and dcase_t2a_captions subsets. Data could still overlap with Clotho dcase_aac_test and dcase_aac_analysis subsets." - ) - - if subset in ("as_noac", "fsd_nocl"): - if subset == "as_noac": - target_subset = "as" - csv_fname = "blacklist_audiocaps.full.csv" - - elif subset == "fsd_nocl": - target_subset = "fsd" - csv_fname = "blacklist_clotho.full.csv" - - else: - raise ValueError(f"INTERNAL ERROR: Invalid argument subset={subset}.") - - raw_data = _load_wavcaps_dataset( - root, hf_cache_dir, revision, target_subset, verbose - ) - wavcaps_ids = raw_data["id"] - - csv_fpath = ( - Path(__file__) - .parent.parent.parent.parent.joinpath("data") - .joinpath(csv_fname) - ) - with open(csv_fpath, "r") as file: - reader = csv.DictReader(file) - data = list(reader) - other_ids = [data_i["id"] for data_i in data] - other_ids = dict.fromkeys(other_ids) - - indexes = [i for i, wc_id in enumerate(wavcaps_ids) if wc_id not in other_ids] - - if verbose >= 1: - pylog.info( - f"Getting {len(indexes)}/{len(wavcaps_ids)} items from '{target_subset}' for subset '{subset}'." - ) - - raw_data = { - column: [column_data[idx] for idx in indexes] - for column, column_data in raw_data.items() - } - return raw_data - - if not _is_prepared(root, hf_cache_dir, revision, subset, verbose): - raise RuntimeError( - f"{WavCaps.CARD.PRETTY_NAME} is not prepared in root={root}. Please use download=True to install it in root." - ) - - json_dpath = _get_json_dpath(root, hf_cache_dir, revision) - json_paths = [ - ("AudioSet_SL", osp.join(json_dpath, "AudioSet_SL", "as_final.json")), - ( - "BBC_Sound_Effects", - osp.join(json_dpath, "BBC_Sound_Effects", "bbc_final.json"), - ), - ("FreeSound", osp.join(json_dpath, "FreeSound", "fsd_final.json")), - ("SoundBible", osp.join(json_dpath, "SoundBible", "sb_final.json")), - ] - json_paths = [ - (source, json_path) - for source, json_path in json_paths - if _use_source(source, subset) - ] - - raw_data = {k: [] for k in _WAVCAPS_RAW_COLUMNS + ("source", "fname")} - for source, json_path in json_paths: - if verbose >= 2: - pylog.debug(f"Loading metadata in JSON '{json_path}'...") - json_data, size = _load_json(json_path) - - sources = [source] * size - json_data.pop("audio", None) - - if source == "AudioSet_SL": - ids = json_data["id"] - fnames = [id_.replace(".wav", ".flac") for id_ in ids] - raw_data["fname"] += fnames - - elif source == "BBC_Sound_Effects": - ids = json_data["id"] - fnames = [f"{id_}.flac" for id_ in ids] - raw_data["fname"] += fnames - - elif source == "FreeSound": - ids = json_data["id"] - fnames = [f"{id_}.flac" for id_ in ids] - raw_data["fname"] += fnames - - elif source == "SoundBible": - ids = json_data["id"] - fnames = [f"{id_}.flac" for id_ in ids] - raw_data["fname"] += fnames - - else: - raise RuntimeError(f"Invalid source={source}.") - - for k in _WAVCAPS_RAW_COLUMNS: - if k in json_data: - raw_data[k] += json_data[k] - elif k in _DEFAULT_VALUES: - default_val = _DEFAULT_VALUES[k] - default_values = [default_val] * size - raw_data[k] += default_values - elif k in ("audio", "file_name"): - pass - else: - raise RuntimeError(f"Invalid column name {k}. (with source={source})") - - raw_data["source"] += sources - - raw_data.pop("audio") - raw_data.pop("file_name") - captions = raw_data.pop("caption") - - # Convert str -> List[str] for captions to match other datasets captions type - raw_data["captions"] = [[caption] for caption in captions] - - # Force floating-point precision for duration - raw_data["duration"] = list(map(float, raw_data["duration"])) - - return raw_data - - -def _prepare_wavcaps_dataset( - root: str, - subset: str, - revision: Optional[str], - hf_cache_dir: Optional[str], - resume_dl: bool, - force: bool, - verify_files: bool, - clean_archives: bool, - zip_path: Optional[str], - verbose: int, -) -> None: - if subset == "as_noac": - return _prepare_wavcaps_dataset( - root, - "as", - revision, - hf_cache_dir, - resume_dl, - force, - verify_files, - clean_archives, - zip_path, - verbose, - ) - elif subset == "fsd_nocl": - return _prepare_wavcaps_dataset( - root, - "fsd", - revision, - hf_cache_dir, - resume_dl, - force, - verify_files, - clean_archives, - zip_path, - verbose, - ) - - zip_path = _get_zip_path(zip_path) - - if subset not in WavCapsCard.SUBSETS: - raise ValueError( - f"Invalid argument subset={subset}. (expected one of {WavCapsCard.SUBSETS})" - ) - - # note: verbose=-1 to disable warning triggered when dset is not prepared - if not force and _is_prepared(root, hf_cache_dir, revision, subset, verbose=-1): - return None - - if hf_cache_dir is None: - hf_cache_dir = HUGGINGFACE_HUB_CACHE - - # Download files from huggingface - ign_sources = [ - source for source in WavCapsCard.SOURCES if not _use_source(source, subset) - ] - ign_patterns = [ - pattern - for source in ign_sources - for pattern in (f"json_files/{source}/*.json", f"Zip_files/*") # {source}/ - ] - if verbose >= 2: - pylog.debug(f"ign_sources={ign_sources}") - pylog.debug(f"ign_patterns={ign_patterns}") - - pbar_enabled = are_progress_bars_disabled() - if pbar_enabled and verbose <= 0: - disable_progress_bars() - - snapshot_dpath = snapshot_download( - repo_id=WavCaps.REPO_ID, - repo_type="dataset", - revision=revision, - resume_download=resume_dl, - local_files_only=not force, - cache_dir=hf_cache_dir, - allow_patterns=None, - ignore_patterns=ign_patterns, - ) - - if pbar_enabled and verbose <= 0: - enable_progress_bars() - - snapshot_abs_dpath = osp.abspath(snapshot_dpath) - wavcaps_dpath = _get_wavcaps_dpath(root, hf_cache_dir, revision) - if verbose >= 2: - pylog.debug(f"snapshot_dpath={snapshot_dpath}") - pylog.debug(f"snapshot_absdpath={snapshot_abs_dpath}") - pylog.debug(f"wavcaps_dpath={wavcaps_dpath}") - del snapshot_dpath - - # Build symlink to hf cache - if osp.exists(wavcaps_dpath): - if not osp.islink(wavcaps_dpath): - raise RuntimeError("WavCaps root exists but it is not a symlink.") - link_target_abspath = osp.abspath(osp.realpath(wavcaps_dpath)) - if link_target_abspath != snapshot_abs_dpath: - pylog.error( - "Target link is not pointing to current snapshot path. It will be automatically replaced." - ) - os.remove(wavcaps_dpath) - os.symlink(snapshot_abs_dpath, wavcaps_dpath, True) - else: - os.symlink(snapshot_abs_dpath, wavcaps_dpath, True) - - source_and_splitted = [ - ("AudioSet_SL", True), - ("BBC_Sound_Effects", True), - ("FreeSound", True), - ("SoundBible", False), - ] - source_and_splitted = { - source: is_splitted - for source, is_splitted in source_and_splitted - if _use_source(source, subset) - } - - archives_dpath = _get_archives_dpath(root, hf_cache_dir, revision) - for source, is_splitted in source_and_splitted.items(): - main_zip_fpath = osp.join( - archives_dpath, _WAVCAPS_ARCHIVE_DNAMES[source], f"{source}.zip" - ) - - if is_splitted: - merged_zip_fpath = osp.join( - archives_dpath, _WAVCAPS_ARCHIVE_DNAMES[source], f"{source}_merged.zip" - ) - else: - merged_zip_fpath = main_zip_fpath - - if is_splitted and not osp.isfile(merged_zip_fpath): - cmd = [ - zip_path, - "-FF", - main_zip_fpath, - "--out", - merged_zip_fpath, - ] - if verbose >= 2: - pylog.debug(f"Merging ZIP files for {source}...") - pylog.debug(f"Using command: {' '.join(cmd)}") - - if verbose >= 2: - stdout = None - stderr = None - else: - stdout = subprocess.DEVNULL - stderr = subprocess.DEVNULL - - subprocess.check_call(cmd, stdout=stdout, stderr=stderr) - - audio_subset_dpath = _get_audio_subset_dpath( - root, hf_cache_dir, revision, source - ) - os.makedirs(audio_subset_dpath, exist_ok=True) - - with zipfile.ZipFile(merged_zip_fpath, "r") as file: - flac_subnames = [name for name in file.namelist() if name.endswith(".flac")] - assert len(flac_subnames) > 0 - assert all( - osp.dirname(name) == osp.dirname(flac_subnames[0]) - for name in flac_subnames - ) - - src_root = osp.join(audio_subset_dpath, osp.dirname(flac_subnames[0])) - src_fnames_found = ( - dict.fromkeys(name for name in os.listdir(src_root)) - if osp.isdir(src_root) - else {} - ) - tgt_fnames_found = dict.fromkeys( - name for name in os.listdir(audio_subset_dpath) - ) - - missing_subnames = [ - subname - for subname in flac_subnames - if osp.basename(subname) not in src_fnames_found - and osp.basename(subname) not in tgt_fnames_found - ] - if verbose >= 2: - pylog.debug( - f"Extracting {len(missing_subnames)}/{len(flac_subnames)} audio files from {merged_zip_fpath}..." - ) - file.extractall(audio_subset_dpath, missing_subnames) - if verbose >= 2: - pylog.debug(f"Extraction done.") - - src_fnames_found = ( - dict.fromkeys(name for name in os.listdir(src_root)) - if osp.isdir(src_root) - else {} - ) - src_fpaths_to_move = [ - osp.join(audio_subset_dpath, subname) - for subname in flac_subnames - if osp.basename(subname) in src_fnames_found - ] - if verbose >= 2: - pylog.debug(f"Moving {len(src_fpaths_to_move)} files...") - for src_fpath in tqdm.tqdm(src_fpaths_to_move): - tgt_fpath = osp.join(audio_subset_dpath, osp.basename(src_fpath)) - os.rename(src_fpath, tgt_fpath) - if verbose >= 2: - pylog.debug(f"Move done.") - - if verify_files: - tgt_fnames_expected = [osp.basename(subname) for subname in flac_subnames] - tgt_fnames_found = dict.fromkeys( - fname for fname in os.listdir(audio_subset_dpath) - ) - if verbose >= 2: - pylog.debug(f"Checking {len(tgt_fnames_expected)} files...") - tgt_fnames_invalids = [ - fname for fname in tgt_fnames_expected if fname not in tgt_fnames_found - ] - if len(tgt_fnames_invalids) > 0: - raise FileNotFoundError( - f"Found {len(tgt_fnames_invalids)}/{len(tgt_fnames_expected)} invalid files." - ) - - safe_rmdir(audio_subset_dpath, rm_root=False, error_on_non_empty_dir=True) - - if clean_archives: - used_sources = source_and_splitted.keys() - for source in used_sources: - archive_source_dpath = osp.join( - archives_dpath, _WAVCAPS_ARCHIVE_DNAMES[source] - ) - archives_names = os.listdir(archive_source_dpath) - for name in archives_names: - if not name.endswith(".zip") and ".z" not in name: - continue - fpath = osp.join(archive_source_dpath, name) - if verbose >= 1: - pylog.info(f"Removing archive file {name} for source={source}...") - os.remove(fpath) - - -def _load_json(fpath: str) -> Tuple[Dict[str, Any], int]: - with open(fpath, "r") as file: - data = json.load(file) - data = data["data"] - size = len(data) - data = list_dict_to_dict_list(data, key_mode="same") - return data, size - - -class _WavCapsRawItem(TypedDict): - # Common values - caption: str - duration: float - id: str - # Source Specific values - audio: Optional[str] - author: Optional[str] - description: Optional[str] - download_link: Optional[str] - file_name: Optional[str] - href: Optional[str] - tags: Optional[List[str]] - - -_DEFAULT_VALUES = { - "author": "", - "description": "", - "download_link": "", - "href": "", - "tags": [], -} - -_WAVCAPS_RAW_COLUMNS = tuple( - _WavCapsRawItem.__required_keys__ | _WavCapsRawItem.__optional_keys__ -) - -_WAVCAPS_AUDIO_DNAMES = { - # Source name to audio directory name - "AudioSet_SL": "AudioSet_SL", - "BBC_Sound_Effects": "BBC_Sound_Effects", - "FreeSound": "FreeSound", - "SoundBible": "SoundBible", -} - -_WAVCAPS_ARCHIVE_DNAMES = { - # Source name to audio directory name - "AudioSet_SL": "AudioSet_SL", - "BBC_Sound_Effects": "BBC_Sound_Effects", - "FreeSound": "FreeSound", - "SoundBible": "SoundBible", -} diff --git a/src/aac_datasets/download.py b/src/aac_datasets/download.py index 3672d28..0cca3db 100644 --- a/src/aac_datasets/download.py +++ b/src/aac_datasets/download.py @@ -4,21 +4,37 @@ import logging from argparse import ArgumentParser, Namespace -from typing import Dict, Iterable, Optional import yaml import aac_datasets -from aac_datasets.datasets.audiocaps import AudioCaps, AudioCapsCard -from aac_datasets.datasets.clotho import Clotho, ClothoCard -from aac_datasets.datasets.macs import MACS, MACSCard -from aac_datasets.datasets.wavcaps import WavCaps, WavCapsCard, HUGGINGFACE_HUB_CACHE -from aac_datasets.utils.cmdline import _str_to_bool, _setup_logging -from aac_datasets.utils.paths import ( +from aac_datasets.datasets.functional.audiocaps import ( + AudioCapsCard, + download_audiocaps_datasets, +) +from aac_datasets.datasets.functional.clotho import ( + ClothoCard, + download_clotho_datasets, +) +from aac_datasets.datasets.functional.macs import ( + MACSCard, + download_macs_datasets, +) +from aac_datasets.datasets.functional.wavcaps import ( + WavCapsCard, + download_wavcaps_datasets, +) +from aac_datasets.utils.cmdline import ( + _str_to_bool, + _str_to_opt_int, + _str_to_opt_str, + _setup_logging, +) +from aac_datasets.utils.globals import ( get_default_root, get_default_ffmpeg_path, - get_default_ytdl_path, + get_default_ytdlp_path, get_default_zip_path, ) @@ -26,110 +42,6 @@ pylog = logging.getLogger(__name__) -def download_audiocaps( - root: str = ..., - verbose: int = 1, - force: bool = False, - download: bool = True, - ffmpeg_path: str = ..., - ytdl_path: str = ..., - with_tags: bool = False, - subsets: Iterable[str] = AudioCapsCard.SUBSETS, -) -> Dict[str, AudioCaps]: - """Download :class:`~aac_datasets.datasets.audiocaps.AudioCaps` dataset subsets.""" - AudioCaps.FORCE_PREPARE_DATA = force - datasets = {} - for subset in subsets: - datasets[subset] = AudioCaps( - root, - subset, - download=download, - verbose=verbose, - with_tags=with_tags, - ffmpeg_path=ffmpeg_path, - ytdl_path=ytdl_path, - ) - return datasets - - -def download_clotho( - root: str = ..., - verbose: int = 1, - force: bool = False, - download: bool = True, - version: str = ClothoCard.DEFAULT_VERSION, - clean_archives: bool = False, - subsets: Iterable[str] = ClothoCard.SUBSETS, -) -> Dict[str, Clotho]: - """Download :class:`~aac_datasets.datasets.clotho.Clotho` dataset subsets.""" - subsets = list(subsets) - if version == "v1": - if "val" in subsets: - if verbose >= 0: - pylog.warning( - f"Excluding val subset since it did not exists for version '{version}'." - ) - subsets = [subset for subset in subsets if subset != "val"] - - Clotho.FORCE_PREPARE_DATA = force - Clotho.CLEAN_ARCHIVES = clean_archives - - datasets = {} - for subset in subsets: - datasets[subset] = Clotho( - root, subset, download=download, verbose=verbose, version=version - ) - return datasets - - -def download_macs( - root: str = ..., - verbose: int = 1, - force: bool = False, - download: bool = True, - clean_archives: bool = False, - verify_files: bool = True, -) -> Dict[str, MACS]: - """Download :class:`~aac_datasets.datasets.macs.MACS` dataset.""" - MACS.FORCE_PREPARE_DATA = force - MACS.CLEAN_ARCHIVES = clean_archives - MACS.VERIFY_FILES = verify_files - - datasets = {} - for subset in MACSCard.SUBSETS: - datasets[subset] = MACS(root, download=download, verbose=verbose) - return datasets - - -def download_wavcaps( - root: str = ..., - verbose: int = 1, - force: bool = False, - download: bool = True, - clean_archives: bool = False, - subsets: Iterable[str] = WavCapsCard.SUBSETS, - hf_cache_dir: Optional[str] = HUGGINGFACE_HUB_CACHE, - revision: Optional[str] = WavCapsCard.DEFAULT_REVISION, - zip_path: str = ..., -) -> Dict[str, WavCaps]: - """Download :class:`~aac_datasets.datasets.wavcaps.WavCaps` dataset.""" - - WavCaps.FORCE_PREPARE_DATA = force - WavCaps.CLEAN_ARCHIVES = clean_archives - - datasets = {} - for subset in subsets: - datasets[subset] = WavCaps( - root, - download=download, - hf_cache_dir=hf_cache_dir, - revision=revision, - verbose=verbose, - zip_path=zip_path, - ) - return datasets - - def _get_main_download_args() -> Namespace: parser = ArgumentParser( description="Download a dataset at specified root directory.", @@ -141,18 +53,18 @@ def _get_main_download_args() -> Namespace: default=get_default_root(), help="The path to the parent directory of the datasets.", ) - parser.add_argument( - "--verbose", - type=int, - default=1, - help="Verbose level of the script. 0 means silent mode, 1 is default mode and 2 add additional debugging outputs.", - ) parser.add_argument( "--force", type=_str_to_bool, default=False, help="Force download of files, even if they are already downloaded.", ) + parser.add_argument( + "--verbose", + type=int, + default=1, + help="Verbose level of the script. 0 means silent mode, 1 is default mode and 2 add additional debugging outputs.", + ) subparsers = parser.add_subparsers( dest="dataset", @@ -168,9 +80,9 @@ def _get_main_download_args() -> Namespace: help="Path to ffmpeg used to download audio from youtube.", ) audiocaps_subparser.add_argument( - "--ytdl_path", + "--ytdlp_path", type=str, - default=get_default_ytdl_path(), + default=get_default_ytdlp_path(), help="Path to yt-dl program used to extract metadata from a youtube video.", ) audiocaps_subparser.add_argument( @@ -187,6 +99,12 @@ def _get_main_download_args() -> Namespace: choices=AudioCapsCard.SUBSETS, help="AudioCaps subsets to download.", ) + audiocaps_subparser.add_argument( + "--max_workers", + type=_str_to_opt_int, + default=1, + help="Number of workers used for downloading multiple files in parallel.", + ) clotho_subparser = subparsers.add_parser(ClothoCard.NAME) clotho_subparser.add_argument( @@ -243,8 +161,8 @@ def _get_main_download_args() -> Namespace: ) wavcaps_subparser.add_argument( "--hf_cache_dir", - type=str, - default=HUGGINGFACE_HUB_CACHE, + type=_str_to_opt_str, + default=None, help="Hugging face cache dir.", ) wavcaps_subparser.add_argument( @@ -272,45 +190,42 @@ def _main_download() -> None: pylog.debug(yaml.dump({"Arguments": args.__dict__}, sort_keys=False)) if args.dataset == AudioCapsCard.NAME: - download_audiocaps( + download_audiocaps_datasets( root=args.root, - verbose=args.verbose, + subsets=args.subsets, force=args.force, - download=True, + verbose=args.verbose, ffmpeg_path=args.ffmpeg_path, - ytdl_path=args.ytdl_path, + max_workers=args.max_workers, with_tags=args.with_tags, - subsets=args.subsets, + ytdlp_path=args.ytdlp_path, ) elif args.dataset == ClothoCard.NAME: - download_clotho( + download_clotho_datasets( root=args.root, - verbose=args.verbose, + subsets=args.subsets, force=args.force, - download=True, - version=args.version, + verbose=args.verbose, clean_archives=args.clean_archives, - subsets=args.subsets, + version=args.version, ) elif args.dataset == MACSCard.NAME: - download_macs( + download_macs_datasets( root=args.root, - verbose=args.verbose, force=args.force, - download=True, + verbose=args.verbose, clean_archives=args.clean_archives, ) elif args.dataset == WavCapsCard.NAME: - download_wavcaps( + download_wavcaps_datasets( root=args.root, - verbose=args.verbose, + subsets=args.subsets, force=args.force, - download=True, + verbose=args.verbose, clean_archives=args.clean_archives, - subsets=args.subsets, hf_cache_dir=args.hf_cache_dir, revision=args.revision, zip_path=args.zip_path, diff --git a/src/aac_datasets/info.py b/src/aac_datasets/info.py index 1d7fa11..171b8f1 100644 --- a/src/aac_datasets/info.py +++ b/src/aac_datasets/info.py @@ -13,10 +13,10 @@ import aac_datasets -from aac_datasets.utils.paths import ( +from aac_datasets.utils.globals import ( get_default_root, get_default_ffmpeg_path, - get_default_ytdl_path, + get_default_ytdlp_path, ) @@ -37,7 +37,7 @@ def get_install_info() -> Dict[str, str]: "package_path": get_package_repository_path(), "root": get_default_root(), "ffmpeg_path": get_default_ffmpeg_path(), - "ytdl_path": get_default_ytdl_path(), + "ytdlp_path": get_default_ytdlp_path(), } diff --git a/src/aac_datasets/utils/__init__.py b/src/aac_datasets/utils/__init__.py index 9f4b8ea..4127099 100644 --- a/src/aac_datasets/utils/__init__.py +++ b/src/aac_datasets/utils/__init__.py @@ -4,5 +4,3 @@ """ Utilities modules. """ - -from .collate import BasicCollate, AdvancedCollate diff --git a/src/aac_datasets/utils/audioset_mapping.py b/src/aac_datasets/utils/audioset_mapping.py new file mode 100644 index 0000000..e7e5dda --- /dev/null +++ b/src/aac_datasets/utils/audioset_mapping.py @@ -0,0 +1,101 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import csv +import os +import os.path as osp + +from pathlib import Path +from typing import Dict, Union + +from torch.hub import download_url_to_file + + +_AUDIOSET_INFOS = { + "class_labels_indices": { + "fname": "class_labels_indices.csv", + "url": "http://storage.googleapis.com/us_audioset/youtube_corpus/v1/csv/class_labels_indices.csv", + }, +} +_DEFAULT_CACHE_PATH = Path.home().joinpath(".cache", "audioset_mapping") + + +def get_audioset_mapping_cache_path(cache_path: Union[str, Path, None] = None) -> Path: + if cache_path is not None: + return Path(cache_path) + else: + return _DEFAULT_CACHE_PATH + + +def download_audioset_mapping( + cache_path: Union[str, Path, None] = None, + verbose: int = 0, +) -> None: + cache_path = get_audioset_mapping_cache_path(cache_path) + os.makedirs(cache_path, exist_ok=True) + + info = _AUDIOSET_INFOS["class_labels_indices"] + map_fname = info["fname"] + map_fpath = cache_path.joinpath(map_fname) + + url = info["url"] + download_url_to_file(url, str(map_fpath), progress=verbose >= 1) + + +def load_audioset_mapping( + key_name: str = "index", + val_name: str = "display_name", + offline: bool = False, + cache_path: Union[str, Path, None] = None, + verbose: int = 0, +) -> Dict: + NAMES = ("index", "mid", "display_name") + if key_name not in NAMES: + raise ValueError( + f"Invalid argument key_name={key_name}. (expected one of {NAMES})" + ) + if val_name not in NAMES: + raise ValueError( + f"Invalid argument val_name={val_name}. (expected one of {NAMES})" + ) + if key_name == val_name: + raise ValueError( + f"Invalid arguments key_name={key_name} with val_name={val_name}. (expected different values)" + ) + + cache_path = get_audioset_mapping_cache_path(cache_path) + + info = _AUDIOSET_INFOS["class_labels_indices"] + map_fname = info["fname"] + map_fpath = cache_path.joinpath(map_fname) + + if not osp.isfile(map_fpath): + if offline: + raise FileNotFoundError( + f"Cannot find or download audioset mapping file in '{map_fpath}' with mode offline={offline}." + ) + + download_audioset_mapping(cache_path, verbose) + + with open(map_fpath, "r") as file: + reader = csv.DictReader(file, skipinitialspace=True, strict=True) + data = list(reader) + + keys = [data_i[key_name] for data_i in data] + values = [data_i[val_name] for data_i in data] + + if key_name == "index": + keys = list(map(int, keys)) + if val_name == "index": + values = list(map(int, values)) + + mapping = dict(zip(keys, values)) + return mapping + + +def load_audioset_name_to_idx( + offline: bool = False, + cache_path: Union[str, Path, None] = None, + verbose: int = 0, +) -> Dict[str, int]: + return load_audioset_mapping("display_name", "index", offline, cache_path, verbose) diff --git a/src/aac_datasets/utils/cmdline.py b/src/aac_datasets/utils/cmdline.py index e121ae4..dbffec4 100644 --- a/src/aac_datasets/utils/cmdline.py +++ b/src/aac_datasets/utils/cmdline.py @@ -4,9 +4,12 @@ import logging import sys +from typing import Optional -_TRUE_VALUES = ("true", "1", "t", "yes", "y") -_FALSE_VALUES = ("false", "0", "f", "no", "n") + +_TRUE_VALUES = ("true", "t", "yes", "y", "1") +_FALSE_VALUES = ("false", "f", "no", "n", "0") +_NONE_VALUES = ("none",) def _str_to_bool(s: str) -> bool: @@ -21,10 +24,28 @@ def _str_to_bool(s: str) -> bool: ) -def _setup_logging(pkg_name: str, verbose: int) -> None: - format_ = "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s" +def _str_to_opt_int(s: str) -> Optional[int]: + s = str(s).strip().lower() + if s in _NONE_VALUES: + return None + else: + return int(s) + + +def _str_to_opt_str(s: str) -> Optional[str]: + s = str(s) + if s.lower() in _NONE_VALUES: + return None + else: + return s + + +def _setup_logging(pkg_name: str, verbose: int, set_format: bool = True) -> None: handler = logging.StreamHandler(sys.stdout) - handler.setFormatter(logging.Formatter(format_)) + if set_format: + format_ = "[%(asctime)s][%(name)s][%(levelname)s] - %(message)s" + handler.setFormatter(logging.Formatter(format_)) + pkg_logger = logging.getLogger(pkg_name) found = False @@ -43,4 +64,5 @@ def _setup_logging(pkg_name: str, verbose: int) -> None: level = logging.INFO else: level = logging.DEBUG + pkg_logger.setLevel(level) diff --git a/src/aac_datasets/utils/collate.py b/src/aac_datasets/utils/collate.py index 85c032c..0adbe50 100644 --- a/src/aac_datasets/utils/collate.py +++ b/src/aac_datasets/utils/collate.py @@ -63,8 +63,7 @@ def __call__(self, batch_lst: List[Dict[str, Any]]) -> Dict[str, Any]: batch_dic[key] = values continue - are_stackables = [value.shape == values[0].shape for value in values] - if all(are_stackables): + if can_be_stacked(values): values = torch.stack(values) batch_dic[key] = values continue @@ -97,3 +96,12 @@ def pad_last_dim(tensor: Tensor, target_length: int, pad_value: float) -> Tensor """ pad_len = max(target_length - tensor.shape[-1], 0) return F.pad(tensor, [0, pad_len], value=pad_value) + + +def can_be_stacked(tensors: List[Tensor]) -> bool: + """Returns true if a list of tensors can be stacked with torch.stack function.""" + if len(tensors) == 0: + return False + shape0 = tensors[0].shape + are_stackables = [tensor.shape == shape0 for tensor in tensors] + return all(are_stackables) diff --git a/src/aac_datasets/utils/collections.py b/src/aac_datasets/utils/collections.py index 85e0fc6..9309454 100644 --- a/src/aac_datasets/utils/collections.py +++ b/src/aac_datasets/utils/collections.py @@ -1,7 +1,7 @@ #!/usr/bin/env python # -*- coding: utf-8 -*- -from typing import Dict, List, TypeVar +from typing import Any, Dict, Iterable, List, TypeVar T = TypeVar("T") @@ -10,6 +10,7 @@ def list_dict_to_dict_list( lst: List[Dict[str, T]], key_mode: str = "intersect", + default: Any = None, ) -> Dict[str, List[T]]: """Convert list of dicts to dict of lists. @@ -17,26 +18,30 @@ def list_dict_to_dict_list( :param key_mode: Can be "same" or "intersect". If "same", all the dictionaries must contains the same keys otherwise a ValueError will be raised. If "intersect", only the intersection of all keys will be used in output. + If "union", the output dict will contains the union of all keys, and the missing value will use the argument default. :returns: The dictionary of lists. """ if len(lst) <= 0: return {} + keys = set(lst[0].keys()) if key_mode == "same": if not all(keys == set(item.keys()) for item in lst[1:]): raise ValueError("Invalid keys for batch.") elif key_mode == "intersect": - keys = intersect_lists([list(item.keys()) for item in lst]) + keys = intersect_lists([item.keys() for item in lst]) + elif key_mode == "union": + keys = union_lists([item.keys() for item in lst]) else: - KEY_MODES = ("same", "intersect") + KEY_MODES = ("same", "intersect", "union") raise ValueError( f"Invalid argument key_mode={key_mode}. (expected one of {KEY_MODES})" ) - return {key: [item[key] for item in lst] for key in keys} + return {key: [item.get(key, default) for item in lst] for key in keys} -def intersect_lists(lst_of_lst: List[List[T]]) -> List[T]: +def intersect_lists(lst_of_lst: List[Iterable[T]]) -> List[T]: """Performs intersection of elements in lists (like set intersection), but keep their original order.""" if len(lst_of_lst) <= 0: return [] @@ -46,3 +51,12 @@ def intersect_lists(lst_of_lst: List[List[T]]) -> List[T]: if len(out) == 0: break return out + + +def union_lists(lst_of_lst: Iterable[Iterable[T]]) -> List[T]: + """Performs union of elements in lists (like set union), but keep their original order.""" + out = {} + for lst_i in lst_of_lst: + out |= dict.fromkeys(lst_i) + out = list(out) + return out diff --git a/src/aac_datasets/utils/download.py b/src/aac_datasets/utils/download.py index 0df0dc1..a134c3c 100644 --- a/src/aac_datasets/utils/download.py +++ b/src/aac_datasets/utils/download.py @@ -3,25 +3,41 @@ import hashlib import os +import os.path as osp from pathlib import Path from typing import List, Union +from torch.hub import download_url_to_file + HASH_TYPES = ("sha256", "md5") DEFAULT_CHUNK_SIZE = 256 * 1024**2 # 256 MiB +def download_file( + url: str, + fpath: Union[str, Path], + make_intermediate: bool = False, + verbose: int = 0, +) -> None: + if make_intermediate: + dpath = osp.dirname(fpath) + os.makedirs(dpath, exist_ok=True) + + download_url_to_file(url, fpath, progress=verbose > 0) + + def safe_rmdir( - root: str, + root: Union[str, Path], rm_root: bool = True, error_on_non_empty_dir: bool = True, ) -> List[str]: """Remove all empty sub-directories. :param root: Root directory path. - :param rm_root: If True, remove the root directory. defaults to True. - :param error_on_non_empty_dir: If True, raises a RuntimeError if a subdirectory contains 1 file. + :param rm_root: If True, remove the root directory too. defaults to True. + :param error_on_non_empty_dir: If True, raises a RuntimeError if a subdirectory contains at least 1 file. Otherwise it will leave non-empty directories. defaults to True. :returns: The list of directories paths deleted. """ deleted = [] @@ -36,23 +52,6 @@ def safe_rmdir( return deleted -def validate_file( - fpath: Union[str, Path], - hash_value: str, - hash_type: str = "sha256", -) -> bool: - """Validate a given file object with its hash. - - :param fpath: The filepath or the file. - :param hash_value: The hash value as string. - :param hash_type: The hash type. defaults to "sha256". - :returns: True if the file hash corresponds to the hash value. - """ - hash_value_found = hash_file(fpath, hash_type) - is_valid = hash_value_found == hash_value - return is_valid - - def hash_file( fpath: Union[str, Path], hash_type: str, diff --git a/src/aac_datasets/utils/globals.py b/src/aac_datasets/utils/globals.py new file mode 100644 index 0000000..38a746a --- /dev/null +++ b/src/aac_datasets/utils/globals.py @@ -0,0 +1,172 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +import logging +import os +import os.path as osp + +from pathlib import Path +from typing import Any, Union + + +pylog = logging.getLogger(__name__) + + +# Public functions +def get_default_root() -> str: + """Returns the default root directory path. + + If :func:`~aac_datasets.utils.globals.set_default_root` has been used before with a string argument, it will return the value given to this function. + Else if the environment variable AAC_DATASETS_ROOT has been set to a string, it will return its value. + Else it will be equal to "." by default. + """ + return __get_default_value("root") + + +def get_default_ffmpeg_path() -> str: + """Returns the default ffmpeg executable path. + + If :func:`~aac_datasets.utils.globals.set_default_ffmpeg_path` has been used before with a string argument, it will return the value given to this function. + Else if the environment variable AAC_DATASETS_FFMPEG_PATH has been set to a string, it will return its value. + Else it will be equal to "ffmpeg" by default. + """ + return __get_default_value("ffmpeg") + + +def get_default_ytdlp_path() -> str: + """Returns the default yt-dlp executable path. + + If :func:`~aac_datasets.utils.globals.set_default_ytdlp_path` has been used before with a string argument, it will return the value given to this function. + Else if the environment variable AAC_DATASETS_YTDLP_PATH has been set to a string, it will return its value. + Else it will be equal to "yt-dlp" by default. + """ + return __get_default_value("ytdlp") + + +def get_default_zip_path() -> str: + """Returns the default zip executable path. + + If :func:`~aac_datasets.utils.globals.set_default_zip_path` has been used before with a string argument, it will return the value given to this function. + Else if the environment variable AAC_DATASETS_ZIP_PATH has been set to a string, it will return its value. + Else it will be equal to "zip" by default. + """ + return __get_default_value("zip") + + +def set_default_root(cache_path: Union[str, Path, None]) -> None: + """Override default root directory path.""" + __set_default_value("root", cache_path) + + +def set_default_ffmpeg_path(tmp_path: Union[str, Path, None]) -> None: + """Override default ffmpeg executable path.""" + __set_default_value("ffmpeg", tmp_path) + + +def set_default_ytdlp_path(java_path: Union[str, Path, None]) -> None: + """Override default yt-dl executable path.""" + __set_default_value("ytdlp", java_path) + + +def set_default_zip_path(tmp_path: Union[str, Path, None]) -> None: + """Override default zip executable path.""" + __set_default_value("zip", tmp_path) + + +# Private functions +def _get_root(root: Union[str, Path, None] = None) -> str: + return __get_value("root", root) + + +def _get_ffmpeg_path(ffmpeg_path: Union[str, Path, None] = None) -> str: + return __get_value("ffmpeg", ffmpeg_path) + + +def _get_ytdlp_path(ytdlp_path: Union[str, Path, None] = None) -> str: + return __get_value("ytdlp", ytdlp_path) + + +def _get_zip_path(zip_path: Union[str, Path, None] = None) -> str: + return __get_value("zip", zip_path) + + +def __get_default_value(value_name: str) -> str: + values = __DEFAULT_GLOBALS[value_name]["values"] + process_func = __DEFAULT_GLOBALS[value_name]["process"] + + for source, value_or_env_varname in values.items(): + if source.startswith("env"): + value = os.getenv(value_or_env_varname, None) + else: + value = value_or_env_varname + + if value is not None: + value = process_func(value) + return value + + pylog.error(f"Values: {values}") + raise RuntimeError( + f"Invalid default value for value_name={value_name}. (all default values are None)" + ) + + +def __set_default_value( + value_name: str, + value: Any, +) -> None: + __DEFAULT_GLOBALS[value_name]["values"]["user"] = value + + +def __get_value(value_name: str, value: Any = None) -> Any: + if value is None or value is ...: + return __get_default_value(value_name) + else: + process_func = __DEFAULT_GLOBALS[value_name]["process"] + value = process_func(value) + return value + + +def __process_path(value: Union[str, Path, None]) -> Union[str, None]: + if value is None or value is ...: + return None + + value = str(value) + value = osp.expanduser(value) + value = osp.expandvars(value) + return value + + +__DEFAULT_GLOBALS = { + "root": { + "values": { + "user": None, + "env": "AAC_DATASETS_ROOT", + "package": ".", + }, + "process": __process_path, + }, + "ytdlp": { + "values": { + "user": None, + "env": "AAC_DATASETS_YTDLP_PATH", + "package": "yt-dlp", + }, + "process": __process_path, + }, + "ffmpeg": { + "values": { + "user": None, + "env": "AAC_DATASETS_FFMPEG_PATH", + "package": "ffmpeg", + }, + "process": __process_path, + }, + "zip": { + "values": { + "user": None, + "env": "AAC_DATASETS_ZIP_PATH", + "package": "zip", + }, + "process": __process_path, + }, +} diff --git a/src/aac_datasets/utils/paths.py b/src/aac_datasets/utils/paths.py deleted file mode 100644 index afe2312..0000000 --- a/src/aac_datasets/utils/paths.py +++ /dev/null @@ -1,158 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -import logging -import os -import os.path as osp - -from typing import Dict, Optional, Union - - -pylog = logging.getLogger(__name__) - - -__DEFAULT_PATHS: Dict[str, Dict[str, Optional[str]]] = { - "root": { - "user": None, - "env": "AAC_DATASETS_ROOT", - "package": ".", - }, - "ytdl": { - "user": None, - "env": "AAC_DATASETS_YTDL_PATH", - "package": "ytdlp", - }, - "ffmpeg": { - "user": None, - "env": "AAC_DATASETS_FFMPEG_PATH", - "package": "ffmpeg", - }, - "zip": { - "user": None, - "env": "AAC_DATASETS_ZIP_PATH", - "package": "zip", - }, -} - - -# Public functions -def get_default_root() -> str: - """Returns the default root directory path. - - If :func:`~aac_datasets.utils.path.set_default_root` has been used before with a string argument, it will return the value given to this function. - Else if the environment variable AAC_DATASETS_ROOT has been set to a string, it will return its value. - Else it will be equal to "." by default. - """ - return __get_default_path("root") - - -def get_default_ytdl_path() -> str: - """Returns the default youtube-dl executable path. - - If :func:`~aac_datasets.utils.path.set_default_ytdl_path` has been used before with a string argument, it will return the value given to this function. - Else if the environment variable AAC_DATASETS_YTDL_PATH has been set to a string, it will return its value. - Else it will be equal to "youtube-dl" by default. - """ - return __get_default_path("ytdl") - - -def get_default_ffmpeg_path() -> str: - """Returns the default ffmpeg executable path. - - If :func:`~aac_datasets.utils.path.set_default_ffmpeg_path` has been used before with a string argument, it will return the value given to this function. - Else if the environment variable AAC_DATASETS_FFMPEG_PATH has been set to a string, it will return its value. - Else it will be equal to "ffmpeg" by default. - """ - return __get_default_path("ffmpeg") - - -def get_default_zip_path() -> str: - """Returns the default zip executable path. - - If :func:`~aac_datasets.utils.path.set_default_zip_path` has been used before with a string argument, it will return the value given to this function. - Else if the environment variable AAC_DATASETS_ZIP_PATH has been set to a string, it will return its value. - Else it will be equal to "zip" by default. - """ - return __get_default_path("zip") - - -def set_default_root(cache_path: Optional[str]) -> None: - """Override default root directory path.""" - __set_default_path("root", cache_path) - - -def set_default_ytdl_path(java_path: Optional[str]) -> None: - """Override default youtube-dl executable path.""" - __set_default_path("ytdl", java_path) - - -def set_default_ffmpeg_path(tmp_path: Optional[str]) -> None: - """Override default ffmpeg executable path.""" - __set_default_path("ffmpeg", tmp_path) - - -def set_default_zip_path(tmp_path: Optional[str]) -> None: - """Override default zip executable path.""" - __set_default_path("zip", tmp_path) - - -# Private functions -def _get_root(root: Union[str, None] = None) -> str: - return __get_path("root", root) - - -def _get_ytdl_path(ytdl_path: Union[str, None] = None) -> str: - return __get_path("ytdl", ytdl_path) - - -def _get_ffmpeg_path(ffmpeg_path: Union[str, None] = None) -> str: - return __get_path("ffmpeg", ffmpeg_path) - - -def _get_zip_path(zip_path: Union[str, None] = None) -> str: - return __get_path("zip", zip_path) - - -def __get_default_path(path_name: str) -> str: - paths = __DEFAULT_PATHS[path_name] - - for name, path_or_var in paths.items(): - if path_or_var is None: - continue - - if name.startswith("env"): - path = os.getenv(path_or_var, None) - else: - path = path_or_var - - if path is not None: - path = __process_path(path) - return path - - pylog.error(f"Paths values: {paths}") - raise RuntimeError( - f"Invalid default path for path_name={path_name}. (all default paths are None)" - ) - - -def __set_default_path( - path_name: str, - path: Optional[str], -) -> None: - if path is not ... and path is not None: - path = __process_path(path) - __DEFAULT_PATHS[path_name]["user"] = path - - -def __get_path(path_name: str, path: Union[str, None] = None) -> str: - if path is ... or path is None: - return __get_default_path(path_name) - else: - path = __process_path(path) - return path - - -def __process_path(path: str) -> str: - path = osp.expanduser(path) - path = osp.expandvars(path) - return path