Version 0.5.1

Labbeti · Mar 4, 2024 · 650057f · 650057f
1 parent 781b759
commit 650057f
Show file tree

Hide file tree

Showing 23 changed files with 332 additions and 283 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,36 @@
+# exclude: ""
+
+repos:
+    # Format Code
+    - repo: https://github.com/ambv/black
+      rev: 22.10.0
+      hooks:
+        - id: black
+
+    # Sort imports
+    - repo: https://github.com/PyCQA/isort
+      rev: 5.12.0
+      hooks:
+      - id: isort
+        args: ["--profile", "black"]
+
+    # Formatting, Whitespace, etc
+    - repo: https://github.com/pre-commit/pre-commit-hooks
+      rev: v2.2.3
+      hooks:
+      - id: trailing-whitespace
+      - id: check-added-large-files
+        args: ['--maxkb=1000']
+      - id: check-ast
+      - id: check-json
+      - id: check-merge-conflict
+      - id: check-xml
+      - id: check-yaml
+      - id: debug-statements
+      - id: end-of-file-fixer
+      - id: requirements-txt-fixer
+      - id: mixed-line-ending
+        args: ['--fix=no']
+      - id: flake8
+        # args: ['--ignore=E203,E501,F811,E712,W503']
+        args: ['--config=.flake8']
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,11 @@
 
 All notable changes to this project will be documented in this file.
 
+## [0.5.1] 2024-03-04
+### Fixed
+- WavCaps download preparation (#3).
+- `safe_rmdir` function when sub-directories are deleted.
+
 ## [0.5.0] 2024-01-05
 ### Changed
 - Update typing for paths with python class `Path`.
@@ -83,7 +88,7 @@ All notable changes to this project will be documented in this file.
 - Weak private methods are now strongly private in datasets.
 - Rename `item_transform` to `transform` in datasets.
 - Rename `load_tags` to `with_tags` in `AudioCaps`.
- 
+
 ### Fixed
 - AudioCaps loading when `with_tags` is False.
 - Clotho files download.

diff --git a/CITATION.cff b/CITATION.cff
@@ -11,7 +11,7 @@ authors:
     affiliation: IRIT
     orcid: 'https://orcid.org/0000-0002-7219-5463'
 repository-code: 'https://github.com/Labbeti/aac-datasets/'
-abstract: Audio Captioning datasets for Pytorch.
+abstract: Audio Captioning datasets for PyTorch.
 keywords:
   - audio
   - deep-learning
@@ -22,5 +22,5 @@ keywords:
   - captioning
   - audio-captioning
 license: MIT
-version: 0.5.0
-date-released: '2024-01-05'
+version: 0.5.1
+date-released: '2024-03-04'
diff --git a/README.md b/README.md
@@ -67,7 +67,7 @@ aac-datasets-download --root "." clotho --subsets "dev"
 ```
 
 ## Datasets information
-Here is the statistics for each dataset :
+`aac-datasets` package contains 4 different datasets :
 
 <!-- | | AudioCaps | Clotho | MACS | WavCaps |
 |:---:|:---:|:---:|:---:|:---:|
@@ -85,7 +85,7 @@ Here is the statistics for each dataset :
 
 For Clotho, the **dev** subset should be used for training, val for validation and eval for testing.
 
-Here is additional statistics on the train subset for AudioCaps, Clotho and MACS:
+Here is additional statistics on the train subset for AudioCaps, Clotho, MACS and WavCaps:
 
 | | AudioCaps/train | Clotho/dev | MACS/full | WavCaps/full |
 |:---:|:---:|:---:|:---:|:---:|
@@ -96,7 +96,9 @@ Here is additional statistics on the train subset for AudioCaps, Clotho and MACS
 | Nb captions | 49,838 | 19,195 | 17,275 | 403,050 |
 | Total nb words<sup>2</sup> | 402,482 | 217,362 | 160,006 | 3,161,823 |
 | Sentence size<sup>2</sup> | 2-52 | 8-20 | 5-40 | 2-38 |
-| Vocabulary<sup>2</sup> | 4724 | 4369 | 2721 | 24600 |
+| Vocabulary<sup>2</sup> | 4724 | 4369 | 2721 | 24,600 |
+| Annotated by | Human | Human | Human | Machine |
+| Corrected by | Human | Human | None | None |
 
 <sup>1</sup> This duration is estimated on the total duration of 46230/49838 files of 126.7h.
 
@@ -167,7 +169,7 @@ dataset = AudioCaps(
 [3] F. Font, A. Mesaros, D. P. W. Ellis, E. Fonseca, M. Fuentes, and B. Elizalde, Proceedings of the 6th Workshop on Detection and Classication of Acoustic Scenes and Events (DCASE 2021). Barcelona, Spain: Music Technology Group - Universitat Pompeu Fabra, Nov. 2021. Available: https://doi.org/10.5281/zenodo.5770113
 
 #### WavCaps
-[4] X. Mei et al., “WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research,” arXiv preprint arXiv:2303.17395, 2023, [Online]. Available: https://arxiv.org/pdf/2303.17395.pdf 
+[4] X. Mei et al., “WavCaps: A ChatGPT-Assisted Weakly-Labelled Audio Captioning Dataset for Audio-Language Multimodal Research,” arXiv preprint arXiv:2303.17395, 2023, [Online]. Available: https://arxiv.org/pdf/2303.17395.pdf
 
 ## Cite the aac-datasets package
 If you use this software, please consider cite it as "Labbe, E. (2013). aac-datasets: Audio Captioning datasets for PyTorch.", or use the following BibTeX citation:
@@ -177,10 +179,10 @@ If you use this software, please consider cite it as "Labbe, E. (2013). aac-data
     Labbe_aac_datasets_2024,
     author = {Labbé, Etienne},
     license = {MIT},
-    month = {01},
+    month = {03},
     title = {{aac-datasets}},
     url = {https://github.com/Labbeti/aac-datasets/},
-    version = {0.5.0},
+    version = {0.5.1},
     year = {2024}
 }
 ```

diff --git a/docs/conf.py b/docs/conf.py
@@ -17,7 +17,6 @@
 
 import aac_datasets
 
-
 # -- Project information -----------------------------------------------------
 
 project = aac_datasets.__name__
@@ -96,5 +95,5 @@
     "torchaudio": ("https://pytorch.org/audio/stable/", None),
 }
 
-# TODO: to be used with sphinx>=7.1
+# Only works with sphinx>=7.1
 maximum_signature_line_length = 10
diff --git a/docs/data_subsets.rst b/docs/data_subsets.rst
@@ -7,7 +7,7 @@ The original AudioCaps dataset contains only 3 subsets : `train`, `val` and `tes
 
 A fourth subset named `train_v2` is another version of the train subset where captions has been manually corrected or deleted. For more details, see paper "CoNeTTE: An efficient Audio Captioning system leveraging multiple datasets with Task Embedding".
 
-Clotho 
+Clotho
 ########################
 Clotho contains 7 subsets:
 
@@ -28,12 +28,12 @@ WavCaps
 ########################
 WavCaps contains 6 subsets:
 
-- `as` : contains 108K files from AudioSet strongly labeled dataset,
+- `audioset` : contains 108K files from AudioSet strongly labeled dataset,
 - `bbc` : contains 31K files from BBC Sound Effects website,
-- `fsd` : contains 262K files from FreeSound website,
-- `sb` : contains 1.2K files from SoundBible website,
-- `as_noac` : contains 99K files from as subset without overlapping data with AudioCaps,
-- `fsd_nocl` : contains 258K files from fsd subset without overlapping data with Clotho (except for subsets of task 6a).
+- `freesound` : contains 262K files from FreeSound website,
+- `soundbible` : contains 1.2K files from SoundBible website,
+- `audioset_no_audiocaps` : contains 99K files from as subset without overlapping data with AudioCaps,
+- `freesound_no_clotho` : contains 258K files from fsd subset without overlapping data with Clotho (except for subsets of task 6a).
 
 Since WavCaps does not contains validation or testing subsets, all of their data is used as additional training data.
-The subsets as_noac and `fsd_nocl` are provided to avoid biases when evaluating on AudioCaps or Clotho datasets.
+The subsets as_noac and `freesound_no_clotho` are provided to avoid biases when evaluating on AudioCaps or Clotho datasets.
diff --git a/docs/usage.rst b/docs/usage.rst
@@ -13,15 +13,6 @@ You can download each dataset subset by using the download=True option in datase
 
     _ = Clotho("/my/path/to/data", subset="dev", download=True)
 
-You can also do the same by using functions :
-.. :caption: Download Clotho development dataset (python).
-
-.. code-block:: python
-
-    from aac_datasets.download import download_clotho_dataset
-
-    download_clotho_dataset("/my/path/to/data", subset="dev", download=True)
-
 Or by the command line :
 .. :caption: Download Clotho development dataset (command line).
 

diff --git a/examples/dataloader.ipynb b/examples/dataloader.ipynb
@@ -60,7 +60,7 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "aac-datasets version: 0.5.0\n"
+      "aac-datasets version: 0.5.1\n"
      ]
     }
    ],
@@ -108,7 +108,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 7,
+   "execution_count": 6,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -123,7 +123,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 8,
+   "execution_count": 7,
    "metadata": {},
    "outputs": [
     {
@@ -188,7 +188,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.4"
+   "version": "3.10.11"
   },
   "orig_nbformat": 4,
   "vscode": {

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -1,10 +1,11 @@
 # -*- coding: utf-8 -*-
 
-pytest==6.2.5
-flake8==4.0.1
-click==8.0.4
 black==22.10.0
+click==8.0.4
+flake8==4.0.1
 ipykernel==6.9.1
-twine==4.0.1
+pre-commit
+pytest==6.2.5
 sphinx
 sphinx-press-theme==0.8.0
+twine==4.0.1
diff --git a/src/aac_datasets/__init__.py b/src/aac_datasets/__init__.py
@@ -6,11 +6,13 @@
 
 __author__ = "Etienne Labbé (Labbeti)"
 __author_email__ = "labbeti.pub@gmail.com"
+__docs__ = "Audio Captioning Datasets"
+__docs_url__ = "https://aac-datasets.readthedocs.io/en/stable/"
 __license__ = "MIT"
 __maintainer__ = "Etienne Labbé (Labbeti)"
 __name__ = "aac-datasets"
 __status__ = "Development"
-__version__ = "0.5.0"
+__version__ = "0.5.1"
 
 
 from .datasets.audiocaps import AudioCaps
@@ -26,7 +28,6 @@
     set_default_ytdlp_path,
 )
 
-
 __all__ = [
     "AudioCaps",
     "Clotho",

diff --git a/src/aac_datasets/check.py b/src/aac_datasets/check.py
@@ -4,21 +4,18 @@
 import logging
 import os.path as osp
 import random
-
 from argparse import ArgumentParser, Namespace
 from typing import Dict, Iterable, Union
 
 import yaml
 
 import aac_datasets
-
 from aac_datasets.datasets.audiocaps import AudioCaps, AudioCapsCard
 from aac_datasets.datasets.clotho import Clotho, ClothoCard
 from aac_datasets.datasets.macs import MACS, MACSCard
 from aac_datasets.datasets.wavcaps import WavCaps, WavCapsCard
-from aac_datasets.utils.globals import get_default_root
 from aac_datasets.download import _setup_logging
-
+from aac_datasets.utils.globals import get_default_root
 
 DATASETS_NAMES = (AudioCapsCard.NAME, ClothoCard.NAME, MACSCard.NAME, WavCapsCard.NAME)
 
@@ -66,8 +63,8 @@ def check_directory(
                 ds = ds_class(root, subset, verbose=0)
                 if len(ds) > 0:
                     # Try to load a random item
-                    idx = random.randint(0, len(ds) - 1)
-                    _item = ds[idx]
+                    index = random.randint(0, len(ds) - 1)
+                    ds[index]
                 found_dsets[subset] = ds
 
             except RuntimeError:

diff --git a/src/aac_datasets/datasets/audiocaps.py b/src/aac_datasets/datasets/audiocaps.py
@@ -3,23 +3,13 @@
 
 import logging
 import os.path as osp
-
 from pathlib import Path
-from typing import (
-    Any,
-    Callable,
-    ClassVar,
-    Dict,
-    List,
-    Optional,
-    Union,
-)
+from typing import Any, Callable, ClassVar, Dict, List, Optional, Union
 
 import torch
 import torchaudio
-
 from torch import Tensor
-from typing_extensions import TypedDict, NotRequired
+from typing_extensions import NotRequired, TypedDict
 
 try:
     # To support torchaudio >= 2.1.0
@@ -30,12 +20,11 @@
 from aac_datasets.datasets.base import AACDataset
 from aac_datasets.datasets.functional.audiocaps import (
     AudioCapsCard,
+    _get_audio_subset_dpath,
     download_audiocaps_dataset,
     load_audiocaps_dataset,
-    _get_audio_subset_dpath,
 )
-from aac_datasets.utils.globals import _get_root, _get_ffmpeg_path, _get_ytdlp_path
-
+from aac_datasets.utils.globals import _get_ffmpeg_path, _get_root, _get_ytdlp_path
 
 pylog = logging.getLogger(__name__)
 
@@ -98,7 +87,7 @@ def __init__(
         root: Union[str, Path, None] = None,
         subset: str = AudioCapsCard.DEFAULT_SUBSET,
         download: bool = False,
-        transform: Optional[Callable[[Dict[str, Any]], Any]] = None,
+        transform: Optional[Callable[[AudioCapsItem], Any]] = None,
         verbose: int = 0,
         force_download: bool = False,
         verify_files: bool = False,
@@ -284,10 +273,10 @@ def __repr__(self) -> str:
         return f"{AudioCapsCard.PRETTY_NAME}({repr_str})"
 
     # Private methods
-    def _load_audio(self, idx: int) -> Tensor:
-        if not self._raw_data["is_on_disk"][idx]:
+    def _load_audio(self, index: int) -> Tensor:
+        if not self._raw_data["is_on_disk"][index]:
             return torch.empty((0,))
-        fpath = self.at(idx, "fpath")
+        fpath = self.at(index, "fpath")
         audio, sr = torchaudio.load(fpath)  # type: ignore
 
         # Sanity check
@@ -302,9 +291,9 @@ def _load_audio(self, idx: int) -> Tensor:
             )
         return audio
 
-    def _load_audio_metadata(self, idx: int) -> AudioMetaData:
-        if not self._raw_data["is_on_disk"][idx]:
+    def _load_audio_metadata(self, index: int) -> AudioMetaData:
+        if not self._raw_data["is_on_disk"][index]:
             return AudioMetaData(-1, -1, -1, -1, "unknown_encoding")
-        fpath = self.at(idx, "fpath")
+        fpath = self.at(index, "fpath")
         audio_metadata = torchaudio.info(fpath)  # type: ignore
         return audio_metadata