Use huggingface_hub cache (#7105)

* use hfh cache * remove unused mock download manager * fix remaining http calls * remove test line * use the hfh lib cache_dir for hf_hub_download * bump hfh minimum version * update tests * style * again * lucain's comments * fix tests * minor * update offline test * don't test time out on old hfh * minor * disable some tests on windows * update docs * typo
huggingface · Aug 21, 2024 · 2878019 · 2878019
1 parent 70bac27
commit 2878019
Show file tree

Hide file tree

Showing 15 changed files with 104 additions and 687 deletions.
diff --git a/.github/conda/meta.yaml b/.github/conda/meta.yaml
@@ -24,7 +24,7 @@ requirements:
     - dataclasses
     - multiprocess
     - fsspec
-    - huggingface_hub >=0.21.2,<1.0.0
+    - huggingface_hub >=0.22.0,<1.0.0
     - packaging
     - aiohttp
   run:
@@ -41,7 +41,7 @@ requirements:
     - dataclasses
     - multiprocess
     - fsspec
-    - huggingface_hub >=0.21.2,<1.0.0
+    - huggingface_hub >=0.22.0,<1.0.0
     - packaging
     - aiohttp
 

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -65,7 +65,7 @@ jobs:
         run: uv pip install --system --upgrade pyarrow huggingface-hub dill
       - name: Install dependencies (minimum versions)
         if: ${{ matrix.deps_versions != 'deps-latest' }}
-        run: uv pip install --system pyarrow==15.0.0 huggingface-hub==0.21.2 transformers dill==0.3.1.1
+        run: uv pip install --system pyarrow==15.0.0 huggingface-hub==0.22.0 transformers dill==0.3.1.1
       - name: Test with pytest
         run: |
           python -m pytest -rfExX -m ${{ matrix.test }} -n 2 --dist loadfile -sv ./tests/

diff --git a/docs/source/audio_dataset.mdx b/docs/source/audio_dataset.mdx
@@ -523,8 +523,6 @@ The reason you need to use a combination of [`~DownloadManager.download`] and [`
 ```py
 def _split_generators(self, dl_manager):
     """Returns SplitGenerators."""
-    dl_manager.download_config.ignore_url_params = True
-
     audio_path = dl_manager.download(_AUDIO_URL)
     local_extracted_archive = dl_manager.extract(audio_path) if not dl_manager.is_streaming else None
     path_to_clips = "librivox-indonesia"

diff --git a/docs/source/cache.mdx b/docs/source/cache.mdx
@@ -1,8 +1,14 @@
 # Cache management
 
-When you download a dataset, the processing scripts and data are stored locally on your computer. The cache allows 🤗 Datasets to avoid re-downloading or processing the entire dataset every time you use it. 
+When you download a dataset from Hugging Face, the data are stored locally on your computer.
+Files from Hugging Face are stored as usual in the `huggingface_hub` cache, which is at `~/.cache/huggingface/hub` by default.
+See the [Hub cache documentation](https://huggingface.co/docs/huggingface_hub/guides/manage-cache) for more details and how to change its location.
 
-This guide will show you how to:
+The Hub cache allows 🤗 Datasets to avoid re-downloading dataset files from Hugging Face every time you use them. 
+
+🤗 Datasets also has its own cache to store datasets converted in Arrow format (the format used by [`Dataset`] objects).
+
+This guide focuses on the 🤗 Datasets cache and will show you how to:
 
 - Change the cache directory.
 - Control how a dataset is loaded from the cache.
@@ -11,17 +17,17 @@ This guide will show you how to:
 
 ## Cache directory
 
-The default cache directory is `~/.cache/huggingface/datasets`. Change the cache location by setting the shell environment variable, `HF_DATASETS_CACHE` to another directory:
+The default 🤗 Datasets cache directory is `~/.cache/huggingface/datasets`. Change the cache location by setting the shell environment variable, `HF_DATASETS_CACHE` to another directory:
 
 ```
-$ export HF_DATASETS_CACHE="/path/to/another/directory"
+$ export HF_DATASETS_CACHE="/path/to/another/directory/datasets"
 ```
 
 When you load a dataset, you also have the option to change where the data is cached. Change the `cache_dir` parameter to the path you want:
 
 ```py
 >>> from datasets import load_dataset
->>> dataset = load_dataset('LOADING_SCRIPT', cache_dir="PATH/TO/MY/CACHE/DIR")
+>>> dataset = load_dataset('username/dataset', cache_dir="/path/to/another/directory/datasets")
 ```
 
 ## Download mode
@@ -37,7 +43,7 @@ Refer to [`DownloadMode`] for a full list of download modes.
 
 ## Cache files
 
-Clean up the cache files in the directory with [`Dataset.cleanup_cache_files`]:
+Clean up the Arrow cache files in the directory with [`Dataset.cleanup_cache_files`]:
 
 ```py
 # Returns the number of removed cache files

diff --git a/setup.py b/setup.py
@@ -133,7 +133,7 @@
     # for data streaming via http
     "aiohttp",
     # To get datasets from the Datasets Hub on huggingface.co
-    "huggingface-hub>=0.21.2",
+    "huggingface-hub>=0.22.0",
     # Utilities from PyPA to e.g., compare versions
     "packaging",
     # To parse YAML metadata from dataset cards

diff --git a/src/datasets/builder.py b/src/datasets/builder.py
@@ -847,7 +847,6 @@ def download_and_prepare(
             )
 
         is_local = not is_remote_filesystem(self._fs)
-
         self.dl_manager = dl_manager
 
         # Prevent parallel local disk operations

diff --git a/src/datasets/download/download_config.py b/src/datasets/download/download_config.py
@@ -43,9 +43,6 @@ class DownloadConfig:
         token (`str` or `bool`, *optional*):
             Optional string or boolean to use as Bearer token
             for remote files on the Datasets Hub. If `True`, or not specified, will get token from `~/.huggingface`.
-        ignore_url_params (`bool`, defaults to `False`):
-            Whether to strip all query parameters and fragments from
-            the download URL before using it for caching the file.
         storage_options (`dict`, *optional*):
             Key/value pairs to be passed on to the dataset file-system backend, if any.
         download_desc (`str`, *optional*):
@@ -68,7 +65,6 @@ class DownloadConfig:
     num_proc: Optional[int] = None
     max_retries: int = 1
     token: Optional[Union[str, bool]] = None
-    ignore_url_params: bool = False
     storage_options: Dict[str, Any] = field(default_factory=dict)
     download_desc: Optional[str] = None
     disable_tqdm: bool = False

diff --git a/src/datasets/download/mock_download_manager.py b/src/datasets/download/mock_download_manager.py
diff --git a/src/datasets/load.py b/src/datasets/load.py
@@ -37,7 +37,7 @@
 import yaml
 from fsspec.core import url_to_fs
 from huggingface_hub import DatasetCard, DatasetCardData, HfApi, HfFileSystem
-from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError
+from huggingface_hub.utils import GatedRepoError, RepositoryNotFoundError, RevisionNotFoundError, get_session
 
 from . import config
 from .arrow_dataset import Dataset
@@ -76,7 +76,7 @@
     OfflineModeIsEnabled,
     _raise_if_offline_mode_is_enabled,
     cached_path,
-    head_hf_s3,
+    get_datasets_user_agent,
     init_hf_modules,
     is_relative_path,
     relative_to_absolute_path,
@@ -276,7 +276,11 @@ def increase_load_count(name: str):
     """Update the download count of a dataset."""
     if not config.HF_HUB_OFFLINE and config.HF_UPDATE_DOWNLOAD_COUNTS:
         try:
-            head_hf_s3(name, filename=name + ".py")
+            get_session().head(
+                "/".join((config.S3_DATASETS_BUCKET_PREFIX, name, name + ".py")),
+                user_agent=get_datasets_user_agent(),
+                timeout=3,
+            )
         except Exception:
             pass