From abc21f0bd4ca10ed07cdf2dc399e40f6a31fd5b0 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Tue, 22 Oct 2024 17:21:25 +0000 Subject: [PATCH 01/11] Use tempdir for finetuning dataset download --- llmfoundry/data/finetuning/tasks.py | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 915267786f..14113cc4cc 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -34,6 +34,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: import importlib import logging import os +import tempfile import warnings from collections.abc import Mapping from functools import partial @@ -107,15 +108,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: _ALLOWED_CONTENT_KEYS = {'content'} _ALLOWED_ROLES = {'user', 'assistant', 'system', 'tool'} _ALLOWED_LAST_MESSAGE_ROLES = {'assistant'} -DOWNLOADED_FT_DATASETS_DIRPATH = os.path.abspath( - os.path.join( - os.path.realpath(__file__), - os.pardir, - os.pardir, - os.pardir, - '.downloaded_finetuning', - ), -) +DOWNLOADED_FT_DATASETS_DIRPATH = tempfile.mkdtemp() SUPPORTED_EXTENSIONS = ['.csv', '.json', '.jsonl', '.parquet'] HUGGINGFACE_FOLDER_EXTENSIONS = ['.lock', '.metadata'] DEFAULT_TARGET_RESPONSES = 'last' From 0a3846febd5b4ac068eb8217fab7843661b4d547 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Tue, 22 Oct 2024 19:21:03 +0000 Subject: [PATCH 02/11] Add debug logs --- llmfoundry/data/finetuning/dataloader.py | 2 ++ llmfoundry/data/finetuning/tasks.py | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 7c9d149fea..908144305d 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -593,6 +593,8 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: finetune_dir, f'.node_{dist.get_node_rank()}_local_rank0_completed', ) + + log.debug(f'Downloading dataset {name} to {destination}.') if dist.get_local_rank() == 0: try: get_file(path=name, destination=destination, overwrite=True) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 14113cc4cc..2a3b97ee73 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -901,6 +901,10 @@ def build_from_hf( dataset_name, ) + log.debug( + f'Downloading dataset {dataset_name} to {local_dataset_dir}.', + ) + if _is_empty_or_nonexistent(dirpath=local_dataset_dir): # Safely load a dataset from HF Hub with restricted file types. hf_hub.snapshot_download( From 7a29f9f1192d6362fa3d8925a146c60c05441bf6 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 23 Oct 2024 04:23:03 +0000 Subject: [PATCH 03/11] Use tempdir for dataset downloading --- llmfoundry/data/finetuning/dataloader.py | 7 +++++-- llmfoundry/data/finetuning/tasks.py | 7 +++---- llmfoundry/utils/file_utils.py | 16 ++++++++++++++++ tests/data/test_dataloader.py | 19 +++++++++---------- 4 files changed, 33 insertions(+), 16 deletions(-) create mode 100644 llmfoundry/utils/file_utils.py diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index 908144305d..ad54394253 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -20,7 +20,6 @@ from llmfoundry.data.finetuning.tasks import ( DEFAULT_TARGET_PROMPTS, DEFAULT_TARGET_RESPONSES, - DOWNLOADED_FT_DATASETS_DIRPATH, SUPPORTED_EXTENSIONS, dataset_constructor, ) @@ -32,6 +31,7 @@ MissingHuggingFaceURLSplitError, NotEnoughDatasetSamplesError, ) +from llmfoundry.utils.file_utils import dist_mkdtemp from llmfoundry.utils.registry_utils import construct_from_registry log = logging.getLogger(__name__) @@ -571,7 +571,7 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: # HF datasets does not support a split with dashes, so we replace dashes with underscores. hf_formatted_split = split.replace('-', '_') finetune_dir = os.path.join( - DOWNLOADED_FT_DATASETS_DIRPATH, + dist_mkdtemp(), hf_formatted_split if hf_formatted_split != 'data' else 'data_not', ) os.makedirs(finetune_dir, exist_ok=True) @@ -617,10 +617,13 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_download') + print(dist.get_local_rank(), f'signal_file_path: {signal_file_path}') + # Avoid the collective call until the local rank zero has finished trying to download the dataset # so that we don't timeout for large downloads. This syncs all processes on the node with dist.local_rank_zero_download_and_wait(signal_file_path): # Then, wait to ensure every node has finished trying to download the dataset + print('GOT TO BARRIER') dist.barrier() # clean up signal file diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 2a3b97ee73..5225a60473 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -34,7 +34,6 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: import importlib import logging import os -import tempfile import warnings from collections.abc import Mapping from functools import partial @@ -92,6 +91,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: UnableToProcessPromptResponseError, UnknownExampleTypeError, ) +from llmfoundry.utils.file_utils import dist_mkdtemp # yapf: enable from llmfoundry.utils.logging_utils import SpecificWarningFilter @@ -108,7 +108,6 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: _ALLOWED_CONTENT_KEYS = {'content'} _ALLOWED_ROLES = {'user', 'assistant', 'system', 'tool'} _ALLOWED_LAST_MESSAGE_ROLES = {'assistant'} -DOWNLOADED_FT_DATASETS_DIRPATH = tempfile.mkdtemp() SUPPORTED_EXTENSIONS = ['.csv', '.json', '.jsonl', '.parquet'] HUGGINGFACE_FOLDER_EXTENSIONS = ['.lock', '.metadata'] DEFAULT_TARGET_RESPONSES = 'last' @@ -897,11 +896,11 @@ def build_from_hf( if not os.path.isdir(dataset_name): # dataset_name is not a local dir path, download if needed. local_dataset_dir = os.path.join( - DOWNLOADED_FT_DATASETS_DIRPATH, + dist_mkdtemp(), dataset_name, ) - log.debug( + print( f'Downloading dataset {dataset_name} to {local_dataset_dir}.', ) diff --git a/llmfoundry/utils/file_utils.py b/llmfoundry/utils/file_utils.py new file mode 100644 index 0000000000..2543aea759 --- /dev/null +++ b/llmfoundry/utils/file_utils.py @@ -0,0 +1,16 @@ +# Copyright 2024 MosaicML LLM Foundry authors +# SPDX-License-Identifier: Apache-2.0 + +import tempfile + +from composer.utils import dist + + +def dist_mkdtemp() -> str: + tempdir = None + if dist.get_local_rank() == 0: + tempdir = tempfile.mkdtemp() + tempdir = dist.all_gather_object(tempdir)[0] + if tempdir is None: + raise ValueError('Dist operation to get tempdir failed.') + return tempdir diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 5f16c86eb9..2501f4c59a 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -30,7 +30,6 @@ validate_target_settings, ) from llmfoundry.data.finetuning.tasks import ( - DOWNLOADED_FT_DATASETS_DIRPATH, HUGGINGFACE_FOLDER_EXTENSIONS, SUPPORTED_EXTENSIONS, dataset_constructor, @@ -430,9 +429,9 @@ def test_finetuning_dataloader_safe_load( hf_name: str, hf_revision: Optional[str], expectation: ContextManager, + tmp_path: pathlib.Path, ): # Clear the folder - shutil.rmtree(DOWNLOADED_FT_DATASETS_DIRPATH, ignore_errors=True) cfg = DictConfig({ 'dataset': { 'hf_name': hf_name, @@ -455,18 +454,18 @@ def test_finetuning_dataloader_safe_load( tokenizer = build_tokenizer('gpt2', {}) - with expectation: - _ = build_finetuning_dataloader( - tokenizer=tokenizer, - device_batch_size=1, - **cfg, - ) + with patch('llmfoundry.data.finetuning.tasks.dist_mkdtemp', return_value=str(tmp_path)): + with expectation: + _ = build_finetuning_dataloader( + tokenizer=tokenizer, + device_batch_size=1, + **cfg, + ) # If no raised errors, we should expect downloaded files with only safe file types. if isinstance(expectation, does_not_raise): - download_dir = os.path.join(DOWNLOADED_FT_DATASETS_DIRPATH, hf_name) downloaded_files = [ - file for _, _, files in os.walk(download_dir) for file in files + file for _, _, files in os.walk(tmp_path) for file in files ] assert len(downloaded_files) > 0 assert all( From 8dcc1c4df5c2062273efb2d9f53732b9dba5ba77 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 23 Oct 2024 04:23:33 +0000 Subject: [PATCH 04/11] fix --- llmfoundry/data/finetuning/tasks.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 5225a60473..57d0a1c67c 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -900,7 +900,7 @@ def build_from_hf( dataset_name, ) - print( + log.debug( f'Downloading dataset {dataset_name} to {local_dataset_dir}.', ) From 145388b1c8f6673e6711b3bbfd0d4b1da7db6785 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 23 Oct 2024 04:34:13 +0000 Subject: [PATCH 05/11] add docstring for helepr --- llmfoundry/utils/file_utils.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/llmfoundry/utils/file_utils.py b/llmfoundry/utils/file_utils.py index 2543aea759..f50f5b42c4 100644 --- a/llmfoundry/utils/file_utils.py +++ b/llmfoundry/utils/file_utils.py @@ -7,6 +7,11 @@ def dist_mkdtemp() -> str: + """Creates a temp directory on local rank 0 to use for other ranks. + + Returns: + str: The path to the temporary directory. + """ tempdir = None if dist.get_local_rank() == 0: tempdir = tempfile.mkdtemp() From 98cef6af8d174b43ab15f9ea95f2048ca354f4c4 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 23 Oct 2024 05:31:03 +0000 Subject: [PATCH 06/11] fix tests --- tests/data/test_dataloader.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data/test_dataloader.py b/tests/data/test_dataloader.py index 2501f4c59a..2608ccd091 100644 --- a/tests/data/test_dataloader.py +++ b/tests/data/test_dataloader.py @@ -454,7 +454,7 @@ def test_finetuning_dataloader_safe_load( tokenizer = build_tokenizer('gpt2', {}) - with patch('llmfoundry.data.finetuning.tasks.dist_mkdtemp', return_value=str(tmp_path)): + with patch('llmfoundry.data.finetuning.tasks.tempfile.mkdtemp', return_value=str(tmp_path)): with expectation: _ = build_finetuning_dataloader( tokenizer=tokenizer, From 3fa2031dce0418179bc728e6d8cec6f64b1cb12b Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 23 Oct 2024 05:32:03 +0000 Subject: [PATCH 07/11] use tempfile in tasks.py --- llmfoundry/data/finetuning/tasks.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 0e8c46e8f4..2030a87fe7 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -34,6 +34,7 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: import importlib import logging import os +import tempfile import warnings from collections.abc import Mapping from functools import partial @@ -91,7 +92,6 @@ def preprocessing_fn(example: Dict) -> Dict[str, str]: UnableToProcessPromptResponseError, UnknownExampleTypeError, ) -from llmfoundry.utils.file_utils import dist_mkdtemp # yapf: enable from llmfoundry.utils.logging_utils import SpecificWarningFilter @@ -912,7 +912,7 @@ def build_from_hf( if not os.path.isdir(dataset_name): # dataset_name is not a local dir path, download if needed. local_dataset_dir = os.path.join( - dist_mkdtemp(), + tempfile.mkdtemp(), dataset_name, ) From 17eb0be23cf9ad3469f09fbbf2a1c32f89c5eb42 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 23 Oct 2024 06:13:04 +0000 Subject: [PATCH 08/11] change error type --- llmfoundry/utils/file_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llmfoundry/utils/file_utils.py b/llmfoundry/utils/file_utils.py index f50f5b42c4..634bafcbd8 100644 --- a/llmfoundry/utils/file_utils.py +++ b/llmfoundry/utils/file_utils.py @@ -17,5 +17,5 @@ def dist_mkdtemp() -> str: tempdir = tempfile.mkdtemp() tempdir = dist.all_gather_object(tempdir)[0] if tempdir is None: - raise ValueError('Dist operation to get tempdir failed.') + raise RuntimeError('Dist operation to get tempdir failed.') return tempdir From f8fd8a0d86bc6d23071635f2744de67aa4b3c349 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Wed, 23 Oct 2024 18:16:17 +0000 Subject: [PATCH 09/11] address pr comments --- llmfoundry/data/finetuning/dataloader.py | 3 --- llmfoundry/utils/file_utils.py | 7 +++++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/llmfoundry/data/finetuning/dataloader.py b/llmfoundry/data/finetuning/dataloader.py index ad54394253..e73213f74a 100644 --- a/llmfoundry/data/finetuning/dataloader.py +++ b/llmfoundry/data/finetuning/dataloader.py @@ -617,13 +617,10 @@ def _download_remote_hf_dataset(remote_path: str, split: str) -> str: with open(signal_file_path, 'wb') as f: f.write(b'local_rank0_completed_download') - print(dist.get_local_rank(), f'signal_file_path: {signal_file_path}') - # Avoid the collective call until the local rank zero has finished trying to download the dataset # so that we don't timeout for large downloads. This syncs all processes on the node with dist.local_rank_zero_download_and_wait(signal_file_path): # Then, wait to ensure every node has finished trying to download the dataset - print('GOT TO BARRIER') dist.barrier() # clean up signal file diff --git a/llmfoundry/utils/file_utils.py b/llmfoundry/utils/file_utils.py index 634bafcbd8..1b106c4c92 100644 --- a/llmfoundry/utils/file_utils.py +++ b/llmfoundry/utils/file_utils.py @@ -13,9 +13,12 @@ def dist_mkdtemp() -> str: str: The path to the temporary directory. """ tempdir = None - if dist.get_local_rank() == 0: + local_rank = dist.get_local_rank() + global_rank = dist.get_global_rank() + if local_rank == 0: tempdir = tempfile.mkdtemp() - tempdir = dist.all_gather_object(tempdir)[0] + + tempdir = dist.all_gather_object(tempdir)[global_rank - local_rank] if tempdir is None: raise RuntimeError('Dist operation to get tempdir failed.') return tempdir From 6616e47f009742a224313ffe7c7a26a412682929 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 24 Oct 2024 15:17:06 -0700 Subject: [PATCH 10/11] code quality Co-authored-by: Mihir Patel --- llmfoundry/data/finetuning/tasks.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 2030a87fe7..5ccebcd22b 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -916,9 +916,7 @@ def build_from_hf( dataset_name, ) - log.debug( - f'Downloading dataset {dataset_name} to {local_dataset_dir}.', - ) + log.debug(f'Downloading dataset {dataset_name} to {local_dataset_dir}.') if _is_empty_or_nonexistent(dirpath=local_dataset_dir): # Safely load a dataset from HF Hub with restricted file types. From 221b8bd27a9156603a12e4e0160f9db5f87b7de4 Mon Sep 17 00:00:00 2001 From: Irene Dea Date: Thu, 24 Oct 2024 22:33:20 +0000 Subject: [PATCH 11/11] code quaity --- llmfoundry/data/finetuning/tasks.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/llmfoundry/data/finetuning/tasks.py b/llmfoundry/data/finetuning/tasks.py index 5ccebcd22b..2030a87fe7 100644 --- a/llmfoundry/data/finetuning/tasks.py +++ b/llmfoundry/data/finetuning/tasks.py @@ -916,7 +916,9 @@ def build_from_hf( dataset_name, ) - log.debug(f'Downloading dataset {dataset_name} to {local_dataset_dir}.') + log.debug( + f'Downloading dataset {dataset_name} to {local_dataset_dir}.', + ) if _is_empty_or_nonexistent(dirpath=local_dataset_dir): # Safely load a dataset from HF Hub with restricted file types.