make destination_path required

mosaicml · Feb 17, 2023 · 08283d4 · 08283d4
1 parent 937051b
commit 08283d4
Show file tree

Hide file tree

Showing 2 changed files with 50 additions and 28 deletions.
diff --git a/composer/datasets/in_context_learning_evaluation.py b/composer/datasets/in_context_learning_evaluation.py
@@ -238,7 +238,7 @@ def __init__(
         prompt_string: str,
         example_delimiter: str,
         continuation_delimiter: str,
-        destination_path: str = 'icl_mc_task.jsonl',
+        destination_path: str,
     ):
         try:
             from datasets import load_dataset  # pyright: ignore [reportGeneralTypeIssues]
@@ -368,16 +368,17 @@ def split_batch(self, batch: Any, microbatch_size: int):
 
 
 def get_icl_task_dataloader(
-        icl_task_type: str,
-        dataset_uri: str,
-        tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
-        batch_size: int,
-        max_seq_len: int,
-        pad_tok_id: int,
-        num_fewshot: int,
-        prompt_string: str,  # e.g. 'translate english to french:'
-        example_delimiter: str,  # e.g. '\n'
-        continuation_delimiter: str,  # e.g. ''
+    icl_task_type: str,
+    dataset_uri: str,
+    tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
+    batch_size: int,
+    max_seq_len: int,
+    pad_tok_id: int,
+    num_fewshot: int,
+    prompt_string: str,  # e.g. 'translate english to french:'
+    example_delimiter: str,  # e.g. '\n'
+    continuation_delimiter: str,  # e.g. ''
+    destination_path: str,
 ) -> DataSpec:
     """This constructs a dataloader capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
 
@@ -423,14 +424,27 @@ def get_icl_task_dataloader(
     """
 
     if icl_task_type == 'multiple_choice':
-        dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri, tokenizer, max_seq_len, pad_tok_id,
-                                                             num_fewshot, prompt_string, example_delimiter,
-                                                             continuation_delimiter)
+        dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri,
+                                                             tokenizer,
+                                                             max_seq_len,
+                                                             pad_tok_id,
+                                                             num_fewshot,
+                                                             prompt_string,
+                                                             example_delimiter,
+                                                             continuation_delimiter,
+                                                             destination_path=destination_path)
         batch_size = max(dataset.num_choices, batch_size)
         effective_batchsize = batch_size // dataset.num_choices
     elif icl_task_type == 'language_modeling':
-        dataset = InContextLearningLMTaskDataset(dataset_uri, tokenizer, max_seq_len, pad_tok_id, num_fewshot,
-                                                 prompt_string, example_delimiter, continuation_delimiter)
+        dataset = InContextLearningLMTaskDataset(dataset_uri,
+                                                 tokenizer,
+                                                 max_seq_len,
+                                                 pad_tok_id,
+                                                 num_fewshot,
+                                                 prompt_string,
+                                                 example_delimiter,
+                                                 continuation_delimiter,
+                                                 destination_path=destination_path)
         effective_batchsize = batch_size
     else:
         raise Exception(f'Unrecognized ICL task type: {icl_task_type}')

diff --git a/tests/datasets/test_in_context_learning_datasets.py b/tests/datasets/test_in_context_learning_datasets.py
@@ -39,7 +39,7 @@ def test_batch_padding_logic(tiny_gpt2_tokenizer):
 
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
-def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
+def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     tokenizer = tiny_gpt2_tokenizer
@@ -55,7 +55,8 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
                                  num_fewshot=0,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 continuation_delimiter='')
+                                 continuation_delimiter='',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -75,7 +76,7 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
 
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
+def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
@@ -91,7 +92,8 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
                                  num_fewshot=num_fewshot,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 continuation_delimiter='')
+                                 continuation_delimiter='',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -113,7 +115,7 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 1])
-def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
+def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
@@ -130,7 +132,8 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
                                  num_fewshot=num_fewshot,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 continuation_delimiter=': ')
+                                 continuation_delimiter=': ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -158,7 +161,7 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
 
 
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
-def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
+def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
 
     tokenizer = tiny_gpt2_tokenizer
@@ -174,7 +177,8 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
                                  num_fewshot=1,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 continuation_delimiter=': ')
+                                 continuation_delimiter=': ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
 
     assert isinstance(dl.dataloader, DataLoader)  # pyright
     batch = next(dl.dataloader._get_iterator())
@@ -202,7 +206,7 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
 @pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
 @pytest.mark.parametrize('num_fewshot', [0, 5])
 @device('gpu')
-def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer):
+def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -217,7 +221,9 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
                                  num_fewshot=num_fewshot,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 continuation_delimiter='')
+                                 continuation_delimiter='',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+
     evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
     model = create_gpt2(use_pretrained=False, pretrained_model_name='EleutherAI/gpt-neo-125M')
     model.add_eval_metrics(evaluator)
@@ -230,7 +236,7 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
 @pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
 @device('gpu')
 @pytest.mark.parametrize('num_fewshot', [0, 5])
-def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenizer):
+def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path):
     pytest.importorskip('datasets')
     in_memory_logger = InMemoryLogger()  # track the logged metrics in the in_memory_logger
     local_data = os.path.join(os.path.dirname(__file__), 'local_data')
@@ -245,7 +251,9 @@ def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenize
                                  num_fewshot=num_fewshot,
                                  prompt_string='',
                                  example_delimiter='\n',
-                                 continuation_delimiter=': ')
+                                 continuation_delimiter=': ',
+                                 destination_path=str(tmp_path / 'icl.jsonl'))
+
     evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
     model = create_gpt2(use_pretrained=False, pretrained_model_name='EleutherAI/gpt-neo-125M')
     model.add_eval_metrics(evaluator)