Skip to content

Commit

Permalink
make destination_path required
Browse files Browse the repository at this point in the history
  • Loading branch information
dakinggg committed Feb 17, 2023
1 parent 937051b commit 08283d4
Show file tree
Hide file tree
Showing 2 changed files with 50 additions and 28 deletions.
46 changes: 30 additions & 16 deletions composer/datasets/in_context_learning_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -238,7 +238,7 @@ def __init__(
prompt_string: str,
example_delimiter: str,
continuation_delimiter: str,
destination_path: str = 'icl_mc_task.jsonl',
destination_path: str,
):
try:
from datasets import load_dataset # pyright: ignore [reportGeneralTypeIssues]
Expand Down Expand Up @@ -368,16 +368,17 @@ def split_batch(self, batch: Any, microbatch_size: int):


def get_icl_task_dataloader(
icl_task_type: str,
dataset_uri: str,
tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
batch_size: int,
max_seq_len: int,
pad_tok_id: int,
num_fewshot: int,
prompt_string: str, # e.g. 'translate english to french:'
example_delimiter: str, # e.g. '\n'
continuation_delimiter: str, # e.g. ''
icl_task_type: str,
dataset_uri: str,
tokenizer: Union[transformers.PreTrainedTokenizer, transformers.PreTrainedTokenizerFast],
batch_size: int,
max_seq_len: int,
pad_tok_id: int,
num_fewshot: int,
prompt_string: str, # e.g. 'translate english to french:'
example_delimiter: str, # e.g. '\n'
continuation_delimiter: str, # e.g. ''
destination_path: str,
) -> DataSpec:
"""This constructs a dataloader capable of evaluating LLMs on in-context learning language modeling tasks, for example LAMBADA. An example usage is below:
Expand Down Expand Up @@ -423,14 +424,27 @@ def get_icl_task_dataloader(
"""

if icl_task_type == 'multiple_choice':
dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri, tokenizer, max_seq_len, pad_tok_id,
num_fewshot, prompt_string, example_delimiter,
continuation_delimiter)
dataset = InContextLearningMultipleChoiceTaskDataset(dataset_uri,
tokenizer,
max_seq_len,
pad_tok_id,
num_fewshot,
prompt_string,
example_delimiter,
continuation_delimiter,
destination_path=destination_path)
batch_size = max(dataset.num_choices, batch_size)
effective_batchsize = batch_size // dataset.num_choices
elif icl_task_type == 'language_modeling':
dataset = InContextLearningLMTaskDataset(dataset_uri, tokenizer, max_seq_len, pad_tok_id, num_fewshot,
prompt_string, example_delimiter, continuation_delimiter)
dataset = InContextLearningLMTaskDataset(dataset_uri,
tokenizer,
max_seq_len,
pad_tok_id,
num_fewshot,
prompt_string,
example_delimiter,
continuation_delimiter,
destination_path=destination_path)
effective_batchsize = batch_size
else:
raise Exception(f'Unrecognized ICL task type: {icl_task_type}')
Expand Down
32 changes: 20 additions & 12 deletions tests/datasets/test_in_context_learning_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ def test_batch_padding_logic(tiny_gpt2_tokenizer):


@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
local_data = os.path.join(os.path.dirname(__file__), 'local_data')

tokenizer = tiny_gpt2_tokenizer
Expand All @@ -55,7 +55,8 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
num_fewshot=0,
prompt_string='',
example_delimiter='\n',
continuation_delimiter='')
continuation_delimiter='',
destination_path=str(tmp_path / 'icl.jsonl'))

assert isinstance(dl.dataloader, DataLoader) # pyright
batch = next(dl.dataloader._get_iterator())
Expand All @@ -75,7 +76,7 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):

@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
@pytest.mark.parametrize('num_fewshot', [0, 1])
def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot, tmp_path):
local_data = os.path.join(os.path.dirname(__file__), 'local_data')

tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
Expand All @@ -91,7 +92,8 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
num_fewshot=num_fewshot,
prompt_string='',
example_delimiter='\n',
continuation_delimiter='')
continuation_delimiter='',
destination_path=str(tmp_path / 'icl.jsonl'))

assert isinstance(dl.dataloader, DataLoader) # pyright
batch = next(dl.dataloader._get_iterator())
Expand All @@ -113,7 +115,7 @@ def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):

@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
@pytest.mark.parametrize('num_fewshot', [0, 1])
def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot, tmp_path):
local_data = os.path.join(os.path.dirname(__file__), 'local_data')

tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
Expand All @@ -130,7 +132,8 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
num_fewshot=num_fewshot,
prompt_string='',
example_delimiter='\n',
continuation_delimiter=': ')
continuation_delimiter=': ',
destination_path=str(tmp_path / 'icl.jsonl'))

assert isinstance(dl.dataloader, DataLoader) # pyright
batch = next(dl.dataloader._get_iterator())
Expand Down Expand Up @@ -158,7 +161,7 @@ def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):


@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer, tmp_path):
local_data = os.path.join(os.path.dirname(__file__), 'local_data')

tokenizer = tiny_gpt2_tokenizer
Expand All @@ -174,7 +177,8 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
num_fewshot=1,
prompt_string='',
example_delimiter='\n',
continuation_delimiter=': ')
continuation_delimiter=': ',
destination_path=str(tmp_path / 'icl.jsonl'))

assert isinstance(dl.dataloader, DataLoader) # pyright
batch = next(dl.dataloader._get_iterator())
Expand Down Expand Up @@ -202,7 +206,7 @@ def test_mc_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
@pytest.mark.parametrize('num_fewshot', [0, 5])
@device('gpu')
def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer):
def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenizer, tmp_path):
pytest.importorskip('datasets')
in_memory_logger = InMemoryLogger() # track the logged metrics in the in_memory_logger
local_data = os.path.join(os.path.dirname(__file__), 'local_data')
Expand All @@ -217,7 +221,9 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
num_fewshot=num_fewshot,
prompt_string='',
example_delimiter='\n',
continuation_delimiter='')
continuation_delimiter='',
destination_path=str(tmp_path / 'icl.jsonl'))

evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningLMAccuracy'])
model = create_gpt2(use_pretrained=False, pretrained_model_name='EleutherAI/gpt-neo-125M')
model.add_eval_metrics(evaluator)
Expand All @@ -230,7 +236,7 @@ def test_lm_task_evaluation(device, dataset_uri, num_fewshot, tiny_gpt2_tokenize
@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl', 'hellaswag_small.jsonl'])
@device('gpu')
@pytest.mark.parametrize('num_fewshot', [0, 5])
def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenizer):
def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenizer, tmp_path):
pytest.importorskip('datasets')
in_memory_logger = InMemoryLogger() # track the logged metrics in the in_memory_logger
local_data = os.path.join(os.path.dirname(__file__), 'local_data')
Expand All @@ -245,7 +251,9 @@ def test_mc_task_evaluation(device, num_fewshot, dataset_uri, tiny_gpt2_tokenize
num_fewshot=num_fewshot,
prompt_string='',
example_delimiter='\n',
continuation_delimiter=': ')
continuation_delimiter=': ',
destination_path=str(tmp_path / 'icl.jsonl'))

evaluator = Evaluator(label='lambada', dataloader=dl, metric_names=['InContextLearningMultipleChoiceAccuracy'])
model = create_gpt2(use_pretrained=False, pretrained_model_name='EleutherAI/gpt-neo-125M')
model.add_eval_metrics(evaluator)
Expand Down

0 comments on commit 08283d4

Please sign in to comment.