Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

permit opt tokenizer #1958

Merged
merged 3 commits into from
Feb 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions composer/datasets/in_context_learning_evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,10 +153,11 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:

cont = f'{continuation_delimiter}{cont}'

encoded_example['context'] = self.tokenizer(ctxt)
encoded_example['continuation'] = self.tokenizer(cont)
encoded_example['preamble'] = self.tokenizer(
preamble) # if the preamble is empty then these will be 0-length lists
preamble
) # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
encoded_example['context'] = self.tokenizer(ctxt, add_special_tokens=False)
encoded_example['continuation'] = self.tokenizer(cont, add_special_tokens=False)

examples.append(encoded_example)

Expand Down Expand Up @@ -298,13 +299,13 @@ def prep_examples(self, num_fewshot: int, prompt_string: str, example_delimiter:
'choices'], self.samples[sample_idx]['gold'],
if len(preamble) > 0:
query = f'{example_delimiter}{query}'

choices = [f'{continuation_delimiter}{choice}' for choice in choices]
encoded_example['query'] = self.tokenizer(query)
encoded_example['choices'] = [self.tokenizer(choice) for choice in choices]
encoded_example['preamble'] = self.tokenizer(
bmosaicml marked this conversation as resolved.
Show resolved Hide resolved
preamble) # if the preamble is empty then these will be 0-length lists
preamble
) # if the preamble is empty then these will be 0-length lists, unless the tokenizer adds special tokens to empty strings (e.g. OPT tokenizer)
encoded_example['gold_idx'] = gold_idx
encoded_example['query'] = self.tokenizer(query, add_special_tokens=False)
encoded_example['choices'] = [self.tokenizer(choice, add_special_tokens=False) for choice in choices]

examples.append(encoded_example)

Expand Down
87 changes: 86 additions & 1 deletion tests/datasets/test_in_context_learning_datasets.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

import pytest
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

from composer.core import Evaluator
from composer.datasets.in_context_learning_evaluation import (_get_fewshot_sample_idxs, _make_padded_input,
Expand Down Expand Up @@ -51,7 +52,43 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
batch_size,
max_seq_len=seqlen,
pad_tok_id=tokenizer.eos_token_id,
num_fewshot=1,
num_fewshot=0,
prompt_string='',
example_delimiter='\n',
continuation_delimiter='')

assert isinstance(dl.dataloader, DataLoader) # pyright
batch = next(dl.dataloader._get_iterator())

assert 'input_ids' in batch
assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
assert 'attention_mask' in batch
assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
assert 'continuation_indices' in batch
assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
assert 'mode' in batch
assert batch['mode'] == 'icl_task'
min_idx = min(batch['continuation_indices'][0]).item()
max_idx = max(batch['continuation_indices'][0]).item()
assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'


@pytest.mark.parametrize('dataset_uri', ['lambada_small.jsonl'])
bmosaicml marked this conversation as resolved.
Show resolved Hide resolved
@pytest.mark.parametrize('num_fewshot', [0, 1])
def test_lm_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
local_data = os.path.join(os.path.dirname(__file__), 'local_data')

tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)
dataset_uri = f'{local_data}/{dataset_uri}'
batch_size = 2
seqlen = 2048
dl = get_icl_task_dataloader('language_modeling',
dataset_uri,
tokenizer,
batch_size,
max_seq_len=seqlen,
pad_tok_id=tokenizer.eos_token_id,
num_fewshot=num_fewshot,
prompt_string='',
example_delimiter='\n',
continuation_delimiter='')
Expand All @@ -70,6 +107,54 @@ def test_lm_task_dataloader(dataset_uri, tiny_gpt2_tokenizer):
min_idx = min(batch['continuation_indices'][0]).item()
max_idx = max(batch['continuation_indices'][0]).item()
assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ' glen'
assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1


@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
@pytest.mark.parametrize('num_fewshot', [0, 1])
def test_mc_task_dataloader_opt_tokenizer(dataset_uri, num_fewshot):
local_data = os.path.join(os.path.dirname(__file__), 'local_data')

tokenizer = AutoTokenizer.from_pretrained('facebook/opt-125m', use_fast=False)

dataset_uri = f'{local_data}/{dataset_uri}'
batch_size = 2
seqlen = 2048
dl = get_icl_task_dataloader('multiple_choice',
dataset_uri,
tokenizer,
batch_size,
max_seq_len=seqlen,
pad_tok_id=tokenizer.eos_token_id,
num_fewshot=num_fewshot,
prompt_string='',
example_delimiter='\n',
continuation_delimiter=': ')

assert isinstance(dl.dataloader, DataLoader) # pyright
batch = next(dl.dataloader._get_iterator())

choices_per_question = 2
assert 'input_ids' in batch
assert tuple(batch['input_ids'].shape) == (batch_size, seqlen)
assert 'attention_mask' in batch
assert tuple(batch['attention_mask'].shape) == (batch_size, seqlen)
assert 'continuation_indices' in batch
assert isinstance(batch['continuation_indices'], list) and len(batch['continuation_indices']) == batch_size
assert 'mode' in batch
assert batch['mode'] == 'icl_task'
assert 'gold_indices' in batch
assert isinstance(batch['gold_indices'], list) and len(batch['gold_indices']) == batch_size // choices_per_question
assert 'choice_groupings' in batch
assert isinstance(batch['choice_groupings'], list) and len(
batch['choice_groupings']) == batch_size // choices_per_question

min_idx = min(batch['continuation_indices'][0]).item()
max_idx = max(batch['continuation_indices'][0]).item()
assert tokenizer.decode(batch['input_ids'][0][min_idx:max_idx + 1]) == ': Pour it onto a plate'
assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).startswith('</s>')
assert tokenizer.decode(batch['input_ids'][0][0:min_idx]).count('</s>') == 1


@pytest.mark.parametrize('dataset_uri', ['piqa_small.jsonl'])
Expand Down