Skip to content

Commit

Permalink
Big stupid debug push
Browse files Browse the repository at this point in the history
  • Loading branch information
Rocketknight1 committed Oct 8, 2024
1 parent 98b1a35 commit b4cc131
Show file tree
Hide file tree
Showing 2 changed files with 38 additions and 36 deletions.
20 changes: 0 additions & 20 deletions src/transformers/tokenization_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,6 @@ class EncodingFast:
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
CHAT_TEMPLATE_FILE = "chat_template.jinja"

# Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
FULL_TOKENIZER_FILE = "tokenizer.json"
Expand Down Expand Up @@ -2122,7 +2121,6 @@ def from_pretrained(
"tokenizer_config_file": TOKENIZER_CONFIG_FILE,
# tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
"tokenizer_file": FULL_TOKENIZER_FILE,
"chat_template_file": CHAT_TEMPLATE_FILE,
}
vocab_files = {**cls.vocab_files_names, **additional_files_names}
if "tokenizer_file" in vocab_files:
Expand Down Expand Up @@ -2243,8 +2241,6 @@ def _from_pretrained(
from_slow = kwargs.get("from_slow", False)
gguf_file = kwargs.get("gguf_file", None)
has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
chat_template_file = resolved_vocab_files.pop("chat_template_file", None)
chat_template_file = None

# If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be
# loaded directly from the GGUF file.
Expand Down Expand Up @@ -2443,11 +2439,6 @@ def _from_pretrained(
if key != "additional_special_tokens":
init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])

if chat_template_file is not None:
with open(chat_template_file, encoding="utf-8") as chat_template_handle:
init_kwargs["chat_template"] = chat_template_handle.read()


# Instantiate the tokenizer.
try:
tokenizer = cls(*init_inputs, **init_kwargs)
Expand Down Expand Up @@ -2585,9 +2576,6 @@ def save_pretrained(
tokenizer_config_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
)
chat_template_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_FILE
)

tokenizer_config = copy.deepcopy(self.init_kwargs)

Expand All @@ -2607,15 +2595,7 @@ def save_pretrained(
if isinstance(self.chat_template, dict):
# Chat template dicts are saved to the config as lists of dicts with fixed key names.
# They will be reconstructed as a single dict during loading.
# We're trying to discourage chat template dicts, and they are always
# saved in the config, never as single files.
tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()]
elif kwargs.get("save_chat_template_file", False):
with open(chat_template_file, "w", encoding="utf-8") as f:
f.write(self.chat_template)
logger.info(f"chat template saved in {chat_template_file}")
if "chat_template" in tokenizer_config:
tokenizer_config.pop("chat_template") # To ensure it doesn't somehow end up in the config too
else:
tokenizer_config["chat_template"] = self.chat_template

Expand Down
54 changes: 38 additions & 16 deletions tests/test_tokenization_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -1083,8 +1083,46 @@ def test_chat_template(self):
]
expected_output = "systemsystem messageuseruser messageassistantassistant message"
tokenizers = self.get_tokenizers()
tokenizers = tokenizers[1:2]
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
# Put this block back afterwards

# tokenizer.chat_template = dummy_template
with tempfile.TemporaryDirectory() as tmp_dir_name:
# tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True)
tokenizer.save_pretrained(tmp_dir_name)
# chat_template_file = Path(tmp_dir_name) / "chat_template.jinja"
#self.assertTrue(chat_template_file.is_file())
#self.assertEqual(chat_template_file.read_text(), dummy_template)
#config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text())
# Assert the chat template is not in the config when it's saved as a separate file
#self.assertNotIn("chat_template", config_dict)
tokenizer = tokenizer.from_pretrained(tmp_dir_name)
# TODO Why is T5 failing?

# self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted

with tempfile.TemporaryDirectory() as tmp_dir_name:
tokenizer.save_pretrained(tmp_dir_name)
#chat_template_file = Path(tmp_dir_name) / "chat_template.jinja"
#self.assertTrue(chat_template_file.is_file())
#self.assertEqual(chat_template_file.read_text(), dummy_template)
# config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text())
# Assert the chat template is not in the config when it's saved as a separate file
#self.assertNotIn("chat_template", config_dict)
tokenizer = tokenizer.from_pretrained(tmp_dir_name)
# TODO Why is T5 failing?

self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted
continue

# End block
output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False)
self.assertEqual(output, expected_output) # Test output is the same after reloading
# Check that no error raised
tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False)

output = tokenizer.apply_chat_template(
dummy_conversation, chat_template=dummy_template, tokenize=False, return_dict=False
)
Expand Down Expand Up @@ -1116,23 +1154,7 @@ def test_chat_template(self):
# Check that no error raised
tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False)

with tempfile.TemporaryDirectory() as tmp_dir_name:
tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True)
chat_template_file = Path(tmp_dir_name) / "chat_template.jinja"
self.assertTrue(chat_template_file.is_file())
self.assertEqual(chat_template_file.read_text(), dummy_template)
config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text())
# Assert the chat template is not in the config when it's saved as a separate file
self.assertNotIn("chat_template", config_dict)
tokenizer = tokenizer.from_pretrained(tmp_dir_name)
# TODO Figure out how "chat_template" gets into init_kwargs
# TODO Ensure "chat_template_file" doesn't end up anywhere! Where is it getting into the config?

self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted
output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False)
self.assertEqual(output, expected_output) # Test output is the same after reloading
# Check that no error raised
tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False)

@require_jinja
def test_chat_template_batched(self):
Expand Down

0 comments on commit b4cc131

Please sign in to comment.