From b4cc13140784e272a34f45c859b8b9287e904932 Mon Sep 17 00:00:00 2001 From: Matt Date: Tue, 8 Oct 2024 18:46:48 +0100 Subject: [PATCH] Big stupid debug push --- src/transformers/tokenization_utils_base.py | 20 -------- tests/test_tokenization_common.py | 54 +++++++++++++++------ 2 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py index ac21f0ac971bee..438ef1c8a4a5e2 100644 --- a/src/transformers/tokenization_utils_base.py +++ b/src/transformers/tokenization_utils_base.py @@ -145,7 +145,6 @@ class EncodingFast: SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" -CHAT_TEMPLATE_FILE = "chat_template.jinja" # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file FULL_TOKENIZER_FILE = "tokenizer.json" @@ -2122,7 +2121,6 @@ def from_pretrained( "tokenizer_config_file": TOKENIZER_CONFIG_FILE, # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders "tokenizer_file": FULL_TOKENIZER_FILE, - "chat_template_file": CHAT_TEMPLATE_FILE, } vocab_files = {**cls.vocab_files_names, **additional_files_names} if "tokenizer_file" in vocab_files: @@ -2243,8 +2241,6 @@ def _from_pretrained( from_slow = kwargs.get("from_slow", False) gguf_file = kwargs.get("gguf_file", None) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None - chat_template_file = resolved_vocab_files.pop("chat_template_file", None) - chat_template_file = None # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be # loaded directly from the GGUF file. @@ -2443,11 +2439,6 @@ def _from_pretrained( if key != "additional_special_tokens": init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key]) - if chat_template_file is not None: - with open(chat_template_file, encoding="utf-8") as chat_template_handle: - init_kwargs["chat_template"] = chat_template_handle.read() - - # Instantiate the tokenizer. try: tokenizer = cls(*init_inputs, **init_kwargs) @@ -2585,9 +2576,6 @@ def save_pretrained( tokenizer_config_file = os.path.join( save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE ) - chat_template_file = os.path.join( - save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_FILE - ) tokenizer_config = copy.deepcopy(self.init_kwargs) @@ -2607,15 +2595,7 @@ def save_pretrained( if isinstance(self.chat_template, dict): # Chat template dicts are saved to the config as lists of dicts with fixed key names. # They will be reconstructed as a single dict during loading. - # We're trying to discourage chat template dicts, and they are always - # saved in the config, never as single files. tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()] - elif kwargs.get("save_chat_template_file", False): - with open(chat_template_file, "w", encoding="utf-8") as f: - f.write(self.chat_template) - logger.info(f"chat template saved in {chat_template_file}") - if "chat_template" in tokenizer_config: - tokenizer_config.pop("chat_template") # To ensure it doesn't somehow end up in the config too else: tokenizer_config["chat_template"] = self.chat_template diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py index c4f54b884eaf97..7babaddb70623c 100644 --- a/tests/test_tokenization_common.py +++ b/tests/test_tokenization_common.py @@ -1083,8 +1083,46 @@ def test_chat_template(self): ] expected_output = "systemsystem messageuseruser messageassistantassistant message" tokenizers = self.get_tokenizers() + tokenizers = tokenizers[1:2] for tokenizer in tokenizers: with self.subTest(f"{tokenizer.__class__.__name__}"): + # Put this block back afterwards + + # tokenizer.chat_template = dummy_template + with tempfile.TemporaryDirectory() as tmp_dir_name: + # tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True) + tokenizer.save_pretrained(tmp_dir_name) + # chat_template_file = Path(tmp_dir_name) / "chat_template.jinja" + #self.assertTrue(chat_template_file.is_file()) + #self.assertEqual(chat_template_file.read_text(), dummy_template) + #config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text()) + # Assert the chat template is not in the config when it's saved as a separate file + #self.assertNotIn("chat_template", config_dict) + tokenizer = tokenizer.from_pretrained(tmp_dir_name) + # TODO Why is T5 failing? + + # self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted + + with tempfile.TemporaryDirectory() as tmp_dir_name: + tokenizer.save_pretrained(tmp_dir_name) + #chat_template_file = Path(tmp_dir_name) / "chat_template.jinja" + #self.assertTrue(chat_template_file.is_file()) + #self.assertEqual(chat_template_file.read_text(), dummy_template) + # config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text()) + # Assert the chat template is not in the config when it's saved as a separate file + #self.assertNotIn("chat_template", config_dict) + tokenizer = tokenizer.from_pretrained(tmp_dir_name) + # TODO Why is T5 failing? + + self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted + continue + + # End block + output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) + self.assertEqual(output, expected_output) # Test output is the same after reloading + # Check that no error raised + tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) + output = tokenizer.apply_chat_template( dummy_conversation, chat_template=dummy_template, tokenize=False, return_dict=False ) @@ -1116,23 +1154,7 @@ def test_chat_template(self): # Check that no error raised tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) - with tempfile.TemporaryDirectory() as tmp_dir_name: - tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True) - chat_template_file = Path(tmp_dir_name) / "chat_template.jinja" - self.assertTrue(chat_template_file.is_file()) - self.assertEqual(chat_template_file.read_text(), dummy_template) - config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text()) - # Assert the chat template is not in the config when it's saved as a separate file - self.assertNotIn("chat_template", config_dict) - tokenizer = tokenizer.from_pretrained(tmp_dir_name) - # TODO Figure out how "chat_template" gets into init_kwargs - # TODO Ensure "chat_template_file" doesn't end up anywhere! Where is it getting into the config? - self.assertEqual(tokenizer.chat_template, dummy_template) # Test template has persisted - output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False) - self.assertEqual(output, expected_output) # Test output is the same after reloading - # Check that no error raised - tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False) @require_jinja def test_chat_template_batched(self):