Big stupid debug push

huggingface · Oct 8, 2024 · b4cc131 · b4cc131
1 parent 98b1a35
commit b4cc131
Show file tree

Hide file tree

Showing 2 changed files with 38 additions and 36 deletions.
diff --git a/src/transformers/tokenization_utils_base.py b/src/transformers/tokenization_utils_base.py
@@ -145,7 +145,6 @@ class EncodingFast:
 SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
 ADDED_TOKENS_FILE = "added_tokens.json"
 TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
-CHAT_TEMPLATE_FILE = "chat_template.jinja"
 
 # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file
 FULL_TOKENIZER_FILE = "tokenizer.json"
@@ -2122,7 +2121,6 @@ def from_pretrained(
                     "tokenizer_config_file": TOKENIZER_CONFIG_FILE,
                     # tokenizer_file used to initialize a slow from a fast. Properly copy the `addedTokens` instead of adding in random orders
                     "tokenizer_file": FULL_TOKENIZER_FILE,
-                    "chat_template_file": CHAT_TEMPLATE_FILE,
                 }
                 vocab_files = {**cls.vocab_files_names, **additional_files_names}
                 if "tokenizer_file" in vocab_files:
@@ -2243,8 +2241,6 @@ def _from_pretrained(
         from_slow = kwargs.get("from_slow", False)
         gguf_file = kwargs.get("gguf_file", None)
         has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
-        chat_template_file = resolved_vocab_files.pop("chat_template_file", None)
-        chat_template_file = None
 
         # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be
         # loaded directly from the GGUF file.
@@ -2443,11 +2439,6 @@ def _from_pretrained(
                 if key != "additional_special_tokens":
                     init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])
 
-        if chat_template_file is not None:
-            with open(chat_template_file, encoding="utf-8") as chat_template_handle:
-                init_kwargs["chat_template"] = chat_template_handle.read()
-
-
         # Instantiate the tokenizer.
         try:
             tokenizer = cls(*init_inputs, **init_kwargs)
@@ -2585,9 +2576,6 @@ def save_pretrained(
         tokenizer_config_file = os.path.join(
             save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
         )
-        chat_template_file = os.path.join(
-            save_directory, (filename_prefix + "-" if filename_prefix else "") + CHAT_TEMPLATE_FILE
-        )
 
         tokenizer_config = copy.deepcopy(self.init_kwargs)
 
@@ -2607,15 +2595,7 @@ def save_pretrained(
             if isinstance(self.chat_template, dict):
                 # Chat template dicts are saved to the config as lists of dicts with fixed key names.
                 # They will be reconstructed as a single dict during loading.
-                # We're trying to discourage chat template dicts, and they are always
-                # saved in the config, never as single files.
                 tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()]
-            elif kwargs.get("save_chat_template_file", False):
-                with open(chat_template_file, "w", encoding="utf-8") as f:
-                    f.write(self.chat_template)
-                logger.info(f"chat template saved in {chat_template_file}")
-                if "chat_template" in tokenizer_config:
-                    tokenizer_config.pop("chat_template")  # To ensure it doesn't somehow end up in the config too
             else:
                 tokenizer_config["chat_template"] = self.chat_template
 

diff --git a/tests/test_tokenization_common.py b/tests/test_tokenization_common.py
@@ -1083,8 +1083,46 @@ def test_chat_template(self):
         ]
         expected_output = "systemsystem messageuseruser messageassistantassistant message"
         tokenizers = self.get_tokenizers()
+        tokenizers = tokenizers[1:2]
         for tokenizer in tokenizers:
             with self.subTest(f"{tokenizer.__class__.__name__}"):
+                # Put this block back afterwards
+
+                # tokenizer.chat_template = dummy_template
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
+                    # tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True)
+                    tokenizer.save_pretrained(tmp_dir_name)
+                    # chat_template_file = Path(tmp_dir_name) / "chat_template.jinja"
+                    #self.assertTrue(chat_template_file.is_file())
+                    #self.assertEqual(chat_template_file.read_text(), dummy_template)
+                    #config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text())
+                    # Assert the chat template is not in the config when it's saved as a separate file
+                    #self.assertNotIn("chat_template", config_dict)
+                    tokenizer = tokenizer.from_pretrained(tmp_dir_name)
+                    # TODO Why is T5 failing?
+
+                # self.assertEqual(tokenizer.chat_template, dummy_template)  # Test template has persisted
+
+                with tempfile.TemporaryDirectory() as tmp_dir_name:
+                    tokenizer.save_pretrained(tmp_dir_name)
+                    #chat_template_file = Path(tmp_dir_name) / "chat_template.jinja"
+                    #self.assertTrue(chat_template_file.is_file())
+                    #self.assertEqual(chat_template_file.read_text(), dummy_template)
+                    # config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text())
+                    # Assert the chat template is not in the config when it's saved as a separate file
+                    #self.assertNotIn("chat_template", config_dict)
+                    tokenizer = tokenizer.from_pretrained(tmp_dir_name)
+                    # TODO Why is T5 failing?
+
+                self.assertEqual(tokenizer.chat_template, dummy_template)  # Test template has persisted
+                continue
+
+                # End block
+                output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False)
+                self.assertEqual(output, expected_output)  # Test output is the same after reloading
+                # Check that no error raised
+                tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False)
+
                 output = tokenizer.apply_chat_template(
                     dummy_conversation, chat_template=dummy_template, tokenize=False, return_dict=False
                 )
@@ -1116,23 +1154,7 @@ def test_chat_template(self):
                 # Check that no error raised
                 tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False)
 
-                with tempfile.TemporaryDirectory() as tmp_dir_name:
-                    tokenizer.save_pretrained(tmp_dir_name, save_chat_template_file=True)
-                    chat_template_file = Path(tmp_dir_name) / "chat_template.jinja"
-                    self.assertTrue(chat_template_file.is_file())
-                    self.assertEqual(chat_template_file.read_text(), dummy_template)
-                    config_dict = json.loads((Path(tmp_dir_name) / "tokenizer_config.json").read_text())
-                    # Assert the chat template is not in the config when it's saved as a separate file
-                    self.assertNotIn("chat_template", config_dict)
-                    tokenizer = tokenizer.from_pretrained(tmp_dir_name)
-                    # TODO Figure out how "chat_template" gets into init_kwargs
-                    # TODO Ensure "chat_template_file" doesn't end up anywhere! Where is it getting into the config?
 
-                self.assertEqual(tokenizer.chat_template, dummy_template)  # Test template has persisted
-                output = tokenizer.apply_chat_template(dummy_conversation, tokenize=False, return_dict=False)
-                self.assertEqual(output, expected_output)  # Test output is the same after reloading
-                # Check that no error raised
-                tokenizer.apply_chat_template(dummy_conversation, tokenize=True, return_dict=False)
 
     @require_jinja
     def test_chat_template_batched(self):