diff --git a/paddlenlp/transformers/albert/__init__.py b/paddlenlp/transformers/albert/__init__.py index 97043fd7ba68..86ecd1322521 100644 --- a/paddlenlp/transformers/albert/__init__.py +++ b/paddlenlp/transformers/albert/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .tokenizer import * diff --git a/paddlenlp/transformers/albert/tokenizer.py b/paddlenlp/transformers/albert/tokenizer.py index a7d80d0b2457..d4ff3d4e0ffd 100644 --- a/paddlenlp/transformers/albert/tokenizer.py +++ b/paddlenlp/transformers/albert/tokenizer.py @@ -20,9 +20,9 @@ import sentencepiece as spm -from .. import PretrainedTokenizer, BertTokenizer, AddedToken +from .. import AddedToken, BertTokenizer, PretrainedTokenizer -__all__ = ["AlbertTokenizer"] +__all__ = ["AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"] SPIECE_UNDERLINE = "▁" diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 8407154c98ff..ff89b81d5cc2 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -13,11 +13,12 @@ # limitations under the License. from __future__ import annotations +import importlib import inspect import io import json import os -from collections import defaultdict +from collections import OrderedDict, defaultdict from typing import Dict, List, Type from ...utils.download import resolve_file_path @@ -30,6 +31,250 @@ "AutoConfig", ] +CONFIG_MAPPING_NAMES = OrderedDict( + [ + ("albert", "AlbertConfig"), + ("artist", "ArtistConfig"), + ("bart", "BartConfig"), + ("bert", "BertConfig"), + ("bigbird", "BigBirdConfig"), + ("bit", "BitConfig"), + ("blenderbot", "BlenderbotConfig"), + ("blenderbot_small", "BlenderbotSmallConfig"), + ("blip", "BlipConfig"), + ("blip2", "Blip2Config"), + ("bloom", "BloomConfig"), + ("chatglm", "ChatGLMConfig"), + ("chatglm_v2", "ChatGLMv2Config"), + ("chinesebert", "ChineseBertConfig"), + ("chineseclip", "ChineseCLIPConfig"), + ("clap", "ClapConfig"), + ("clip", "CLIPConfig"), + ("codegen", "CodeGenConfig"), + ("convbert", "ConvBertConfig"), + ("ctrl", "CTRLConfig"), + ("dallebart", "DalleBartConfig"), + ("deberta", "DebertaConfig"), + ("debertav2", "DebertaV2Config"), + ("distilbert", "DistilBertConfig"), + ("dpt", "DPTConfig"), + ("electra", "ElectraConfig"), + ("ernie", "ErnieConfig"), + ("ernie_code", "ErnieCodeConfig"), + ("ernie_ctm", "ErnieCtmConfig"), + ("ernie_doc", "ErnieDocConfig"), + ("ernie_gram", "ErnieGramConfig"), + ("ernie_layout", "ErnieLayoutConfig"), + ("ernie_m", "ErnieMConfig"), + ("ernie_vil", "ErnieViLConfig"), + ("fnet", "FNetConfig"), + ("funnel", "FunnelConfig"), + ("gau_alpha", "GAUAlphaConfig"), + ("gemma", "GemmaConfig"), + ("glm", "GLMConfig"), + ("gpt", "GPTConfig"), + ("gptj", "GPTJConfig"), + ("jamba", "JambaConfig"), + ("layoutlm", "LayoutLMConfig"), + ("layoutlmv2", "LayoutLMv2Config"), + ("layoutxlm", "LayoutXLMConfig"), + ("llama", "LlamaConfig"), + ("luke", "LukeConfig"), + ("mamba", "MambaConfig"), + ("mbart", "MBartConfig"), + ("megatronbert", "MegatronBertConfig"), + ("minigpt4", "MiniGPT4Config"), + ("mistral", "MistralConfig"), + ("mixtral", "MixtralConfig"), + ("mobilebert", "MobileBertConfig"), + ("mpnet", "MPNetConfig"), + ("mt5", "MT5Config"), + ("nezha", "NeZhaConfig"), + ("nystromformer", "NystromformerConfig"), + ("opt", "OPTConfig"), + ("pegasus", "PegasusConfig"), + ("ppminilm", "PPMiniLMConfig"), + ("prophetnet", "ProphetNetConfig"), + ("qwen", "QWenConfig"), + ("qwen2", "Qwen2Config"), + ("qwen2_moe", "Qwen2MoeConfig"), + ("reformer", "ReformerConfig"), + ("rembert", "RemBertConfig"), + ("roberta", "RobertaConfig"), + ("roformer", "RoFormerConfig"), + ("roformerv2", "RoFormerv2Config"), + ("rw", "RWConfig"), + ("skep", "SkepConfig"), + ("speecht5", "SpeechT5Config"), + ("squeezebert", "SqueezeBertConfig"), + ("t5", "T5Config"), + ("tinybert", "TinyBertConfig"), + ("unified_transformer", "UnifiedTransformerConfig"), + ("unimo", "UNIMOConfig"), + ("visualglm", "VisualGLMConfig"), + ("xlm", "XLMConfig"), + ("xlnet", "XLNetConfig"), + ("yuan", "YuanConfig"), + ] +) + + +MODEL_NAMES_MAPPING = OrderedDict( + # Base model mapping + [ + ("albert", "Albert"), + ("artist", "Artist"), + ("bart", "Bart"), + ("bert", "Bert"), + ("bigbird", "BigBird"), + ("bit", "Bit"), + ("blenderbot", "Blenderbot"), + ("blenderbot_small", "BlenderbotSmall"), + ("blip", "Blip"), + ("blip2", "Blip2"), + ("bloom", "Bloom"), + ("chatglm", "ChatGLM"), + ("chatglm_v2", "ChatGLMv2"), + ("chinesebert", "ChineseBert"), + ("chineseclip", "ChineseCLIPText"), + ("clap", "CLAP"), + ("clip", "CLIP"), + ("codegen", "CodeGen"), + ("convbert", "ConvBert"), + ("ctrl", "CTRL"), + ("dallebart", "DalleBart"), + ("deberta", "Deberta"), + ("debertav2", "DebertaV2"), + ("distilbert", "DistilBert"), + ("dpt", "DPT"), + ("electra", "Electra"), + ("ernie", "Ernie"), + ("ernie_code", "ErnieCode"), + ("ernie_ctm", "ErnieCtm"), + ("ernie_doc", "ErnieDoc"), + ("ernie_gram", "ErnieGram"), + ("ernie_layout", "ErnieLayout"), + ("ernie_m", "ErnieM"), + ("ernie_vil", "ErnieViL"), + ("fnet", "FNet"), + ("funnel", "Funnel"), + ("gau_alpha", "GAUAlpha"), + ("gemma", "Gemma"), + ("glm", "GLM"), + ("gpt", "GPT"), + ("gptj", "GPTJ"), + ("jamba", "Jamba"), + ("layoutlm", "LayoutLM"), + ("layoutlmv2", "LayoutLMv2"), + ("layoutxlm", "LayoutXLM"), + ("llama", "Llama"), + ("luke", "Luke"), + ("mamba", "Mamba"), + ("mbart", "MBart"), + ("megatronbert", "MegatronBert"), + ("minigpt4", "MiniGPT4"), + ("mistral", "Mistral"), + ("mixtral", "Mixtral"), + ("mobilebert", "MobileBert"), + ("mpnet", "MPNet"), + ("mt5", "MT5"), + ("nezha", "NeZha"), + ("nystromformer", "Nystromformer"), + ("opt", "OPT"), + ("pegasus", "Pegasus"), + ("ppminilm", "PPMiniLM"), + ("prophetnet", "ProphetNet"), + ("qwen", "QWen"), + ("qwen2", "Qwen2"), + ("qwen2_moe", "Qwen2Moe"), + ("reformer", "Reformer"), + ("rembert", "RemBert"), + ("roberta", "Roberta"), + ("roformer", "RoFormer"), + ("roformerv2", "RoFormerv2"), + ("rw", "RW"), + ("skep", "Skep"), + ("speecht5", "SpeechT5"), + ("squeezebert", "SqueezeBert"), + ("t5", "T5"), + ("tinybert", "TinyBert"), + ("unified_transformer", "UnifiedTransformer"), + ("unimo", "UNIMO"), + ("visualglm", "VisualGLM"), + ("xlm", "XLM"), + ("xlnet", "XLNet"), + ("yuan", "Yuan"), + ] +) + + +def config_class_to_model_type(config): + """Converts a config class name to the corresponding model type""" + for key, cls in CONFIG_MAPPING_NAMES.items(): + if cls == config: + return key + # if key not found check in extra content + for key, cls in CONFIG_MAPPING._extra_content.items(): + if cls.__name__ == config: + return key + return None + + +class _LazyConfigMapping(OrderedDict): + """ + A dictionary that lazily load its values when they are requested. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + if key not in self._mapping: + raise KeyError(key) + value = self._mapping[key] + module_name = model_type_to_module_name(key) + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module( + f".{module_name}.configuration", "paddlenlp.transformers" + ) + if hasattr(self._modules[module_name], value): + return getattr(self._modules[module_name], value) + + # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the + # object at the top level. + transformers_module = importlib.import_module("paddlenlp") + return getattr(transformers_module, value) + + def keys(self): + return list(self._mapping.keys()) + list(self._extra_content.keys()) + + def values(self): + return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values()) + + def items(self): + return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items()) + + def __iter__(self): + return iter(list(self._mapping.keys()) + list(self._extra_content.keys())) + + def __contains__(self, item): + return item in self._mapping or item in self._extra_content + + def register(self, key, value, exist_ok=False): + """ + Register a new configuration in this mapping. + """ + if key in self._mapping.keys() and not exist_ok: + raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.") + self._extra_content[key] = value + + +CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES) + def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]: """load the configurations of PretrainedConfig mapping: {: [, , ...], } @@ -64,6 +309,12 @@ def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]: return mappings +def model_type_to_module_name(key): + """Converts a config key to the corresponding module.""" + key = key.replace("-", "_") + return key + + class AutoConfig(PretrainedConfig): """ AutoConfig is a generic config class that will be instantiated as one of the @@ -191,12 +442,29 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar from_hf_hub=from_hf_hub, from_aistudio=from_aistudio, ) - if config_file is not None and os.path.exists(config_file): + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + if "model_type" in config_dict: + try: + config_class = CONFIG_MAPPING[config_dict["model_type"]] + except KeyError: + raise ValueError( + f"The checkpoint you are trying to load has model type `{config_dict['model_type']}` " + "but Transformers does not recognize this architecture. This could be because of an " + "issue with the checkpoint, or because your version of Transformers is out of date." + ) + return config_class.from_dict(config_dict, **unused_kwargs) + elif "model_type" not in config_dict and config_file is not None and os.path.exists(config_file): config_class = cls._get_config_class_from_config(pretrained_model_name_or_path, config_file) logger.info("We are using %s to load '%s'." % (config_class, pretrained_model_name_or_path)) if config_class is cls: return cls.from_file(config_file) return config_class.from_pretrained(config_file, *model_args, **kwargs) + elif config_file is None: + # Fallback: use pattern matching on the string. + # We go from longer names to shorter names to catch roberta before bert (for instance) + for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True): + if pattern in str(pretrained_model_name_or_path): + return CONFIG_MAPPING[pattern].from_dict(config_dict, **unused_kwargs) else: raise RuntimeError( f"Can't load config for '{pretrained_model_name_or_path}'.\n" @@ -205,3 +473,20 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant config files.\n" ) + + @staticmethod + def register(model_type, config, exist_ok=False): + """ + Register a new configuration for this class. + + Args: + model_type (`str`): The model type like "bert" or "gpt". + config ([`PretrainedConfig`]): The config to register. + """ + if issubclass(config, PretrainedConfig) and config.model_type != model_type: + raise ValueError( + "The config you are passing has a `model_type` attribute that is not consistent with the model type " + f"you passed (config has {config.model_type} and you passed {model_type}. Fix one of those so they " + "match!" + ) + CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok) diff --git a/paddlenlp/transformers/auto/factory.py b/paddlenlp/transformers/auto/factory.py new file mode 100644 index 000000000000..960ed741c655 --- /dev/null +++ b/paddlenlp/transformers/auto/factory.py @@ -0,0 +1,157 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import importlib +from collections import OrderedDict + +from paddlenlp.transformers.auto.configuration import model_type_to_module_name + + +def getattribute_from_module(module, attr): + if attr is None: + return None + if isinstance(attr, tuple): + return tuple(getattribute_from_module(module, a) for a in attr) + if hasattr(module, attr): + return getattr(module, attr) + # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the + # object at the top level. + paddlenlp_module = importlib.import_module("paddlenlp") + + if module != paddlenlp_module: + try: + return getattribute_from_module(paddlenlp_module, attr) + except ValueError: + raise ValueError(f"Could not find {attr} neither in {module} nor in {paddlenlp_module}!") + else: + raise ValueError(f"Could not find {attr} in {paddlenlp_module}!") + + +class _LazyAutoMapping(OrderedDict): + """ + " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed. + + Args: + - config_mapping: The map model type to config class + - model_mapping: The map model type to model (or tokenizer) class + """ + + def __init__(self, config_mapping, model_mapping): + self._config_mapping = config_mapping + self._reverse_config_mapping = {v: k for k, v in config_mapping.items()} + self._model_mapping = model_mapping + self._model_mapping._model_mapping = self + self._extra_content = {} + self._modules = {} + + def __len__(self): + common_keys = set(self._config_mapping.keys()).intersection(self._model_mapping.keys()) + return len(common_keys) + len(self._extra_content) + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping: + model_name = self._model_mapping[model_type] + return self._load_attr_from_module(model_type, model_name) + + # Maybe there was several model types associated with this config. + model_types = [k for k, v in self._config_mapping.items() if v == key.__name__] + for mtype in model_types: + if mtype in self._model_mapping: + model_name = self._model_mapping[mtype] + return self._load_attr_from_module(mtype, model_name) + raise KeyError(key) + + def _load_attr_from_module(self, model_type, attr): + module_name = model_type_to_module_name(model_type) + if module_name not in self._modules: + if "Tokenizer" in model_type: + try: + self._modules[module_name] = importlib.import_module( + f".{module_name}.tokenizer", "paddlenlp.transformers" + ) + except ImportError: + pass + if module_name not in self._modules: + if "Config" in model_type: + try: + self._modules[module_name] = importlib.import_module( + f".{module_name}.configuration", "paddlenlp.transformers" + ) + except ImportError: + pass + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "paddlenlp.transformers") + return getattribute_from_module(self._modules[module_name], attr) + + def keys(self): + mapping_keys = [ + self._load_attr_from_module(key, name) + for key, name in self._config_mapping.items() + if key in self._model_mapping.keys() + ] + return mapping_keys + list(self._extra_content.keys()) + + def get(self, key, default): + try: + return self.__getitem__(key) + except KeyError: + return default + + def __bool__(self): + return bool(self.keys()) + + def values(self): + mapping_values = [ + self._load_attr_from_module(key, name) + for key, name in self._model_mapping.items() + if key in self._config_mapping.keys() + ] + return mapping_values + list(self._extra_content.values()) + + def items(self): + mapping_items = [ + ( + self._load_attr_from_module(key, self._config_mapping[key]), + self._load_attr_from_module(key, self._model_mapping[key]), + ) + for key in self._model_mapping.keys() + if key in self._config_mapping.keys() + ] + return mapping_items + list(self._extra_content.items()) + + def __iter__(self): + return iter(self.keys()) + + def __contains__(self, item): + if item in self._extra_content: + return True + if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: + return False + model_type = self._reverse_config_mapping[item.__name__] + return model_type in self._model_mapping + + def register(self, key, value, exist_ok=False): + """ + Register a new model in this mapping. + """ + if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping: + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping.keys() and not exist_ok: + raise ValueError(f"'{key}' is already used by a Transformers model.") + + self._extra_content[key] = value diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 46efa4efb7ad..88392d4eb110 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -17,104 +17,271 @@ import json import os from collections import OrderedDict +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from ...utils import is_tokenizers_available from ...utils.download import resolve_file_path from ...utils.import_utils import import_module from ...utils.log import logger +from ..configuration_utils import PretrainedConfig +from ..tokenizer_utils_base import TOKENIZER_CONFIG_FILE +from ..tokenizer_utils_fast import PretrainedTokenizerFast +from .configuration import ( + CONFIG_MAPPING_NAMES, + AutoConfig, + config_class_to_model_type, + model_type_to_module_name, +) +from .factory import _LazyAutoMapping __all__ = [ "AutoTokenizer", ] -TOKENIZER_MAPPING_NAMES = OrderedDict( - [ - ("AlbertEnglishTokenizer", "albert"), - ("AlbertChineseTokenizer", "albert"), - ("BertJapaneseTokenizer", "bert_japanese"), - ("BigBirdTokenizer", "bigbird"), - ("BlenderbotSmallTokenizer", "blenderbot_small"), - ("BlenderbotTokenizer", "blenderbot"), - ("ChatGLMTokenizer", "chatglm"), - ("ChatGLMv2Tokenizer", "chatglm_v2"), - ("ChineseBertTokenizer", "chinesebert"), - ("ConvBertTokenizer", "convbert"), - ("CTRLTokenizer", "ctrl"), - ("DalleBartTokenizer", "dallebart"), - ("DistilBertTokenizer", "distilbert"), - ("ElectraTokenizer", "electra"), - ("ErnieCtmTokenizer", "ernie_ctm"), - ("ErnieDocTokenizer", "ernie_doc"), - ("ErnieDocBPETokenizer", "ernie_doc"), - ("ErnieGramTokenizer", "ernie_gram"), - ("ErnieLayoutTokenizer", "ernie_layout"), - ("ErnieMTokenizer", "ernie_m"), - ("ErnieCodeTokenizer", "ernie_code"), - ("ErnieTokenizer", "ernie"), - ("FNetTokenizer", "fnet"), - ("FunnelTokenizer", "funnel"), - ("LlamaTokenizer", "llama"), - ("LayoutXLMTokenizer", "layoutxlm"), - ("LayoutLMv2Tokenizer", "layoutlmv2"), - ("LayoutLMTokenizer", "layoutlm"), - ("LukeTokenizer", "luke"), - ("MBartTokenizer", "mbart"), - ("MBart50Tokenizer", "mbart"), - ("MegatronBertTokenizer", "megatronbert"), - ("MobileBertTokenizer", "mobilebert"), - ("MPNetTokenizer", "mpnet"), - ("NeZhaTokenizer", "nezha"), - ("NystromformerTokenizer", "nystromformer"), - ("PPMiniLMTokenizer", "ppminilm"), - ("ProphetNetTokenizer", "prophetnet"), - ("ReformerTokenizer", "reformer"), - ("RemBertTokenizer", "rembert"), - ("RobertaChineseTokenizer", "roberta"), - ("RobertaBPETokenizer", "roberta"), - ("RoFormerTokenizer", "roformer"), - ("RoFormerv2Tokenizer", "roformerv2"), - ("SkepTokenizer", "skep"), - ("SqueezeBertTokenizer", "squeezebert"), - ("TinyBertTokenizer", "tinybert"), - ("UnifiedTransformerTokenizer", "unified_transformer"), - ("UNIMOTokenizer", "unimo"), - ("XLNetTokenizer", "xlnet"), - ("XLMTokenizer", "xlm"), - ("GPTTokenizer", "gpt"), - ("GPTChineseTokenizer", "gpt"), - ("T5Tokenizer", "t5"), - ("BertTokenizer", "bert"), - ("BartTokenizer", "bart"), - ("GAUAlphaTokenizer", "gau_alpha"), - ("CodeGenTokenizer", "codegen"), - ("CLIPTokenizer", "clip"), - ("ArtistTokenizer", "artist"), - ("ChineseCLIPTokenizer", "chineseclip"), - ("ErnieViLTokenizer", "ernie_vil"), - ("PegasusChineseTokenizer", "pegasus"), - ("GLMBertTokenizer", "glm"), - ("GLMChineseTokenizer", "glm"), - ("GLMGPT2Tokenizer", "glm"), - ("BloomTokenizer", "bloom"), - ("SpeechT5Tokenizer", "speecht5"), - ("QWenTokenizer", "qwen"), - ("GemmaTokenizer", "gemma"), - ("YuanTokenizer", "yuan"), - ("MambaTokenizer", "mamba"), - ("JambaTokenizer", "jamba"), - ] -) +if TYPE_CHECKING: + TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict() +else: + TOKENIZER_MAPPING_NAMES = OrderedDict( + [ + ("albert", (("AlbertChineseTokenizer", "AlbertEnglishTokenizer"), None)), + ("bart", "BartTokenizer"), + ("bert", "BertTokenizer"), + ("blenderbot", "BlenderbotTokenizer"), + ("bloom", "BloomTokenizer"), + ("clip", "CLIPTokenizer"), + ("codegen", "CodeGenTokenizer"), + ("convbert", "ConvBertTokenizer"), + ("ctrl", "CTRLTokenizer"), + ("distilbert", "DistilBertTokenizer"), + ("electra", "ElectraTokenizer"), + ("ernie", "ErnieTokenizer"), + ("ernie_m", "ErnieMTokenizer"), + ("fnet", "FNetTokenizer"), + ("funnel", "FunnelTokenizer"), + ("gemma", "GemmaTokenizer"), + ("jamba", "JambaTokenizer"), + ("layoutlm", "LayoutLMTokenizer"), + ("layoutlmv2", "LayoutLMv2Tokenizer"), + ("layoutxlm", "LayoutXLMTokenizer"), + ( + "llama", + ( + "LlamaTokenizer", + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("luke", "LukeTokenizer"), + ("mamba", "MambaTokenizer"), + ("mbart", (("MBartTokenizer", "MBart50Tokenizer"), None)), + ("mobilebert", "MobileBertTokenizer"), + ("mpnet", "MPNetTokenizer"), + ("nezha", "NeZhaTokenizer"), + ("pegasus", "PegasusChineseTokenizer"), + ("prophetnet", "ProphetNetTokenizer"), + ("reformer", "ReformerTokenizer"), + ("rembert", "RemBertTokenizer"), + ("roberta", "RobertaBPETokenizer"), + ("roformer", "RoFormerTokenizer"), + ("speecht5", "SpeechT5Tokenizer"), + ("squeezebert", "SqueezeBertTokenizer"), + ("t5", "T5Tokenizer"), + ("xlm", "XLMTokenizer"), + ("xlnet", "XLNetTokenizer"), + ("bert_japanese", "BertJapaneseTokenizer"), + ("bigbird", "BigBirdTokenizer"), + ("blenderbot_small", "BlenderbotSmallTokenizer"), + ("chatglm", "ChatGLMTokenizer"), + ("chatglm_v2", "ChatGLMv2Tokenizer"), + ("chinesebert", "ChineseBertTokenizer"), + ("dallebart", "DalleBartTokenizer"), + ("ernie_ctm", "ErnieCtmTokenizer"), + ("ernie_doc", "ErnieDocBPETokenizer"), + ("ernie_gram", "ErnieGramTokenizer"), + ("ernie_layout", "ErnieLayoutTokenizer"), + ("ernie_code", "ErnieCodeTokenizer"), + ("megatronbert", "MegatronBertTokenizer"), + ("nystromformer", "NystromformerTokenizer"), + ("ppminilm", "PPMiniLMTokenizer"), + ("roformerv2", "RoFormerv2Tokenizer"), + ("skep", "SkepTokenizer"), + ("tinybert", "TinyBertTokenizer"), + ("unified_transformer", "UnifiedTransformerTokenizer"), + ("unimo", "UNIMOTokenizer"), + ("gpt", (("GPTTokenizer", "GPTChineseTokenizer"), None)), + ("gau_alpha", "GAUAlphaTokenizer"), + ("artist", "ArtistTokenizer"), + ("chineseclip", "ChineseCLIPTokenizer"), + ("ernie_vil", "ErnieViLTokenizer"), + ("glm", "GLMGPT2Tokenizer"), + ("qwen", "QWenTokenizer"), + ("qwen2", "Qwen2Tokenizer"), + ("yuan", "YuanTokenizer"), + ] + ) + + +def get_mapping_tokenizers(tokenizers, with_fast=True): + all_tokenizers = [] + if isinstance(tokenizers, tuple): + (tokenizer_slow, tokenizer_fast) = tokenizers + if isinstance(tokenizer_slow, tuple): + all_tokenizers.extend(tokenizer_slow) + else: + all_tokenizers.append(tokenizer_slow) + if with_fast and tokenizer_fast is not None: + all_tokenizers.append(tokenizer_fast) + else: + all_tokenizers.append(tokenizers) + return all_tokenizers def get_configurations(): MAPPING_NAMES = OrderedDict() - for key, class_name in TOKENIZER_MAPPING_NAMES.items(): - import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.tokenizer") - tokenizer_name = getattr(import_class, key) - name = tuple(tokenizer_name.pretrained_init_configuration.keys()) - MAPPING_NAMES[name] = tokenizer_name + for class_name, values in TOKENIZER_MAPPING_NAMES.items(): + all_tokenizers = get_mapping_tokenizers(values, with_fast=False) + for key in all_tokenizers: + import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.tokenizer") + tokenizer_name = getattr(import_class, key) + name = tuple(tokenizer_name.pretrained_init_configuration.keys()) + MAPPING_NAMES[name] = tokenizer_name return MAPPING_NAMES +INIT_CONFIG_MAPPING = get_configurations() + +TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) + +CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} + + +def tokenizer_class_from_name(class_name: str): + if class_name == "PretrainedTokenizerFast": + return PretrainedTokenizerFast + + for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): + all_tokenizers = get_mapping_tokenizers(tokenizers) + if class_name in all_tokenizers: + module_name = model_type_to_module_name(module_name) + try: + module = importlib.import_module(f".{module_name}", "paddlenlp.transformers") + return getattr(module, class_name) + except AttributeError: + try: + module = importlib.import_module(f".{module_name}.tokenizer", "paddlenlp.transformers") + + return getattr(module, class_name) + except AttributeError: + raise ValueError(f"Tokenizer class {class_name} is not currently imported.") + + for config, tokenizers in TOKENIZER_MAPPING._extra_content.items(): + for tokenizer in tokenizers: + if getattr(tokenizer, "__name__", None) == class_name: + return tokenizer + + # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main + # init and we return the proper dummy to get an appropriate error message. + main_module = importlib.import_module("paddlenlp") + if hasattr(main_module, class_name): + return getattr(main_module, class_name) + + return None + + +def get_tokenizer_config( + pretrained_model_name_or_path: Union[str, os.PathLike], + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: Optional[bool] = None, + proxies: Optional[Dict[str, str]] = None, + token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, + subfolder: str = "", + **kwargs, +): + """ + Loads the tokenizer configuration from a pretrained model tokenizer configuration. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a configuration file saved using the + [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the standard + cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the configuration files and override the cached versions if they + exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + local_files_only (`bool`, *optional*, defaults to `False`): + If `True`, will only try to load the tokenizer configuration from local files. + subfolder (`str`, *optional*, defaults to `""`): + In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can + specify the folder name here. + + + + Passing `token=True` is required when you want to use a private model. + + + + Returns: + `Dict`: The configuration of the tokenizer. + + Examples: + + ```python + # Download configuration from huggingface.co and cache. + tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased") + # This model does not have a tokenizer config so the result will be an empty dict. + tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base") + + # Save a pretrained tokenizer locally and you can reload its config + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") + tokenizer.save_pretrained("tokenizer-test") + tokenizer_config = get_tokenizer_config("tokenizer-test") + ```""" + + resolved_config_file = resolve_file_path( + pretrained_model_name_or_path, + TOKENIZER_CONFIG_FILE, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + token=token, + revision=revision, + local_files_only=local_files_only, + subfolder=subfolder, + ) + if resolved_config_file is None: + logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.") + return {} + with open(resolved_config_file, encoding="utf-8") as reader: + result = json.load(reader) + + return result + + class AutoTokenizer: """ AutoClass can help you automatically retrieve the relevant model given the provided @@ -123,15 +290,12 @@ class AutoTokenizer: base tokenizer classes when created with the AutoTokenizer.from_pretrained() classmethod. """ - MAPPING_NAMES = get_configurations() - _tokenizer_mapping = MAPPING_NAMES - _name_mapping = TOKENIZER_MAPPING_NAMES - tokenizer_config_file = "tokenizer_config.json" + _tokenizer_mapping = get_configurations() - def __init__(self, *args, **kwargs): + def __init__(self): raise EnvironmentError( - f"{self.__class__.__name__} is designed to be instantiated " - f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`" + "AutoTokenizer is designed to be instantiated " + "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method." ) @classmethod @@ -190,7 +354,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): - Name of a community-contributed pretrained model. - Local directory path which contains tokenizer related resources and tokenizer config file ("tokenizer_config.json"). - *args (tuple): position arguments for model `__init__`. If provided, + *model_args (tuple): position arguments for model `__init__`. If provided, use these as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for tokenizer @@ -219,20 +383,17 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): print(type(tokenizer)) # """ - # Default not to use fast tokenizer - use_faster = kwargs.pop("use_faster", None) - use_fast = kwargs.pop("use_fast", None) - if use_fast is not None or use_faster is not None: - raise ValueError("use_fast is deprecated") + config = kwargs.pop("config", None) + kwargs["_from_auto"] = True - cache_dir = kwargs.get("cache_dir", None) - subfolder = kwargs.get("subfolder", "") - if subfolder is None: - subfolder = "" - from_aistudio = kwargs.get("from_aistudio", False) - from_hf_hub = kwargs.get("from_hf_hub", False) + use_fast = kwargs.pop("use_fast", False) + tokenizer_type = kwargs.pop("tokenizer_type", None) + if tokenizer_type is not None: + # TODO: Support tokenizer_type + raise NotImplementedError("tokenizer_type is not supported yet.") all_tokenizer_names = [] + for names, tokenizer_class in cls._tokenizer_mapping.items(): for name in names: all_tokenizer_names.append(name) @@ -245,25 +406,56 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - config_file = resolve_file_path( - pretrained_model_name_or_path, - cls.tokenizer_config_file, - subfolder, - cache_dir=cache_dir, - from_hf_hub=from_hf_hub, - from_aistudio=from_aistudio, - ) - if config_file is not None and os.path.exists(config_file): - tokenizer_class = cls._get_tokenizer_class_from_config( - pretrained_model_name_or_path, config_file, use_fast - ) - logger.info(f"We are using {tokenizer_class} to load '{pretrained_model_name_or_path}'.") + tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) + config_tokenizer_class = tokenizer_config.get("tokenizer_class") + if config_tokenizer_class is None: + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + config_tokenizer_class = config.tokenizer_class + if config_tokenizer_class is not None: + tokenizer_class = None + if use_fast and not config_tokenizer_class.endswith("Fast"): + tokenizer_class_candidate = f"{config_tokenizer_class}Fast" + tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) + if tokenizer_class is None: + tokenizer_class_candidate = config_tokenizer_class + tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) + if tokenizer_class is None: + raise ValueError( + f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." + ) return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - else: - raise RuntimeError( - f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" - f"Please make sure that '{pretrained_model_name_or_path}' is:\n" - "- a correct model-identifier of built-in pretrained models,\n" - "- or a correct model-identifier of community-contributed pretrained models,\n" - "- or the correct path to a directory containing relevant tokenizer files.\n" - ) + + # TODO: if model is an encoder decoder + + model_type = config_class_to_model_type(type(config).__name__) + if model_type is not None: + tokenizer_class_py = TOKENIZER_MAPPING[type(config)] + if isinstance(tokenizer_class_py, (list, tuple)): + (tokenizer_class_py, tokenizer_class_fast) = tokenizer_class_py + else: + tokenizer_class_fast = None + if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): + return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + else: + if tokenizer_class_py is not None: + if isinstance(tokenizer_class_py, str): + return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + else: + # Use the first tokenizer class in the list + print("We are using %s to load '%s'." % (tokenizer_class_py[0], pretrained_model_name_or_path)) + return tokenizer_class_py[0].from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) + else: + raise ValueError( + "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " + "in order to use this tokenizer." + ) + raise RuntimeError( + f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" + f"Please make sure that '{pretrained_model_name_or_path}' is:\n" + "- a correct model-identifier of built-in pretrained models,\n" + "- or a correct model-identifier of community-contributed pretrained models,\n" + "- or the correct path to a directory containing relevant tokenizer files.\n" + ) diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py index ebb905a68f89..c1fd2e0c530f 100644 --- a/paddlenlp/transformers/configuration_utils.py +++ b/paddlenlp/transformers/configuration_utils.py @@ -826,7 +826,8 @@ def get_config_dict( # Get config dict associated with the base config file config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) - + if config_dict is None: + return {}, kwargs # That config file may point us toward another config file to use. if "configuration_files" in config_dict: original_kwargs["cache_dir"] = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) @@ -875,9 +876,8 @@ def _get_config_dict( from_aistudio=from_aistudio, from_hf_hub=from_hf_hub, ) - assert ( - resolved_config_file is not None - ), f"please make sure one of the {filenames} under {pretrained_model_name_or_path}" + if resolved_config_file is None: + return None, kwargs try: logger.info(f"Loading configuration file {resolved_config_file}") # Load config dict diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index eafa3572a450..adc3c52130e6 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -26,9 +26,40 @@ decoders, normalizers, pre_tokenizers, + processors, ) from tokenizers.models import BPE, Unigram +from paddlenlp.utils.import_utils import ( + is_protobuf_available, + is_sentencepiece_available, +) + + +def import_protobuf(error_message=""): + if is_sentencepiece_available(): + from sentencepiece import sentencepiece_model_pb2 + + return sentencepiece_model_pb2 + if is_protobuf_available(): + import google.protobuf + + if version.parse(google.protobuf.__version__) < version.parse("4.0.0"): + from transformers.utils import sentencepiece_model_pb2 + else: + from transformers.utils import ( + sentencepiece_model_pb2_new as sentencepiece_model_pb2, + ) + return sentencepiece_model_pb2 + else: + raise ImportError( + f""" +{error_message} requires the protobuf library but it was not found in your environment. Checkout the instructions on the +installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones +that match your environment. Please note that you may need to restart your runtime after installation. +""" + ) + # Copied from transformers, adapted for tokenizers >= 0.19.0 def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str: @@ -199,15 +230,61 @@ def converted(self) -> Tokenizer: return tokenizer -class TikTokenConverter(Converter): - def extract(self, tiktoken_file: str): - from .tiktoken_model_utils import bpe, bytes_to_unicode, load_tiktoken_bpe +# Copied from paddlenlp/transformers/gpt/tokenizer.py +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + _chr = chr + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + + +class TikTokenConverter: + """ + A general tiktoken converter. + """ - bpe_ranks = ( - self.original_tokenizer.mergeable_ranks - if hasattr(self.original_tokenizer, "mergeable_ranks") and self.original_tokenizer.mergeable_ranks - else load_tiktoken_bpe(tiktoken_file) - ) + def __init__( + self, + vocab_file=None, + pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", + add_prefix_space=False, + additional_special_tokens=None, + *args, + **kwargs, + ): + super().__init__(*args) + self.vocab_file = vocab_file + self.pattern = pattern + self.add_prefix_space = add_prefix_space + self.additional_special_tokens = additional_special_tokens + + def extract_vocab_merges_from_model(self, tiktoken_url: str): + try: + from tiktoken.load import load_tiktoken_bpe + except Exception: + raise ValueError( + "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`." + ) + + bpe_ranks = load_tiktoken_bpe(tiktoken_url) byte_encoder = bytes_to_unicode() def token_bytes_to_string(b): @@ -219,12 +296,39 @@ def token_bytes_to_string(b): vocab[token_bytes_to_string(token)] = rank if len(token) == 1: continue - merged = tuple(bpe(bpe_ranks, token, max_rank=rank)) - if len(merged) == 2: - merges.append(tuple(map(token_bytes_to_string, merged))) - + local = [] + for index in range(1, len(token)): + piece_l, piece_r = token[:index], token[index:] + if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks: + local.append((piece_l, piece_r, rank)) + local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False) + merges.extend(local) + merges = sorted(merges, key=lambda val: val[2], reverse=False) + merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges] return vocab, merges + def tokenizer(self): + vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab_file) + tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False)) + if hasattr(tokenizer.model, "ignore_merges"): + tokenizer.model.ignore_merges = True + return tokenizer + + def converted(self) -> Tokenizer: + tokenizer = self.tokenizer() + tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False), + pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False), + ] + ) + tokenizer.decoder = decoders.ByteLevel() + tokenizer.add_special_tokens(self.additional_special_tokens) + + tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) + + return tokenizer + class LlamaConverter(SpmConverter): handle_byte_fallback = True @@ -298,7 +402,7 @@ def pre_tokenizer(self, replacement, add_prefix_space): } -def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: +def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer: """ Utilities to convert a slow tokenizer instance in a fast tokenizer instance. @@ -313,12 +417,18 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: """ tokenizer_class_name = transformer_tokenizer.__class__.__name__ - if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS: - raise ValueError( - f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance. " - f"No converter was found. Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}" - ) - - converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] - - return converter_class(transformer_tokenizer).converted() + if tokenizer_class_name in SLOW_TO_FAST_CONVERTERS and not from_tiktoken: + converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] + return converter_class(transformer_tokenizer).converted() + else: + try: + return TikTokenConverter( + vocab_file=transformer_tokenizer.vocab_file, + additional_special_tokens=transformer_tokenizer.additional_special_tokens, + ).converted() + except Exception: + raise ValueError( + f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path " + f"with a SentencePiece tokenizer.model file." + f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}" + ) diff --git a/paddlenlp/transformers/ernie/__init__.py b/paddlenlp/transformers/ernie/__init__.py index 97043fd7ba68..91cb3725f5fe 100644 --- a/paddlenlp/transformers/ernie/__init__.py +++ b/paddlenlp/transformers/ernie/__init__.py @@ -11,3 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .configuration import * +from .modeling import * +from .tokenizer import * diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py index 373d741fdf2e..be688206e2ad 100644 --- a/paddlenlp/transformers/llama/tokenizer.py +++ b/paddlenlp/transformers/llama/tokenizer.py @@ -19,6 +19,8 @@ import sentencepiece as spm +from paddlenlp.transformers.convert_slow_tokenizer import import_protobuf + from ...utils.log import logger from .. import PretrainedTokenizer @@ -70,8 +72,7 @@ def __init__( self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.decode_with_prefix_space = decode_with_prefix_space - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(vocab_file) + self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", True)) @property def vocab_size(self): @@ -98,6 +99,23 @@ def bos_token_id(self) -> Optional[int]: def eos_token_id(self) -> Optional[int]: return self.sp_model.eos_id() + def get_spm_processor(self, from_slow=True): + tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) + if from_slow: # no dependency on protobuf + tokenizer.Load(self.vocab_file) + return tokenizer + + with open(self.vocab_file, "rb") as f: + sp_model = f.read() + model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)") + model = model_pb2.ModelProto.FromString(sp_model) + normalizer_spec = model_pb2.NormalizerSpec() + normalizer_spec.add_dummy_prefix = False + model.normalizer_spec.MergeFrom(normalizer_spec) + sp_model = model.SerializeToString() + tokenizer.LoadFromSerializedProto(sp_model) + return tokenizer + def get_vocab(self): """Returns vocab as a dict""" vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} diff --git a/paddlenlp/transformers/llama/tokenizer_fast.py b/paddlenlp/transformers/llama/tokenizer_fast.py index 1543e14b61b1..13416c92c270 100644 --- a/paddlenlp/transformers/llama/tokenizer_fast.py +++ b/paddlenlp/transformers/llama/tokenizer_fast.py @@ -24,7 +24,7 @@ __all__ = ["LlamaTokenizerFast"] -VOCAB_FILES_NAMES = {"vocab_file": "spiece.bpe.model", "tokenizer_file": "tokenizer.json"} +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"} B_INST, E_INST = "[INST]", "[/INST]" B_SYS, E_SYS = "<>\n", "\n<>\n\n" diff --git a/paddlenlp/transformers/mbart/__init__.py b/paddlenlp/transformers/mbart/__init__.py index 97043fd7ba68..ebdc0b0919be 100644 --- a/paddlenlp/transformers/mbart/__init__.py +++ b/paddlenlp/transformers/mbart/__init__.py @@ -11,3 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .configuration import * +from .tokenizer import * diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 7a47ae01a50a..e044e7e5830b 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -51,9 +51,25 @@ from ..utils.download import resolve_file_path from ..utils.env import CHAT_TEMPLATE_CONFIG_NAME, TOKENIZER_CONFIG_NAME -from ..utils.import_utils import is_tokenizers_available +from ..utils.import_utils import is_protobuf_available, is_tokenizers_available from ..utils.log import logger + +def import_protobuf_decode_error(error_message=""): + if is_protobuf_available(): + from google.protobuf.message import DecodeError + + return DecodeError + else: + raise ImportError( + f""" +{error_message} requires the protobuf library but it was not found in your environment. Checkout the instructions on the +installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones +that match your environment. Please note that you may need to restart your runtime after installation. +""" + ) + + if is_tokenizers_available(): from tokenizers import AddedToken from tokenizers import Encoding as EncodingFast @@ -142,6 +158,7 @@ class TensorType(ExplicitEnum): SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" +FULL_TOKENIZER_FILE = "tokenizer.json" def to_py_obj(obj): @@ -1436,10 +1453,13 @@ def _set_processor_class(self, processor_class: str): self._processor_class = processor_class def __repr__(self) -> str: + added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()]) return ( - f"{'PretrainedTokenizer'}(name_or_path='{self.name_or_path}', " - f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, " - f"padding_side='{self.padding_side}', truncation_side='{self.truncation_side}', special_tokens={self.special_tokens_map_extended})" + f"{self.__class__.__name__}(name_or_path='{self.name_or_path}'," + f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast}," + f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}'," + f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), " + " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}" ) def get_vocab(self) -> Dict[str, int]: @@ -1495,20 +1515,18 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Load from local directory path tokenizer = BertTokenizer.from_pretrained('./my_bert/') """ - - pretrained_model_name_or_path = str(pretrained_model_name_or_path) cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.pop("from_hf_hub", False) from_aistudio = kwargs.pop("from_aistudio", False) subfolder = kwargs.pop("subfolder", "") return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False) - if subfolder is None: - subfolder = "" - + pretrained_model_name_or_path = str(pretrained_model_name_or_path) vocab_files = {} init_configuration = {} + # is_local = os.path.isdir(pretrained_model_name_or_path) + additional_files_names = { "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, @@ -1517,7 +1535,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): } vocab_files_target = {**cls.resource_files_names, **additional_files_names} - # From HF Hub or AI Studio if from_hf_hub or from_aistudio: # Only include the necessary resource files specified by the tokenizer cls @@ -1541,8 +1558,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Assuming from community-contributed pretrained models for file_id, file_name in vocab_files_target.items(): vocab_files[file_id] = file_name - resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): if file_path is None or os.path.isfile(file_path): resolved_vocab_files[file_id] = file_path @@ -1555,12 +1572,49 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): from_aistudio=from_aistudio, from_hf_hub=from_hf_hub, ) - for file_id, file_path in resolved_vocab_files.items(): if resolved_vocab_files[file_id] is not None: cache_dir = os.path.dirname(resolved_vocab_files[file_id]) break + return cls._from_pretrained( + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *args, + cache_dir=cache_dir, + return_tokenizer_file_dir=return_tokenizer_file_dir, + from_hf_hub=from_hf_hub, + **kwargs, + ) + @classmethod + def _from_pretrained( + cls, + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *init_inputs, + cache_dir=None, + return_tokenizer_file_dir=False, + from_hf_hub=False, + **kwargs, + ): + if cls.__name__.endswith("Fast"): + from_slow = kwargs.get("from_slow", False) + else: + from_slow = kwargs.get("from_slow", True) + has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None + if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None: + slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( + copy.deepcopy(resolved_vocab_files), + pretrained_model_name_or_path, + copy.deepcopy(init_configuration), + *init_inputs, + cache_dir=cache_dir, + **(copy.deepcopy(kwargs)), + ) + else: + slow_tokenizer = None tokenizer_config_file_dir_list = set() for k, v in resolved_vocab_files.items(): if v is not None and os.path.isfile(v): @@ -1568,6 +1622,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): tokenizer_config_file_dir_list = list(tokenizer_config_file_dir_list) # TODO: check this assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory." + # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None @@ -1575,9 +1630,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): if tokenizer_config_file is not None: with io.open(tokenizer_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) + init_kwargs.pop("tokenizer_class", None) else: init_kwargs = init_configuration + if slow_tokenizer is not None: + init_kwargs["__slow_tokenizer"] = slow_tokenizer + init_kwargs["name_or_path"] = pretrained_model_name_or_path + init_kwargs["from_slow"] = from_slow + pass_added_tokens_file = False # Handle tokenizer serialization of added and special tokens added_tokens_decoder: Dict[int, AddedToken] = {} @@ -1597,11 +1658,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): pass_added_tokens_file = True # position args are stored in kwargs, maybe better not include - init_args = init_kwargs.pop("init_args", ()) init_kwargs.pop("init_class", None) # Update with newly provided args and kwargs - init_args = init_args if not args else args init_kwargs.update(kwargs) def convert_added_tokens(obj): @@ -1645,7 +1704,22 @@ def convert_added_tokens(obj): init_kwargs.pop("tokenizer_file") # TODO(guosheng): avoid reduplication of position args and key word args - tokenizer = cls(*init_args, **init_kwargs) + try: + tokenizer = cls(*init_inputs, **init_kwargs) + except import_protobuf_decode_error(): + logger.info( + "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead." + "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).", + ) + return False + except RuntimeError as e: + if "sentencepiece_processor.cc" in str(e): + logger.info( + "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead." + "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).", + ) + return False + chat_template = init_kwargs.pop("chat_template", None) if chat_template is not None: tokenizer.init_chat_template(chat_template) diff --git a/paddlenlp/transformers/tokenizer_utils_fast.py b/paddlenlp/transformers/tokenizer_utils_fast.py index 6d49ac7fde71..60fd432bb9d8 100644 --- a/paddlenlp/transformers/tokenizer_utils_fast.py +++ b/paddlenlp/transformers/tokenizer_utils_fast.py @@ -35,7 +35,7 @@ WordPieceTrainer, ) -from ..utils.env import ADDED_TOKENS_NAME, FULL_TOKENIZER_NAME +from ..utils.env import ADDED_TOKENS_NAME, FULL_TOKENIZER_NAME, TIKTOKEN_VOCAB_FILE from .convert_slow_tokenizer import convert_slow_tokenizer from .tokenizer_utils import ChatTemplateMixin, PretrainedTokenizer from .tokenizer_utils_base import ( @@ -60,7 +60,7 @@ "WordPiece": WordPieceTrainer, } -VOCAB_FILES_NAMES = {"tokenizer_file": FULL_TOKENIZER_NAME} +VOCAB_FILES_NAMES = {"tokenizer_file": FULL_TOKENIZER_NAME, "vocab_file": TIKTOKEN_VOCAB_FILE} class PretrainedTokenizerFast(ChatTemplateMixin, PretrainedTokenizerBase): @@ -97,13 +97,19 @@ def __init__(self, *args, **kwargs): elif fast_tokenizer_file is not None and not from_slow: # We have a serialization from tokenizers which let us directly build the backend fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) - elif slow_tokenizer is not None: + elif slow_tokenizer: # We need to convert a slow tokenizer to build the backend fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) - elif self.slow_tokenizer_class is not None: + elif self.slow_tokenizer_class is not None and slow_tokenizer is not False: # We need to create and convert a slow tokenizer to build the backend slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) + elif not slow_tokenizer: + # We try to load with tiktoken + self.vocab_file = kwargs.get("vocab_file", None) + self.additional_special_tokens = kwargs.get("additional_special_tokens", []) + fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True) + slow_tokenizer = None else: raise ValueError( "Couldn't instantiate the backend tokenizer from one of: \n" @@ -626,7 +632,6 @@ def _encode_plus( ) self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose) - return batched_output def convert_tokens_to_string(self, tokens: List[str]) -> str: diff --git a/paddlenlp/utils/__init__.py b/paddlenlp/utils/__init__.py index 7f52ac762a00..a8c4dc487a0e 100644 --- a/paddlenlp/utils/__init__.py +++ b/paddlenlp/utils/__init__.py @@ -18,7 +18,7 @@ from .batch_sampler import * from .env import CONFIG_NAME, GENERATION_CONFIG_NAME, LEGACY_CONFIG_NAME -from .import_utils import install_package, uninstall_package +from .import_utils import * from .infohub import infohub from .initializer import to from .serialization import load_torch diff --git a/paddlenlp/utils/env.py b/paddlenlp/utils/env.py index f57380fb4698..d1fbbb1a60ba 100644 --- a/paddlenlp/utils/env.py +++ b/paddlenlp/utils/env.py @@ -74,7 +74,7 @@ def _get_bool_env(env_key: str, default_value: str) -> bool: GENERATION_CONFIG_NAME = "generation_config.json" # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file FULL_TOKENIZER_NAME = "tokenizer.json" - +TIKTOKEN_VOCAB_FILE = "tokenizer.model" LORA_CONFIG_NAME = "lora_config.json" LORA_WEIGHTS_NAME = "lora_model_state.pdparams" diff --git a/paddlenlp/utils/import_utils.py b/paddlenlp/utils/import_utils.py index 3da810b7b0b7..2c3796214a7f 100644 --- a/paddlenlp/utils/import_utils.py +++ b/paddlenlp/utils/import_utils.py @@ -18,19 +18,113 @@ import shutil import site import sys -from typing import Optional, Type +from typing import Optional, Tuple, Type, Union import pip from paddlenlp.utils.log import logger +# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better. +def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]: + # Check if the package spec exists and grab its version to avoid importing a local directory + package_exists = importlib.util.find_spec(pkg_name) is not None + package_version = "N/A" + if package_exists: + try: + # Primary method to get the package version + package_version = importlib.metadata.version(pkg_name) + except importlib.metadata.PackageNotFoundError: + # Fallback method: Only for "torch" and versions containing "dev" + if pkg_name == "torch": + try: + package = importlib.import_module(pkg_name) + temp_version = getattr(package, "__version__", "N/A") + # Check if the version contains "dev" + if "dev" in temp_version: + package_version = temp_version + package_exists = True + else: + package_exists = False + except ImportError: + # If the package can't be imported, it's not available + package_exists = False + else: + # For packages other than "torch", don't attempt the fallback and set as not available + package_exists = False + logger.debug(f"Detected {pkg_name} version: {package_version}") + if return_version: + return package_exists, package_version + else: + return package_exists + + +_g2p_en_available = _is_package_available("g2p_en") +_sentencepiece_available = _is_package_available("sentencepiece") +_sklearn_available = importlib.util.find_spec("sklearn") is not None +if _sklearn_available: + try: + importlib.metadata.version("scikit-learn") + except importlib.metadata.PackageNotFoundError: + _sklearn_available = False + + +# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better. +def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]: + # Check if the package spec exists and grab its version to avoid importing a local directory + package_exists = importlib.util.find_spec(pkg_name) is not None + package_version = "N/A" + if package_exists: + try: + # Primary method to get the package version + package_version = importlib.metadata.version(pkg_name) + except importlib.metadata.PackageNotFoundError: + # Fallback method: Only for "torch" and versions containing "dev" + if pkg_name == "torch": + try: + package = importlib.import_module(pkg_name) + temp_version = getattr(package, "__version__", "N/A") + # Check if the version contains "dev" + if "dev" in temp_version: + package_version = temp_version + package_exists = True + else: + package_exists = False + except ImportError: + # If the package can't be imported, it's not available + package_exists = False + else: + # For packages other than "torch", don't attempt the fallback and set as not available + package_exists = False + logger.debug(f"Detected {pkg_name} version: {package_version}") + if return_version: + return package_exists, package_version + else: + return package_exists + + +_g2p_en_available = _is_package_available("g2p_en") +_sentencepiece_available = _is_package_available("sentencepiece") +_sklearn_available = importlib.util.find_spec("sklearn") is not None +if _sklearn_available: + try: + importlib.metadata.version("scikit-learn") + except importlib.metadata.PackageNotFoundError: + _sklearn_available = False + + def is_datasets_available(): import importlib return importlib.util.find_spec("datasets") is not None +def is_protobuf_available(): + if importlib.util.find_spec("google") is None: + return False + return importlib.util.find_spec("google.protobuf") is not None + + def is_paddle_cuda_available() -> bool: if is_paddle_available(): import paddle @@ -40,6 +134,14 @@ def is_paddle_cuda_available() -> bool: return False +def is_g2p_en_available(): + return _g2p_en_available + + +def is_sentencepiece_available(): + return _sentencepiece_available + + def is_paddle_available() -> bool: """check if `torch` package is installed Returns: @@ -48,14 +150,14 @@ def is_paddle_available() -> bool: return is_package_available("paddle") -def is_psutil_available(): - return importlib.util.find_spec("psutil") is not None - - def is_tiktoken_available(): return importlib.util.find_spec("tiktoken") is not None +def is_psutil_available(): + return importlib.util.find_spec("psutil") is not None + + def is_torch_available() -> bool: """check if `torch` package is installed Returns: diff --git a/requirements.txt b/requirements.txt index 0673bcc40ed2..dc7c2a06bfb3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ jieba +blobfile colorlog colorama seqeval diff --git a/tests/transformers/auto/test_confiugration.py b/tests/transformers/auto/test_confiugration.py index e58b793cc78a..eea37fd93a86 100644 --- a/tests/transformers/auto/test_confiugration.py +++ b/tests/transformers/auto/test_confiugration.py @@ -21,6 +21,9 @@ import unittest from paddlenlp.transformers import AutoConfig +from paddlenlp.transformers.auto.configuration import CONFIG_MAPPING +from paddlenlp.transformers.bert.configuration import BertConfig +from paddlenlp.transformers.configuration_utils import PretrainedConfig from paddlenlp.utils.env import CONFIG_NAME @@ -86,6 +89,34 @@ def test_load_from_legacy_config(self): auto_config = AutoConfig.from_pretrained(tempdir) self.assertEqual(auto_config.hidden_size, number) + def test_new_config_registration(self): + class CustomConfig(PretrainedConfig): + model_type = "custom" + + def __init__(self, attribute=1, **kwargs): + self.attribute = attribute + super().__init__(**kwargs) + + try: + AutoConfig.register("custom", CustomConfig) + # Wrong model type will raise an error + with self.assertRaises(ValueError): + AutoConfig.register("model", CustomConfig) + # Trying to register something existing in the PaddleNLP library will raise an error + with self.assertRaises(ValueError): + AutoConfig.register("bert", BertConfig) + + # Now that the config is registered, it can be used as any other config with the auto-API + config = CustomConfig() + with tempfile.TemporaryDirectory() as tmp_dir: + config.save_pretrained(tmp_dir) + new_config = AutoConfig.from_pretrained(tmp_dir) + self.assertIsInstance(new_config, CustomConfig) + + finally: + if "custom" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["custom"] + def test_from_pretrained_cache_dir(self): model_id = "__internal_testing__/tiny-random-bert" with tempfile.TemporaryDirectory() as tempdir: diff --git a/tests/transformers/llama/test_tokenizer.py b/tests/transformers/llama/test_tokenizer.py index 8ba708f2ffb4..940548a7a950 100644 --- a/tests/transformers/llama/test_tokenizer.py +++ b/tests/transformers/llama/test_tokenizer.py @@ -13,12 +13,16 @@ # See the License for the specific language governing permissions and # limitations under the License. +import shutil +import tempfile import unittest +from paddlenlp.transformers.auto.tokenizer import AutoTokenizer from paddlenlp.transformers.llama.tokenizer import LlamaTokenizer from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer +from paddlenlp.transformers.tokenizer_utils_fast import PretrainedTokenizerFast -from ...transformers.test_tokenizer_common import TokenizerTesterMixin +from ..test_tokenizer_common import TokenizerTesterMixin VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", @@ -186,7 +190,6 @@ def test_padding_if_pad_token_set_slow(self): def test_add_bos_token_slow(self): bos_token = "" tokenizer = self.get_tokenizer() - s = "This is a simple input" s2 = ["This is a simple input 1", "This is a simple input 2"] @@ -208,3 +211,73 @@ def test_pretrained_model_lists(self): # No max_model_input_sizes self.assertGreaterEqual(len(self.tokenizer_class.pretrained_resource_files_map), 1) self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_resource_files_map.values())[0]), 1) + + +class TikTokenIntegrationTests(unittest.TestCase): + """ + A class that regroups important test to make sure that we properly handle the special tokens. + """ + + def test_tiktoken_llama(self): + model_path = "hf-internal-testing/llama-3-8b-internal" + subfolder = "" + test_text = "This is a test sentence." + test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001] + num_reserved_special_tokens = 256 + special_tokens = [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", + "<|python_tag|>", # end of turn + ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] + + tiktoken_tokenizer = PretrainedTokenizerFast.from_pretrained( + model_path, + subfolder=subfolder, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + ) + tokens = tiktoken_tokenizer.tokenize("<|begin_of_text|> " + test_text) + self.assertEqual(tokens[0], "<|begin_of_text|>") + + tiktoken_tokenizer = AutoTokenizer.from_pretrained( + model_path, + subfolder=subfolder, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + add_bos_token=True, + add_eos_token=True, + use_fast=True, + ) + self.assertTrue(isinstance(tiktoken_tokenizer, PretrainedTokenizerFast)) + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)["input_ids"] + self.assertEqual(tokens, test_tokens) + tmpdirname = tempfile.mkdtemp() + tiktoken_tokenizer.save_pretrained(tmpdirname) + tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname, use_fast=True) + self.assertTrue(isinstance(tokenizer_reload, PretrainedTokenizerFast)) + tokens = tokenizer_reload.encode(test_text, add_special_tokens=True)["input_ids"] + self.assertEqual(tokens, test_tokens) + shutil.rmtree(tmpdirname) + tiktoken_tokenizer = AutoTokenizer.from_pretrained( + model_path, + subfolder=subfolder, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + from_slow=True, + add_bos_token=True, + add_eos_token=True, + use_fast=True, + ) + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)["input_ids"] + self.assertEqual(tokens, test_tokens)