From 0fd72401ac1973696a8c0a25b558bf1c90bebd47 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Tue, 24 Sep 2024 14:10:50 +0000 Subject: [PATCH 01/21] add support of tiktoken tokenizer, refactor some code --- paddlenlp/transformers/auto/configuration.py | 50 +- paddlenlp/transformers/auto/factory.py | 126 ++++ paddlenlp/transformers/auto/tokenizer.py | 614 ++++++++++++++---- .../transformers/convert_slow_tokenizer.py | 145 ++++- paddlenlp/transformers/ernie/__init__.py | 3 + paddlenlp/transformers/llama/tokenizer.py | 22 +- .../transformers/llama/tokenizer_fast.py | 2 +- .../transformers/tokenizer_utils_base.py | 131 +++- .../transformers/tokenizer_utils_fast.py | 19 +- paddlenlp/utils/__init__.py | 2 +- paddlenlp/utils/download/__init__.py | 4 +- paddlenlp/utils/env.py | 2 +- paddlenlp/utils/import_utils.py | 53 +- tests/transformers/llama/test_tokenizer.py | 75 ++- tests/transformers/test_modeling_common.py | 5 +- 15 files changed, 1064 insertions(+), 189 deletions(-) create mode 100644 paddlenlp/transformers/auto/factory.py diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 8407154c98ff..5db25644e6c9 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -13,13 +13,16 @@ # limitations under the License. from __future__ import annotations +import importlib import inspect import io import json import os -from collections import defaultdict +from collections import OrderedDict, defaultdict from typing import Dict, List, Type +from paddlenlp.utils.env import CONFIG_NAME + from ...utils.download import resolve_file_path from ...utils.import_utils import import_module from ...utils.log import logger @@ -29,7 +32,35 @@ __all__ = [ "AutoConfig", ] - +# CONFIG_MAPPING_NAMES = OrderedDict( +# [ +# # Add configs here +# ('albert', 'AlbertConfig'), ('bart', 'BartConfig'), ('bert', 'BertConfig'), ('bit', 'BitConfig'), ('blenderbot', 'BlenderbotConfig'), ('blip', 'BlipConfig'), ('bloom', 'BloomConfig'), ('clap', 'ClapConfig'), ('clip', 'CLIPConfig'), ('clipseg', 'CLIPSegConfig'), ('codegen', 'CodeGenConfig'), ('convbert', 'ConvBertConfig'), ('ctrl', 'CTRLConfig'), ('deberta', 'DebertaConfig'), ('distilbert', 'DistilBertConfig'), ('dpt', 'DPTConfig'), ('electra', 'ElectraConfig'), ('ernie', 'ErnieConfig'), ('ernie_m', 'ErnieMConfig'), ('fnet', 'FNetConfig'), ('funnel', 'FunnelConfig'), ('gemma', 'GemmaConfig'), ('gptj', 'GPTJConfig'), ('jamba', 'JambaConfig'), ('layoutlm', 'LayoutLMConfig'), ('layoutlmv2', 'LayoutLMv2Config'), ('llama', 'LlamaConfig'), ('luke', 'LukeConfig'), ('mamba', 'MambaConfig'), ('mbart', 'MBartConfig'), ('mistral', 'MistralConfig'), ('mixtral', 'MixtralConfig'), ('mobilebert', 'MobileBertConfig'), ('mpnet', 'MPNetConfig'), ('mt5', 'MT5Config'), ('nezha', 'NezhaConfig'), ('nystromformer', 'NystromformerConfig'), ('opt', 'OPTConfig'), ('pegasus', 'PegasusConfig'), ('prophetnet', 'ProphetNetConfig'), ('qwen2_moe', 'Qwen2MoeConfig'), ('reformer', 'ReformerConfig'), ('rembert', 'RemBertConfig'), ('roberta', 'RobertaConfig'), ('roformer', 'RoFormerConfig'), ('speecht5', 'SpeechT5Config'), ('squeezebert', 'SqueezeBertConfig'), ('t5', 'T5Config'), ('xlm', 'XLMConfig'), ('xlnet', 'XLNetConfig'), +# ] +# ) + +CONFIG_MAPPING_NAMES = OrderedDict( + [ + ('albert', 'AlbertConfig'), ('bigbird', 'BigBirdConfig'), ('blenderbot_small', 'BlenderbotSmallConfig'), ('blenderbot', 'BlenderbotConfig'), ('chatglm_v2', 'ChatGLMv2Config'), ('chatglm', 'ChatGLMConfig'), ('chineseclip', 'ChineseCLIPTextConfig'), ('chinesebert', 'ChineseBertConfig'), ('convbert', 'ConvBertConfig'), ('ctrl', 'CTRLConfig'), ('distilbert', 'DistilBertConfig'), ('dallebart', 'DalleBartConfig'), ('electra', 'ElectraConfig'), ('ernie_vil', 'ErnieViLConfig'), ('ernie_ctm', 'ErnieCtmConfig'), ('ernie_doc', 'ErnieDocConfig'), ('ernie_gen', 'ErnieGenConfig'), ('ernie_gram', 'ErnieGramConfig'), ('ernie_layout', 'ErnieLayoutConfig'), ('ernie_m', 'ErnieMConfig'), ('ernie_code', 'ErnieCodeConfig'), ('ernie', 'ErnieConfig'), ('fnet', 'FNetConfig'), ('funnel', 'FunnelConfig'), ('llama', 'LlamaConfig'), ('layoutxlm', 'LayoutXLMConfig'), ('layoutlmv2', 'LayoutLMv2Config'), ('layoutlm', 'LayoutLMConfig'), ('luke', 'LukeConfig'), ('mbart', 'MBartConfig'), ('megatronbert', 'MegatronBertConfig'), ('mobilebert', 'MobileBertConfig'), ('mpnet', 'MPNetConfig'), ('nezha', 'NeZhaConfig'), ('nystromformer', 'NystromformerConfig'), ('ppminilm', 'PPMiniLMConfig'), ('prophetnet', 'ProphetNetConfig'), ('reformer', 'ReformerConfig'), ('rembert', 'RemBertConfig'), ('roberta', 'RobertaConfig'), ('roformerv2', 'RoFormerv2Config'), ('roformer', 'RoFormerConfig'), ('skep', 'SkepConfig'), ('squeezebert', 'SqueezeBertConfig'), ('tinybert', 'TinyBertConfig'), ('unified_transformer', 'UnifiedTransformerConfig'), ('unimo', 'UNIMOConfig'), ('xlnet', 'XLNetConfig'), ('xlm', 'XLMConfig'), ('gpt', 'GPTConfig'), ('glm', 'GLMConfig'), ('mt5', 'MT5Config'), ('t5', 'T5Config'), ('bert', 'BertConfig'), ('bart', 'BartConfig'), ('gau_alpha', 'GAUAlphaConfig'), ('codegen', 'CodeGenConfig'), ('clip', 'CLIPConfig'), ('artist', 'ArtistConfig'), ('opt', 'OPTConfig'), ('pegasus', 'PegasusConfig'), ('dpt', 'DPTConfig'), ('bit', 'BitConfig'), ('blip', 'BlipConfig'), ('bloom', 'BloomConfig'), ('qwen', 'QWenConfig'), ('mistral', 'MistralConfig'), ('mixtral', 'MixtralConfig'), ('qwen2', 'Qwen2Config'), ('qwen2_moe', 'Qwen2MoeConfig'), ('gemma', 'GemmaConfig'), ('yuan', 'YuanConfig'), ('mamba', 'MambaConfig'), ('jamba', 'JambaConfig') + ] +) + + +MODEL_NAMES_MAPPING = OrderedDict( + [ + # Add full (and cased) model names here + # Base model mapping + ('albert', 'Albert'), ('bigbird', 'BigBird'), ('blenderbot_small', 'BlenderbotSmall'), ('blenderbot', 'Blenderbot'), ('chatglm_v2', 'ChatGLMv2'), ('chatglm', 'ChatGLM'), ('chineseclip', 'ChineseCLIPText'), ('chinesebert', 'ChineseBert'), ('convbert', 'ConvBert'), ('ctrl', 'CTRL'), ('distilbert', 'DistilBert'), ('dallebart', 'DalleBart'), ('electra', 'Electra'), ('ernie_vil', 'ErnieViL'), ('ernie_ctm', 'ErnieCtm'), ('ernie_doc', 'ErnieDoc'), ('ernie_gen', 'ErnieGen'), ('ernie_gram', 'ErnieGram'), ('ernie_layout', 'ErnieLayout'), ('ernie_m', 'ErnieM'), ('ernie_code', 'ErnieCode'), ('ernie', 'Ernie'), ('fnet', 'FNet'), ('funnel', 'Funnel'), ('llama', 'Llama'), ('layoutxlm', 'LayoutXLM'), ('layoutlmv2', 'LayoutLMv2'), ('layoutlm', 'LayoutLM'), ('luke', 'Luke'), ('mbart', 'MBart'), ('megatronbert', 'MegatronBert'), ('mobilebert', 'MobileBert'), ('mpnet', 'MPNet'), ('nezha', 'NeZha'), ('nystromformer', 'Nystromformer'), ('ppminilm', 'PPMiniLM'), ('prophetnet', 'ProphetNet'), ('reformer', 'Reformer'), ('rembert', 'RemBert'), ('roberta', 'Roberta'), ('roformerv2', 'RoFormerv2'), ('roformer', 'RoFormer'), ('skep', 'Skep'), ('squeezebert', 'SqueezeBert'), ('tinybert', 'TinyBert'), ('unified_transformer', 'UnifiedTransformer'), ('unimo', 'UNIMO'), ('xlnet', 'XLNet'), ('xlm', 'XLM'), ('gpt', 'GPT'), ('glm', 'GLM'), ('mt5', 'MT5'), ('t5', 'T5'), ('bert', 'Bert'), ('bart', 'Bart'), ('gau_alpha', 'GAUAlpha'), ('codegen', 'CodeGen'), ('clip', 'CLIP'), ('artist', 'Artist'), ('opt', 'OPT'), ('pegasus', 'Pegasus'), ('dpt', 'DPT'), ('bit', 'Bit'), ('blip', 'Blip'), ('bloom', 'Bloom'), ('qwen', 'QWen'), ('mistral', 'Mistral'), ('mixtral', 'Mixtral'), ('qwen2', 'Qwen2'), ('qwen2_moe', 'Qwen2Moe'), ('gemma', 'Gemma'), ('yuan', 'Yuan'), ('mamba', 'Mamba'), ('jamba', 'Jamba') + ] +) + + +def config_class_to_model_type(config): + """Converts a config class name to the corresponding model type""" + for key, cls in CONFIG_MAPPING_NAMES.items(): + if cls == config: + return key + return None def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]: """load the configurations of PretrainedConfig mapping: {: [, , ...], } @@ -63,6 +94,21 @@ def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]: return mappings +def model_type_to_module_name(key): + """Converts a config key to the corresponding module.""" + # Special treatment + # if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME: + # key = SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key] + + # if key in DEPRECATED_MODELS: + # key = f"deprecated.{key}" + # return key + + key = key.replace("-", "_") + # if key in DEPRECATED_MODELS: + # key = f"deprecated.{key}" + + return key class AutoConfig(PretrainedConfig): """ diff --git a/paddlenlp/transformers/auto/factory.py b/paddlenlp/transformers/auto/factory.py new file mode 100644 index 000000000000..ca6c15a08f46 --- /dev/null +++ b/paddlenlp/transformers/auto/factory.py @@ -0,0 +1,126 @@ + +from collections import OrderedDict +import importlib + +from paddlenlp.transformers.auto.configuration import model_type_to_module_name + + +def getattribute_from_module(module, attr): + if attr is None: + return None + if isinstance(attr, tuple): + return tuple(getattribute_from_module(module, a) for a in attr) + if hasattr(module, attr): + return getattr(module, attr) + # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the + # object at the top level. + transformers_module = importlib.import_module("transformers") + + if module != transformers_module: + try: + return getattribute_from_module(transformers_module, attr) + except ValueError: + raise ValueError(f"Could not find {attr} neither in {module} nor in {transformers_module}!") + else: + raise ValueError(f"Could not find {attr} in {transformers_module}!") + +class _LazyAutoMapping(OrderedDict): + """ + " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed. + + Args: + - config_mapping: The map model type to config class + - model_mapping: The map model type to model (or tokenizer) class + """ + + def __init__(self, config_mapping, model_mapping): + self._config_mapping = config_mapping + self._reverse_config_mapping = {v: k for k, v in config_mapping.items()} + self._model_mapping = model_mapping + self._model_mapping._model_mapping = self + self._extra_content = {} + self._modules = {} + + def __len__(self): + common_keys = set(self._config_mapping.keys()).intersection(self._model_mapping.keys()) + return len(common_keys) + len(self._extra_content) + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping: + model_name = self._model_mapping[model_type] + return self._load_attr_from_module(model_type, model_name) + + # Maybe there was several model types associated with this config. + model_types = [k for k, v in self._config_mapping.items() if v == key.__name__] + for mtype in model_types: + if mtype in self._model_mapping: + model_name = self._model_mapping[mtype] + return self._load_attr_from_module(mtype, model_name) + raise KeyError(key) + + def _load_attr_from_module(self, model_type, attr): + module_name = model_type_to_module_name(model_type) + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "paddlenlp.transformers") + return getattribute_from_module(self._modules[module_name], attr) + + def keys(self): + mapping_keys = [ + self._load_attr_from_module(key, name) + for key, name in self._config_mapping.items() + if key in self._model_mapping.keys() + ] + return mapping_keys + list(self._extra_content.keys()) + + def get(self, key, default): + try: + return self.__getitem__(key) + except KeyError: + return default + + def __bool__(self): + return bool(self.keys()) + + def values(self): + mapping_values = [ + self._load_attr_from_module(key, name) + for key, name in self._model_mapping.items() + if key in self._config_mapping.keys() + ] + return mapping_values + list(self._extra_content.values()) + + def items(self): + mapping_items = [ + ( + self._load_attr_from_module(key, self._config_mapping[key]), + self._load_attr_from_module(key, self._model_mapping[key]), + ) + for key in self._model_mapping.keys() + if key in self._config_mapping.keys() + ] + return mapping_items + list(self._extra_content.items()) + + def __iter__(self): + return iter(self.keys()) + + def __contains__(self, item): + if item in self._extra_content: + return True + if not hasattr(item, "__name__") or item.__name__ not in self._reverse_config_mapping: + return False + model_type = self._reverse_config_mapping[item.__name__] + return model_type in self._model_mapping + + def register(self, key, value, exist_ok=False): + """ + Register a new model in this mapping. + """ + if hasattr(key, "__name__") and key.__name__ in self._reverse_config_mapping: + model_type = self._reverse_config_mapping[key.__name__] + if model_type in self._model_mapping.keys() and not exist_ok: + raise ValueError(f"'{key}' is already used by a Transformers model.") + + self._extra_content[key] = value \ No newline at end of file diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 46efa4efb7ad..650fad48e505 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -17,92 +17,269 @@ import json import os from collections import OrderedDict +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from paddlenlp.transformers.auto.configuration import ( + CONFIG_MAPPING_NAMES, + AutoConfig, + config_class_to_model_type, + model_type_to_module_name, +) +from paddlenlp.transformers.configuration_utils import PretrainedConfig +from paddlenlp.transformers.tokenizer_utils_base import TOKENIZER_CONFIG_FILE +from paddlenlp.transformers.tokenizer_utils_fast import PretrainedTokenizerFast + +from ...utils import ( + is_g2p_en_available, + is_sentencepiece_available, + is_tokenizers_available, +) from ...utils.download import resolve_file_path from ...utils.import_utils import import_module from ...utils.log import logger +from .factory import _LazyAutoMapping __all__ = [ "AutoTokenizer", ] -TOKENIZER_MAPPING_NAMES = OrderedDict( - [ - ("AlbertEnglishTokenizer", "albert"), - ("AlbertChineseTokenizer", "albert"), - ("BertJapaneseTokenizer", "bert_japanese"), - ("BigBirdTokenizer", "bigbird"), - ("BlenderbotSmallTokenizer", "blenderbot_small"), - ("BlenderbotTokenizer", "blenderbot"), - ("ChatGLMTokenizer", "chatglm"), - ("ChatGLMv2Tokenizer", "chatglm_v2"), - ("ChineseBertTokenizer", "chinesebert"), - ("ConvBertTokenizer", "convbert"), - ("CTRLTokenizer", "ctrl"), - ("DalleBartTokenizer", "dallebart"), - ("DistilBertTokenizer", "distilbert"), - ("ElectraTokenizer", "electra"), - ("ErnieCtmTokenizer", "ernie_ctm"), - ("ErnieDocTokenizer", "ernie_doc"), - ("ErnieDocBPETokenizer", "ernie_doc"), - ("ErnieGramTokenizer", "ernie_gram"), - ("ErnieLayoutTokenizer", "ernie_layout"), - ("ErnieMTokenizer", "ernie_m"), - ("ErnieCodeTokenizer", "ernie_code"), - ("ErnieTokenizer", "ernie"), - ("FNetTokenizer", "fnet"), - ("FunnelTokenizer", "funnel"), - ("LlamaTokenizer", "llama"), - ("LayoutXLMTokenizer", "layoutxlm"), - ("LayoutLMv2Tokenizer", "layoutlmv2"), - ("LayoutLMTokenizer", "layoutlm"), - ("LukeTokenizer", "luke"), - ("MBartTokenizer", "mbart"), - ("MBart50Tokenizer", "mbart"), - ("MegatronBertTokenizer", "megatronbert"), - ("MobileBertTokenizer", "mobilebert"), - ("MPNetTokenizer", "mpnet"), - ("NeZhaTokenizer", "nezha"), - ("NystromformerTokenizer", "nystromformer"), - ("PPMiniLMTokenizer", "ppminilm"), - ("ProphetNetTokenizer", "prophetnet"), - ("ReformerTokenizer", "reformer"), - ("RemBertTokenizer", "rembert"), - ("RobertaChineseTokenizer", "roberta"), - ("RobertaBPETokenizer", "roberta"), - ("RoFormerTokenizer", "roformer"), - ("RoFormerv2Tokenizer", "roformerv2"), - ("SkepTokenizer", "skep"), - ("SqueezeBertTokenizer", "squeezebert"), - ("TinyBertTokenizer", "tinybert"), - ("UnifiedTransformerTokenizer", "unified_transformer"), - ("UNIMOTokenizer", "unimo"), - ("XLNetTokenizer", "xlnet"), - ("XLMTokenizer", "xlm"), - ("GPTTokenizer", "gpt"), - ("GPTChineseTokenizer", "gpt"), - ("T5Tokenizer", "t5"), - ("BertTokenizer", "bert"), - ("BartTokenizer", "bart"), - ("GAUAlphaTokenizer", "gau_alpha"), - ("CodeGenTokenizer", "codegen"), - ("CLIPTokenizer", "clip"), - ("ArtistTokenizer", "artist"), - ("ChineseCLIPTokenizer", "chineseclip"), - ("ErnieViLTokenizer", "ernie_vil"), - ("PegasusChineseTokenizer", "pegasus"), - ("GLMBertTokenizer", "glm"), - ("GLMChineseTokenizer", "glm"), - ("GLMGPT2Tokenizer", "glm"), - ("BloomTokenizer", "bloom"), - ("SpeechT5Tokenizer", "speecht5"), - ("QWenTokenizer", "qwen"), - ("GemmaTokenizer", "gemma"), - ("YuanTokenizer", "yuan"), - ("MambaTokenizer", "mamba"), - ("JambaTokenizer", "jamba"), - ] -) +if is_tokenizers_available(): + from ..tokenizer_utils_fast import PretrainedTokenizerFast +else: + PretrainedTokenizerFast = None + +if False: + # This significantly improves completion suggestion performance when + # the transformers package is used with Microsoft's Pylance language server. + TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict() +else: + TOKENIZER_MAPPING_NAMES = OrderedDict( + [ + ( + "albert", + ( + "AlbertChineseTokenizer" if is_sentencepiece_available() else None, + "AlbertChineseTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("bart", ("BartTokenizer", "BartTokenizerFast")), + ( + "bert", + ( + "BertTokenizer", + "BertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")), + ( + "bloom", + ( + "BloomTokenizer", + "BloomTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "clip", + ( + "CLIPTokenizer", + "CLIPTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "codegen", + ( + "CodeGenTokenizer", + "CodeGenTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "convbert", + ( + "ConvBertTokenizer", + "ConvBertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("ctrl", ("CTRLTokenizer", None)), + ( + "distilbert", + ( + "DistilBertTokenizer", + "DistilBertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "electra", + ( + "ElectraTokenizer", + "ElectraTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "ernie", + ( + "ErnieTokenizer", + "ErnieTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)), + ( + "fnet", + ( + "FNetTokenizer", + "FNetTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "funnel", + ( + "FunnelTokenizer", + "FunnelTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "gemma", + ( + "GemmaTokenizer" if is_sentencepiece_available() else None, + "GemmaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "jamba", + ( + "JambaTokenizer" if is_sentencepiece_available() else None, + "JambaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "layoutlm", + ( + "LayoutLMTokenizer", + "LayoutLMTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "layoutlmv2", + ( + "LayoutLMv2Tokenizer", + "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "layoutxlm", + ( + "LayoutXLMTokenizer", + "LayoutXLMTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "llama", + ( + "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("luke", ("LukeTokenizer", None)), + ( + "mamba", + ( + "MambaTokenizer", + "MambaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "mbart", + ( + "MBart50Tokenizer" if is_sentencepiece_available() else None, + "MBart50TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "mobilebert", + ( + "MobileBertTokenizer", + "MobileBertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "mpnet", + ( + "MPNetTokenizer", + "MPNetTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "nezha", + ( + "NeZhaTokenizer", + "NeZhaTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "pegasus", + ( + "PegasusChineseTokenizer" if is_sentencepiece_available() else None, + "PegasusChineseTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("prophetnet", ("ProphetNetTokenizer", None)), + ( + "reformer", + ( + "ReformerTokenizer" if is_sentencepiece_available() else None, + "ReformerTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "rembert", + ( + "RemBertTokenizer" if is_sentencepiece_available() else None, + "RemBertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "roberta", + ( + "RobertaBPETokenizer", + "RobertaBPETokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "roformer", + ( + "RoFormerTokenizer", + "RoFormerTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)), + ( + "squeezebert", + ( + "SqueezeBertTokenizer", + "SqueezeBertTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ( + "t5", + ( + "T5Tokenizer" if is_sentencepiece_available() else None, + "T5TokenizerFast" if is_tokenizers_available() else None, + ), + ), + ("xlm", ("XLMTokenizer", None)), + ( + "xlnet", + ( + "XLNetTokenizer" if is_sentencepiece_available() else None, + "XLNetTokenizerFast" if is_tokenizers_available() else None, + ), + ), + ('bert_japanese', 'BertJapaneseTokenizer'), ('bigbird', 'BigBirdTokenizer'), ('blenderbot_small', 'BlenderbotSmallTokenizer'), ('chatglm', 'ChatGLMTokenizer'), ('chatglm_v2', 'ChatGLMv2Tokenizer'), ('chinesebert', 'ChineseBertTokenizer'), ('dallebart', 'DalleBartTokenizer'), ('ernie_ctm', 'ErnieCtmTokenizer'), ('ernie_doc', 'ErnieDocBPETokenizer'), ('ernie_gram', 'ErnieGramTokenizer'), ('ernie_layout', 'ErnieLayoutTokenizer'), ('ernie_code', 'ErnieCodeTokenizer'), ('megatronbert', 'MegatronBertTokenizer'), ('nystromformer', 'NystromformerTokenizer'), ('ppminilm', 'PPMiniLMTokenizer'), ('roformerv2', 'RoFormerv2Tokenizer'), ('skep', 'SkepTokenizer'), ('tinybert', 'TinyBertTokenizer'), ('unified_transformer', 'UnifiedTransformerTokenizer'), ('unimo', 'UNIMOTokenizer'), ('gpt', 'GPTChineseTokenizer'), ('gau_alpha', 'GAUAlphaTokenizer'), ('artist', 'ArtistTokenizer'), ('chineseclip', 'ChineseCLIPTokenizer'), ('ernie_vil', 'ErnieViLTokenizer'), ('glm', 'GLMGPT2Tokenizer'), ('qwen', 'QWenTokenizer'), ('yuan', 'YuanTokenizer'), + ] + ) + +TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) + +CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} def get_configurations(): @@ -115,6 +292,129 @@ def get_configurations(): return MAPPING_NAMES +def tokenizer_class_from_name(class_name: str): + if class_name == "PretrainedTokenizerFast": + return PretrainedTokenizerFast + + for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): + if class_name in tokenizers: + module_name = model_type_to_module_name(module_name) + print(f"module_name: {module_name}") + module = importlib.import_module(f".{module_name}", "paddlenlp.transformers") + try: + return getattr(module, class_name) + except AttributeError: + continue + + for config, tokenizers in TOKENIZER_MAPPING._extra_content.items(): + for tokenizer in tokenizers: + if getattr(tokenizer, "__name__", None) == class_name: + return tokenizer + + # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main + # init and we return the proper dummy to get an appropriate error message. + main_module = importlib.import_module("transformers") + if hasattr(main_module, class_name): + return getattr(main_module, class_name) + + return None + + +def get_tokenizer_config( + pretrained_model_name_or_path: Union[str, os.PathLike], + cache_dir: Optional[Union[str, os.PathLike]] = None, + force_download: bool = False, + resume_download: Optional[bool] = None, + proxies: Optional[Dict[str, str]] = None, + token: Optional[Union[bool, str]] = None, + revision: Optional[str] = None, + local_files_only: bool = False, + subfolder: str = "", + **kwargs, +): + """ + Loads the tokenizer configuration from a pretrained model tokenizer configuration. + + Args: + pretrained_model_name_or_path (`str` or `os.PathLike`): + This can be either: + + - a string, the *model id* of a pretrained model configuration hosted inside a model repo on + huggingface.co. + - a path to a *directory* containing a configuration file saved using the + [`~PreTrainedTokenizer.save_pretrained`] method, e.g., `./my_model_directory/`. + + cache_dir (`str` or `os.PathLike`, *optional*): + Path to a directory in which a downloaded pretrained model configuration should be cached if the standard + cache should not be used. + force_download (`bool`, *optional*, defaults to `False`): + Whether or not to force to (re-)download the configuration files and override the cached versions if they + exist. + resume_download: + Deprecated and ignored. All downloads are now resumed by default when possible. + Will be removed in v5 of Transformers. + proxies (`Dict[str, str]`, *optional*): + A dictionary of proxy servers to use by protocol or endpoint, e.g., `{'http': 'foo.bar:3128', + 'http://hostname': 'foo.bar:4012'}.` The proxies are used on each request. + token (`str` or *bool*, *optional*): + The token to use as HTTP bearer authorization for remote files. If `True`, will use the token generated + when running `huggingface-cli login` (stored in `~/.huggingface`). + revision (`str`, *optional*, defaults to `"main"`): + The specific model version to use. It can be a branch name, a tag name, or a commit id, since we use a + git-based system for storing models and other artifacts on huggingface.co, so `revision` can be any + identifier allowed by git. + local_files_only (`bool`, *optional*, defaults to `False`): + If `True`, will only try to load the tokenizer configuration from local files. + subfolder (`str`, *optional*, defaults to `""`): + In case the tokenizer config is located inside a subfolder of the model repo on huggingface.co, you can + specify the folder name here. + + + + Passing `token=True` is required when you want to use a private model. + + + + Returns: + `Dict`: The configuration of the tokenizer. + + Examples: + + ```python + # Download configuration from huggingface.co and cache. + tokenizer_config = get_tokenizer_config("google-bert/bert-base-uncased") + # This model does not have a tokenizer config so the result will be an empty dict. + tokenizer_config = get_tokenizer_config("FacebookAI/xlm-roberta-base") + + # Save a pretrained tokenizer locally and you can reload its config + from transformers import AutoTokenizer + + tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased") + tokenizer.save_pretrained("tokenizer-test") + tokenizer_config = get_tokenizer_config("tokenizer-test") + ```""" + + resolved_config_file = resolve_file_path( + pretrained_model_name_or_path, + TOKENIZER_CONFIG_FILE, + cache_dir=cache_dir, + force_download=force_download, + resume_download=resume_download, + proxies=proxies, + token=token, + revision=revision, + local_files_only=local_files_only, + subfolder=subfolder, + ) + if resolved_config_file is None: + logger.info("Could not locate the tokenizer configuration file, will try to use the model config instead.") + return {} + with open(resolved_config_file, encoding="utf-8") as reader: + result = json.load(reader) + + return result + + class AutoTokenizer: """ AutoClass can help you automatically retrieve the relevant model given the provided @@ -123,15 +423,15 @@ class AutoTokenizer: base tokenizer classes when created with the AutoTokenizer.from_pretrained() classmethod. """ - MAPPING_NAMES = get_configurations() - _tokenizer_mapping = MAPPING_NAMES - _name_mapping = TOKENIZER_MAPPING_NAMES - tokenizer_config_file = "tokenizer_config.json" + # MAPPING_NAMES = get_configurations() + # _tokenizer_mapping = MAPPING_NAMES + # _name_mapping = TOKENIZER_MAPPING_NAMES + # tokenizer_config_file = "tokenizer_config.json" - def __init__(self, *args, **kwargs): + def __init__(self): raise EnvironmentError( - f"{self.__class__.__name__} is designed to be instantiated " - f"using the `{self.__class__.__name__}.from_pretrained(pretrained_model_name_or_path).`" + "AutoTokenizer is designed to be instantiated " + "using the `AutoTokenizer.from_pretrained(pretrained_model_name_or_path)` method." ) @classmethod @@ -176,7 +476,7 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_ return tokenizer_class @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): """ Creates an instance of `AutoTokenizer`. Related resources are loaded by specifying name of a built-in pretrained model, or a community-contributed @@ -219,51 +519,103 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): print(type(tokenizer)) # """ + config = kwargs.pop("config", None) + kwargs["_from_auto"] = True + + use_fast = kwargs.pop("use_fast", True) + tokenizer_type = kwargs.pop("tokenizer_type", None) + if tokenizer_type is not None: + # TODO: Support tokenizer_type + raise NotImplementedError("tokenizer_type is not supported yet.") + + tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) + config_tokenizer_class = tokenizer_config.get("tokenizer_class") + if config_tokenizer_class is None: + if not isinstance(config, PretrainedConfig): + config = AutoConfig.from_pretrained(pretrained_model_name_or_path, **kwargs) + config_tokenizer_class = config.tokenizer_class + if config_tokenizer_class is not None: + tokenizer_class = None + print(f"config_tokenizer_class: {config_tokenizer_class}") + if use_fast and not config_tokenizer_class.endswith("Fast"): + tokenizer_class_candidate = f"{config_tokenizer_class}Fast" + tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) + if tokenizer_class is None: + tokenizer_class_candidate = config_tokenizer_class + tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) + if tokenizer_class is None: + raise ValueError( + f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." + ) + + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + print("we have to be creative") + + # TODO: if model is an encoder decoder + + model_type = config_class_to_model_type(type(config).__name__) + if model_type is not None: + print(f"model_type is not None: {model_type}") + tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)] + print(tokenizer_class_py, tokenizer_class_fast) + if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): + return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + else: + if tokenizer_class_py is not None: + return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + else: + raise ValueError( + "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " + "in order to use this tokenizer." + ) + # Default not to use fast tokenizer - use_faster = kwargs.pop("use_faster", None) - use_fast = kwargs.pop("use_fast", None) - if use_fast is not None or use_faster is not None: - raise ValueError("use_fast is deprecated") + # use_faster = kwargs.pop("use_faster", None) + # use_fast = kwargs.pop("use_fast", None) + # if use_fast is not None or use_faster is not None: + # raise ValueError("use_fast is deprecated") - cache_dir = kwargs.get("cache_dir", None) - subfolder = kwargs.get("subfolder", "") - if subfolder is None: - subfolder = "" - from_aistudio = kwargs.get("from_aistudio", False) - from_hf_hub = kwargs.get("from_hf_hub", False) - - all_tokenizer_names = [] - for names, tokenizer_class in cls._tokenizer_mapping.items(): - for name in names: - all_tokenizer_names.append(name) - - # From built-in pretrained models - if pretrained_model_name_or_path in all_tokenizer_names: - for names, tokenizer_class in cls._tokenizer_mapping.items(): - for pattern in names: - if pattern == pretrained_model_name_or_path: - logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) - return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - - config_file = resolve_file_path( - pretrained_model_name_or_path, - cls.tokenizer_config_file, - subfolder, - cache_dir=cache_dir, - from_hf_hub=from_hf_hub, - from_aistudio=from_aistudio, - ) - if config_file is not None and os.path.exists(config_file): - tokenizer_class = cls._get_tokenizer_class_from_config( - pretrained_model_name_or_path, config_file, use_fast - ) - logger.info(f"We are using {tokenizer_class} to load '{pretrained_model_name_or_path}'.") - return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - else: - raise RuntimeError( - f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" - f"Please make sure that '{pretrained_model_name_or_path}' is:\n" - "- a correct model-identifier of built-in pretrained models,\n" - "- or a correct model-identifier of community-contributed pretrained models,\n" - "- or the correct path to a directory containing relevant tokenizer files.\n" - ) + # cache_dir = kwargs.get("cache_dir", None) + # subfolder = kwargs.get("subfolder", "") + # if subfolder is None: + # subfolder = "" + # from_aistudio = kwargs.get("from_aistudio", False) + # from_hf_hub = kwargs.get("from_hf_hub", False) + + # all_tokenizer_names = [] + # for names, tokenizer_class in cls._tokenizer_mapping.items(): + # for name in names: + # all_tokenizer_names.append(name) + + # # From built-in pretrained models + # if pretrained_model_name_or_path in all_tokenizer_names: + # for names, tokenizer_class in cls._tokenizer_mapping.items(): + # for pattern in names: + # if pattern == pretrained_model_name_or_path: + # logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) + # return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + + # config_file = resolve_file_path( + # pretrained_model_name_or_path, + # cls.tokenizer_config_file, + # subfolder, + # cache_dir=cache_dir, + # from_hf_hub=from_hf_hub, + # from_aistudio=from_aistudio, + # ) + # print(f"config_file: {config_file}") + # print("cls.tokenizer_config_file: ", cls.tokenizer_config_file) + # if config_file is not None and os.path.exists(config_file): + # tokenizer_class = cls._get_tokenizer_class_from_config( + # pretrained_model_name_or_path, config_file, use_fast + # ) + # logger.info(f"We are using {tokenizer_class} to load '{pretrained_model_name_or_path}'.") + # return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + # else: + # raise RuntimeError( + # f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" + # f"Please make sure that '{pretrained_model_name_or_path}' is:\n" + # "- a correct model-identifier of built-in pretrained models,\n" + # "- or a correct model-identifier of community-contributed pretrained models,\n" + # "- or the correct path to a directory containing relevant tokenizer files.\n" + # ) diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index eafa3572a450..af3cf887c791 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -16,7 +16,6 @@ # limitations under the License. from typing import Dict, List, Optional, Tuple - import tokenizers from packaging import version from tokenizers import ( @@ -26,9 +25,31 @@ decoders, normalizers, pre_tokenizers, + processors, ) from tokenizers.models import BPE, Unigram +from paddlenlp.utils.import_utils import is_protobuf_available, is_sentencepiece_available + +def import_protobuf(error_message=""): + if is_sentencepiece_available(): + from sentencepiece import sentencepiece_model_pb2 + + return sentencepiece_model_pb2 + if is_protobuf_available(): + import google.protobuf + + if version.parse(google.protobuf.__version__) < version.parse("4.0.0"): + from transformers.utils import sentencepiece_model_pb2 + else: + from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2 + return sentencepiece_model_pb2 + else: + raise ImportError(f""" +{error_message} requires the protobuf library but it was not found in your environment. Checkout the instructions on the +installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones +that match your environment. Please note that you may need to restart your runtime after installation. +""") # Copied from transformers, adapted for tokenizers >= 0.19.0 def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str: @@ -198,16 +219,60 @@ def converted(self) -> Tokenizer: return tokenizer +# Copied from paddlenlp/transformers/gpt/tokenizer.py +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + _chr = chr + bs = ( + list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1)) + ) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8 + n) + n += 1 + cs = [_chr(n) for n in cs] + return dict(zip(bs, cs)) + +class TikTokenConverter: + """ + A general tiktoken converter. + """ -class TikTokenConverter(Converter): - def extract(self, tiktoken_file: str): - from .tiktoken_model_utils import bpe, bytes_to_unicode, load_tiktoken_bpe + def __init__( + self, + vocab_file=None, + pattern=r"""(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+""", + add_prefix_space=False, + additional_special_tokens=None, + *args, + **kwargs, + ): + super().__init__(*args) + self.vocab_file = vocab_file + self.pattern = pattern + self.add_prefix_space = add_prefix_space + self.additional_special_tokens = additional_special_tokens + + def extract_vocab_merges_from_model(self, tiktoken_url: str): + try: + from tiktoken.load import load_tiktoken_bpe + except Exception: + raise ValueError( + "`tiktoken` is required to read a `tiktoken` file. Install it with " "`pip install tiktoken`." + ) - bpe_ranks = ( - self.original_tokenizer.mergeable_ranks - if hasattr(self.original_tokenizer, "mergeable_ranks") and self.original_tokenizer.mergeable_ranks - else load_tiktoken_bpe(tiktoken_file) - ) + bpe_ranks = load_tiktoken_bpe(tiktoken_url) byte_encoder = bytes_to_unicode() def token_bytes_to_string(b): @@ -219,12 +284,39 @@ def token_bytes_to_string(b): vocab[token_bytes_to_string(token)] = rank if len(token) == 1: continue - merged = tuple(bpe(bpe_ranks, token, max_rank=rank)) - if len(merged) == 2: - merges.append(tuple(map(token_bytes_to_string, merged))) - + local = [] + for index in range(1, len(token)): + piece_l, piece_r = token[:index], token[index:] + if piece_l in bpe_ranks and piece_r in bpe_ranks and (piece_l + piece_r) in bpe_ranks: + local.append((piece_l, piece_r, rank)) + local = sorted(local, key=lambda x: (bpe_ranks[x[0]], bpe_ranks[x[1]]), reverse=False) + merges.extend(local) + merges = sorted(merges, key=lambda val: val[2], reverse=False) + merges = [(token_bytes_to_string(val[0]), token_bytes_to_string(val[1])) for val in merges] return vocab, merges + def tokenizer(self): + vocab_scores, merges = self.extract_vocab_merges_from_model(self.vocab_file) + tokenizer = Tokenizer(BPE(vocab_scores, merges, fuse_unk=False)) + if hasattr(tokenizer.model, "ignore_merges"): + tokenizer.model.ignore_merges = True + return tokenizer + + def converted(self) -> Tokenizer: + tokenizer = self.tokenizer() + tokenizer.pre_tokenizer = pre_tokenizers.Sequence( + [ + pre_tokenizers.Split(Regex(self.pattern), behavior="isolated", invert=False), + pre_tokenizers.ByteLevel(add_prefix_space=self.add_prefix_space, use_regex=False), + ] + ) + tokenizer.decoder = decoders.ByteLevel() + tokenizer.add_special_tokens(self.additional_special_tokens) + + tokenizer.post_processor = processors.ByteLevel(trim_offsets=False) + + return tokenizer + class LlamaConverter(SpmConverter): handle_byte_fallback = True @@ -298,7 +390,7 @@ def pre_tokenizer(self, replacement, add_prefix_space): } -def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: +def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokenizer: """ Utilities to convert a slow tokenizer instance in a fast tokenizer instance. @@ -313,12 +405,19 @@ def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer: """ tokenizer_class_name = transformer_tokenizer.__class__.__name__ - if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS: - raise ValueError( - f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance. " - f"No converter was found. Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}" - ) - - converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] - - return converter_class(transformer_tokenizer).converted() + if tokenizer_class_name in SLOW_TO_FAST_CONVERTERS and not from_tiktoken: + converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] + return converter_class(transformer_tokenizer).converted() + else: + # try: + return TikTokenConverter( + vocab_file=transformer_tokenizer.vocab_file, + additional_special_tokens=transformer_tokenizer.additional_special_tokens, + ).converted() + # except Exception: + # raise ValueError( + # f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path " + # f"with a SentencePiece tokenizer.model file." + # f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}" + # ) + diff --git a/paddlenlp/transformers/ernie/__init__.py b/paddlenlp/transformers/ernie/__init__.py index 97043fd7ba68..bda886444126 100644 --- a/paddlenlp/transformers/ernie/__init__.py +++ b/paddlenlp/transformers/ernie/__init__.py @@ -11,3 +11,6 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. +from .configuration import * +from .modeling import * +from .tokenizer import * \ No newline at end of file diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py index 2bae61e67b4e..d890d703d649 100644 --- a/paddlenlp/transformers/llama/tokenizer.py +++ b/paddlenlp/transformers/llama/tokenizer.py @@ -20,6 +20,8 @@ import numpy as np import sentencepiece as spm +from paddlenlp.transformers.convert_slow_tokenizer import import_protobuf + from ...utils.log import logger from .. import PretrainedTokenizer from ..tokenizer_utils_base import BatchEncoding, EncodedInput, PaddingStrategy @@ -72,7 +74,8 @@ def __init__( self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.decode_with_prefix_space = decode_with_prefix_space - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + # self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False)) self.sp_model.Load(vocab_file) @property @@ -99,6 +102,23 @@ def bos_token_id(self) -> Optional[int]: @property def eos_token_id(self) -> Optional[int]: return self.sp_model.eos_id() + + def get_spm_processor(self, from_slow=False): + tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) + if from_slow: # no dependency on protobuf + tokenizer.Load(self.vocab_file) + return tokenizer + + with open(self.vocab_file, "rb") as f: + sp_model = f.read() + model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)") + model = model_pb2.ModelProto.FromString(sp_model) + normalizer_spec = model_pb2.NormalizerSpec() + normalizer_spec.add_dummy_prefix = False + model.normalizer_spec.MergeFrom(normalizer_spec) + sp_model = model.SerializeToString() + tokenizer.LoadFromSerializedProto(sp_model) + return tokenizer def get_vocab(self): """Returns vocab as a dict""" diff --git a/paddlenlp/transformers/llama/tokenizer_fast.py b/paddlenlp/transformers/llama/tokenizer_fast.py index 1543e14b61b1..13416c92c270 100644 --- a/paddlenlp/transformers/llama/tokenizer_fast.py +++ b/paddlenlp/transformers/llama/tokenizer_fast.py @@ -24,7 +24,7 @@ __all__ = ["LlamaTokenizerFast"] -VOCAB_FILES_NAMES = {"vocab_file": "spiece.bpe.model", "tokenizer_file": "tokenizer.json"} +VOCAB_FILES_NAMES = {"vocab_file": "tokenizer.model", "tokenizer_file": "tokenizer.json"} B_INST, E_INST = "[INST]", "[/INST]" B_SYS, E_SYS = "<>\n", "\n<>\n\n" diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 6af5cc29e5d4..069f8de3a173 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -41,9 +41,22 @@ from ..utils.download import resolve_file_path from ..utils.env import CHAT_TEMPLATE_CONFIG_NAME, TOKENIZER_CONFIG_NAME -from ..utils.import_utils import is_tokenizers_available +from ..utils.import_utils import is_protobuf_available, is_tokenizers_available from ..utils.log import logger + +def import_protobuf_decode_error(error_message=""): + if is_protobuf_available(): + from google.protobuf.message import DecodeError + + return DecodeError + else: + raise ImportError(f""" +{error_message} requires the protobuf library but it was not found in your environment. Checkout the instructions on the +installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones +that match your environment. Please note that you may need to restart your runtime after installation. +""") + if is_tokenizers_available(): from tokenizers import AddedToken from tokenizers import Encoding as EncodingFast @@ -132,7 +145,7 @@ class TensorType(ExplicitEnum): SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" ADDED_TOKENS_FILE = "added_tokens.json" TOKENIZER_CONFIG_FILE = "tokenizer_config.json" - +FULL_TOKENIZER_FILE = "tokenizer.json" def to_py_obj(obj): """ @@ -1361,6 +1374,10 @@ def __init__(self, **kwargs): self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) + # By default, cleaning tokenization spaces for both fast and slow tokenizers + self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True) + + # By default, do not split special tokens for both fast and slow tokenizers self.split_special_tokens = kwargs.pop("split_special_tokens", False) @@ -1423,10 +1440,13 @@ def _set_processor_class(self, processor_class: str): self._processor_class = processor_class def __repr__(self) -> str: + added_tokens_decoder_rep = "\n\t".join([f"{k}: {v.__repr__()}," for k, v in self.added_tokens_decoder.items()]) return ( - f"{'PretrainedTokenizer'}(name_or_path='{self.name_or_path}', " - f"vocab_size={self.vocab_size}, model_max_len={self.model_max_length}, " - f"padding_side='{self.padding_side}', truncation_side='{self.truncation_side}', special_tokens={self.special_tokens_map_extended})" + f"{self.__class__.__name__}(name_or_path='{self.name_or_path}'," + f" vocab_size={self.vocab_size}, model_max_length={self.model_max_length}, is_fast={self.is_fast}," + f" padding_side='{self.padding_side}', truncation_side='{self.truncation_side}'," + f" special_tokens={self.special_tokens_map}, clean_up_tokenization_spaces={self.clean_up_tokenization_spaces}), " + " added_tokens_decoder={\n\t" + added_tokens_decoder_rep + "\n}" ) def get_vocab(self) -> Dict[str, int]: @@ -1483,28 +1503,30 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): tokenizer = BertTokenizer.from_pretrained('./my_bert/') """ - pretrained_model_name_or_path = str(pretrained_model_name_or_path) cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.pop("from_hf_hub", False) from_aistudio = kwargs.pop("from_aistudio", False) subfolder = kwargs.pop("subfolder", "") return_tokenizer_file_dir = kwargs.pop("return_tokenizer_file_dir", False) - if subfolder is None: - subfolder = "" - + pretrained_model_name_or_path = str(pretrained_model_name_or_path) vocab_files = {} init_configuration = {} + is_local = os.path.isdir(pretrained_model_name_or_path) + + additional_files_names = { "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE, - "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME, + "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME, # what's this + # "tokenizer_file": FULL_TOKENIZER_FILE, } + print(f"cls = {cls}") + print(f"cls.resource_files_name1s = {cls.resource_files_names}") vocab_files_target = {**cls.resource_files_names, **additional_files_names} - # From HF Hub or AI Studio if from_hf_hub or from_aistudio: # Only include the necessary resource files specified by the tokenizer cls @@ -1528,12 +1550,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Assuming from community-contributed pretrained models for file_id, file_name in vocab_files_target.items(): vocab_files[file_id] = file_name - + print("vocab_files: ", vocab_files) resolved_vocab_files = {} + for file_id, file_path in vocab_files.items(): + print(f"file_id: {file_id}, file_path: {file_path}") if file_path is None or os.path.isfile(file_path): resolved_vocab_files[file_id] = file_path continue + print(f"Try resolving {file_id} from {pretrained_model_name_or_path}, {file_path}") resolved_vocab_files[file_id] = resolve_file_path( pretrained_model_name_or_path, [file_path], @@ -1542,11 +1567,72 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): from_aistudio=from_aistudio, from_hf_hub=from_hf_hub, ) - for file_id, file_path in resolved_vocab_files.items(): if resolved_vocab_files[file_id] is not None: cache_dir = os.path.dirname(resolved_vocab_files[file_id]) break + return cls._from_pretrained( + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *args, + cache_dir=cache_dir, + return_tokenizer_file_dir=return_tokenizer_file_dir, + from_hf_hub=from_hf_hub, + **kwargs, + ) + @classmethod + def _from_pretrained( + cls, + resolved_vocab_files, + pretrained_model_name_or_path, + init_configuration, + *init_inputs, + cache_dir=None, + return_tokenizer_file_dir=False, + from_hf_hub=False, + **kwargs, + ): + """ + Instantiate a `PretrainedTokenizer` from a predefined tokenizer class. + + Args: + pretrained_model_name_or_path (str): + The model name or path to instantiate the tokenizer from. + *init_inputs (tuple): + Positional arguments to be passed to the tokenizer class `__init__` method. + cache_dir (str, optional): + Directory to cache the downloaded vocabulary files. + return_tokenizer_file_dir (bool, optional): + Whether to return the directory path of the tokenizer files. + from_hf_hub (bool, optional): + Whether to load from Huggingface Hub. + from_aistudio (bool, optional): + Whether to load from AI Studio. + **kwargs (dict): + Additional keyword arguments to be passed to the tokenizer class `__init__` method. + + Returns: + PretrainedTokenizer: An instance of `PretrainedTokenizer`. + str: The directory path of the tokenizer files if `return_tokenizer_file_dir` is `True`. + + """ + print("sdvcsdvsdvsvd",cls,resolved_vocab_files,pretrained_model_name_or_path,init_configuration,init_inputs,cache_dir,return_tokenizer_file_dir,from_hf_hub,kwargs) + from_slow = kwargs.get("from_slow", False) + has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None + print(f"from_slow: {from_slow}, has_tokenizer_file: {has_tokenizer_file}") + if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None: + slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( + copy.deepcopy(resolved_vocab_files), + pretrained_model_name_or_path, + copy.deepcopy(init_configuration), + *init_inputs, + cache_dir=cache_dir, + **(copy.deepcopy(kwargs)), + ) + else: + slow_tokenizer = None + print(f"slow_tokenizer: {slow_tokenizer}") tokenizer_config_file_dir_list = set() for k, v in resolved_vocab_files.items(): @@ -1555,6 +1641,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): tokenizer_config_file_dir_list = list(tokenizer_config_file_dir_list) # TODO: check this assert len(tokenizer_config_file_dir_list) > 0, "All tokenizer files should be in the same directory." + # Prepare tokenizer initialization kwargs # Did we saved some inputs and kwargs to reload ? has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None @@ -1565,6 +1652,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): else: init_kwargs = init_configuration + if slow_tokenizer is not None: + init_kwargs["__slow_tokenizer"] = slow_tokenizer + init_kwargs["name_or_path"] = pretrained_model_name_or_path + pass_added_tokens_file = False # Handle tokenizer serialization of added and special tokens added_tokens_decoder: Dict[int, AddedToken] = {} @@ -1584,11 +1675,11 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): pass_added_tokens_file = True # position args are stored in kwargs, maybe better not include - init_args = init_kwargs.pop("init_args", ()) + # init_args = init_kwargs.pop("init_args", ()) init_kwargs.pop("init_class", None) # Update with newly provided args and kwargs - init_args = init_args if not args else args + # init_args = init_args if not args else args init_kwargs.update(kwargs) def convert_added_tokens(obj): @@ -1632,7 +1723,15 @@ def convert_added_tokens(obj): init_kwargs.pop("tokenizer_file") # TODO(guosheng): avoid reduplication of position args and key word args - tokenizer = cls(*init_args, **init_kwargs) + try: + tokenizer = cls(*init_inputs, **init_kwargs) + except import_protobuf_decode_error(): + logger.info( + "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead." + "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).", + ) + return False + chat_template = init_kwargs.pop("chat_template", None) if chat_template is not None: tokenizer.init_chat_template(chat_template) diff --git a/paddlenlp/transformers/tokenizer_utils_fast.py b/paddlenlp/transformers/tokenizer_utils_fast.py index d6a854fdd667..26fa0d6b4747 100644 --- a/paddlenlp/transformers/tokenizer_utils_fast.py +++ b/paddlenlp/transformers/tokenizer_utils_fast.py @@ -35,7 +35,7 @@ WordPieceTrainer, ) -from ..utils.env import ADDED_TOKENS_NAME, FULL_TOKENIZER_NAME +from ..utils.env import ADDED_TOKENS_NAME, FULL_TOKENIZER_NAME, TIKTOKEN_VOCAB_FILE from .convert_slow_tokenizer import convert_slow_tokenizer from .tokenizer_utils import ChatTemplateMixin, PretrainedTokenizer from .tokenizer_utils_base import ( @@ -60,8 +60,7 @@ "WordPiece": WordPieceTrainer, } -VOCAB_FILES_NAMES = {"tokenizer_file": FULL_TOKENIZER_NAME} - +VOCAB_FILES_NAMES = {"tokenizer_file": FULL_TOKENIZER_NAME, "vocab_file": TIKTOKEN_VOCAB_FILE} class PretrainedTokenizerFast(ChatTemplateMixin, PretrainedTokenizerBase): """ @@ -97,13 +96,19 @@ def __init__(self, *args, **kwargs): elif fast_tokenizer_file is not None and not from_slow: # We have a serialization from tokenizers which let us directly build the backend fast_tokenizer = TokenizerFast.from_file(fast_tokenizer_file) - elif slow_tokenizer is not None: + elif slow_tokenizer: # We need to convert a slow tokenizer to build the backend fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) - elif self.slow_tokenizer_class is not None: + elif self.slow_tokenizer_class is not None and slow_tokenizer is not False: # We need to create and convert a slow tokenizer to build the backend slow_tokenizer = self.slow_tokenizer_class(*args, **kwargs) fast_tokenizer = convert_slow_tokenizer(slow_tokenizer) + elif not slow_tokenizer: + # We try to load with tiktoken + self.vocab_file = kwargs.get("vocab_file", None) + self.additional_special_tokens = kwargs.get("additional_special_tokens", []) + fast_tokenizer = convert_slow_tokenizer(self, from_tiktoken=True) + slow_tokenizer = None else: raise ValueError( "Couldn't instantiate the backend tokenizer from one of: \n" @@ -605,6 +610,8 @@ def _encode_plus( split_special_tokens=split_special_tokens, **kwargs, ) + print(batched_input) + print(batched_output) # Return tensor is None, then we can remove the leading batch axis # Overflowing tokens are returned as a batch of output so we keep them in this case @@ -618,7 +625,7 @@ def _encode_plus( ) self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose) - + print(f"final output = {batched_output}") return batched_output def convert_tokens_to_string(self, tokens: List[str]) -> str: diff --git a/paddlenlp/utils/__init__.py b/paddlenlp/utils/__init__.py index 7f52ac762a00..a8c4dc487a0e 100644 --- a/paddlenlp/utils/__init__.py +++ b/paddlenlp/utils/__init__.py @@ -18,7 +18,7 @@ from .batch_sampler import * from .env import CONFIG_NAME, GENERATION_CONFIG_NAME, LEGACY_CONFIG_NAME -from .import_utils import install_package, uninstall_package +from .import_utils import * from .infohub import infohub from .initializer import to from .serialization import load_torch diff --git a/paddlenlp/utils/download/__init__.py b/paddlenlp/utils/download/__init__.py index 6f5dad5c8889..eadf41a47838 100644 --- a/paddlenlp/utils/download/__init__.py +++ b/paddlenlp/utils/download/__init__.py @@ -142,7 +142,9 @@ def resolve_file_path( elif index < len(filenames) - 1: continue else: - raise FileNotFoundError(f"please make sure one of the {filenames} under the dir {repo_id}") + pass + # 临时解决方案 + # raise FileNotFoundError(f"please make sure one of the {filenames} under the dir {repo_id}") # check cache for filename in filenames: diff --git a/paddlenlp/utils/env.py b/paddlenlp/utils/env.py index f57380fb4698..d1fbbb1a60ba 100644 --- a/paddlenlp/utils/env.py +++ b/paddlenlp/utils/env.py @@ -74,7 +74,7 @@ def _get_bool_env(env_key: str, default_value: str) -> bool: GENERATION_CONFIG_NAME = "generation_config.json" # Fast tokenizers (provided by HuggingFace tokenizer's library) can be saved in a single file FULL_TOKENIZER_NAME = "tokenizer.json" - +TIKTOKEN_VOCAB_FILE = "tokenizer.model" LORA_CONFIG_NAME = "lora_config.json" LORA_WEIGHTS_NAME = "lora_model_state.pdparams" diff --git a/paddlenlp/utils/import_utils.py b/paddlenlp/utils/import_utils.py index 3da810b7b0b7..cccb867830a9 100644 --- a/paddlenlp/utils/import_utils.py +++ b/paddlenlp/utils/import_utils.py @@ -18,18 +18,64 @@ import shutil import site import sys -from typing import Optional, Type +from typing import Optional, Tuple, Type, Union import pip from paddlenlp.utils.log import logger +# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better. +def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]: + # Check if the package spec exists and grab its version to avoid importing a local directory + package_exists = importlib.util.find_spec(pkg_name) is not None + package_version = "N/A" + if package_exists: + try: + # Primary method to get the package version + package_version = importlib.metadata.version(pkg_name) + except importlib.metadata.PackageNotFoundError: + # Fallback method: Only for "torch" and versions containing "dev" + if pkg_name == "torch": + try: + package = importlib.import_module(pkg_name) + temp_version = getattr(package, "__version__", "N/A") + # Check if the version contains "dev" + if "dev" in temp_version: + package_version = temp_version + package_exists = True + else: + package_exists = False + except ImportError: + # If the package can't be imported, it's not available + package_exists = False + else: + # For packages other than "torch", don't attempt the fallback and set as not available + package_exists = False + logger.debug(f"Detected {pkg_name} version: {package_version}") + if return_version: + return package_exists, package_version + else: + return package_exists + +_g2p_en_available = _is_package_available("g2p_en") +_sentencepiece_available = _is_package_available("sentencepiece") +_sklearn_available = importlib.util.find_spec("sklearn") is not None +if _sklearn_available: + try: + importlib.metadata.version("scikit-learn") + except importlib.metadata.PackageNotFoundError: + _sklearn_available = False + def is_datasets_available(): import importlib return importlib.util.find_spec("datasets") is not None +def is_protobuf_available(): + if importlib.util.find_spec("google") is None: + return False + return importlib.util.find_spec("google.protobuf") is not None def is_paddle_cuda_available() -> bool: if is_paddle_available(): @@ -39,6 +85,11 @@ def is_paddle_cuda_available() -> bool: else: return False +def is_g2p_en_available(): + return _g2p_en_available + +def is_sentencepiece_available(): + return _sentencepiece_available def is_paddle_available() -> bool: """check if `torch` package is installed diff --git a/tests/transformers/llama/test_tokenizer.py b/tests/transformers/llama/test_tokenizer.py index 8ba708f2ffb4..d88bd07a9a91 100644 --- a/tests/transformers/llama/test_tokenizer.py +++ b/tests/transformers/llama/test_tokenizer.py @@ -13,10 +13,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +import shutil +import tempfile import unittest -from paddlenlp.transformers.llama.tokenizer import LlamaTokenizer +from paddlenlp.transformers.auto.tokenizer import AutoTokenizer +from paddlenlp.transformers.llama.tokenizer import Llama3Tokenizer, LlamaTokenizer from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer +from paddlenlp.transformers.tokenizer_utils_fast import PretrainedTokenizerFast from ...transformers.test_tokenizer_common import TokenizerTesterMixin @@ -34,6 +38,7 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def get_tokenizer(self, **kwargs) -> PretrainedTokenizer: tokenizer = LlamaTokenizer.from_pretrained("__internal_testing__/tiny-random-llama", **kwargs) + print(f"tokenizer = {tokenizer}") tokenizer.pad_token = tokenizer.unk_token return tokenizer @@ -186,7 +191,6 @@ def test_padding_if_pad_token_set_slow(self): def test_add_bos_token_slow(self): bos_token = "" tokenizer = self.get_tokenizer() - s = "This is a simple input" s2 = ["This is a simple input 1", "This is a simple input 2"] @@ -208,3 +212,70 @@ def test_pretrained_model_lists(self): # No max_model_input_sizes self.assertGreaterEqual(len(self.tokenizer_class.pretrained_resource_files_map), 1) self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_resource_files_map.values())[0]), 1) + +class TikTokenIntegrationTests(unittest.TestCase): + """ + A class that regroups important test to make sure that we properly handle the special tokens. + """ + def test_tiktoken_llama(self): + model_path = "hf-internal-testing/Llama3-Instruct-Internal" + test_text = "This is a test sentence." + test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001] + num_reserved_special_tokens = 256 + special_tokens = [ + "<|begin_of_text|>", + "<|end_of_text|>", + "<|reserved_special_token_0|>", + "<|reserved_special_token_1|>", + "<|reserved_special_token_2|>", + "<|reserved_special_token_3|>", + "<|start_header_id|>", + "<|end_header_id|>", + "<|reserved_special_token_4|>", + "<|eot_id|>", + "<|python_tag|>", # end of turn + ] + [f"<|reserved_special_token_{i}|>" for i in range(5, num_reserved_special_tokens - 5)] + + tiktoken_tokenizer = PretrainedTokenizerFast.from_pretrained( + model_path, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + from_hf_hub=True, + ) + tokens = tiktoken_tokenizer.tokenize("<|begin_of_text|> " + test_text) + self.assertEqual(tokens[0], "<|begin_of_text|>") + + tiktoken_tokenizer = AutoTokenizer.from_pretrained( + model_path, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + add_bos_token=True, + add_eos_token=True, + from_hf_hub=True + ) + self.assertTrue(isinstance(tiktoken_tokenizer, PretrainedTokenizerFast)) + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)['input_ids'] + self.assertEqual(tokens, test_tokens) + tmpdirname = tempfile.mkdtemp() + tiktoken_tokenizer.save_pretrained(tmpdirname) + print(f"RELOADING >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") + tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname) + + self.assertTrue(isinstance(tokenizer_reload, PretrainedTokenizerFast)) + tokens = tokenizer_reload.encode(test_text, add_special_tokens=True)['input_ids'] + self.assertEqual(tokens, test_tokens) + shutil.rmtree(tmpdirname) + # tiktoken_tokenizer = AutoTokenizer.from_pretrained( + # model_path, + # additional_special_tokens=special_tokens, + # bos_token="<|begin_of_text|>", + # eos_token="<|end_of_text|>", + # from_slow=True, + # add_bos_token=True, + # add_eos_token=True, + # from_hf_hub=True, + # ) + # tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)['input_ids'] + # self.assertEqual(tokens, test_tokens) \ No newline at end of file diff --git a/tests/transformers/test_modeling_common.py b/tests/transformers/test_modeling_common.py index 8813c6dee754..3f7ed82158a9 100644 --- a/tests/transformers/test_modeling_common.py +++ b/tests/transformers/test_modeling_common.py @@ -928,9 +928,8 @@ def tearDown(self): def test_to_static_use_top_k(self): tokenizer = self.TokenizerClass.from_pretrained(self.internal_testing_model) - if tokenizer.__class__.__name__ == "LlamaTokenizer": + if "LlamaTokenizer" in tokenizer.__class__.__name__: tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "" - model = self.CausalLMClass.from_pretrained(self.internal_testing_model) model_kwargs = tokenizer( self.article, @@ -1008,7 +1007,7 @@ def test_to_static_use_top_k(self): def test_to_static_use_top_p(self): tokenizer = self.TokenizerClass.from_pretrained(self.internal_testing_model) - if tokenizer.__class__.__name__ == "LlamaTokenizer": + if "LlamaTokenizer" in tokenizer.__class__.__name__: tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "" model = self.CausalLMClass.from_pretrained(self.internal_testing_model) From 9004ac9a40536628725fb8245b1a601669b10d19 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Fri, 27 Sep 2024 10:57:46 +0000 Subject: [PATCH 02/21] add support of tiktoken tokenizer, refactor some code --- paddlenlp/transformers/auto/configuration.py | 229 ++++++++++++++++-- paddlenlp/transformers/auto/factory.py | 18 +- paddlenlp/transformers/auto/tokenizer.py | 126 ++++------ paddlenlp/transformers/configuration_utils.py | 8 +- .../transformers/convert_slow_tokenizer.py | 42 ++-- .../transformers/tokenizer_utils_base.py | 31 ++- .../transformers/tokenizer_utils_fast.py | 1 + paddlenlp/utils/download/__init__.py | 12 +- paddlenlp/utils/import_utils.py | 58 +++++ tests/transformers/llama/test_tokenizer.py | 42 ++-- tests/transformers/test_chat_template.py | 8 +- 11 files changed, 414 insertions(+), 161 deletions(-) diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 5db25644e6c9..d29abc7ea114 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -32,16 +32,83 @@ __all__ = [ "AutoConfig", ] -# CONFIG_MAPPING_NAMES = OrderedDict( -# [ -# # Add configs here -# ('albert', 'AlbertConfig'), ('bart', 'BartConfig'), ('bert', 'BertConfig'), ('bit', 'BitConfig'), ('blenderbot', 'BlenderbotConfig'), ('blip', 'BlipConfig'), ('bloom', 'BloomConfig'), ('clap', 'ClapConfig'), ('clip', 'CLIPConfig'), ('clipseg', 'CLIPSegConfig'), ('codegen', 'CodeGenConfig'), ('convbert', 'ConvBertConfig'), ('ctrl', 'CTRLConfig'), ('deberta', 'DebertaConfig'), ('distilbert', 'DistilBertConfig'), ('dpt', 'DPTConfig'), ('electra', 'ElectraConfig'), ('ernie', 'ErnieConfig'), ('ernie_m', 'ErnieMConfig'), ('fnet', 'FNetConfig'), ('funnel', 'FunnelConfig'), ('gemma', 'GemmaConfig'), ('gptj', 'GPTJConfig'), ('jamba', 'JambaConfig'), ('layoutlm', 'LayoutLMConfig'), ('layoutlmv2', 'LayoutLMv2Config'), ('llama', 'LlamaConfig'), ('luke', 'LukeConfig'), ('mamba', 'MambaConfig'), ('mbart', 'MBartConfig'), ('mistral', 'MistralConfig'), ('mixtral', 'MixtralConfig'), ('mobilebert', 'MobileBertConfig'), ('mpnet', 'MPNetConfig'), ('mt5', 'MT5Config'), ('nezha', 'NezhaConfig'), ('nystromformer', 'NystromformerConfig'), ('opt', 'OPTConfig'), ('pegasus', 'PegasusConfig'), ('prophetnet', 'ProphetNetConfig'), ('qwen2_moe', 'Qwen2MoeConfig'), ('reformer', 'ReformerConfig'), ('rembert', 'RemBertConfig'), ('roberta', 'RobertaConfig'), ('roformer', 'RoFormerConfig'), ('speecht5', 'SpeechT5Config'), ('squeezebert', 'SqueezeBertConfig'), ('t5', 'T5Config'), ('xlm', 'XLMConfig'), ('xlnet', 'XLNetConfig'), -# ] -# ) CONFIG_MAPPING_NAMES = OrderedDict( [ - ('albert', 'AlbertConfig'), ('bigbird', 'BigBirdConfig'), ('blenderbot_small', 'BlenderbotSmallConfig'), ('blenderbot', 'BlenderbotConfig'), ('chatglm_v2', 'ChatGLMv2Config'), ('chatglm', 'ChatGLMConfig'), ('chineseclip', 'ChineseCLIPTextConfig'), ('chinesebert', 'ChineseBertConfig'), ('convbert', 'ConvBertConfig'), ('ctrl', 'CTRLConfig'), ('distilbert', 'DistilBertConfig'), ('dallebart', 'DalleBartConfig'), ('electra', 'ElectraConfig'), ('ernie_vil', 'ErnieViLConfig'), ('ernie_ctm', 'ErnieCtmConfig'), ('ernie_doc', 'ErnieDocConfig'), ('ernie_gen', 'ErnieGenConfig'), ('ernie_gram', 'ErnieGramConfig'), ('ernie_layout', 'ErnieLayoutConfig'), ('ernie_m', 'ErnieMConfig'), ('ernie_code', 'ErnieCodeConfig'), ('ernie', 'ErnieConfig'), ('fnet', 'FNetConfig'), ('funnel', 'FunnelConfig'), ('llama', 'LlamaConfig'), ('layoutxlm', 'LayoutXLMConfig'), ('layoutlmv2', 'LayoutLMv2Config'), ('layoutlm', 'LayoutLMConfig'), ('luke', 'LukeConfig'), ('mbart', 'MBartConfig'), ('megatronbert', 'MegatronBertConfig'), ('mobilebert', 'MobileBertConfig'), ('mpnet', 'MPNetConfig'), ('nezha', 'NeZhaConfig'), ('nystromformer', 'NystromformerConfig'), ('ppminilm', 'PPMiniLMConfig'), ('prophetnet', 'ProphetNetConfig'), ('reformer', 'ReformerConfig'), ('rembert', 'RemBertConfig'), ('roberta', 'RobertaConfig'), ('roformerv2', 'RoFormerv2Config'), ('roformer', 'RoFormerConfig'), ('skep', 'SkepConfig'), ('squeezebert', 'SqueezeBertConfig'), ('tinybert', 'TinyBertConfig'), ('unified_transformer', 'UnifiedTransformerConfig'), ('unimo', 'UNIMOConfig'), ('xlnet', 'XLNetConfig'), ('xlm', 'XLMConfig'), ('gpt', 'GPTConfig'), ('glm', 'GLMConfig'), ('mt5', 'MT5Config'), ('t5', 'T5Config'), ('bert', 'BertConfig'), ('bart', 'BartConfig'), ('gau_alpha', 'GAUAlphaConfig'), ('codegen', 'CodeGenConfig'), ('clip', 'CLIPConfig'), ('artist', 'ArtistConfig'), ('opt', 'OPTConfig'), ('pegasus', 'PegasusConfig'), ('dpt', 'DPTConfig'), ('bit', 'BitConfig'), ('blip', 'BlipConfig'), ('bloom', 'BloomConfig'), ('qwen', 'QWenConfig'), ('mistral', 'MistralConfig'), ('mixtral', 'MixtralConfig'), ('qwen2', 'Qwen2Config'), ('qwen2_moe', 'Qwen2MoeConfig'), ('gemma', 'GemmaConfig'), ('yuan', 'YuanConfig'), ('mamba', 'MambaConfig'), ('jamba', 'JambaConfig') + ("albert", "AlbertConfig"), + ("bigbird", "BigBirdConfig"), + ("blenderbot_small", "BlenderbotSmallConfig"), + ("blenderbot", "BlenderbotConfig"), + ("chatglm_v2", "ChatGLMv2Config"), + ("chatglm", "ChatGLMConfig"), + ("chineseclip", "ChineseCLIPTextConfig"), + ("chinesebert", "ChineseBertConfig"), + ("convbert", "ConvBertConfig"), + ("ctrl", "CTRLConfig"), + ("distilbert", "DistilBertConfig"), + ("dallebart", "DalleBartConfig"), + ("electra", "ElectraConfig"), + ("ernie_vil", "ErnieViLConfig"), + ("ernie_ctm", "ErnieCtmConfig"), + ("ernie_doc", "ErnieDocConfig"), + ("ernie_gen", "ErnieGenConfig"), + ("ernie_gram", "ErnieGramConfig"), + ("ernie_layout", "ErnieLayoutConfig"), + ("ernie_m", "ErnieMConfig"), + ("ernie_code", "ErnieCodeConfig"), + ("ernie", "ErnieConfig"), + ("fnet", "FNetConfig"), + ("funnel", "FunnelConfig"), + ("llama", "LlamaConfig"), + ("layoutxlm", "LayoutXLMConfig"), + ("layoutlmv2", "LayoutLMv2Config"), + ("layoutlm", "LayoutLMConfig"), + ("luke", "LukeConfig"), + ("mbart", "MBartConfig"), + ("megatronbert", "MegatronBertConfig"), + ("mobilebert", "MobileBertConfig"), + ("mpnet", "MPNetConfig"), + ("nezha", "NeZhaConfig"), + ("nystromformer", "NystromformerConfig"), + ("ppminilm", "PPMiniLMConfig"), + ("prophetnet", "ProphetNetConfig"), + ("reformer", "ReformerConfig"), + ("rembert", "RemBertConfig"), + ("roberta", "RobertaConfig"), + ("roformerv2", "RoFormerv2Config"), + ("roformer", "RoFormerConfig"), + ("skep", "SkepConfig"), + ("squeezebert", "SqueezeBertConfig"), + ("tinybert", "TinyBertConfig"), + ("unified_transformer", "UnifiedTransformerConfig"), + ("unimo", "UNIMOConfig"), + ("xlnet", "XLNetConfig"), + ("xlm", "XLMConfig"), + ("gpt", "GPTConfig"), + ("glm", "GLMConfig"), + ("mt5", "MT5Config"), + ("t5", "T5Config"), + ("bert", "BertConfig"), + ("bart", "BartConfig"), + ("gau_alpha", "GAUAlphaConfig"), + ("codegen", "CodeGenConfig"), + ("clip", "CLIPConfig"), + ("artist", "ArtistConfig"), + ("opt", "OPTConfig"), + ("pegasus", "PegasusConfig"), + ("dpt", "DPTConfig"), + ("bit", "BitConfig"), + ("blip", "BlipConfig"), + ("bloom", "BloomConfig"), + ("qwen", "QWenConfig"), + ("mistral", "MistralConfig"), + ("mixtral", "MixtralConfig"), + ("qwen2", "Qwen2Config"), + ("qwen2_moe", "Qwen2MoeConfig"), + ("gemma", "GemmaConfig"), + ("yuan", "YuanConfig"), + ("mamba", "MambaConfig"), + ("jamba", "JambaConfig"), ] ) @@ -50,7 +117,80 @@ [ # Add full (and cased) model names here # Base model mapping - ('albert', 'Albert'), ('bigbird', 'BigBird'), ('blenderbot_small', 'BlenderbotSmall'), ('blenderbot', 'Blenderbot'), ('chatglm_v2', 'ChatGLMv2'), ('chatglm', 'ChatGLM'), ('chineseclip', 'ChineseCLIPText'), ('chinesebert', 'ChineseBert'), ('convbert', 'ConvBert'), ('ctrl', 'CTRL'), ('distilbert', 'DistilBert'), ('dallebart', 'DalleBart'), ('electra', 'Electra'), ('ernie_vil', 'ErnieViL'), ('ernie_ctm', 'ErnieCtm'), ('ernie_doc', 'ErnieDoc'), ('ernie_gen', 'ErnieGen'), ('ernie_gram', 'ErnieGram'), ('ernie_layout', 'ErnieLayout'), ('ernie_m', 'ErnieM'), ('ernie_code', 'ErnieCode'), ('ernie', 'Ernie'), ('fnet', 'FNet'), ('funnel', 'Funnel'), ('llama', 'Llama'), ('layoutxlm', 'LayoutXLM'), ('layoutlmv2', 'LayoutLMv2'), ('layoutlm', 'LayoutLM'), ('luke', 'Luke'), ('mbart', 'MBart'), ('megatronbert', 'MegatronBert'), ('mobilebert', 'MobileBert'), ('mpnet', 'MPNet'), ('nezha', 'NeZha'), ('nystromformer', 'Nystromformer'), ('ppminilm', 'PPMiniLM'), ('prophetnet', 'ProphetNet'), ('reformer', 'Reformer'), ('rembert', 'RemBert'), ('roberta', 'Roberta'), ('roformerv2', 'RoFormerv2'), ('roformer', 'RoFormer'), ('skep', 'Skep'), ('squeezebert', 'SqueezeBert'), ('tinybert', 'TinyBert'), ('unified_transformer', 'UnifiedTransformer'), ('unimo', 'UNIMO'), ('xlnet', 'XLNet'), ('xlm', 'XLM'), ('gpt', 'GPT'), ('glm', 'GLM'), ('mt5', 'MT5'), ('t5', 'T5'), ('bert', 'Bert'), ('bart', 'Bart'), ('gau_alpha', 'GAUAlpha'), ('codegen', 'CodeGen'), ('clip', 'CLIP'), ('artist', 'Artist'), ('opt', 'OPT'), ('pegasus', 'Pegasus'), ('dpt', 'DPT'), ('bit', 'Bit'), ('blip', 'Blip'), ('bloom', 'Bloom'), ('qwen', 'QWen'), ('mistral', 'Mistral'), ('mixtral', 'Mixtral'), ('qwen2', 'Qwen2'), ('qwen2_moe', 'Qwen2Moe'), ('gemma', 'Gemma'), ('yuan', 'Yuan'), ('mamba', 'Mamba'), ('jamba', 'Jamba') + ("albert", "Albert"), + ("bigbird", "BigBird"), + ("blenderbot_small", "BlenderbotSmall"), + ("blenderbot", "Blenderbot"), + ("chatglm_v2", "ChatGLMv2"), + ("chatglm", "ChatGLM"), + ("chineseclip", "ChineseCLIPText"), + ("chinesebert", "ChineseBert"), + ("convbert", "ConvBert"), + ("ctrl", "CTRL"), + ("distilbert", "DistilBert"), + ("dallebart", "DalleBart"), + ("electra", "Electra"), + ("ernie_vil", "ErnieViL"), + ("ernie_ctm", "ErnieCtm"), + ("ernie_doc", "ErnieDoc"), + ("ernie_gen", "ErnieGen"), + ("ernie_gram", "ErnieGram"), + ("ernie_layout", "ErnieLayout"), + ("ernie_m", "ErnieM"), + ("ernie_code", "ErnieCode"), + ("ernie", "Ernie"), + ("fnet", "FNet"), + ("funnel", "Funnel"), + ("llama", "Llama"), + ("layoutxlm", "LayoutXLM"), + ("layoutlmv2", "LayoutLMv2"), + ("layoutlm", "LayoutLM"), + ("luke", "Luke"), + ("mbart", "MBart"), + ("megatronbert", "MegatronBert"), + ("mobilebert", "MobileBert"), + ("mpnet", "MPNet"), + ("nezha", "NeZha"), + ("nystromformer", "Nystromformer"), + ("ppminilm", "PPMiniLM"), + ("prophetnet", "ProphetNet"), + ("reformer", "Reformer"), + ("rembert", "RemBert"), + ("roberta", "Roberta"), + ("roformerv2", "RoFormerv2"), + ("roformer", "RoFormer"), + ("skep", "Skep"), + ("squeezebert", "SqueezeBert"), + ("tinybert", "TinyBert"), + ("unified_transformer", "UnifiedTransformer"), + ("unimo", "UNIMO"), + ("xlnet", "XLNet"), + ("xlm", "XLM"), + ("gpt", "GPT"), + ("glm", "GLM"), + ("mt5", "MT5"), + ("t5", "T5"), + ("bert", "Bert"), + ("bart", "Bart"), + ("gau_alpha", "GAUAlpha"), + ("codegen", "CodeGen"), + ("clip", "CLIP"), + ("artist", "Artist"), + ("opt", "OPT"), + ("pegasus", "Pegasus"), + ("dpt", "DPT"), + ("bit", "Bit"), + ("blip", "Blip"), + ("bloom", "Bloom"), + ("qwen", "QWen"), + ("mistral", "Mistral"), + ("mixtral", "Mixtral"), + ("qwen2", "Qwen2"), + ("qwen2_moe", "Qwen2Moe"), + ("gemma", "Gemma"), + ("yuan", "Yuan"), + ("mamba", "Mamba"), + ("jamba", "Jamba"), ] ) @@ -62,6 +202,61 @@ def config_class_to_model_type(config): return key return None + +class _LazyConfigMapping(OrderedDict): + """ + A dictionary that lazily load its values when they are requested. + """ + + def __init__(self, mapping): + self._mapping = mapping + self._extra_content = {} + self._modules = {} + + def __getitem__(self, key): + if key in self._extra_content: + return self._extra_content[key] + if key not in self._mapping: + raise KeyError(key) + value = self._mapping[key] + module_name = model_type_to_module_name(key) + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models") + if hasattr(self._modules[module_name], value): + return getattr(self._modules[module_name], value) + + # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the + # object at the top level. + transformers_module = importlib.import_module("transformers") + return getattr(transformers_module, value) + + def keys(self): + return list(self._mapping.keys()) + list(self._extra_content.keys()) + + def values(self): + return [self[k] for k in self._mapping.keys()] + list(self._extra_content.values()) + + def items(self): + return [(k, self[k]) for k in self._mapping.keys()] + list(self._extra_content.items()) + + def __iter__(self): + return iter(list(self._mapping.keys()) + list(self._extra_content.keys())) + + def __contains__(self, item): + return item in self._mapping or item in self._extra_content + + def register(self, key, value, exist_ok=False): + """ + Register a new configuration in this mapping. + """ + if key in self._mapping.keys() and not exist_ok: + raise ValueError(f"'{key}' is already used by a Transformers config, pick another name.") + self._extra_content[key] = value + + +CONFIG_MAPPING = _LazyConfigMapping(CONFIG_MAPPING_NAMES) + + def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]: """load the configurations of PretrainedConfig mapping: {: [, , ...], } @@ -94,22 +289,13 @@ def get_configurations() -> Dict[str, List[Type[PretrainedConfig]]]: return mappings + def model_type_to_module_name(key): """Converts a config key to the corresponding module.""" - # Special treatment - # if key in SPECIAL_MODEL_TYPE_TO_MODULE_NAME: - # key = SPECIAL_MODEL_TYPE_TO_MODULE_NAME[key] - - # if key in DEPRECATED_MODELS: - # key = f"deprecated.{key}" - # return key - key = key.replace("-", "_") - # if key in DEPRECATED_MODELS: - # key = f"deprecated.{key}" - return key + class AutoConfig(PretrainedConfig): """ AutoConfig is a generic config class that will be instantiated as one of the @@ -243,6 +429,11 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar if config_class is cls: return cls.from_file(config_file) return config_class.from_pretrained(config_file, *model_args, **kwargs) + elif config_file is None: + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True): + if pattern in str(pretrained_model_name_or_path): + return CONFIG_MAPPING[pattern].from_dict(config_dict, **unused_kwargs) else: raise RuntimeError( f"Can't load config for '{pretrained_model_name_or_path}'.\n" diff --git a/paddlenlp/transformers/auto/factory.py b/paddlenlp/transformers/auto/factory.py index ca6c15a08f46..fa084838dec2 100644 --- a/paddlenlp/transformers/auto/factory.py +++ b/paddlenlp/transformers/auto/factory.py @@ -1,6 +1,19 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. -from collections import OrderedDict import importlib +from collections import OrderedDict from paddlenlp.transformers.auto.configuration import model_type_to_module_name @@ -24,6 +37,7 @@ def getattribute_from_module(module, attr): else: raise ValueError(f"Could not find {attr} in {transformers_module}!") + class _LazyAutoMapping(OrderedDict): """ " A mapping config to object (model or tokenizer for instance) that will load keys and values when it is accessed. @@ -123,4 +137,4 @@ def register(self, key, value, exist_ok=False): if model_type in self._model_mapping.keys() and not exist_ok: raise ValueError(f"'{key}' is already used by a Transformers model.") - self._extra_content[key] = value \ No newline at end of file + self._extra_content[key] = value diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 650fad48e505..e9f59fb1338d 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -17,7 +17,7 @@ import json import os from collections import OrderedDict -from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union +from typing import Dict, Optional, Tuple, Union from paddlenlp.transformers.auto.configuration import ( CONFIG_MAPPING_NAMES, @@ -29,11 +29,7 @@ from paddlenlp.transformers.tokenizer_utils_base import TOKENIZER_CONFIG_FILE from paddlenlp.transformers.tokenizer_utils_fast import PretrainedTokenizerFast -from ...utils import ( - is_g2p_en_available, - is_sentencepiece_available, - is_tokenizers_available, -) +from ...utils import is_sentencepiece_available, is_tokenizers_available from ...utils.download import resolve_file_path from ...utils.import_utils import import_module from ...utils.log import logger @@ -43,11 +39,6 @@ "AutoTokenizer", ] -if is_tokenizers_available(): - from ..tokenizer_utils_fast import PretrainedTokenizerFast -else: - PretrainedTokenizerFast = None - if False: # This significantly improves completion suggestion performance when # the transformers package is used with Microsoft's Pylance language server. @@ -67,7 +58,7 @@ "bert", ( "BertTokenizer", - "BertTokenizerFast" if is_tokenizers_available() else None, + None, ), ), ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")), @@ -96,7 +87,7 @@ "convbert", ( "ConvBertTokenizer", - "ConvBertTokenizerFast" if is_tokenizers_available() else None, + None, ), ), ("ctrl", ("CTRLTokenizer", None)), @@ -104,7 +95,7 @@ "distilbert", ( "DistilBertTokenizer", - "DistilBertTokenizerFast" if is_tokenizers_available() else None, + None, ), ), ( @@ -189,15 +180,15 @@ ( "mbart", ( + "MBartTokenizer" if is_sentencepiece_available() else None, "MBart50Tokenizer" if is_sentencepiece_available() else None, - "MBart50TokenizerFast" if is_tokenizers_available() else None, ), ), ( "mobilebert", ( "MobileBertTokenizer", - "MobileBertTokenizerFast" if is_tokenizers_available() else None, + None, ), ), ( @@ -233,7 +224,7 @@ "rembert", ( "RemBertTokenizer" if is_sentencepiece_available() else None, - "RemBertTokenizerFast" if is_tokenizers_available() else None, + None, ), ), ( @@ -255,7 +246,7 @@ "squeezebert", ( "SqueezeBertTokenizer", - "SqueezeBertTokenizerFast" if is_tokenizers_available() else None, + None, ), ), ( @@ -273,7 +264,34 @@ "XLNetTokenizerFast" if is_tokenizers_available() else None, ), ), - ('bert_japanese', 'BertJapaneseTokenizer'), ('bigbird', 'BigBirdTokenizer'), ('blenderbot_small', 'BlenderbotSmallTokenizer'), ('chatglm', 'ChatGLMTokenizer'), ('chatglm_v2', 'ChatGLMv2Tokenizer'), ('chinesebert', 'ChineseBertTokenizer'), ('dallebart', 'DalleBartTokenizer'), ('ernie_ctm', 'ErnieCtmTokenizer'), ('ernie_doc', 'ErnieDocBPETokenizer'), ('ernie_gram', 'ErnieGramTokenizer'), ('ernie_layout', 'ErnieLayoutTokenizer'), ('ernie_code', 'ErnieCodeTokenizer'), ('megatronbert', 'MegatronBertTokenizer'), ('nystromformer', 'NystromformerTokenizer'), ('ppminilm', 'PPMiniLMTokenizer'), ('roformerv2', 'RoFormerv2Tokenizer'), ('skep', 'SkepTokenizer'), ('tinybert', 'TinyBertTokenizer'), ('unified_transformer', 'UnifiedTransformerTokenizer'), ('unimo', 'UNIMOTokenizer'), ('gpt', 'GPTChineseTokenizer'), ('gau_alpha', 'GAUAlphaTokenizer'), ('artist', 'ArtistTokenizer'), ('chineseclip', 'ChineseCLIPTokenizer'), ('ernie_vil', 'ErnieViLTokenizer'), ('glm', 'GLMGPT2Tokenizer'), ('qwen', 'QWenTokenizer'), ('yuan', 'YuanTokenizer'), + ("bert_japanese", "BertJapaneseTokenizer"), + ("bigbird", "BigBirdTokenizer"), + ("blenderbot_small", "BlenderbotSmallTokenizer"), + ("chatglm", "ChatGLMTokenizer"), + ("chatglm_v2", "ChatGLMv2Tokenizer"), + ("chinesebert", "ChineseBertTokenizer"), + ("dallebart", "DalleBartTokenizer"), + ("ernie_ctm", "ErnieCtmTokenizer"), + ("ernie_doc", "ErnieDocBPETokenizer"), + ("ernie_gram", "ErnieGramTokenizer"), + ("ernie_layout", "ErnieLayoutTokenizer"), + ("ernie_code", "ErnieCodeTokenizer"), + ("megatronbert", "MegatronBertTokenizer"), + ("nystromformer", "NystromformerTokenizer"), + ("ppminilm", "PPMiniLMTokenizer"), + ("roformerv2", "RoFormerv2Tokenizer"), + ("skep", "SkepTokenizer"), + ("tinybert", "TinyBertTokenizer"), + ("unified_transformer", "UnifiedTransformerTokenizer"), + ("unimo", "UNIMOTokenizer"), + ("gpt", "GPTChineseTokenizer"), + ("gau_alpha", "GAUAlphaTokenizer"), + ("artist", "ArtistTokenizer"), + ("chineseclip", "ChineseCLIPTokenizer"), + ("ernie_vil", "ErnieViLTokenizer"), + ("glm", "GLMGPT2Tokenizer"), + ("qwen", "QWenTokenizer"), + ("yuan", "YuanTokenizer"), ] ) @@ -299,12 +317,15 @@ def tokenizer_class_from_name(class_name: str): for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): if class_name in tokenizers: module_name = model_type_to_module_name(module_name) - print(f"module_name: {module_name}") module = importlib.import_module(f".{module_name}", "paddlenlp.transformers") try: return getattr(module, class_name) except AttributeError: - continue + try: + module = importlib.import_module(f".{module_name}.tokenizer", "paddlenlp.transformers") + return getattr(module, class_name) + except AttributeError: + continue for config, tokenizers in TOKENIZER_MAPPING._extra_content.items(): for tokenizer in tokenizers: @@ -313,7 +334,7 @@ def tokenizer_class_from_name(class_name: str): # We did not fine the class, but maybe it's because a dep is missing. In that case, the class will be in the main # init and we return the proper dummy to get an appropriate error message. - main_module = importlib.import_module("transformers") + main_module = importlib.import_module("paddlenlp") if hasattr(main_module, class_name): return getattr(main_module, class_name) @@ -423,11 +444,6 @@ class AutoTokenizer: base tokenizer classes when created with the AutoTokenizer.from_pretrained() classmethod. """ - # MAPPING_NAMES = get_configurations() - # _tokenizer_mapping = MAPPING_NAMES - # _name_mapping = TOKENIZER_MAPPING_NAMES - # tokenizer_config_file = "tokenizer_config.json" - def __init__(self): raise EnvironmentError( "AutoTokenizer is designed to be instantiated " @@ -536,7 +552,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): config_tokenizer_class = config.tokenizer_class if config_tokenizer_class is not None: tokenizer_class = None - print(f"config_tokenizer_class: {config_tokenizer_class}") if use_fast and not config_tokenizer_class.endswith("Fast"): tokenizer_class_candidate = f"{config_tokenizer_class}Fast" tokenizer_class = tokenizer_class_from_name(tokenizer_class_candidate) @@ -547,17 +562,13 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): raise ValueError( f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." ) - return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) - print("we have to be creative") # TODO: if model is an encoder decoder model_type = config_class_to_model_type(type(config).__name__) if model_type is not None: - print(f"model_type is not None: {model_type}") tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)] - print(tokenizer_class_py, tokenizer_class_fast) if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) else: @@ -568,54 +579,3 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " "in order to use this tokenizer." ) - - # Default not to use fast tokenizer - # use_faster = kwargs.pop("use_faster", None) - # use_fast = kwargs.pop("use_fast", None) - # if use_fast is not None or use_faster is not None: - # raise ValueError("use_fast is deprecated") - - # cache_dir = kwargs.get("cache_dir", None) - # subfolder = kwargs.get("subfolder", "") - # if subfolder is None: - # subfolder = "" - # from_aistudio = kwargs.get("from_aistudio", False) - # from_hf_hub = kwargs.get("from_hf_hub", False) - - # all_tokenizer_names = [] - # for names, tokenizer_class in cls._tokenizer_mapping.items(): - # for name in names: - # all_tokenizer_names.append(name) - - # # From built-in pretrained models - # if pretrained_model_name_or_path in all_tokenizer_names: - # for names, tokenizer_class in cls._tokenizer_mapping.items(): - # for pattern in names: - # if pattern == pretrained_model_name_or_path: - # logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) - # return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - - # config_file = resolve_file_path( - # pretrained_model_name_or_path, - # cls.tokenizer_config_file, - # subfolder, - # cache_dir=cache_dir, - # from_hf_hub=from_hf_hub, - # from_aistudio=from_aistudio, - # ) - # print(f"config_file: {config_file}") - # print("cls.tokenizer_config_file: ", cls.tokenizer_config_file) - # if config_file is not None and os.path.exists(config_file): - # tokenizer_class = cls._get_tokenizer_class_from_config( - # pretrained_model_name_or_path, config_file, use_fast - # ) - # logger.info(f"We are using {tokenizer_class} to load '{pretrained_model_name_or_path}'.") - # return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) - # else: - # raise RuntimeError( - # f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" - # f"Please make sure that '{pretrained_model_name_or_path}' is:\n" - # "- a correct model-identifier of built-in pretrained models,\n" - # "- or a correct model-identifier of community-contributed pretrained models,\n" - # "- or the correct path to a directory containing relevant tokenizer files.\n" - # ) diff --git a/paddlenlp/transformers/configuration_utils.py b/paddlenlp/transformers/configuration_utils.py index ebb905a68f89..c1fd2e0c530f 100644 --- a/paddlenlp/transformers/configuration_utils.py +++ b/paddlenlp/transformers/configuration_utils.py @@ -826,7 +826,8 @@ def get_config_dict( # Get config dict associated with the base config file config_dict, kwargs = cls._get_config_dict(pretrained_model_name_or_path, **kwargs) - + if config_dict is None: + return {}, kwargs # That config file may point us toward another config file to use. if "configuration_files" in config_dict: original_kwargs["cache_dir"] = os.path.join(cache_dir, pretrained_model_name_or_path, subfolder) @@ -875,9 +876,8 @@ def _get_config_dict( from_aistudio=from_aistudio, from_hf_hub=from_hf_hub, ) - assert ( - resolved_config_file is not None - ), f"please make sure one of the {filenames} under {pretrained_model_name_or_path}" + if resolved_config_file is None: + return None, kwargs try: logger.info(f"Loading configuration file {resolved_config_file}") # Load config dict diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index af3cf887c791..ee8ea49b83e6 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -29,7 +29,11 @@ ) from tokenizers.models import BPE, Unigram -from paddlenlp.utils.import_utils import is_protobuf_available, is_sentencepiece_available +from paddlenlp.utils.import_utils import ( + is_protobuf_available, + is_sentencepiece_available, +) + def import_protobuf(error_message=""): if is_sentencepiece_available(): @@ -42,14 +46,19 @@ def import_protobuf(error_message=""): if version.parse(google.protobuf.__version__) < version.parse("4.0.0"): from transformers.utils import sentencepiece_model_pb2 else: - from transformers.utils import sentencepiece_model_pb2_new as sentencepiece_model_pb2 + from transformers.utils import ( + sentencepiece_model_pb2_new as sentencepiece_model_pb2, + ) return sentencepiece_model_pb2 else: - raise ImportError(f""" + raise ImportError( + f""" {error_message} requires the protobuf library but it was not found in your environment. Checkout the instructions on the installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. -""") +""" + ) + # Copied from transformers, adapted for tokenizers >= 0.19.0 def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str: @@ -219,6 +228,7 @@ def converted(self) -> Tokenizer: return tokenizer + # Copied from paddlenlp/transformers/gpt/tokenizer.py def bytes_to_unicode(): """ @@ -244,6 +254,7 @@ def bytes_to_unicode(): cs = [_chr(n) for n in cs] return dict(zip(bs, cs)) + class TikTokenConverter: """ A general tiktoken converter. @@ -409,15 +420,14 @@ def convert_slow_tokenizer(transformer_tokenizer, from_tiktoken=False) -> Tokeni converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name] return converter_class(transformer_tokenizer).converted() else: - # try: - return TikTokenConverter( - vocab_file=transformer_tokenizer.vocab_file, - additional_special_tokens=transformer_tokenizer.additional_special_tokens, - ).converted() - # except Exception: - # raise ValueError( - # f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path " - # f"with a SentencePiece tokenizer.model file." - # f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}" - # ) - + try: + return TikTokenConverter( + vocab_file=transformer_tokenizer.vocab_file, + additional_special_tokens=transformer_tokenizer.additional_special_tokens, + ).converted() + except Exception: + raise ValueError( + f"Converting from Tiktoken failed, if a converter for SentencePiece is available, provide a model path " + f"with a SentencePiece tokenizer.model file." + f"Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}" + ) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 069f8de3a173..508c342dcf42 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -51,11 +51,14 @@ def import_protobuf_decode_error(error_message=""): return DecodeError else: - raise ImportError(f""" + raise ImportError( + f""" {error_message} requires the protobuf library but it was not found in your environment. Checkout the instructions on the installation page of its repo: https://github.com/protocolbuffers/protobuf/tree/master/python#installation and follow the ones that match your environment. Please note that you may need to restart your runtime after installation. -""") +""" + ) + if is_tokenizers_available(): from tokenizers import AddedToken @@ -147,6 +150,7 @@ class TensorType(ExplicitEnum): TOKENIZER_CONFIG_FILE = "tokenizer_config.json" FULL_TOKENIZER_FILE = "tokenizer.json" + def to_py_obj(obj): """ Convert a Paddle tensor, Numpy array or python list to a python list. @@ -1377,7 +1381,6 @@ def __init__(self, **kwargs): # By default, cleaning tokenization spaces for both fast and slow tokenizers self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True) - # By default, do not split special tokens for both fast and slow tokenizers self.split_special_tokens = kwargs.pop("split_special_tokens", False) @@ -1502,7 +1505,6 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Load from local directory path tokenizer = BertTokenizer.from_pretrained('./my_bert/') """ - cache_dir = kwargs.pop("cache_dir", None) from_hf_hub = kwargs.pop("from_hf_hub", False) from_aistudio = kwargs.pop("from_aistudio", False) @@ -1513,8 +1515,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): vocab_files = {} init_configuration = {} - is_local = os.path.isdir(pretrained_model_name_or_path) - + # is_local = os.path.isdir(pretrained_model_name_or_path) additional_files_names = { "added_tokens_file": ADDED_TOKENS_FILE, @@ -1581,6 +1582,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): from_hf_hub=from_hf_hub, **kwargs, ) + @classmethod def _from_pretrained( cls, @@ -1617,10 +1619,8 @@ def _from_pretrained( str: The directory path of the tokenizer files if `return_tokenizer_file_dir` is `True`. """ - print("sdvcsdvsdvsvd",cls,resolved_vocab_files,pretrained_model_name_or_path,init_configuration,init_inputs,cache_dir,return_tokenizer_file_dir,from_hf_hub,kwargs) from_slow = kwargs.get("from_slow", False) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None - print(f"from_slow: {from_slow}, has_tokenizer_file: {has_tokenizer_file}") if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None: slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( copy.deepcopy(resolved_vocab_files), @@ -1632,8 +1632,6 @@ def _from_pretrained( ) else: slow_tokenizer = None - print(f"slow_tokenizer: {slow_tokenizer}") - tokenizer_config_file_dir_list = set() for k, v in resolved_vocab_files.items(): if v is not None and os.path.isfile(v): @@ -1649,12 +1647,14 @@ def _from_pretrained( if tokenizer_config_file is not None: with io.open(tokenizer_config_file, encoding="utf-8") as f: init_kwargs = json.load(f) + init_kwargs.pop("tokenizer_class", None) else: init_kwargs = init_configuration if slow_tokenizer is not None: init_kwargs["__slow_tokenizer"] = slow_tokenizer init_kwargs["name_or_path"] = pretrained_model_name_or_path + init_kwargs["from_slow"] = from_slow pass_added_tokens_file = False # Handle tokenizer serialization of added and special tokens @@ -1675,11 +1675,9 @@ def _from_pretrained( pass_added_tokens_file = True # position args are stored in kwargs, maybe better not include - # init_args = init_kwargs.pop("init_args", ()) init_kwargs.pop("init_class", None) # Update with newly provided args and kwargs - # init_args = init_args if not args else args init_kwargs.update(kwargs) def convert_added_tokens(obj): @@ -1731,7 +1729,14 @@ def convert_added_tokens(obj): "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).", ) return False - + except RuntimeError as e: + if "sentencepiece_processor.cc" in str(e): + logger.info( + "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead." + "(SentencePiece RuntimeError: Tried to load SPM model with non-SPM vocab file).", + ) + return False + chat_template = init_kwargs.pop("chat_template", None) if chat_template is not None: tokenizer.init_chat_template(chat_template) diff --git a/paddlenlp/transformers/tokenizer_utils_fast.py b/paddlenlp/transformers/tokenizer_utils_fast.py index 26fa0d6b4747..59491e96dec6 100644 --- a/paddlenlp/transformers/tokenizer_utils_fast.py +++ b/paddlenlp/transformers/tokenizer_utils_fast.py @@ -62,6 +62,7 @@ VOCAB_FILES_NAMES = {"tokenizer_file": FULL_TOKENIZER_NAME, "vocab_file": TIKTOKEN_VOCAB_FILE} + class PretrainedTokenizerFast(ChatTemplateMixin, PretrainedTokenizerBase): """ Base class for all fast tokenizers (wrapping HuggingFace tokenizers library). diff --git a/paddlenlp/utils/download/__init__.py b/paddlenlp/utils/download/__init__.py index eadf41a47838..935fee61cd85 100644 --- a/paddlenlp/utils/download/__init__.py +++ b/paddlenlp/utils/download/__init__.py @@ -213,6 +213,9 @@ def resolve_file_path( elif from_hf_hub: log_endpoint = "Huggingface Hub" for filename in filenames: + print( + f"params: repo_id={repo_id}, filename={filename}, subfolder={subfolder}, repo_type={repo_type}, revision={revision}, token={token}, endpoint={endpoint}, from_bos={from_bos}, from_aistudio={from_aistudio}, from_hf_hub={from_hf_hub}" + ) download_kwargs["filename"] = filename is_available = bos_aistudio_hf_file_exist( repo_id, @@ -237,6 +240,9 @@ def resolve_file_path( download_kwargs["url"] = url for filename in filenames: download_kwargs["filename"] = filename + print( + f"params: repo_id={repo_id}, filename={filename}, subfolder={subfolder}, repo_type={repo_type}, revision={revision}, token={token}, endpoint={endpoint}, from_bos={from_bos}, from_aistudio={from_aistudio}, from_hf_hub={from_hf_hub}" + ) is_available = bos_aistudio_hf_file_exist( repo_id, filename, @@ -274,7 +280,8 @@ def resolve_file_path( f"'{log_endpoint}' for available revisions." ) except EntryNotFoundError: - raise EnvironmentError(f"Does not appear one of the {filenames} in {repo_id}.") + return None + # raise EnvironmentError(f"Does not appear one of the {filenames} in {repo_id}.") except HTTPError as err: raise EnvironmentError(f"There was a specific connection error when trying to load {repo_id}:\n{err}") except ValueError: @@ -312,6 +319,7 @@ def bos_aistudio_hf_file_exist( if subfolder is None: subfolder = "" filename = os.path.join(subfolder, filename) + print(f"filename = {filename}") if from_aistudio: out = aistudio_hub_file_exists( repo_id=repo_id, @@ -329,6 +337,7 @@ def bos_aistudio_hf_file_exist( revision=revision, token=token, ) + print(f"out = {out}") else: out = bos_file_exists( repo_id=repo_id, @@ -338,6 +347,7 @@ def bos_aistudio_hf_file_exist( token=token, # donot need token endpoint=endpoint, ) + return out diff --git a/paddlenlp/utils/import_utils.py b/paddlenlp/utils/import_utils.py index cccb867830a9..b8a2ffec626a 100644 --- a/paddlenlp/utils/import_utils.py +++ b/paddlenlp/utils/import_utils.py @@ -67,6 +67,50 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ _sklearn_available = False +# TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better. +def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]: + # Check if the package spec exists and grab its version to avoid importing a local directory + package_exists = importlib.util.find_spec(pkg_name) is not None + package_version = "N/A" + if package_exists: + try: + # Primary method to get the package version + package_version = importlib.metadata.version(pkg_name) + except importlib.metadata.PackageNotFoundError: + # Fallback method: Only for "torch" and versions containing "dev" + if pkg_name == "torch": + try: + package = importlib.import_module(pkg_name) + temp_version = getattr(package, "__version__", "N/A") + # Check if the version contains "dev" + if "dev" in temp_version: + package_version = temp_version + package_exists = True + else: + package_exists = False + except ImportError: + # If the package can't be imported, it's not available + package_exists = False + else: + # For packages other than "torch", don't attempt the fallback and set as not available + package_exists = False + logger.debug(f"Detected {pkg_name} version: {package_version}") + if return_version: + return package_exists, package_version + else: + return package_exists + + +_g2p_en_available = _is_package_available("g2p_en") +_sentencepiece_available = _is_package_available("sentencepiece") +_sklearn_available = importlib.util.find_spec("sklearn") is not None +if _sklearn_available: + try: + importlib.metadata.version("scikit-learn") + except importlib.metadata.PackageNotFoundError: + _sklearn_available = False + + def is_datasets_available(): import importlib @@ -77,6 +121,12 @@ def is_protobuf_available(): return False return importlib.util.find_spec("google.protobuf") is not None +def is_protobuf_available(): + if importlib.util.find_spec("google") is None: + return False + return importlib.util.find_spec("google.protobuf") is not None + + def is_paddle_cuda_available() -> bool: if is_paddle_available(): import paddle @@ -91,6 +141,14 @@ def is_g2p_en_available(): def is_sentencepiece_available(): return _sentencepiece_available +def is_g2p_en_available(): + return _g2p_en_available + + +def is_sentencepiece_available(): + return _sentencepiece_available + + def is_paddle_available() -> bool: """check if `torch` package is installed Returns: diff --git a/tests/transformers/llama/test_tokenizer.py b/tests/transformers/llama/test_tokenizer.py index d88bd07a9a91..a4c5cb4d6d68 100644 --- a/tests/transformers/llama/test_tokenizer.py +++ b/tests/transformers/llama/test_tokenizer.py @@ -18,7 +18,7 @@ import unittest from paddlenlp.transformers.auto.tokenizer import AutoTokenizer -from paddlenlp.transformers.llama.tokenizer import Llama3Tokenizer, LlamaTokenizer +from paddlenlp.transformers.llama.tokenizer import LlamaTokenizer from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer from paddlenlp.transformers.tokenizer_utils_fast import PretrainedTokenizerFast @@ -213,12 +213,15 @@ def test_pretrained_model_lists(self): self.assertGreaterEqual(len(self.tokenizer_class.pretrained_resource_files_map), 1) self.assertGreaterEqual(len(list(self.tokenizer_class.pretrained_resource_files_map.values())[0]), 1) + class TikTokenIntegrationTests(unittest.TestCase): """ A class that regroups important test to make sure that we properly handle the special tokens. """ + def test_tiktoken_llama(self): - model_path = "hf-internal-testing/Llama3-Instruct-Internal" + model_path = "hf-internal-testing/llama-3-8b-internal" + subfolder = "original" test_text = "This is a test sentence." test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001] num_reserved_special_tokens = 256 @@ -238,6 +241,7 @@ def test_tiktoken_llama(self): tiktoken_tokenizer = PretrainedTokenizerFast.from_pretrained( model_path, + subfolder=subfolder, additional_special_tokens=special_tokens, bos_token="<|begin_of_text|>", eos_token="<|end_of_text|>", @@ -248,34 +252,34 @@ def test_tiktoken_llama(self): tiktoken_tokenizer = AutoTokenizer.from_pretrained( model_path, + subfolder=subfolder, additional_special_tokens=special_tokens, bos_token="<|begin_of_text|>", eos_token="<|end_of_text|>", add_bos_token=True, add_eos_token=True, - from_hf_hub=True + from_hf_hub=True, ) self.assertTrue(isinstance(tiktoken_tokenizer, PretrainedTokenizerFast)) - tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)['input_ids'] + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)["input_ids"] self.assertEqual(tokens, test_tokens) tmpdirname = tempfile.mkdtemp() tiktoken_tokenizer.save_pretrained(tmpdirname) - print(f"RELOADING >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>") tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname) - self.assertTrue(isinstance(tokenizer_reload, PretrainedTokenizerFast)) - tokens = tokenizer_reload.encode(test_text, add_special_tokens=True)['input_ids'] + tokens = tokenizer_reload.encode(test_text, add_special_tokens=True)["input_ids"] self.assertEqual(tokens, test_tokens) shutil.rmtree(tmpdirname) - # tiktoken_tokenizer = AutoTokenizer.from_pretrained( - # model_path, - # additional_special_tokens=special_tokens, - # bos_token="<|begin_of_text|>", - # eos_token="<|end_of_text|>", - # from_slow=True, - # add_bos_token=True, - # add_eos_token=True, - # from_hf_hub=True, - # ) - # tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)['input_ids'] - # self.assertEqual(tokens, test_tokens) \ No newline at end of file + tiktoken_tokenizer = AutoTokenizer.from_pretrained( + model_path, + subfolder=subfolder, + additional_special_tokens=special_tokens, + bos_token="<|begin_of_text|>", + eos_token="<|end_of_text|>", + from_slow=True, + add_bos_token=True, + add_eos_token=True, + from_hf_hub=True, + ) + tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)["input_ids"] + self.assertEqual(tokens, test_tokens) diff --git a/tests/transformers/test_chat_template.py b/tests/transformers/test_chat_template.py index 4e443b54a2e2..a76ba40fa543 100644 --- a/tests/transformers/test_chat_template.py +++ b/tests/transformers/test_chat_template.py @@ -97,7 +97,7 @@ def test_inference_template(self): class ChatTemplateIntegrationTest(unittest.TestCase): def test_linlyai_chinese_llama_2_chat_template(self): - tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b") + tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b", use_fast=False) query = "你好" final_query = tokenizer.apply_chat_template(query, tokenize=False) expected_query = f"### Instruction:{query} ### Response:" @@ -110,7 +110,7 @@ def test_linlyai_chinese_llama_2_chat_template(self): self.assertEqual(final_query, expected_query) def test_linlyai_chinese_llama_2_chat_template_with_none_saved(self): - tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b") + tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b", use_fast=False) tokenizer.chat_template = None with tempfile.TemporaryDirectory() as tempdir: tokenizer.save_pretrained(tempdir) @@ -182,7 +182,7 @@ def get_common_prefix(self, tokenizer): def test_prefix(self): prompt = "欢迎使用 PaddleNLP 大模型开发套件" - tokenizer = AutoTokenizer.from_pretrained(self.model_name) + tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=False) result = tokenizer.apply_chat_template(prompt, tokenize=False) result_ids = tokenizer(result, add_special_tokens=False)["input_ids"] @@ -230,7 +230,7 @@ def test_must_have_system(self): def test_at_least_one_turn(self): query = [["你好", "您好,我是个人人工智能助手"], ["今天吃啥", "你可以选择不同的菜系"]] - tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b") + tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b", use_fast=False) # tokenizer.init_chat_template(self.chat_template_config_file) # get all query sentence From d004c3329020a27b8f4ec5f6c6ac082a9ebb1d61 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Fri, 27 Sep 2024 14:28:54 +0000 Subject: [PATCH 03/21] clean code & add blobfile to requirements.txt --- paddlenlp/transformers/auto/configuration.py | 171 ++++++------ paddlenlp/transformers/auto/factory.py | 11 +- paddlenlp/transformers/auto/tokenizer.py | 256 +++--------------- .../transformers/convert_slow_tokenizer.py | 1 + paddlenlp/transformers/ernie/__init__.py | 2 +- paddlenlp/transformers/llama/tokenizer.py | 2 +- .../transformers/tokenizer_utils_base.py | 33 +-- .../transformers/tokenizer_utils_fast.py | 3 - paddlenlp/utils/download/__init__.py | 10 +- paddlenlp/utils/import_utils.py | 19 +- requirements.txt | 1 + tests/transformers/llama/test_tokenizer.py | 1 - 12 files changed, 153 insertions(+), 357 deletions(-) diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index d29abc7ea114..35aaf2f198fe 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -21,8 +21,6 @@ from collections import OrderedDict, defaultdict from typing import Dict, List, Type -from paddlenlp.utils.env import CONFIG_NAME - from ...utils.download import resolve_file_path from ...utils.import_utils import import_module from ...utils.log import logger @@ -36,161 +34,176 @@ CONFIG_MAPPING_NAMES = OrderedDict( [ ("albert", "AlbertConfig"), + ("artist", "ArtistConfig"), + ("bart", "BartConfig"), + ("bert", "BertConfig"), ("bigbird", "BigBirdConfig"), - ("blenderbot_small", "BlenderbotSmallConfig"), + ("bit", "BitConfig"), ("blenderbot", "BlenderbotConfig"), - ("chatglm_v2", "ChatGLMv2Config"), + ("blenderbot_small", "BlenderbotSmallConfig"), + ("blip", "BlipConfig"), + ("blip2", "Blip2Config"), + ("bloom", "BloomConfig"), ("chatglm", "ChatGLMConfig"), - ("chineseclip", "ChineseCLIPTextConfig"), + ("chatglm_v2", "ChatGLMv2Config"), ("chinesebert", "ChineseBertConfig"), + ("chineseclip", "ChineseCLIPConfig"), + ("clap", "ClapConfig"), + ("clip", "CLIPConfig"), + ("codegen", "CodeGenConfig"), ("convbert", "ConvBertConfig"), ("ctrl", "CTRLConfig"), - ("distilbert", "DistilBertConfig"), ("dallebart", "DalleBartConfig"), + ("deberta", "DebertaConfig"), + ("debertav2", "DebertaV2Config"), + ("distilbert", "DistilBertConfig"), + ("dpt", "DPTConfig"), ("electra", "ElectraConfig"), - ("ernie_vil", "ErnieViLConfig"), + ("ernie", "ErnieConfig"), + ("ernie_code", "ErnieCodeConfig"), ("ernie_ctm", "ErnieCtmConfig"), ("ernie_doc", "ErnieDocConfig"), - ("ernie_gen", "ErnieGenConfig"), ("ernie_gram", "ErnieGramConfig"), ("ernie_layout", "ErnieLayoutConfig"), ("ernie_m", "ErnieMConfig"), - ("ernie_code", "ErnieCodeConfig"), - ("ernie", "ErnieConfig"), + ("ernie_vil", "ErnieViLConfig"), ("fnet", "FNetConfig"), ("funnel", "FunnelConfig"), - ("llama", "LlamaConfig"), - ("layoutxlm", "LayoutXLMConfig"), - ("layoutlmv2", "LayoutLMv2Config"), + ("gau_alpha", "GAUAlphaConfig"), + ("gemma", "GemmaConfig"), + ("glm", "GLMConfig"), + ("gpt", "GPTConfig"), + ("gptj", "GPTJConfig"), + ("jamba", "JambaConfig"), ("layoutlm", "LayoutLMConfig"), + ("layoutlmv2", "LayoutLMv2Config"), + ("layoutxlm", "LayoutXLMConfig"), + ("llama", "LlamaConfig"), ("luke", "LukeConfig"), + ("mamba", "MambaConfig"), ("mbart", "MBartConfig"), ("megatronbert", "MegatronBertConfig"), + ("minigpt4", "MiniGPT4Config"), + ("mistral", "MistralConfig"), + ("mixtral", "MixtralConfig"), ("mobilebert", "MobileBertConfig"), ("mpnet", "MPNetConfig"), + ("mt5", "MT5Config"), ("nezha", "NeZhaConfig"), ("nystromformer", "NystromformerConfig"), + ("opt", "OPTConfig"), + ("pegasus", "PegasusConfig"), ("ppminilm", "PPMiniLMConfig"), ("prophetnet", "ProphetNetConfig"), + ("qwen", "QWenConfig"), + ("qwen2", "Qwen2Config"), + ("qwen2_moe", "Qwen2MoeConfig"), ("reformer", "ReformerConfig"), ("rembert", "RemBertConfig"), ("roberta", "RobertaConfig"), - ("roformerv2", "RoFormerv2Config"), ("roformer", "RoFormerConfig"), + ("roformerv2", "RoFormerv2Config"), + ("rw", "RWConfig"), ("skep", "SkepConfig"), + ("speecht5", "SpeechT5Config"), ("squeezebert", "SqueezeBertConfig"), + ("t5", "T5Config"), ("tinybert", "TinyBertConfig"), ("unified_transformer", "UnifiedTransformerConfig"), ("unimo", "UNIMOConfig"), - ("xlnet", "XLNetConfig"), + ("visualglm", "VisualGLMConfig"), ("xlm", "XLMConfig"), - ("gpt", "GPTConfig"), - ("glm", "GLMConfig"), - ("mt5", "MT5Config"), - ("t5", "T5Config"), - ("bert", "BertConfig"), - ("bart", "BartConfig"), - ("gau_alpha", "GAUAlphaConfig"), - ("codegen", "CodeGenConfig"), - ("clip", "CLIPConfig"), - ("artist", "ArtistConfig"), - ("opt", "OPTConfig"), - ("pegasus", "PegasusConfig"), - ("dpt", "DPTConfig"), - ("bit", "BitConfig"), - ("blip", "BlipConfig"), - ("bloom", "BloomConfig"), - ("qwen", "QWenConfig"), - ("mistral", "MistralConfig"), - ("mixtral", "MixtralConfig"), - ("qwen2", "Qwen2Config"), - ("qwen2_moe", "Qwen2MoeConfig"), - ("gemma", "GemmaConfig"), + ("xlnet", "XLNetConfig"), ("yuan", "YuanConfig"), - ("mamba", "MambaConfig"), - ("jamba", "JambaConfig"), ] ) MODEL_NAMES_MAPPING = OrderedDict( + # Base model mapping [ - # Add full (and cased) model names here - # Base model mapping ("albert", "Albert"), + ("artist", "Artist"), + ("bart", "Bart"), + ("bert", "Bert"), ("bigbird", "BigBird"), - ("blenderbot_small", "BlenderbotSmall"), + ("bit", "Bit"), ("blenderbot", "Blenderbot"), - ("chatglm_v2", "ChatGLMv2"), + ("blenderbot_small", "BlenderbotSmall"), + ("blip", "Blip"), + ("blip2", "Blip2"), + ("bloom", "Bloom"), ("chatglm", "ChatGLM"), - ("chineseclip", "ChineseCLIPText"), + ("chatglm_v2", "ChatGLMv2"), ("chinesebert", "ChineseBert"), + ("chineseclip", "ChineseCLIPText"), + ("clap", "CLAP"), + ("clip", "CLIP"), + ("codegen", "CodeGen"), ("convbert", "ConvBert"), ("ctrl", "CTRL"), - ("distilbert", "DistilBert"), ("dallebart", "DalleBart"), + ("deberta", "Deberta"), + ("debertav2", "DebertaV2"), + ("distilbert", "DistilBert"), + ("dpt", "DPT"), ("electra", "Electra"), - ("ernie_vil", "ErnieViL"), + ("ernie", "Ernie"), + ("ernie_code", "ErnieCode"), ("ernie_ctm", "ErnieCtm"), ("ernie_doc", "ErnieDoc"), - ("ernie_gen", "ErnieGen"), ("ernie_gram", "ErnieGram"), ("ernie_layout", "ErnieLayout"), ("ernie_m", "ErnieM"), - ("ernie_code", "ErnieCode"), - ("ernie", "Ernie"), + ("ernie_vil", "ErnieViL"), ("fnet", "FNet"), ("funnel", "Funnel"), - ("llama", "Llama"), - ("layoutxlm", "LayoutXLM"), - ("layoutlmv2", "LayoutLMv2"), + ("gau_alpha", "GAUAlpha"), + ("gemma", "Gemma"), + ("glm", "GLM"), + ("gpt", "GPT"), + ("gptj", "GPTJ"), + ("jamba", "Jamba"), ("layoutlm", "LayoutLM"), + ("layoutlmv2", "LayoutLMv2"), + ("layoutxlm", "LayoutXLM"), + ("llama", "Llama"), ("luke", "Luke"), + ("mamba", "Mamba"), ("mbart", "MBart"), ("megatronbert", "MegatronBert"), + ("minigpt4", "MiniGPT4"), + ("mistral", "Mistral"), + ("mixtral", "Mixtral"), ("mobilebert", "MobileBert"), ("mpnet", "MPNet"), + ("mt5", "MT5"), ("nezha", "NeZha"), ("nystromformer", "Nystromformer"), + ("opt", "OPT"), + ("pegasus", "Pegasus"), ("ppminilm", "PPMiniLM"), ("prophetnet", "ProphetNet"), + ("qwen", "QWen"), + ("qwen2", "Qwen2"), + ("qwen2_moe", "Qwen2Moe"), ("reformer", "Reformer"), ("rembert", "RemBert"), ("roberta", "Roberta"), - ("roformerv2", "RoFormerv2"), ("roformer", "RoFormer"), + ("roformerv2", "RoFormerv2"), + ("rw", "RW"), ("skep", "Skep"), + ("speecht5", "SpeechT5"), ("squeezebert", "SqueezeBert"), + ("t5", "T5"), ("tinybert", "TinyBert"), ("unified_transformer", "UnifiedTransformer"), ("unimo", "UNIMO"), - ("xlnet", "XLNet"), + ("visualglm", "VisualGLM"), ("xlm", "XLM"), - ("gpt", "GPT"), - ("glm", "GLM"), - ("mt5", "MT5"), - ("t5", "T5"), - ("bert", "Bert"), - ("bart", "Bart"), - ("gau_alpha", "GAUAlpha"), - ("codegen", "CodeGen"), - ("clip", "CLIP"), - ("artist", "Artist"), - ("opt", "OPT"), - ("pegasus", "Pegasus"), - ("dpt", "DPT"), - ("bit", "Bit"), - ("blip", "Blip"), - ("bloom", "Bloom"), - ("qwen", "QWen"), - ("mistral", "Mistral"), - ("mixtral", "Mixtral"), - ("qwen2", "Qwen2"), - ("qwen2_moe", "Qwen2Moe"), - ("gemma", "Gemma"), + ("xlnet", "XLNet"), ("yuan", "Yuan"), - ("mamba", "Mamba"), - ("jamba", "Jamba"), ] ) @@ -431,6 +444,8 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar return config_class.from_pretrained(config_file, *model_args, **kwargs) elif config_file is None: config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + # Fallback: use pattern matching on the string. + # We go from longer names to shorter names to catch roberta before bert (for instance) for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True): if pattern in str(pretrained_model_name_or_path): return CONFIG_MAPPING[pattern].from_dict(config_dict, **unused_kwargs) diff --git a/paddlenlp/transformers/auto/factory.py b/paddlenlp/transformers/auto/factory.py index fa084838dec2..edffb27913b3 100644 --- a/paddlenlp/transformers/auto/factory.py +++ b/paddlenlp/transformers/auto/factory.py @@ -1,4 +1,5 @@ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -27,15 +28,15 @@ def getattribute_from_module(module, attr): return getattr(module, attr) # Some of the mappings have entries model_type -> object of another model type. In that case we try to grab the # object at the top level. - transformers_module = importlib.import_module("transformers") + paddlenlp_module = importlib.import_module("paddlenlp") - if module != transformers_module: + if module != paddlenlp_module: try: - return getattribute_from_module(transformers_module, attr) + return getattribute_from_module(paddlenlp_module, attr) except ValueError: - raise ValueError(f"Could not find {attr} neither in {module} nor in {transformers_module}!") + raise ValueError(f"Could not find {attr} neither in {module} nor in {paddlenlp_module}!") else: - raise ValueError(f"Could not find {attr} in {transformers_module}!") + raise ValueError(f"Could not find {attr} in {paddlenlp_module}!") class _LazyAutoMapping(OrderedDict): diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index e9f59fb1338d..26036d4c4e8f 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -17,7 +17,7 @@ import json import os from collections import OrderedDict -from typing import Dict, Optional, Tuple, Union +from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union from paddlenlp.transformers.auto.configuration import ( CONFIG_MAPPING_NAMES, @@ -29,7 +29,7 @@ from paddlenlp.transformers.tokenizer_utils_base import TOKENIZER_CONFIG_FILE from paddlenlp.transformers.tokenizer_utils_fast import PretrainedTokenizerFast -from ...utils import is_sentencepiece_available, is_tokenizers_available +from ...utils import is_tokenizers_available from ...utils.download import resolve_file_path from ...utils.import_utils import import_module from ...utils.log import logger @@ -39,231 +39,57 @@ "AutoTokenizer", ] -if False: +if TYPE_CHECKING: # This significantly improves completion suggestion performance when # the transformers package is used with Microsoft's Pylance language server. TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict() else: TOKENIZER_MAPPING_NAMES = OrderedDict( [ - ( - "albert", - ( - "AlbertChineseTokenizer" if is_sentencepiece_available() else None, - "AlbertChineseTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("bart", ("BartTokenizer", "BartTokenizerFast")), - ( - "bert", - ( - "BertTokenizer", - None, - ), - ), - ("blenderbot", ("BlenderbotTokenizer", "BlenderbotTokenizerFast")), - ( - "bloom", - ( - "BloomTokenizer", - "BloomTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "clip", - ( - "CLIPTokenizer", - "CLIPTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "codegen", - ( - "CodeGenTokenizer", - "CodeGenTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "convbert", - ( - "ConvBertTokenizer", - None, - ), - ), - ("ctrl", ("CTRLTokenizer", None)), - ( - "distilbert", - ( - "DistilBertTokenizer", - None, - ), - ), - ( - "electra", - ( - "ElectraTokenizer", - "ElectraTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "ernie", - ( - "ErnieTokenizer", - "ErnieTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("ernie_m", ("ErnieMTokenizer" if is_sentencepiece_available() else None, None)), - ( - "fnet", - ( - "FNetTokenizer", - "FNetTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "funnel", - ( - "FunnelTokenizer", - "FunnelTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "gemma", - ( - "GemmaTokenizer" if is_sentencepiece_available() else None, - "GemmaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "jamba", - ( - "JambaTokenizer" if is_sentencepiece_available() else None, - "JambaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "layoutlm", - ( - "LayoutLMTokenizer", - "LayoutLMTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "layoutlmv2", - ( - "LayoutLMv2Tokenizer", - "LayoutLMv2TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "layoutxlm", - ( - "LayoutXLMTokenizer", - "LayoutXLMTokenizerFast" if is_tokenizers_available() else None, - ), - ), + ("albert", "AlbertTokenizer"), + ("bart", "BartTokenizer"), + ("bert", "BertTokenizer"), + ("blenderbot", "BlenderbotTokenizer"), + ("bloom", "BloomTokenizer"), + ("clip", "CLIPTokenizer"), + ("codegen", "CodeGenTokenizer"), + ("convbert", "ConvBertTokenizer"), + ("ctrl", "CTRLTokenizer"), + ("distilbert", "DistilBertTokenizer"), + ("electra", "ElectraTokenizer"), + ("ernie", "ErnieTokenizer"), + ("ernie_m", "ErnieMTokenizer"), + ("fnet", "FNetTokenizer"), + ("funnel", "FunnelTokenizer"), + ("gemma", "GemmaTokenizer"), + ("jamba", "JambaTokenizer"), + ("layoutlm", "LayoutLMTokenizer"), + ("layoutlmv2", "LayoutLMv2Tokenizer"), + ("layoutxlm", "LayoutXLMTokenizer"), ( "llama", ( - "LlamaTokenizer" if is_sentencepiece_available() else None, + "LlamaTokenizer", "LlamaTokenizerFast" if is_tokenizers_available() else None, ), ), - ("luke", ("LukeTokenizer", None)), - ( - "mamba", - ( - "MambaTokenizer", - "MambaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "mbart", - ( - "MBartTokenizer" if is_sentencepiece_available() else None, - "MBart50Tokenizer" if is_sentencepiece_available() else None, - ), - ), - ( - "mobilebert", - ( - "MobileBertTokenizer", - None, - ), - ), - ( - "mpnet", - ( - "MPNetTokenizer", - "MPNetTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "nezha", - ( - "NeZhaTokenizer", - "NeZhaTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "pegasus", - ( - "PegasusChineseTokenizer" if is_sentencepiece_available() else None, - "PegasusChineseTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("prophetnet", ("ProphetNetTokenizer", None)), - ( - "reformer", - ( - "ReformerTokenizer" if is_sentencepiece_available() else None, - "ReformerTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "rembert", - ( - "RemBertTokenizer" if is_sentencepiece_available() else None, - None, - ), - ), - ( - "roberta", - ( - "RobertaBPETokenizer", - "RobertaBPETokenizerFast" if is_tokenizers_available() else None, - ), - ), - ( - "roformer", - ( - "RoFormerTokenizer", - "RoFormerTokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("speecht5", ("SpeechT5Tokenizer" if is_sentencepiece_available() else None, None)), - ( - "squeezebert", - ( - "SqueezeBertTokenizer", - None, - ), - ), - ( - "t5", - ( - "T5Tokenizer" if is_sentencepiece_available() else None, - "T5TokenizerFast" if is_tokenizers_available() else None, - ), - ), - ("xlm", ("XLMTokenizer", None)), - ( - "xlnet", - ( - "XLNetTokenizer" if is_sentencepiece_available() else None, - "XLNetTokenizerFast" if is_tokenizers_available() else None, - ), - ), + ("luke", "LukeTokenizer"), + ("mamba", "MambaTokenizer"), + ("mbart", "MBartTokenizer"), + ("mobilebert", "MobileBertTokenizer"), + ("mpnet", "MPNetTokenizer"), + ("nezha", "NeZhaTokenizer"), + ("pegasus", "PegasusChineseTokenizer"), + ("prophetnet", "ProphetNetTokenizer"), + ("reformer", "ReformerTokenizer"), + ("rembert", "RemBertTokenizer"), + ("roberta", "RobertaBPETokenizer"), + ("roformer", "RoFormerTokenizer"), + ("speecht5", "SpeechT5Tokenizer"), + ("squeezebert", "SqueezeBertTokenizer"), + ("t5", "T5Tokenizer"), + ("xlm", "XLMTokenizer"), + ("xlnet", "XLNetTokenizer"), ("bert_japanese", "BertJapaneseTokenizer"), ("bigbird", "BigBirdTokenizer"), ("blenderbot_small", "BlenderbotSmallTokenizer"), diff --git a/paddlenlp/transformers/convert_slow_tokenizer.py b/paddlenlp/transformers/convert_slow_tokenizer.py index ee8ea49b83e6..adc3c52130e6 100644 --- a/paddlenlp/transformers/convert_slow_tokenizer.py +++ b/paddlenlp/transformers/convert_slow_tokenizer.py @@ -16,6 +16,7 @@ # limitations under the License. from typing import Dict, List, Optional, Tuple + import tokenizers from packaging import version from tokenizers import ( diff --git a/paddlenlp/transformers/ernie/__init__.py b/paddlenlp/transformers/ernie/__init__.py index bda886444126..91cb3725f5fe 100644 --- a/paddlenlp/transformers/ernie/__init__.py +++ b/paddlenlp/transformers/ernie/__init__.py @@ -13,4 +13,4 @@ # limitations under the License. from .configuration import * from .modeling import * -from .tokenizer import * \ No newline at end of file +from .tokenizer import * diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py index d890d703d649..ba8071430735 100644 --- a/paddlenlp/transformers/llama/tokenizer.py +++ b/paddlenlp/transformers/llama/tokenizer.py @@ -102,7 +102,7 @@ def bos_token_id(self) -> Optional[int]: @property def eos_token_id(self) -> Optional[int]: return self.sp_model.eos_id() - + def get_spm_processor(self, from_slow=False): tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) if from_slow: # no dependency on protobuf diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 508c342dcf42..56ffdebbb893 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1521,11 +1521,9 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE, - "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME, # what's this + "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME, # what's this # "tokenizer_file": FULL_TOKENIZER_FILE, } - print(f"cls = {cls}") - print(f"cls.resource_files_name1s = {cls.resource_files_names}") vocab_files_target = {**cls.resource_files_names, **additional_files_names} # From HF Hub or AI Studio @@ -1551,15 +1549,12 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): # Assuming from community-contributed pretrained models for file_id, file_name in vocab_files_target.items(): vocab_files[file_id] = file_name - print("vocab_files: ", vocab_files) resolved_vocab_files = {} - + for file_id, file_path in vocab_files.items(): - print(f"file_id: {file_id}, file_path: {file_path}") if file_path is None or os.path.isfile(file_path): resolved_vocab_files[file_id] = file_path continue - print(f"Try resolving {file_id} from {pretrained_model_name_or_path}, {file_path}") resolved_vocab_files[file_id] = resolve_file_path( pretrained_model_name_or_path, [file_path], @@ -1595,30 +1590,6 @@ def _from_pretrained( from_hf_hub=False, **kwargs, ): - """ - Instantiate a `PretrainedTokenizer` from a predefined tokenizer class. - - Args: - pretrained_model_name_or_path (str): - The model name or path to instantiate the tokenizer from. - *init_inputs (tuple): - Positional arguments to be passed to the tokenizer class `__init__` method. - cache_dir (str, optional): - Directory to cache the downloaded vocabulary files. - return_tokenizer_file_dir (bool, optional): - Whether to return the directory path of the tokenizer files. - from_hf_hub (bool, optional): - Whether to load from Huggingface Hub. - from_aistudio (bool, optional): - Whether to load from AI Studio. - **kwargs (dict): - Additional keyword arguments to be passed to the tokenizer class `__init__` method. - - Returns: - PretrainedTokenizer: An instance of `PretrainedTokenizer`. - str: The directory path of the tokenizer files if `return_tokenizer_file_dir` is `True`. - - """ from_slow = kwargs.get("from_slow", False) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None: diff --git a/paddlenlp/transformers/tokenizer_utils_fast.py b/paddlenlp/transformers/tokenizer_utils_fast.py index 59491e96dec6..18af9d66365a 100644 --- a/paddlenlp/transformers/tokenizer_utils_fast.py +++ b/paddlenlp/transformers/tokenizer_utils_fast.py @@ -611,8 +611,6 @@ def _encode_plus( split_special_tokens=split_special_tokens, **kwargs, ) - print(batched_input) - print(batched_output) # Return tensor is None, then we can remove the leading batch axis # Overflowing tokens are returned as a batch of output so we keep them in this case @@ -626,7 +624,6 @@ def _encode_plus( ) self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose) - print(f"final output = {batched_output}") return batched_output def convert_tokens_to_string(self, tokens: List[str]) -> str: diff --git a/paddlenlp/utils/download/__init__.py b/paddlenlp/utils/download/__init__.py index 935fee61cd85..366cf48a428a 100644 --- a/paddlenlp/utils/download/__init__.py +++ b/paddlenlp/utils/download/__init__.py @@ -142,7 +142,7 @@ def resolve_file_path( elif index < len(filenames) - 1: continue else: - pass + pass # 临时解决方案 # raise FileNotFoundError(f"please make sure one of the {filenames} under the dir {repo_id}") @@ -213,9 +213,6 @@ def resolve_file_path( elif from_hf_hub: log_endpoint = "Huggingface Hub" for filename in filenames: - print( - f"params: repo_id={repo_id}, filename={filename}, subfolder={subfolder}, repo_type={repo_type}, revision={revision}, token={token}, endpoint={endpoint}, from_bos={from_bos}, from_aistudio={from_aistudio}, from_hf_hub={from_hf_hub}" - ) download_kwargs["filename"] = filename is_available = bos_aistudio_hf_file_exist( repo_id, @@ -240,9 +237,6 @@ def resolve_file_path( download_kwargs["url"] = url for filename in filenames: download_kwargs["filename"] = filename - print( - f"params: repo_id={repo_id}, filename={filename}, subfolder={subfolder}, repo_type={repo_type}, revision={revision}, token={token}, endpoint={endpoint}, from_bos={from_bos}, from_aistudio={from_aistudio}, from_hf_hub={from_hf_hub}" - ) is_available = bos_aistudio_hf_file_exist( repo_id, filename, @@ -319,7 +313,6 @@ def bos_aistudio_hf_file_exist( if subfolder is None: subfolder = "" filename = os.path.join(subfolder, filename) - print(f"filename = {filename}") if from_aistudio: out = aistudio_hub_file_exists( repo_id=repo_id, @@ -337,7 +330,6 @@ def bos_aistudio_hf_file_exist( revision=revision, token=token, ) - print(f"out = {out}") else: out = bos_file_exists( repo_id=repo_id, diff --git a/paddlenlp/utils/import_utils.py b/paddlenlp/utils/import_utils.py index b8a2ffec626a..2c3796214a7f 100644 --- a/paddlenlp/utils/import_utils.py +++ b/paddlenlp/utils/import_utils.py @@ -24,6 +24,7 @@ from paddlenlp.utils.log import logger + # TODO: This doesn't work for all packages (`bs4`, `faiss`, etc.) Talk to Sylvain to see how to do with it better. def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[Tuple[bool, str], bool]: # Check if the package spec exists and grab its version to avoid importing a local directory @@ -57,6 +58,7 @@ def _is_package_available(pkg_name: str, return_version: bool = False) -> Union[ else: return package_exists + _g2p_en_available = _is_package_available("g2p_en") _sentencepiece_available = _is_package_available("sentencepiece") _sklearn_available = importlib.util.find_spec("sklearn") is not None @@ -116,10 +118,6 @@ def is_datasets_available(): return importlib.util.find_spec("datasets") is not None -def is_protobuf_available(): - if importlib.util.find_spec("google") is None: - return False - return importlib.util.find_spec("google.protobuf") is not None def is_protobuf_available(): if importlib.util.find_spec("google") is None: @@ -135,11 +133,6 @@ def is_paddle_cuda_available() -> bool: else: return False -def is_g2p_en_available(): - return _g2p_en_available - -def is_sentencepiece_available(): - return _sentencepiece_available def is_g2p_en_available(): return _g2p_en_available @@ -157,14 +150,14 @@ def is_paddle_available() -> bool: return is_package_available("paddle") -def is_psutil_available(): - return importlib.util.find_spec("psutil") is not None - - def is_tiktoken_available(): return importlib.util.find_spec("tiktoken") is not None +def is_psutil_available(): + return importlib.util.find_spec("psutil") is not None + + def is_torch_available() -> bool: """check if `torch` package is installed Returns: diff --git a/requirements.txt b/requirements.txt index 0673bcc40ed2..dc7c2a06bfb3 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,5 @@ jieba +blobfile colorlog colorama seqeval diff --git a/tests/transformers/llama/test_tokenizer.py b/tests/transformers/llama/test_tokenizer.py index a4c5cb4d6d68..fdbe4b4c9a49 100644 --- a/tests/transformers/llama/test_tokenizer.py +++ b/tests/transformers/llama/test_tokenizer.py @@ -38,7 +38,6 @@ class LlamaTokenizationTest(TokenizerTesterMixin, unittest.TestCase): def get_tokenizer(self, **kwargs) -> PretrainedTokenizer: tokenizer = LlamaTokenizer.from_pretrained("__internal_testing__/tiny-random-llama", **kwargs) - print(f"tokenizer = {tokenizer}") tokenizer.pad_token = tokenizer.unk_token return tokenizer From 0b61d11a8ee2e0159dd2c9ef5473ea5c5bfa692e Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Sat, 28 Sep 2024 05:26:58 +0000 Subject: [PATCH 04/21] Don't allow multiple Class in a --- .../paddlenlp.transformers.fnet.tokenizer.po | 2 +- ...ddlenlp.transformers.reformer.tokenizer.po | 2 +- .../paddlenlp.transformers.t5.tokenizer.po | 2 +- paddlenlp/transformers/__init__.py | 4 + paddlenlp/transformers/albert/tokenizer.py | 330 +--------------- .../transformers/albert_chinese/__init__.py | 13 + .../transformers/albert_chinese/tokenizer.py | 105 ++++++ .../transformers/albert_english/__init__.py | 13 + .../transformers/albert_english/tokenizer.py | 263 +++++++++++++ paddlenlp/transformers/auto/configuration.py | 6 + paddlenlp/transformers/auto/tokenizer.py | 10 +- paddlenlp/transformers/bigbird/tokenizer.py | 2 +- paddlenlp/transformers/fnet/tokenizer.py | 3 +- paddlenlp/transformers/mbart/tokenizer.py | 322 +--------------- paddlenlp/transformers/mbart50/__init__.py | 13 + paddlenlp/transformers/mbart50/tokenizer.py | 353 ++++++++++++++++++ paddlenlp/transformers/reformer/tokenizer.py | 2 +- paddlenlp/transformers/t5/tokenizer.py | 2 +- tests/transformers/albert/test_tokenizer.py | 6 +- 19 files changed, 792 insertions(+), 661 deletions(-) create mode 100644 paddlenlp/transformers/albert_chinese/__init__.py create mode 100644 paddlenlp/transformers/albert_chinese/tokenizer.py create mode 100644 paddlenlp/transformers/albert_english/__init__.py create mode 100644 paddlenlp/transformers/albert_english/tokenizer.py create mode 100644 paddlenlp/transformers/mbart50/__init__.py create mode 100644 paddlenlp/transformers/mbart50/tokenizer.py diff --git a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po index 79004b0383ca..f3fe10d3a3eb 100644 --- a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po +++ b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po @@ -26,7 +26,7 @@ msgid "Tokenization class for FNet model." msgstr "" #: of paddlenlp.transformers.fnet.tokenizer.FNetTokenizer:1 -msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`" +msgid "基类::class:`paddlenlp.transformers.albert.albert_english.AlbertEnglishTokenizer`" msgstr "" #: of paddlenlp.transformers.fnet.tokenizer.FNetTokenizer:1 diff --git a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.reformer.tokenizer.po b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.reformer.tokenizer.po index a1dadaf14a90..efec03c6bf05 100644 --- a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.reformer.tokenizer.po +++ b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.reformer.tokenizer.po @@ -22,7 +22,7 @@ msgid "tokenizer" msgstr "" #: of paddlenlp.transformers.reformer.tokenizer.ReformerTokenizer:1 -msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`" +msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`" msgstr "" #: of paddlenlp.transformers.reformer.tokenizer.ReformerTokenizer:1 diff --git a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.t5.tokenizer.po b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.t5.tokenizer.po index 2023df559055..b809e680fcd4 100644 --- a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.t5.tokenizer.po +++ b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.t5.tokenizer.po @@ -22,7 +22,7 @@ msgid "tokenizer" msgstr "" #: of paddlenlp.transformers.t5.tokenizer.T5Tokenizer:1 -msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`" +msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`" msgstr "" #: of paddlenlp.transformers.t5.tokenizer.T5Tokenizer:1 diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index c8bf3a0aecde..be5ebba7d7e3 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -64,6 +64,9 @@ from .albert.configuration import * from .albert.modeling import * from .albert.tokenizer import * +from .albert_chinese.tokenizer import * + +from .albert_english.tokenizer import * from .bit.modeling import * from .bit.configuration import * from .bit.image_processing import * @@ -141,6 +144,7 @@ from .mbart.modeling import * from .mbart.tokenizer import * from .mbart.configuration import * +from .mbart50.tokenizer import * from .megatronbert.modeling import * from .megatronbert.tokenizer import * from .megatronbert.configuration import * diff --git a/paddlenlp/transformers/albert/tokenizer.py b/paddlenlp/transformers/albert/tokenizer.py index a7d80d0b2457..903d36361e73 100644 --- a/paddlenlp/transformers/albert/tokenizer.py +++ b/paddlenlp/transformers/albert/tokenizer.py @@ -14,13 +14,9 @@ # limitations under the License. """Tokenization class for ALBERT model.""" -import os -import unicodedata -from shutil import copyfile - -import sentencepiece as spm - -from .. import PretrainedTokenizer, BertTokenizer, AddedToken +from .. import AddedToken, PretrainedTokenizer +from ..albert_chinese.tokenizer import AlbertChineseTokenizer +from ..albert_english.tokenizer import AlbertEnglishTokenizer __all__ = ["AlbertTokenizer"] @@ -479,323 +475,3 @@ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def save_resources(self, save_directory): return self.tokenizer.save_resources(save_directory) - - -class AlbertEnglishTokenizer(PretrainedTokenizer): - resource_files_names = { - "sentencepiece_model_file": "spiece.model", - } - - pretrained_resource_files_map = { - "sentencepiece_model_file": { - "albert-base-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v1.spiece.model", - "albert-large-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v1.spiece.model", - "albert-xlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v1.spiece.model", - "albert-xxlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v1.spiece.model", - "albert-base-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v2.spiece.model", - "albert-large-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v2.spiece.model", - "albert-xlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v2.spiece.model", - "albert-xxlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v2.spiece.model", - }, - } - - pretrained_init_configuration = { - "albert-base-v1": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-large-v1": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-xlarge-v1": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-xxlarge-v1": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-base-v2": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-large-v2": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-xlarge-v2": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-xxlarge-v2": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - } - max_model_input_sizes = { - "albert-base-v1": 512, - "albert-large-v1": 512, - "albert-xlarge-v1": 512, - "albert-xxlarge-v1": 512, - "albert-base-v2": 512, - "albert-large-v2": 512, - "albert-xlarge-v2": 512, - "albert-xxlarge-v2": 512, - } - - def __init__( - self, - sentencepiece_model_file, - do_lower_case=True, - remove_space=True, - keep_accents=False, - bos_token="[CLS]", - eos_token="[SEP]", - unk_token="", - sep_token="[SEP]", - pad_token="", - cls_token="[CLS]", - mask_token="[MASK]", - sp_model_kwargs=None, - **kwargs - ): - - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.sentencepiece_model_file = sentencepiece_model_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(sentencepiece_model_file) - - @property - def vocab_size(self): - return len(self.sp_model) - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.sentencepiece_model_file) - - def preprocess_text(self, inputs): - if self.remove_space: - outputs = " ".join(inputs.strip().split()) - else: - outputs = inputs - outputs = outputs.replace("``", '"').replace("''", '"') - - if not self.keep_accents: - outputs = unicodedata.normalize("NFKD", outputs) - outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) - if self.do_lower_case: - outputs = outputs.lower() - - return outputs - - def _tokenize(self, text): - """Tokenize a string.""" - text = self.preprocess_text(text) - pieces = self.sp_model.encode(text, out_type=str) - new_pieces = [] - for piece in pieces: - if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): - cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) - if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: - if len(cur_pieces[0]) == 1: - cur_pieces = cur_pieces[1:] - else: - cur_pieces[0] = cur_pieces[0][1:] - cur_pieces.append(piece[-1]) - new_pieces.extend(cur_pieces) - else: - new_pieces.append(piece) - - return new_pieces - - def _convert_token_to_id(self, token): - """Converts a token (str) to an id using the vocab.""" - return self.sp_model.PieceToId(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) to a token (str) using the vocab.""" - return self.sp_model.IdToPiece(index) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() - return out_string - - def num_special_tokens_to_add(self, pair=False): - token_ids_0 = [] - token_ids_1 = [] - return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return cls + token_ids_0 + sep - return cls + token_ids_0 + sep + token_ids_1 + sep - - def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): - if offset_mapping_1 is None: - return [(0, 0)] + offset_mapping_0 + [(0, 0)] - - return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)] - - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - - def save_resources(self, save_directory): - for name, file_name in self.resource_files_names.items(): - save_path = os.path.join(save_directory, file_name) - if os.path.abspath(self.sentencepiece_model_file) != os.path.abspath(save_path) and os.path.isfile( - self.sentencepiece_model_file - ): - copyfile(self.sentencepiece_model_file, save_path) - elif not os.path.isfile(self.sentencepiece_model_file): - with open(save_path, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - -class AlbertChineseTokenizer(BertTokenizer): - resource_files_names = {"vocab_file": "vocab.txt"} - pretrained_resource_files_map = { - "vocab_file": { - "albert-chinese-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-tiny.vocab.txt", - "albert-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-small.vocab.txt", - "albert-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-base.vocab.txt", - "albert-chinese-large": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-large.vocab.txt", - "albert-chinese-xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xlarge.vocab.txt", - "albert-chinese-xxlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xxlarge.vocab.txt", - } - } - pretrained_init_configuration = { - "albert-chinese-tiny": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-small": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-base": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-large": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-xlarge": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-xxlarge": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - } - max_model_input_sizes = { - "albert-chinese-tiny": 512, - "albert-chinese-small": 512, - "albert-chinese-base": 512, - "albert-chinese-large": 512, - "albert-chinese-xlarge": 512, - "albert-chinese-xxlarge": 512, - } - - def __init__( - self, - vocab_file, - do_lower_case=True, - do_basic_tokenize=True, - never_split=None, - unk_token="[UNK]", - sep_token="[SEP]", - pad_token="[PAD]", - cls_token="[CLS]", - mask_token="[MASK]", - tokenize_chinese_chars=True, - strip_accents=None, - **kwargs - ): - super(AlbertChineseTokenizer, self).__init__( - vocab_file, - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) diff --git a/paddlenlp/transformers/albert_chinese/__init__.py b/paddlenlp/transformers/albert_chinese/__init__.py new file mode 100644 index 000000000000..97043fd7ba68 --- /dev/null +++ b/paddlenlp/transformers/albert_chinese/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlenlp/transformers/albert_chinese/tokenizer.py b/paddlenlp/transformers/albert_chinese/tokenizer.py new file mode 100644 index 000000000000..9cdd86a41554 --- /dev/null +++ b/paddlenlp/transformers/albert_chinese/tokenizer.py @@ -0,0 +1,105 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for ALBERT model.""" + +from .. import BertTokenizer + +__all__ = ["AlbertChineseTokenizer"] + +SPIECE_UNDERLINE = "▁" + + +class AlbertChineseTokenizer(BertTokenizer): + resource_files_names = {"vocab_file": "vocab.txt"} + pretrained_resource_files_map = { + "vocab_file": { + "albert-chinese-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-tiny.vocab.txt", + "albert-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-small.vocab.txt", + "albert-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-base.vocab.txt", + "albert-chinese-large": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-large.vocab.txt", + "albert-chinese-xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xlarge.vocab.txt", + "albert-chinese-xxlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xxlarge.vocab.txt", + } + } + pretrained_init_configuration = { + "albert-chinese-tiny": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-small": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-base": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-large": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-xlarge": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-xxlarge": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + } + max_model_input_sizes = { + "albert-chinese-tiny": 512, + "albert-chinese-small": 512, + "albert-chinese-base": 512, + "albert-chinese-large": 512, + "albert-chinese-xlarge": 512, + "albert-chinese-xxlarge": 512, + } + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs + ): + super(AlbertChineseTokenizer, self).__init__( + vocab_file, + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) diff --git a/paddlenlp/transformers/albert_english/__init__.py b/paddlenlp/transformers/albert_english/__init__.py new file mode 100644 index 000000000000..97043fd7ba68 --- /dev/null +++ b/paddlenlp/transformers/albert_english/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlenlp/transformers/albert_english/tokenizer.py b/paddlenlp/transformers/albert_english/tokenizer.py new file mode 100644 index 000000000000..e2b4f2a63b02 --- /dev/null +++ b/paddlenlp/transformers/albert_english/tokenizer.py @@ -0,0 +1,263 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""Tokenization class for ALBERT model.""" + +import os +import unicodedata +from shutil import copyfile + +import sentencepiece as spm + +from .. import PretrainedTokenizer + +__all__ = ["AlbertEnglishTokenizer"] + +SPIECE_UNDERLINE = "▁" + + +class AlbertEnglishTokenizer(PretrainedTokenizer): + resource_files_names = { + "sentencepiece_model_file": "spiece.model", + } + + pretrained_resource_files_map = { + "sentencepiece_model_file": { + "albert-base-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v1.spiece.model", + "albert-large-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v1.spiece.model", + "albert-xlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v1.spiece.model", + "albert-xxlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v1.spiece.model", + "albert-base-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v2.spiece.model", + "albert-large-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v2.spiece.model", + "albert-xlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v2.spiece.model", + "albert-xxlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v2.spiece.model", + }, + } + + pretrained_init_configuration = { + "albert-base-v1": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-large-v1": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-xlarge-v1": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-xxlarge-v1": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-base-v2": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-large-v2": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-xlarge-v2": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-xxlarge-v2": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + } + max_model_input_sizes = { + "albert-base-v1": 512, + "albert-large-v1": 512, + "albert-xlarge-v1": 512, + "albert-xxlarge-v1": 512, + "albert-base-v2": 512, + "albert-large-v2": 512, + "albert-xlarge-v2": 512, + "albert-xxlarge-v2": 512, + } + + def __init__( + self, + sentencepiece_model_file, + do_lower_case=True, + remove_space=True, + keep_accents=False, + bos_token="[CLS]", + eos_token="[SEP]", + unk_token="", + sep_token="[SEP]", + pad_token="", + cls_token="[CLS]", + mask_token="[MASK]", + sp_model_kwargs=None, + **kwargs + ): + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.sentencepiece_model_file = sentencepiece_model_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(sentencepiece_model_file) + + @property + def vocab_size(self): + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.sentencepiece_model_file) + + def preprocess_text(self, inputs): + if self.remove_space: + outputs = " ".join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if not self.keep_accents: + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if self.do_lower_case: + outputs = outputs.lower() + + return outputs + + def _tokenize(self, text): + """Tokenize a string.""" + text = self.preprocess_text(text) + pieces = self.sp_model.encode(text, out_type=str) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + return new_pieces + + def _convert_token_to_id(self, token): + """Converts a token (str) to an id using the vocab.""" + return self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) to a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def num_special_tokens_to_add(self, pair=False): + token_ids_0 = [] + token_ids_1 = [] + return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): + if offset_mapping_1 is None: + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)] + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_resources(self, save_directory): + for name, file_name in self.resource_files_names.items(): + save_path = os.path.join(save_directory, file_name) + if os.path.abspath(self.sentencepiece_model_file) != os.path.abspath(save_path) and os.path.isfile( + self.sentencepiece_model_file + ): + copyfile(self.sentencepiece_model_file, save_path) + elif not os.path.isfile(self.sentencepiece_model_file): + with open(save_path, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 35aaf2f198fe..11c9ec139a62 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -34,6 +34,8 @@ CONFIG_MAPPING_NAMES = OrderedDict( [ ("albert", "AlbertConfig"), + ("albert_chinese", "AlbertConfig"), + ("albert_english", "AlbertConfig"), ("artist", "ArtistConfig"), ("bart", "BartConfig"), ("bert", "BertConfig"), @@ -82,6 +84,7 @@ ("luke", "LukeConfig"), ("mamba", "MambaConfig"), ("mbart", "MBartConfig"), + ("mbart50", "MBartConfig"), ("megatronbert", "MegatronBertConfig"), ("minigpt4", "MiniGPT4Config"), ("mistral", "MistralConfig"), @@ -123,6 +126,8 @@ # Base model mapping [ ("albert", "Albert"), + ("albert_chinese", "AlbertChinese"), + ("albert_english", "AlbertEnglish"), ("artist", "Artist"), ("bart", "Bart"), ("bert", "Bert"), @@ -171,6 +176,7 @@ ("luke", "Luke"), ("mamba", "Mamba"), ("mbart", "MBart"), + ("mbart50", "MBart50"), ("megatronbert", "MegatronBert"), ("minigpt4", "MiniGPT4"), ("mistral", "Mistral"), diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 26036d4c4e8f..13bb26820573 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -47,6 +47,8 @@ TOKENIZER_MAPPING_NAMES = OrderedDict( [ ("albert", "AlbertTokenizer"), + ("albert_chinese", "AlbertChineseTokenizer"), + ("albert_english", "AlbertEnglishTokenizer"), ("bart", "BartTokenizer"), ("bert", "BertTokenizer"), ("blenderbot", "BlenderbotTokenizer"), @@ -76,6 +78,7 @@ ("luke", "LukeTokenizer"), ("mamba", "MambaTokenizer"), ("mbart", "MBartTokenizer"), + ("mbart50", "MBart50Tokenizer"), ("mobilebert", "MobileBertTokenizer"), ("mpnet", "MPNetTokenizer"), ("nezha", "NeZhaTokenizer"), @@ -143,15 +146,18 @@ def tokenizer_class_from_name(class_name: str): for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): if class_name in tokenizers: module_name = model_type_to_module_name(module_name) - module = importlib.import_module(f".{module_name}", "paddlenlp.transformers") + print(f"module_name: {module_name}") try: + module = importlib.import_module(f".{module_name}", "paddlenlp.transformers") return getattr(module, class_name) except AttributeError: try: + print(f"module: {module}") module = importlib.import_module(f".{module_name}.tokenizer", "paddlenlp.transformers") + return getattr(module, class_name) except AttributeError: - continue + raise ValueError(f"Tokenizer class {class_name} is not currently imported.") for config, tokenizers in TOKENIZER_MAPPING._extra_content.items(): for tokenizer in tokenizers: diff --git a/paddlenlp/transformers/bigbird/tokenizer.py b/paddlenlp/transformers/bigbird/tokenizer.py index 9bc5ee70ef4a..6f73fbef6a75 100644 --- a/paddlenlp/transformers/bigbird/tokenizer.py +++ b/paddlenlp/transformers/bigbird/tokenizer.py @@ -21,7 +21,7 @@ from paddlenlp.data.vocab import Vocab -from ..albert.tokenizer import AlbertEnglishTokenizer +from ..albert_english.tokenizer import AlbertEnglishTokenizer __all__ = ["BigBirdTokenizer"] diff --git a/paddlenlp/transformers/fnet/tokenizer.py b/paddlenlp/transformers/fnet/tokenizer.py index 36456a4aee4b..a43274b86232 100644 --- a/paddlenlp/transformers/fnet/tokenizer.py +++ b/paddlenlp/transformers/fnet/tokenizer.py @@ -17,7 +17,8 @@ import sentencepiece as spm -from ..albert.tokenizer import AddedToken, AlbertEnglishTokenizer +from ..albert.tokenizer import AddedToken +from ..albert_english.tokenizer import AlbertEnglishTokenizer __all__ = ["FNetTokenizer"] diff --git a/paddlenlp/transformers/mbart/tokenizer.py b/paddlenlp/transformers/mbart/tokenizer.py index 163031e178e0..9f25eaba241f 100644 --- a/paddlenlp/transformers/mbart/tokenizer.py +++ b/paddlenlp/transformers/mbart/tokenizer.py @@ -19,7 +19,7 @@ from .. import AddedToken, PretrainedTokenizer -__all__ = ["MBartTokenizer", "MBart50Tokenizer"] +__all__ = ["MBartTokenizer"] MBART_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "mbart-large-cc25": 1024, @@ -309,323 +309,3 @@ def set_tgt_lang_special_tokens(self, tgt_lang): self.cur_lang_code_id = self.lang_code_to_id[tgt_lang] self.prefix_tokens = [] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code_id] - - -class MBart50Tokenizer(PretrainedTokenizer): - resource_files_names = { - "vocab_file": "sentencepiece.bpe.model", - } - pretrained_resource_files_map = { - "vocab_file": { - "mbart-large-50-one-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-one-to-many-mmt.sentencepiece.bpe.model", - "mbart-large-50-many-to-one-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-one-mmt.sentencepiece.bpe.model", - "mbart-large-50-many-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-many-mmt.sentencepiece.bpe.model", - } - } - pretrained_init_configuration = { - "mbart-large-50-one-to-many-mmt": {}, - "mbart-large-50-many-to-one-mmt": {}, - "mbart-large-50-many-to-many-mmt": {}, - } - max_model_input_sizes = MBART50_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids"] - - FAIRSEQ_LANGUAGE_CODES = [ - "ar_AR", - "cs_CZ", - "de_DE", - "en_XX", - "es_XX", - "et_EE", - "fi_FI", - "fr_XX", - "gu_IN", - "hi_IN", - "it_IT", - "ja_XX", - "kk_KZ", - "ko_KR", - "lt_LT", - "lv_LV", - "my_MM", - "ne_NP", - "nl_XX", - "ro_RO", - "ru_RU", - "si_LK", - "tr_TR", - "vi_VN", - "zh_CN", - "af_ZA", - "az_AZ", - "bn_IN", - "fa_IR", - "he_IL", - "hr_HR", - "id_ID", - "ka_GE", - "km_KH", - "mk_MK", - "ml_IN", - "mn_MN", - "mr_IN", - "pl_PL", - "ps_AF", - "pt_XX", - "sv_SE", - "sw_KE", - "ta_IN", - "te_IN", - "th_TH", - "tl_XX", - "uk_UA", - "ur_PK", - "xh_ZA", - "gl_ES", - "sl_SI", - ] - - def __init__( - self, - vocab_file, - src_lang=None, - tgt_lang=None, - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - sp_model_kwargs=None, - additional_special_tokens=None, - **kwargs - ): - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - self._build_special_tokens_map_extended(mask_token=mask_token) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.vocab_file = vocab_file - self.sp_model.Load(str(vocab_file)) - self.fairseq_offset = 1 - self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} - self.sp_model_size = len(self.sp_model) - self.lang_code_to_id = { - code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(self.FAIRSEQ_LANGUAGE_CODES) - } - self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset - self.fairseq_tokens_to_ids.update(self.lang_code_to_id) - self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - self.src_lang = src_lang if src_lang is not None else "en_XX" - self.tgt_lang = tgt_lang - # Get `special_tokens_map` after `_wrap_init()` - self.eos_token_id = self.fairseq_tokens_to_ids[eos_token] - self.unk_token_id = self.fairseq_tokens_to_ids[unk_token] - self.set_src_lang_special_tokens(self.src_lang) - self._additional_special_tokens = list(self.lang_code_to_id.keys()) - - if additional_special_tokens is not None: - # Only add those special tokens if they are not already there. - self._additional_special_tokens.extend( - [t for t in additional_special_tokens if t not in self._additional_special_tokens] - ) - - def __call__( - self, - text, - text_pair=None, - max_length=None, - stride=0, - is_split_into_words=False, - padding=None, - truncation="longest_first", - return_position_ids=False, - return_token_type_ids=False, - return_attention_mask=True, - return_length=False, - return_overflowing_tokens=False, - return_special_tokens_mask=False, - **kwargs - ): - if "pad_to_max_seq_len" in kwargs and padding is None: - pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len") - padding = "max_length" if pad_to_max_seq_len else False - elif padding is None: - padding = False - - if "max_seq_len" in kwargs and max_length is None: - max_length = kwargs["max_seq_len"] - - if "truncation_strategy" in kwargs and kwargs["truncation_strategy"] != "longest_first": - truncation = kwargs["truncation_strategy"] - - return super(MBart50Tokenizer, self).__call__( - text=text, - text_pair=text_pair, - max_length=max_length, - stride=stride, - is_split_into_words=is_split_into_words, - padding=padding, - truncation=truncation, - return_position_ids=return_position_ids, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_length=return_length, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - **kwargs, - ) - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - state["sp_model_proto"] = self.sp_model.serialized_model_proto() - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.LoadFromSerializedProto(self.sp_model_proto) - - def save_resources(self, save_directory): - for name, file_name in self.resource_files_names.items(): - save_path = os.path.join(save_directory, file_name) - if os.path.abspath(self.vocab_file) != os.path.abspath(save_path) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, save_path) - elif not os.path.isfile(self.vocab_file): - with open(save_path, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text): - return self.sp_model.encode(text, out_type=str) - - @property - def vocab_size(self): - """ - Returns the size of vocabulary. - - Returns: - int: The sum of size of vocabulary and the size of speical tokens. - - """ - - return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1 - - def _convert_token_to_id(self, token): - """ - Converts a token (str) in an id using the vocab. - """ - if token in self.fairseq_tokens_to_ids: - return self.fairseq_tokens_to_ids[token] - spm_id = self.sp_model.PieceToId(token) - - return spm_id + self.fairseq_offset if spm_id else self.unk_token_id - - def _convert_id_to_token(self, index): - """ - Converts an index (integer) in a token (str) using the vocab. - """ - if index in self.fairseq_ids_to_tokens: - return self.fairseq_ids_to_tokens[index] - return self.sp_model.IdToPiece(index - self.fairseq_offset) - - def convert_tokens_to_string(self, tokens): - """ - Converts a sequence of tokens (strings for sub-words) in a single string. - """ - out_string = "".join(tokens).replace("▁", " ").strip() - return out_string - - def convert_ids_to_string(self, ids): - """ - Converts a sequence of tokens (strings for sub-words) in a single string. - """ - tokens = self.convert_ids_to_tokens(ids) - out_string = "".join(tokens).replace("▁", " ").strip() - return out_string - - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): - """ - Retrieve sequence ids from a token list that has no special tokens added. - """ - - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - prefix_ones = [1] * len(self.prefix_tokens) - suffix_ones = [1] * len(self.suffix_tokens) - if token_ids_1 is None: - return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones - return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An MBART50 sequence has the following format, where ``X`` represents the sequence: - - - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]`` - - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]`` - - BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a - separator. - """ - if token_ids_1 is None: - return self.prefix_tokens + token_ids_0 + self.suffix_tokens - # We don't expect to process pairs, but leave the pair logic for API consistency - return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens - - def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): - """ - Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. - - Should be overridden in a subclass if the model has a special way of building those. - - Args: - offset_mapping_0 (List[tuple]): - List of char offsets to which the special tokens will be added. - offset_mapping_1 (List[tuple], optional): - Optional second list of char offsets for offset mapping pairs. - - Returns: - List[tuple]: List of char offsets with the appropriate offsets of special tokens. - """ - if offset_mapping_1 is None: - return [(0, 0)] + offset_mapping_0 + [(0, 0)] - - return [(0, 0)] + offset_mapping_0 + offset_mapping_1 + [(0, 0)] - - def set_src_lang_special_tokens(self, src_lang): - """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" - self.cur_lang_code_id = self.lang_code_to_id[src_lang] - self.prefix_tokens = [self.cur_lang_code_id] - self.suffix_tokens = [self.eos_token_id] - - def set_tgt_lang_special_tokens(self, tgt_lang): - """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos].""" - self.cur_lang_code_id = self.lang_code_to_id[tgt_lang] - self.prefix_tokens = [self.cur_lang_code_id] - self.suffix_tokens = [self.eos_token_id] - - def _build_translation_inputs(self, raw_inputs, return_tensors, src_lang, tgt_lang, **extra_kwargs): - """Used by translation pipeline, to prepare inputs for the generate function""" - if src_lang is None or tgt_lang is None: - raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") - self.src_lang = src_lang - inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) - tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) - inputs["forced_bos_token_id"] = tgt_lang_id - return inputs diff --git a/paddlenlp/transformers/mbart50/__init__.py b/paddlenlp/transformers/mbart50/__init__.py new file mode 100644 index 000000000000..97043fd7ba68 --- /dev/null +++ b/paddlenlp/transformers/mbart50/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/paddlenlp/transformers/mbart50/tokenizer.py b/paddlenlp/transformers/mbart50/tokenizer.py new file mode 100644 index 000000000000..68d0520c548c --- /dev/null +++ b/paddlenlp/transformers/mbart50/tokenizer.py @@ -0,0 +1,353 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +from shutil import copyfile + +import sentencepiece as spm + +from .. import AddedToken, PretrainedTokenizer + +__all__ = ["MBart50Tokenizer"] + +MBART_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "mbart-large-cc25": 1024, + "mbart-large-en-ro": 1024, +} + +MBART50_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { + "mbart-large-50-one-to-many-mmt": 1024, + "mbart-large-50-many-to-one-mmt": 1024, + "mbart-large-50-many-to-many-mmt": 1024, +} + + +class MBart50Tokenizer(PretrainedTokenizer): + resource_files_names = { + "vocab_file": "sentencepiece.bpe.model", + } + pretrained_resource_files_map = { + "vocab_file": { + "mbart-large-50-one-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-one-to-many-mmt.sentencepiece.bpe.model", + "mbart-large-50-many-to-one-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-one-mmt.sentencepiece.bpe.model", + "mbart-large-50-many-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-many-mmt.sentencepiece.bpe.model", + } + } + pretrained_init_configuration = { + "mbart-large-50-one-to-many-mmt": {}, + "mbart-large-50-many-to-one-mmt": {}, + "mbart-large-50-many-to-many-mmt": {}, + } + max_model_input_sizes = MBART50_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids"] + + FAIRSEQ_LANGUAGE_CODES = [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI", + ] + + def __init__( + self, + vocab_file, + src_lang=None, + tgt_lang=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs=None, + additional_special_tokens=None, + **kwargs + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self._build_special_tokens_map_extended(mask_token=mask_token) + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.vocab_file = vocab_file + self.sp_model.Load(str(vocab_file)) + self.fairseq_offset = 1 + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} + self.sp_model_size = len(self.sp_model) + self.lang_code_to_id = { + code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(self.FAIRSEQ_LANGUAGE_CODES) + } + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + self.fairseq_tokens_to_ids.update(self.lang_code_to_id) + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + self.src_lang = src_lang if src_lang is not None else "en_XX" + self.tgt_lang = tgt_lang + # Get `special_tokens_map` after `_wrap_init()` + self.eos_token_id = self.fairseq_tokens_to_ids[eos_token] + self.unk_token_id = self.fairseq_tokens_to_ids[unk_token] + self.set_src_lang_special_tokens(self.src_lang) + self._additional_special_tokens = list(self.lang_code_to_id.keys()) + + if additional_special_tokens is not None: + # Only add those special tokens if they are not already there. + self._additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in self._additional_special_tokens] + ) + + def __call__( + self, + text, + text_pair=None, + max_length=None, + stride=0, + is_split_into_words=False, + padding=None, + truncation="longest_first", + return_position_ids=False, + return_token_type_ids=False, + return_attention_mask=True, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False, + **kwargs + ): + if "pad_to_max_seq_len" in kwargs and padding is None: + pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len") + padding = "max_length" if pad_to_max_seq_len else False + elif padding is None: + padding = False + + if "max_seq_len" in kwargs and max_length is None: + max_length = kwargs["max_seq_len"] + + if "truncation_strategy" in kwargs and kwargs["truncation_strategy"] != "longest_first": + truncation = kwargs["truncation_strategy"] + + return super(MBart50Tokenizer, self).__call__( + text=text, + text_pair=text_pair, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + padding=padding, + truncation=truncation, + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + def save_resources(self, save_directory): + for name, file_name in self.resource_files_names.items(): + save_path = os.path.join(save_directory, file_name) + if os.path.abspath(self.vocab_file) != os.path.abspath(save_path) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, save_path) + elif not os.path.isfile(self.vocab_file): + with open(save_path, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + return self.sp_model.encode(text, out_type=str) + + @property + def vocab_size(self): + """ + Returns the size of vocabulary. + + Returns: + int: The sum of size of vocabulary and the size of speical tokens. + + """ + + return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1 + + def _convert_token_to_id(self, token): + """ + Converts a token (str) in an id using the vocab. + """ + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """ + Converts an index (integer) in a token (str) using the vocab. + """ + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """ + Converts a sequence of tokens (strings for sub-words) in a single string. + """ + out_string = "".join(tokens).replace("▁", " ").strip() + return out_string + + def convert_ids_to_string(self, ids): + """ + Converts a sequence of tokens (strings for sub-words) in a single string. + """ + tokens = self.convert_ids_to_tokens(ids) + out_string = "".join(tokens).replace("▁", " ").strip() + return out_string + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieve sequence ids from a token list that has no special tokens added. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + prefix_ones = [1] * len(self.prefix_tokens) + suffix_ones = [1] * len(self.suffix_tokens) + if token_ids_1 is None: + return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones + return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An MBART50 sequence has the following format, where ``X`` represents the sequence: + + - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]`` + - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]`` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): + """ + Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. + + Should be overridden in a subclass if the model has a special way of building those. + + Args: + offset_mapping_0 (List[tuple]): + List of char offsets to which the special tokens will be added. + offset_mapping_1 (List[tuple], optional): + Optional second list of char offsets for offset mapping pairs. + + Returns: + List[tuple]: List of char offsets with the appropriate offsets of special tokens. + """ + if offset_mapping_1 is None: + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + + return [(0, 0)] + offset_mapping_0 + offset_mapping_1 + [(0, 0)] + + def set_src_lang_special_tokens(self, src_lang): + """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" + self.cur_lang_code_id = self.lang_code_to_id[src_lang] + self.prefix_tokens = [self.cur_lang_code_id] + self.suffix_tokens = [self.eos_token_id] + + def set_tgt_lang_special_tokens(self, tgt_lang): + """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos].""" + self.cur_lang_code_id = self.lang_code_to_id[tgt_lang] + self.prefix_tokens = [self.cur_lang_code_id] + self.suffix_tokens = [self.eos_token_id] + + def _build_translation_inputs(self, raw_inputs, return_tensors, src_lang, tgt_lang, **extra_kwargs): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) + tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs diff --git a/paddlenlp/transformers/reformer/tokenizer.py b/paddlenlp/transformers/reformer/tokenizer.py index 6944bc258423..15e046a6591c 100644 --- a/paddlenlp/transformers/reformer/tokenizer.py +++ b/paddlenlp/transformers/reformer/tokenizer.py @@ -17,7 +17,7 @@ import sentencepiece as spm -from ..albert.tokenizer import AlbertEnglishTokenizer +from ..albert_english.tokenizer import AlbertEnglishTokenizer __all__ = ["ReformerTokenizer"] diff --git a/paddlenlp/transformers/t5/tokenizer.py b/paddlenlp/transformers/t5/tokenizer.py index 4fc3d60c7cfe..9066d1cbb748 100644 --- a/paddlenlp/transformers/t5/tokenizer.py +++ b/paddlenlp/transformers/t5/tokenizer.py @@ -18,7 +18,7 @@ import sentencepiece as spm -from ..albert.tokenizer import AlbertEnglishTokenizer +from ..albert_english.tokenizer import AlbertEnglishTokenizer __all__ = [ "T5Tokenizer", diff --git a/tests/transformers/albert/test_tokenizer.py b/tests/transformers/albert/test_tokenizer.py index 34c3f7302621..f6d24f1afe94 100644 --- a/tests/transformers/albert/test_tokenizer.py +++ b/tests/transformers/albert/test_tokenizer.py @@ -16,10 +16,8 @@ import os import unittest -from paddlenlp.transformers.albert.tokenizer import ( - AlbertChineseTokenizer, - AlbertEnglishTokenizer, -) +from paddlenlp.transformers.albert_chinese.tokenizer import AlbertChineseTokenizer +from paddlenlp.transformers.albert_english.tokenizer import AlbertEnglishTokenizer from paddlenlp.transformers.bert.tokenizer import BasicTokenizer, WordpieceTokenizer from ...testing_utils import get_tests_dir, slow From aad6750fa24396edfc461e6f3ed81f0cfeac26c3 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Sat, 28 Sep 2024 06:37:34 +0000 Subject: [PATCH 05/21] update docstring, add a RuntimeError when AutoTokenizer failed to load from pretrained, update method to get attr from a module --- paddlenlp/transformers/auto/factory.py | 18 +++++++++++++++++- paddlenlp/transformers/auto/tokenizer.py | 9 ++++++++- 2 files changed, 25 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/auto/factory.py b/paddlenlp/transformers/auto/factory.py index edffb27913b3..960ed741c655 100644 --- a/paddlenlp/transformers/auto/factory.py +++ b/paddlenlp/transformers/auto/factory.py @@ -79,7 +79,23 @@ def __getitem__(self, key): def _load_attr_from_module(self, model_type, attr): module_name = model_type_to_module_name(model_type) if module_name not in self._modules: - self._modules[module_name] = importlib.import_module(f".{module_name}", "paddlenlp.transformers") + if "Tokenizer" in model_type: + try: + self._modules[module_name] = importlib.import_module( + f".{module_name}.tokenizer", "paddlenlp.transformers" + ) + except ImportError: + pass + if module_name not in self._modules: + if "Config" in model_type: + try: + self._modules[module_name] = importlib.import_module( + f".{module_name}.configuration", "paddlenlp.transformers" + ) + except ImportError: + pass + if module_name not in self._modules: + self._modules[module_name] = importlib.import_module(f".{module_name}", "paddlenlp.transformers") return getattribute_from_module(self._modules[module_name], attr) def keys(self): diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 13bb26820573..3fc9add59cb2 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -338,7 +338,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): - Name of a community-contributed pretrained model. - Local directory path which contains tokenizer related resources and tokenizer config file ("tokenizer_config.json"). - *args (tuple): position arguments for model `__init__`. If provided, + *inputs (tuple): position arguments for model `__init__`. If provided, use these as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for tokenizer @@ -411,3 +411,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " "in order to use this tokenizer." ) + raise RuntimeError( + f"Can't load tokenizer for '{pretrained_model_name_or_path}'.\n" + f"Please make sure that '{pretrained_model_name_or_path}' is:\n" + "- a correct model-identifier of built-in pretrained models,\n" + "- or a correct model-identifier of community-contributed pretrained models,\n" + "- or the correct path to a directory containing relevant tokenizer files.\n" + ) From 04dff4d29599ff7e6948c2cafbc84d17bff0d113 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Sat, 28 Sep 2024 10:22:30 +0000 Subject: [PATCH 06/21] update albert_english/__init__.py and mbart/__init__.py --- paddlenlp/transformers/albert/__init__.py | 2 ++ paddlenlp/transformers/albert_chinese/__init__.py | 2 ++ paddlenlp/transformers/albert_english/__init__.py | 2 ++ paddlenlp/transformers/auto/configuration.py | 6 ------ paddlenlp/transformers/auto/tokenizer.py | 10 +++++++++- paddlenlp/transformers/mbart/__init__.py | 2 ++ paddlenlp/transformers/mbart50/__init__.py | 2 ++ 7 files changed, 19 insertions(+), 7 deletions(-) diff --git a/paddlenlp/transformers/albert/__init__.py b/paddlenlp/transformers/albert/__init__.py index 97043fd7ba68..86ecd1322521 100644 --- a/paddlenlp/transformers/albert/__init__.py +++ b/paddlenlp/transformers/albert/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .tokenizer import * diff --git a/paddlenlp/transformers/albert_chinese/__init__.py b/paddlenlp/transformers/albert_chinese/__init__.py index 97043fd7ba68..86ecd1322521 100644 --- a/paddlenlp/transformers/albert_chinese/__init__.py +++ b/paddlenlp/transformers/albert_chinese/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .tokenizer import * diff --git a/paddlenlp/transformers/albert_english/__init__.py b/paddlenlp/transformers/albert_english/__init__.py index 97043fd7ba68..86ecd1322521 100644 --- a/paddlenlp/transformers/albert_english/__init__.py +++ b/paddlenlp/transformers/albert_english/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .tokenizer import * diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 11c9ec139a62..35aaf2f198fe 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -34,8 +34,6 @@ CONFIG_MAPPING_NAMES = OrderedDict( [ ("albert", "AlbertConfig"), - ("albert_chinese", "AlbertConfig"), - ("albert_english", "AlbertConfig"), ("artist", "ArtistConfig"), ("bart", "BartConfig"), ("bert", "BertConfig"), @@ -84,7 +82,6 @@ ("luke", "LukeConfig"), ("mamba", "MambaConfig"), ("mbart", "MBartConfig"), - ("mbart50", "MBartConfig"), ("megatronbert", "MegatronBertConfig"), ("minigpt4", "MiniGPT4Config"), ("mistral", "MistralConfig"), @@ -126,8 +123,6 @@ # Base model mapping [ ("albert", "Albert"), - ("albert_chinese", "AlbertChinese"), - ("albert_english", "AlbertEnglish"), ("artist", "Artist"), ("bart", "Bart"), ("bert", "Bert"), @@ -176,7 +171,6 @@ ("luke", "Luke"), ("mamba", "Mamba"), ("mbart", "MBart"), - ("mbart50", "MBart50"), ("megatronbert", "MegatronBert"), ("minigpt4", "MiniGPT4"), ("mistral", "Mistral"), diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 3fc9add59cb2..3d2c8ed5c29b 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -400,7 +400,15 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): model_type = config_class_to_model_type(type(config).__name__) if model_type is not None: - tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)] + tokenizer_class_py = TOKENIZER_MAPPING[type(config)] + if isinstance(tokenizer_class_py, (list, tuple)): + if len(tokenizer_class_py) == 2: + tokenizer_class_fast = tokenizer_class_py[1] + tokenizer_class_py = tokenizer_class_py[0] + else: + tokenizer_class_fast = None + else: + tokenizer_class_fast = None if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) else: diff --git a/paddlenlp/transformers/mbart/__init__.py b/paddlenlp/transformers/mbart/__init__.py index 97043fd7ba68..86ecd1322521 100644 --- a/paddlenlp/transformers/mbart/__init__.py +++ b/paddlenlp/transformers/mbart/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .tokenizer import * diff --git a/paddlenlp/transformers/mbart50/__init__.py b/paddlenlp/transformers/mbart50/__init__.py index 97043fd7ba68..1c70e3d07945 100644 --- a/paddlenlp/transformers/mbart50/__init__.py +++ b/paddlenlp/transformers/mbart50/__init__.py @@ -11,3 +11,5 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. + +from .tokenizer import MBart50Tokenizer From 6475a83d2d309487cf9bf389f735e8c59c79c4b0 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Sat, 28 Sep 2024 11:45:57 +0000 Subject: [PATCH 07/21] fix typo, rm redundent notations --- .../source/paddlenlp.transformers.fnet.tokenizer.po | 2 +- paddlenlp/transformers/__init__.py | 1 - paddlenlp/utils/download/__init__.py | 3 --- 3 files changed, 1 insertion(+), 5 deletions(-) diff --git a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po index f3fe10d3a3eb..f4ff6d216744 100644 --- a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po +++ b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po @@ -26,7 +26,7 @@ msgid "Tokenization class for FNet model." msgstr "" #: of paddlenlp.transformers.fnet.tokenizer.FNetTokenizer:1 -msgid "基类::class:`paddlenlp.transformers.albert.albert_english.AlbertEnglishTokenizer`" +msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`" msgstr "" #: of paddlenlp.transformers.fnet.tokenizer.FNetTokenizer:1 diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index be5ebba7d7e3..cd629e76bc1b 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -65,7 +65,6 @@ from .albert.modeling import * from .albert.tokenizer import * from .albert_chinese.tokenizer import * - from .albert_english.tokenizer import * from .bit.modeling import * from .bit.configuration import * diff --git a/paddlenlp/utils/download/__init__.py b/paddlenlp/utils/download/__init__.py index 366cf48a428a..e65333203f77 100644 --- a/paddlenlp/utils/download/__init__.py +++ b/paddlenlp/utils/download/__init__.py @@ -143,8 +143,6 @@ def resolve_file_path( continue else: pass - # 临时解决方案 - # raise FileNotFoundError(f"please make sure one of the {filenames} under the dir {repo_id}") # check cache for filename in filenames: @@ -275,7 +273,6 @@ def resolve_file_path( ) except EntryNotFoundError: return None - # raise EnvironmentError(f"Does not appear one of the {filenames} in {repo_id}.") except HTTPError as err: raise EnvironmentError(f"There was a specific connection error when trying to load {repo_id}:\n{err}") except ValueError: From dea3ad40f6842155de846bf71a5f32c02c125f40 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Fri, 11 Oct 2024 14:55:06 +0000 Subject: [PATCH 08/21] some changes... --- .../transformers/albert_chinese/__init__.py | 2 +- .../transformers/albert_english/__init__.py | 2 +- paddlenlp/transformers/auto/configuration.py | 4 ++-- paddlenlp/transformers/auto/tokenizer.py | 19 ++++++++----------- paddlenlp/transformers/fnet/tokenizer.py | 2 +- paddlenlp/transformers/mbart50/__init__.py | 2 +- paddlenlp/utils/download/__init__.py | 4 ++-- tests/transformers/llama/test_tokenizer.py | 2 +- 8 files changed, 17 insertions(+), 20 deletions(-) diff --git a/paddlenlp/transformers/albert_chinese/__init__.py b/paddlenlp/transformers/albert_chinese/__init__.py index 86ecd1322521..e16c5219a42b 100644 --- a/paddlenlp/transformers/albert_chinese/__init__.py +++ b/paddlenlp/transformers/albert_chinese/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlenlp/transformers/albert_english/__init__.py b/paddlenlp/transformers/albert_english/__init__.py index 86ecd1322521..e16c5219a42b 100644 --- a/paddlenlp/transformers/albert_english/__init__.py +++ b/paddlenlp/transformers/albert_english/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 35aaf2f198fe..36aedf7ee90e 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -234,13 +234,13 @@ def __getitem__(self, key): value = self._mapping[key] module_name = model_type_to_module_name(key) if module_name not in self._modules: - self._modules[module_name] = importlib.import_module(f".{module_name}", "transformers.models") + self._modules[module_name] = importlib.import_module(f".{module_name}", "paddlenlp.transformers") if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value) # Some of the mappings have entries model_type -> config of another model type. In that case we try to grab the # object at the top level. - transformers_module = importlib.import_module("transformers") + transformers_module = importlib.import_module("paddlenlp") return getattr(transformers_module, value) def keys(self): diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 3d2c8ed5c29b..2874e2de9e48 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -19,20 +19,19 @@ from collections import OrderedDict from typing import TYPE_CHECKING, Dict, Optional, Tuple, Union -from paddlenlp.transformers.auto.configuration import ( +from ...utils import is_tokenizers_available +from ...utils.download import resolve_file_path +from ...utils.import_utils import import_module +from ...utils.log import logger +from ..configuration_utils import PretrainedConfig +from ..tokenizer_utils_base import TOKENIZER_CONFIG_FILE +from ..tokenizer_utils_fast import PretrainedTokenizerFast +from .configuration import ( CONFIG_MAPPING_NAMES, AutoConfig, config_class_to_model_type, model_type_to_module_name, ) -from paddlenlp.transformers.configuration_utils import PretrainedConfig -from paddlenlp.transformers.tokenizer_utils_base import TOKENIZER_CONFIG_FILE -from paddlenlp.transformers.tokenizer_utils_fast import PretrainedTokenizerFast - -from ...utils import is_tokenizers_available -from ...utils.download import resolve_file_path -from ...utils.import_utils import import_module -from ...utils.log import logger from .factory import _LazyAutoMapping __all__ = [ @@ -40,8 +39,6 @@ ] if TYPE_CHECKING: - # This significantly improves completion suggestion performance when - # the transformers package is used with Microsoft's Pylance language server. TOKENIZER_MAPPING_NAMES: OrderedDict[str, Tuple[Optional[str], Optional[str]]] = OrderedDict() else: TOKENIZER_MAPPING_NAMES = OrderedDict( diff --git a/paddlenlp/transformers/fnet/tokenizer.py b/paddlenlp/transformers/fnet/tokenizer.py index a43274b86232..786207c83db7 100644 --- a/paddlenlp/transformers/fnet/tokenizer.py +++ b/paddlenlp/transformers/fnet/tokenizer.py @@ -17,8 +17,8 @@ import sentencepiece as spm -from ..albert.tokenizer import AddedToken from ..albert_english.tokenizer import AlbertEnglishTokenizer +from ..tokenizer_utils_base import AddedToken __all__ = ["FNetTokenizer"] diff --git a/paddlenlp/transformers/mbart50/__init__.py b/paddlenlp/transformers/mbart50/__init__.py index 1c70e3d07945..c4ea9058ff63 100644 --- a/paddlenlp/transformers/mbart50/__init__.py +++ b/paddlenlp/transformers/mbart50/__init__.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/paddlenlp/utils/download/__init__.py b/paddlenlp/utils/download/__init__.py index e65333203f77..a508f1177070 100644 --- a/paddlenlp/utils/download/__init__.py +++ b/paddlenlp/utils/download/__init__.py @@ -142,7 +142,7 @@ def resolve_file_path( elif index < len(filenames) - 1: continue else: - pass + raise FileNotFoundError(f"please make sure one of the {filenames} under the dir {repo_id}") # check cache for filename in filenames: @@ -272,7 +272,7 @@ def resolve_file_path( f"'{log_endpoint}' for available revisions." ) except EntryNotFoundError: - return None + raise EnvironmentError(f"Does not appear one of the {filenames} in {repo_id}.") except HTTPError as err: raise EnvironmentError(f"There was a specific connection error when trying to load {repo_id}:\n{err}") except ValueError: diff --git a/tests/transformers/llama/test_tokenizer.py b/tests/transformers/llama/test_tokenizer.py index fdbe4b4c9a49..b221aacb4e9b 100644 --- a/tests/transformers/llama/test_tokenizer.py +++ b/tests/transformers/llama/test_tokenizer.py @@ -22,7 +22,7 @@ from paddlenlp.transformers.tokenizer_utils import PretrainedTokenizer from paddlenlp.transformers.tokenizer_utils_fast import PretrainedTokenizerFast -from ...transformers.test_tokenizer_common import TokenizerTesterMixin +from ..test_tokenizer_common import TokenizerTesterMixin VOCAB_FILES_NAMES = { "vocab_file": "vocab.json", From f5ae7943b2f3039ea5fdfaf2cb1e97a1c0e54ee5 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Fri, 11 Oct 2024 17:03:11 +0000 Subject: [PATCH 09/21] AutoTokenizer will not load TokenzierFast by default --- paddlenlp/transformers/auto/configuration.py | 4 ++++ paddlenlp/transformers/auto/tokenizer.py | 2 +- tests/transformers/llama/test_tokenizer.py | 4 +++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 36aedf7ee90e..8912a4d95174 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -213,6 +213,10 @@ def config_class_to_model_type(config): for key, cls in CONFIG_MAPPING_NAMES.items(): if cls == config: return key + # if key not found check in extra content + for key, cls in CONFIG_MAPPING._extra_content.items(): + if cls.__name__ == config: + return key return None diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 2874e2de9e48..bc8974055ef6 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -367,7 +367,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): config = kwargs.pop("config", None) kwargs["_from_auto"] = True - use_fast = kwargs.pop("use_fast", True) + use_fast = kwargs.pop("use_fast", False) tokenizer_type = kwargs.pop("tokenizer_type", None) if tokenizer_type is not None: # TODO: Support tokenizer_type diff --git a/tests/transformers/llama/test_tokenizer.py b/tests/transformers/llama/test_tokenizer.py index b221aacb4e9b..ad567815abfa 100644 --- a/tests/transformers/llama/test_tokenizer.py +++ b/tests/transformers/llama/test_tokenizer.py @@ -258,13 +258,14 @@ def test_tiktoken_llama(self): add_bos_token=True, add_eos_token=True, from_hf_hub=True, + use_fast=True, ) self.assertTrue(isinstance(tiktoken_tokenizer, PretrainedTokenizerFast)) tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)["input_ids"] self.assertEqual(tokens, test_tokens) tmpdirname = tempfile.mkdtemp() tiktoken_tokenizer.save_pretrained(tmpdirname) - tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname) + tokenizer_reload = AutoTokenizer.from_pretrained(tmpdirname, use_fast=True) self.assertTrue(isinstance(tokenizer_reload, PretrainedTokenizerFast)) tokens = tokenizer_reload.encode(test_text, add_special_tokens=True)["input_ids"] self.assertEqual(tokens, test_tokens) @@ -279,6 +280,7 @@ def test_tiktoken_llama(self): add_bos_token=True, add_eos_token=True, from_hf_hub=True, + use_fast=True, ) tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)["input_ids"] self.assertEqual(tokens, test_tokens) From ce684a1a4fabddf95c29abcf77b061dfc83faa04 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Fri, 11 Oct 2024 17:51:47 +0000 Subject: [PATCH 10/21] Add test for external config --- paddlenlp/transformers/auto/configuration.py | 35 +++++++++++++++++-- tests/transformers/auto/test_confiugration.py | 31 ++++++++++++++++ 2 files changed, 63 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/auto/configuration.py b/paddlenlp/transformers/auto/configuration.py index 8912a4d95174..ff89b81d5cc2 100644 --- a/paddlenlp/transformers/auto/configuration.py +++ b/paddlenlp/transformers/auto/configuration.py @@ -238,7 +238,9 @@ def __getitem__(self, key): value = self._mapping[key] module_name = model_type_to_module_name(key) if module_name not in self._modules: - self._modules[module_name] = importlib.import_module(f".{module_name}", "paddlenlp.transformers") + self._modules[module_name] = importlib.import_module( + f".{module_name}.configuration", "paddlenlp.transformers" + ) if hasattr(self._modules[module_name], value): return getattr(self._modules[module_name], value) @@ -440,14 +442,24 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar from_hf_hub=from_hf_hub, from_aistudio=from_aistudio, ) - if config_file is not None and os.path.exists(config_file): + config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) + if "model_type" in config_dict: + try: + config_class = CONFIG_MAPPING[config_dict["model_type"]] + except KeyError: + raise ValueError( + f"The checkpoint you are trying to load has model type `{config_dict['model_type']}` " + "but Transformers does not recognize this architecture. This could be because of an " + "issue with the checkpoint, or because your version of Transformers is out of date." + ) + return config_class.from_dict(config_dict, **unused_kwargs) + elif "model_type" not in config_dict and config_file is not None and os.path.exists(config_file): config_class = cls._get_config_class_from_config(pretrained_model_name_or_path, config_file) logger.info("We are using %s to load '%s'." % (config_class, pretrained_model_name_or_path)) if config_class is cls: return cls.from_file(config_file) return config_class.from_pretrained(config_file, *model_args, **kwargs) elif config_file is None: - config_dict, unused_kwargs = PretrainedConfig.get_config_dict(pretrained_model_name_or_path, **kwargs) # Fallback: use pattern matching on the string. # We go from longer names to shorter names to catch roberta before bert (for instance) for pattern in sorted(CONFIG_MAPPING.keys(), key=len, reverse=True): @@ -461,3 +473,20 @@ def from_pretrained(cls, pretrained_model_name_or_path: str, *model_args, **kwar "- or a correct model-identifier of community-contributed pretrained models,\n" "- or the correct path to a directory containing relevant config files.\n" ) + + @staticmethod + def register(model_type, config, exist_ok=False): + """ + Register a new configuration for this class. + + Args: + model_type (`str`): The model type like "bert" or "gpt". + config ([`PretrainedConfig`]): The config to register. + """ + if issubclass(config, PretrainedConfig) and config.model_type != model_type: + raise ValueError( + "The config you are passing has a `model_type` attribute that is not consistent with the model type " + f"you passed (config has {config.model_type} and you passed {model_type}. Fix one of those so they " + "match!" + ) + CONFIG_MAPPING.register(model_type, config, exist_ok=exist_ok) diff --git a/tests/transformers/auto/test_confiugration.py b/tests/transformers/auto/test_confiugration.py index e58b793cc78a..676d174412f5 100644 --- a/tests/transformers/auto/test_confiugration.py +++ b/tests/transformers/auto/test_confiugration.py @@ -21,6 +21,9 @@ import unittest from paddlenlp.transformers import AutoConfig +from paddlenlp.transformers.auto.configuration import CONFIG_MAPPING +from paddlenlp.transformers.bert.configuration import BertConfig +from paddlenlp.transformers.configuration_utils import PretrainedConfig from paddlenlp.utils.env import CONFIG_NAME @@ -86,6 +89,34 @@ def test_load_from_legacy_config(self): auto_config = AutoConfig.from_pretrained(tempdir) self.assertEqual(auto_config.hidden_size, number) + def test_new_config_registration(self): + class CustomConfig(PretrainedConfig): + model_type = "custom" + + def __init__(self, attribute=1, **kwargs): + self.attribute = attribute + super().__init__(**kwargs) + + try: + AutoConfig.register("custom", CustomConfig) + # Wrong model type will raise an error + with self.assertRaises(ValueError): + AutoConfig.register("model", CustomConfig) + # Trying to register something existing in the Transformers library will raise an error + with self.assertRaises(ValueError): + AutoConfig.register("bert", BertConfig) + + # Now that the config is registered, it can be used as any other config with the auto-API + config = CustomConfig() + with tempfile.TemporaryDirectory() as tmp_dir: + config.save_pretrained(tmp_dir) + new_config = AutoConfig.from_pretrained(tmp_dir) + self.assertIsInstance(new_config, CustomConfig) + + finally: + if "custom" in CONFIG_MAPPING._extra_content: + del CONFIG_MAPPING._extra_content["custom"] + def test_from_pretrained_cache_dir(self): model_id = "__internal_testing__/tiny-random-bert" with tempfile.TemporaryDirectory() as tempdir: From 75368d5dd20a87f645e51d74cc53ca3774081304 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Sat, 12 Oct 2024 01:23:40 +0000 Subject: [PATCH 11/21] revert unnecrssary changes --- tests/transformers/test_chat_template.py | 8 ++++---- tests/transformers/test_modeling_common.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/tests/transformers/test_chat_template.py b/tests/transformers/test_chat_template.py index a76ba40fa543..4e443b54a2e2 100644 --- a/tests/transformers/test_chat_template.py +++ b/tests/transformers/test_chat_template.py @@ -97,7 +97,7 @@ def test_inference_template(self): class ChatTemplateIntegrationTest(unittest.TestCase): def test_linlyai_chinese_llama_2_chat_template(self): - tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b", use_fast=False) + tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b") query = "你好" final_query = tokenizer.apply_chat_template(query, tokenize=False) expected_query = f"### Instruction:{query} ### Response:" @@ -110,7 +110,7 @@ def test_linlyai_chinese_llama_2_chat_template(self): self.assertEqual(final_query, expected_query) def test_linlyai_chinese_llama_2_chat_template_with_none_saved(self): - tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b", use_fast=False) + tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b") tokenizer.chat_template = None with tempfile.TemporaryDirectory() as tempdir: tokenizer.save_pretrained(tempdir) @@ -182,7 +182,7 @@ def get_common_prefix(self, tokenizer): def test_prefix(self): prompt = "欢迎使用 PaddleNLP 大模型开发套件" - tokenizer = AutoTokenizer.from_pretrained(self.model_name, use_fast=False) + tokenizer = AutoTokenizer.from_pretrained(self.model_name) result = tokenizer.apply_chat_template(prompt, tokenize=False) result_ids = tokenizer(result, add_special_tokens=False)["input_ids"] @@ -230,7 +230,7 @@ def test_must_have_system(self): def test_at_least_one_turn(self): query = [["你好", "您好,我是个人人工智能助手"], ["今天吃啥", "你可以选择不同的菜系"]] - tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b", use_fast=False) + tokenizer = AutoTokenizer.from_pretrained("linly-ai/chinese-llama-2-7b") # tokenizer.init_chat_template(self.chat_template_config_file) # get all query sentence diff --git a/tests/transformers/test_modeling_common.py b/tests/transformers/test_modeling_common.py index 4e56a63fb220..af1a4845459f 100644 --- a/tests/transformers/test_modeling_common.py +++ b/tests/transformers/test_modeling_common.py @@ -929,7 +929,7 @@ def tearDown(self): @unittest.skip("Paddle enable PIR API in Python") def test_to_static_use_top_k(self): tokenizer = self.TokenizerClass.from_pretrained(self.internal_testing_model) - if "LlamaTokenizer" in tokenizer.__class__.__name__: + if tokenizer.__class__.__name__ == "LlamaTokenizer": tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "" model = self.CausalLMClass.from_pretrained(self.internal_testing_model) model_kwargs = tokenizer( @@ -1009,7 +1009,7 @@ def test_to_static_use_top_k(self): @unittest.skip("Paddle enable PIR API in Python") def test_to_static_use_top_p(self): tokenizer = self.TokenizerClass.from_pretrained(self.internal_testing_model) - if "LlamaTokenizer" in tokenizer.__class__.__name__: + if tokenizer.__class__.__name__ == "LlamaTokenizer": tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "" model = self.CausalLMClass.from_pretrained(self.internal_testing_model) From 469ffbf917cfc62e6dd61a742570f2f20a3e068d Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Sat, 12 Oct 2024 09:27:07 +0800 Subject: [PATCH 12/21] Update test_modeling_common.py --- tests/transformers/test_modeling_common.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/transformers/test_modeling_common.py b/tests/transformers/test_modeling_common.py index af1a4845459f..51e8745fcb33 100644 --- a/tests/transformers/test_modeling_common.py +++ b/tests/transformers/test_modeling_common.py @@ -931,6 +931,7 @@ def test_to_static_use_top_k(self): tokenizer = self.TokenizerClass.from_pretrained(self.internal_testing_model) if tokenizer.__class__.__name__ == "LlamaTokenizer": tokenizer.pad_token = tokenizer.eos_token if tokenizer.eos_token else "" + model = self.CausalLMClass.from_pretrained(self.internal_testing_model) model_kwargs = tokenizer( self.article, From ee33fba76361e74546595d7604b93c7c3b05064a Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Sat, 12 Oct 2024 02:59:52 +0000 Subject: [PATCH 13/21] fix --- paddlenlp/transformers/auto/tokenizer.py | 10 +++++----- paddlenlp/utils/download/__init__.py | 1 - tests/transformers/auto/test_confiugration.py | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index bc8974055ef6..5f4d8e514e3c 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -321,7 +321,7 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_ return tokenizer_class @classmethod - def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): + def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): """ Creates an instance of `AutoTokenizer`. Related resources are loaded by specifying name of a built-in pretrained model, or a community-contributed @@ -335,7 +335,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): - Name of a community-contributed pretrained model. - Local directory path which contains tokenizer related resources and tokenizer config file ("tokenizer_config.json"). - *inputs (tuple): position arguments for model `__init__`. If provided, + *model_args (tuple): position arguments for model `__init__`. If provided, use these as position argument values for tokenizer initialization. **kwargs (dict): keyword arguments for model `__init__`. If provided, use these to update pre-defined keyword argument values for tokenizer @@ -391,7 +391,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): raise ValueError( f"Tokenizer class {tokenizer_class_candidate} does not exist or is not currently imported." ) - return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) # TODO: if model is an encoder decoder @@ -407,10 +407,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs): else: tokenizer_class_fast = None if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): - return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) else: if tokenizer_class_py is not None: - return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs) + return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) else: raise ValueError( "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " diff --git a/paddlenlp/utils/download/__init__.py b/paddlenlp/utils/download/__init__.py index a508f1177070..6f5dad5c8889 100644 --- a/paddlenlp/utils/download/__init__.py +++ b/paddlenlp/utils/download/__init__.py @@ -336,7 +336,6 @@ def bos_aistudio_hf_file_exist( token=token, # donot need token endpoint=endpoint, ) - return out diff --git a/tests/transformers/auto/test_confiugration.py b/tests/transformers/auto/test_confiugration.py index 676d174412f5..eea37fd93a86 100644 --- a/tests/transformers/auto/test_confiugration.py +++ b/tests/transformers/auto/test_confiugration.py @@ -102,7 +102,7 @@ def __init__(self, attribute=1, **kwargs): # Wrong model type will raise an error with self.assertRaises(ValueError): AutoConfig.register("model", CustomConfig) - # Trying to register something existing in the Transformers library will raise an error + # Trying to register something existing in the PaddleNLP library will raise an error with self.assertRaises(ValueError): AutoConfig.register("bert", BertConfig) From 353fb4195c0e1701db8f6c9905a63e4a86ad910d Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Thu, 17 Oct 2024 20:40:38 +0800 Subject: [PATCH 14/21] rm redundent print --- paddlenlp/transformers/auto/tokenizer.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 5f4d8e514e3c..de388dcf4128 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -143,13 +143,11 @@ def tokenizer_class_from_name(class_name: str): for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): if class_name in tokenizers: module_name = model_type_to_module_name(module_name) - print(f"module_name: {module_name}") try: module = importlib.import_module(f".{module_name}", "paddlenlp.transformers") return getattr(module, class_name) except AttributeError: try: - print(f"module: {module}") module = importlib.import_module(f".{module_name}.tokenizer", "paddlenlp.transformers") return getattr(module, class_name) From d279d8d399000d3234dd9720383a835d812bf827 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Thu, 17 Oct 2024 15:44:56 +0000 Subject: [PATCH 15/21] revert some changes --- .../paddlenlp.transformers.fnet.tokenizer.po | 2 +- ...ddlenlp.transformers.reformer.tokenizer.po | 2 +- .../paddlenlp.transformers.t5.tokenizer.po | 2 +- paddlenlp/transformers/__init__.py | 3 - paddlenlp/transformers/albert/tokenizer.py | 330 +++++++++++++++- .../transformers/albert_chinese/__init__.py | 15 - .../transformers/albert_chinese/tokenizer.py | 105 ------ .../transformers/albert_english/__init__.py | 15 - .../transformers/albert_english/tokenizer.py | 263 ------------- paddlenlp/transformers/bigbird/tokenizer.py | 2 +- paddlenlp/transformers/fnet/tokenizer.py | 3 +- paddlenlp/transformers/mbart/tokenizer.py | 322 +++++++++++++++- paddlenlp/transformers/mbart50/__init__.py | 15 - paddlenlp/transformers/mbart50/tokenizer.py | 353 ------------------ paddlenlp/transformers/reformer/tokenizer.py | 2 +- paddlenlp/transformers/t5/tokenizer.py | 2 +- tests/transformers/albert/test_tokenizer.py | 6 +- 17 files changed, 659 insertions(+), 783 deletions(-) delete mode 100644 paddlenlp/transformers/albert_chinese/__init__.py delete mode 100644 paddlenlp/transformers/albert_chinese/tokenizer.py delete mode 100644 paddlenlp/transformers/albert_english/__init__.py delete mode 100644 paddlenlp/transformers/albert_english/tokenizer.py delete mode 100644 paddlenlp/transformers/mbart50/__init__.py delete mode 100644 paddlenlp/transformers/mbart50/tokenizer.py diff --git a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po index f4ff6d216744..79004b0383ca 100644 --- a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po +++ b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.fnet.tokenizer.po @@ -26,7 +26,7 @@ msgid "Tokenization class for FNet model." msgstr "" #: of paddlenlp.transformers.fnet.tokenizer.FNetTokenizer:1 -msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`" +msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`" msgstr "" #: of paddlenlp.transformers.fnet.tokenizer.FNetTokenizer:1 diff --git a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.reformer.tokenizer.po b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.reformer.tokenizer.po index efec03c6bf05..a1dadaf14a90 100644 --- a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.reformer.tokenizer.po +++ b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.reformer.tokenizer.po @@ -22,7 +22,7 @@ msgid "tokenizer" msgstr "" #: of paddlenlp.transformers.reformer.tokenizer.ReformerTokenizer:1 -msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`" +msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`" msgstr "" #: of paddlenlp.transformers.reformer.tokenizer.ReformerTokenizer:1 diff --git a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.t5.tokenizer.po b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.t5.tokenizer.po index b809e680fcd4..2023df559055 100644 --- a/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.t5.tokenizer.po +++ b/docs/locale/en/LC_MESSAGES/source/paddlenlp.transformers.t5.tokenizer.po @@ -22,7 +22,7 @@ msgid "tokenizer" msgstr "" #: of paddlenlp.transformers.t5.tokenizer.T5Tokenizer:1 -msgid "基类::class:`paddlenlp.transformers.albert_english.tokenizer.AlbertEnglishTokenizer`" +msgid "基类::class:`paddlenlp.transformers.albert.tokenizer.AlbertEnglishTokenizer`" msgstr "" #: of paddlenlp.transformers.t5.tokenizer.T5Tokenizer:1 diff --git a/paddlenlp/transformers/__init__.py b/paddlenlp/transformers/__init__.py index cd629e76bc1b..c8bf3a0aecde 100644 --- a/paddlenlp/transformers/__init__.py +++ b/paddlenlp/transformers/__init__.py @@ -64,8 +64,6 @@ from .albert.configuration import * from .albert.modeling import * from .albert.tokenizer import * -from .albert_chinese.tokenizer import * -from .albert_english.tokenizer import * from .bit.modeling import * from .bit.configuration import * from .bit.image_processing import * @@ -143,7 +141,6 @@ from .mbart.modeling import * from .mbart.tokenizer import * from .mbart.configuration import * -from .mbart50.tokenizer import * from .megatronbert.modeling import * from .megatronbert.tokenizer import * from .megatronbert.configuration import * diff --git a/paddlenlp/transformers/albert/tokenizer.py b/paddlenlp/transformers/albert/tokenizer.py index 903d36361e73..a7d80d0b2457 100644 --- a/paddlenlp/transformers/albert/tokenizer.py +++ b/paddlenlp/transformers/albert/tokenizer.py @@ -14,9 +14,13 @@ # limitations under the License. """Tokenization class for ALBERT model.""" -from .. import AddedToken, PretrainedTokenizer -from ..albert_chinese.tokenizer import AlbertChineseTokenizer -from ..albert_english.tokenizer import AlbertEnglishTokenizer +import os +import unicodedata +from shutil import copyfile + +import sentencepiece as spm + +from .. import PretrainedTokenizer, BertTokenizer, AddedToken __all__ = ["AlbertTokenizer"] @@ -475,3 +479,323 @@ def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): def save_resources(self, save_directory): return self.tokenizer.save_resources(save_directory) + + +class AlbertEnglishTokenizer(PretrainedTokenizer): + resource_files_names = { + "sentencepiece_model_file": "spiece.model", + } + + pretrained_resource_files_map = { + "sentencepiece_model_file": { + "albert-base-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v1.spiece.model", + "albert-large-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v1.spiece.model", + "albert-xlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v1.spiece.model", + "albert-xxlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v1.spiece.model", + "albert-base-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v2.spiece.model", + "albert-large-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v2.spiece.model", + "albert-xlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v2.spiece.model", + "albert-xxlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v2.spiece.model", + }, + } + + pretrained_init_configuration = { + "albert-base-v1": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-large-v1": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-xlarge-v1": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-xxlarge-v1": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-base-v2": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-large-v2": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-xlarge-v2": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + "albert-xxlarge-v2": { + "do_lower_case": True, + "remove_space": True, + "keep_accents": False, + "unk_token": "", + "pad_token": "", + }, + } + max_model_input_sizes = { + "albert-base-v1": 512, + "albert-large-v1": 512, + "albert-xlarge-v1": 512, + "albert-xxlarge-v1": 512, + "albert-base-v2": 512, + "albert-large-v2": 512, + "albert-xlarge-v2": 512, + "albert-xxlarge-v2": 512, + } + + def __init__( + self, + sentencepiece_model_file, + do_lower_case=True, + remove_space=True, + keep_accents=False, + bos_token="[CLS]", + eos_token="[SEP]", + unk_token="", + sep_token="[SEP]", + pad_token="", + cls_token="[CLS]", + mask_token="[MASK]", + sp_model_kwargs=None, + **kwargs + ): + + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + self.do_lower_case = do_lower_case + self.remove_space = remove_space + self.keep_accents = keep_accents + self.sentencepiece_model_file = sentencepiece_model_file + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(sentencepiece_model_file) + + @property + def vocab_size(self): + return len(self.sp_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + return state + + def __setstate__(self, d): + self.__dict__ = d + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.Load(self.sentencepiece_model_file) + + def preprocess_text(self, inputs): + if self.remove_space: + outputs = " ".join(inputs.strip().split()) + else: + outputs = inputs + outputs = outputs.replace("``", '"').replace("''", '"') + + if not self.keep_accents: + outputs = unicodedata.normalize("NFKD", outputs) + outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) + if self.do_lower_case: + outputs = outputs.lower() + + return outputs + + def _tokenize(self, text): + """Tokenize a string.""" + text = self.preprocess_text(text) + pieces = self.sp_model.encode(text, out_type=str) + new_pieces = [] + for piece in pieces: + if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): + cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) + if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: + if len(cur_pieces[0]) == 1: + cur_pieces = cur_pieces[1:] + else: + cur_pieces[0] = cur_pieces[0][1:] + cur_pieces.append(piece[-1]) + new_pieces.extend(cur_pieces) + else: + new_pieces.append(piece) + + return new_pieces + + def _convert_token_to_id(self, token): + """Converts a token (str) to an id using the vocab.""" + return self.sp_model.PieceToId(token) + + def _convert_id_to_token(self, index): + """Converts an index (integer) to a token (str) using the vocab.""" + return self.sp_model.IdToPiece(index) + + def convert_tokens_to_string(self, tokens): + """Converts a sequence of tokens (strings for sub-words) in a single string.""" + out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() + return out_string + + def num_special_tokens_to_add(self, pair=False): + token_ids_0 = [] + token_ids_1 = [] + return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + sep = [self.sep_token_id] + cls = [self.cls_token_id] + if token_ids_1 is None: + return cls + token_ids_0 + sep + return cls + token_ids_0 + sep + token_ids_1 + sep + + def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): + if offset_mapping_1 is None: + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)] + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + + if already_has_special_tokens: + if token_ids_1 is not None: + raise ValueError( + "You should not supply a second sequence if the provided sequence of " + "ids is already formatted with special tokens for the model." + ) + return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) + + if token_ids_1 is not None: + return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] + return [1] + ([0] * len(token_ids_0)) + [1] + + def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): + sep = [self.sep_token_id] + cls = [self.cls_token_id] + + if token_ids_1 is None: + return len(cls + token_ids_0 + sep) * [0] + return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] + + def save_resources(self, save_directory): + for name, file_name in self.resource_files_names.items(): + save_path = os.path.join(save_directory, file_name) + if os.path.abspath(self.sentencepiece_model_file) != os.path.abspath(save_path) and os.path.isfile( + self.sentencepiece_model_file + ): + copyfile(self.sentencepiece_model_file, save_path) + elif not os.path.isfile(self.sentencepiece_model_file): + with open(save_path, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + +class AlbertChineseTokenizer(BertTokenizer): + resource_files_names = {"vocab_file": "vocab.txt"} + pretrained_resource_files_map = { + "vocab_file": { + "albert-chinese-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-tiny.vocab.txt", + "albert-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-small.vocab.txt", + "albert-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-base.vocab.txt", + "albert-chinese-large": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-large.vocab.txt", + "albert-chinese-xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xlarge.vocab.txt", + "albert-chinese-xxlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xxlarge.vocab.txt", + } + } + pretrained_init_configuration = { + "albert-chinese-tiny": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-small": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-base": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-large": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-xlarge": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + "albert-chinese-xxlarge": { + "do_lower_case": False, + "unk_token": "[UNK]", + "pad_token": "[PAD]", + }, + } + max_model_input_sizes = { + "albert-chinese-tiny": 512, + "albert-chinese-small": 512, + "albert-chinese-base": 512, + "albert-chinese-large": 512, + "albert-chinese-xlarge": 512, + "albert-chinese-xxlarge": 512, + } + + def __init__( + self, + vocab_file, + do_lower_case=True, + do_basic_tokenize=True, + never_split=None, + unk_token="[UNK]", + sep_token="[SEP]", + pad_token="[PAD]", + cls_token="[CLS]", + mask_token="[MASK]", + tokenize_chinese_chars=True, + strip_accents=None, + **kwargs + ): + super(AlbertChineseTokenizer, self).__init__( + vocab_file, + do_lower_case=do_lower_case, + do_basic_tokenize=do_basic_tokenize, + never_split=never_split, + unk_token=unk_token, + sep_token=sep_token, + pad_token=pad_token, + cls_token=cls_token, + mask_token=mask_token, + tokenize_chinese_chars=tokenize_chinese_chars, + strip_accents=strip_accents, + **kwargs, + ) diff --git a/paddlenlp/transformers/albert_chinese/__init__.py b/paddlenlp/transformers/albert_chinese/__init__.py deleted file mode 100644 index e16c5219a42b..000000000000 --- a/paddlenlp/transformers/albert_chinese/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .tokenizer import * diff --git a/paddlenlp/transformers/albert_chinese/tokenizer.py b/paddlenlp/transformers/albert_chinese/tokenizer.py deleted file mode 100644 index 9cdd86a41554..000000000000 --- a/paddlenlp/transformers/albert_chinese/tokenizer.py +++ /dev/null @@ -1,105 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization class for ALBERT model.""" - -from .. import BertTokenizer - -__all__ = ["AlbertChineseTokenizer"] - -SPIECE_UNDERLINE = "▁" - - -class AlbertChineseTokenizer(BertTokenizer): - resource_files_names = {"vocab_file": "vocab.txt"} - pretrained_resource_files_map = { - "vocab_file": { - "albert-chinese-tiny": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-tiny.vocab.txt", - "albert-chinese-small": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-small.vocab.txt", - "albert-chinese-base": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-base.vocab.txt", - "albert-chinese-large": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-large.vocab.txt", - "albert-chinese-xlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xlarge.vocab.txt", - "albert-chinese-xxlarge": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-chinese-xxlarge.vocab.txt", - } - } - pretrained_init_configuration = { - "albert-chinese-tiny": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-small": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-base": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-large": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-xlarge": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - "albert-chinese-xxlarge": { - "do_lower_case": False, - "unk_token": "[UNK]", - "pad_token": "[PAD]", - }, - } - max_model_input_sizes = { - "albert-chinese-tiny": 512, - "albert-chinese-small": 512, - "albert-chinese-base": 512, - "albert-chinese-large": 512, - "albert-chinese-xlarge": 512, - "albert-chinese-xxlarge": 512, - } - - def __init__( - self, - vocab_file, - do_lower_case=True, - do_basic_tokenize=True, - never_split=None, - unk_token="[UNK]", - sep_token="[SEP]", - pad_token="[PAD]", - cls_token="[CLS]", - mask_token="[MASK]", - tokenize_chinese_chars=True, - strip_accents=None, - **kwargs - ): - super(AlbertChineseTokenizer, self).__init__( - vocab_file, - do_lower_case=do_lower_case, - do_basic_tokenize=do_basic_tokenize, - never_split=never_split, - unk_token=unk_token, - sep_token=sep_token, - pad_token=pad_token, - cls_token=cls_token, - mask_token=mask_token, - tokenize_chinese_chars=tokenize_chinese_chars, - strip_accents=strip_accents, - **kwargs, - ) diff --git a/paddlenlp/transformers/albert_english/__init__.py b/paddlenlp/transformers/albert_english/__init__.py deleted file mode 100644 index e16c5219a42b..000000000000 --- a/paddlenlp/transformers/albert_english/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .tokenizer import * diff --git a/paddlenlp/transformers/albert_english/tokenizer.py b/paddlenlp/transformers/albert_english/tokenizer.py deleted file mode 100644 index e2b4f2a63b02..000000000000 --- a/paddlenlp/transformers/albert_english/tokenizer.py +++ /dev/null @@ -1,263 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -"""Tokenization class for ALBERT model.""" - -import os -import unicodedata -from shutil import copyfile - -import sentencepiece as spm - -from .. import PretrainedTokenizer - -__all__ = ["AlbertEnglishTokenizer"] - -SPIECE_UNDERLINE = "▁" - - -class AlbertEnglishTokenizer(PretrainedTokenizer): - resource_files_names = { - "sentencepiece_model_file": "spiece.model", - } - - pretrained_resource_files_map = { - "sentencepiece_model_file": { - "albert-base-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v1.spiece.model", - "albert-large-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v1.spiece.model", - "albert-xlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v1.spiece.model", - "albert-xxlarge-v1": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v1.spiece.model", - "albert-base-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-base-v2.spiece.model", - "albert-large-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-large-v2.spiece.model", - "albert-xlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xlarge-v2.spiece.model", - "albert-xxlarge-v2": "https://bj.bcebos.com/paddlenlp/models/transformers/albert/albert-xxlarge-v2.spiece.model", - }, - } - - pretrained_init_configuration = { - "albert-base-v1": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-large-v1": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-xlarge-v1": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-xxlarge-v1": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-base-v2": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-large-v2": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-xlarge-v2": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - "albert-xxlarge-v2": { - "do_lower_case": True, - "remove_space": True, - "keep_accents": False, - "unk_token": "", - "pad_token": "", - }, - } - max_model_input_sizes = { - "albert-base-v1": 512, - "albert-large-v1": 512, - "albert-xlarge-v1": 512, - "albert-xxlarge-v1": 512, - "albert-base-v2": 512, - "albert-large-v2": 512, - "albert-xlarge-v2": 512, - "albert-xxlarge-v2": 512, - } - - def __init__( - self, - sentencepiece_model_file, - do_lower_case=True, - remove_space=True, - keep_accents=False, - bos_token="[CLS]", - eos_token="[SEP]", - unk_token="", - sep_token="[SEP]", - pad_token="", - cls_token="[CLS]", - mask_token="[MASK]", - sp_model_kwargs=None, - **kwargs - ): - - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - self.do_lower_case = do_lower_case - self.remove_space = remove_space - self.keep_accents = keep_accents - self.sentencepiece_model_file = sentencepiece_model_file - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(sentencepiece_model_file) - - @property - def vocab_size(self): - return len(self.sp_model) - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - return state - - def __setstate__(self, d): - self.__dict__ = d - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.Load(self.sentencepiece_model_file) - - def preprocess_text(self, inputs): - if self.remove_space: - outputs = " ".join(inputs.strip().split()) - else: - outputs = inputs - outputs = outputs.replace("``", '"').replace("''", '"') - - if not self.keep_accents: - outputs = unicodedata.normalize("NFKD", outputs) - outputs = "".join([c for c in outputs if not unicodedata.combining(c)]) - if self.do_lower_case: - outputs = outputs.lower() - - return outputs - - def _tokenize(self, text): - """Tokenize a string.""" - text = self.preprocess_text(text) - pieces = self.sp_model.encode(text, out_type=str) - new_pieces = [] - for piece in pieces: - if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit(): - cur_pieces = self.sp_model.EncodeAsPieces(piece[:-1].replace(SPIECE_UNDERLINE, "")) - if piece[0] != SPIECE_UNDERLINE and cur_pieces[0][0] == SPIECE_UNDERLINE: - if len(cur_pieces[0]) == 1: - cur_pieces = cur_pieces[1:] - else: - cur_pieces[0] = cur_pieces[0][1:] - cur_pieces.append(piece[-1]) - new_pieces.extend(cur_pieces) - else: - new_pieces.append(piece) - - return new_pieces - - def _convert_token_to_id(self, token): - """Converts a token (str) to an id using the vocab.""" - return self.sp_model.PieceToId(token) - - def _convert_id_to_token(self, index): - """Converts an index (integer) to a token (str) using the vocab.""" - return self.sp_model.IdToPiece(index) - - def convert_tokens_to_string(self, tokens): - """Converts a sequence of tokens (strings for sub-words) in a single string.""" - out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip() - return out_string - - def num_special_tokens_to_add(self, pair=False): - token_ids_0 = [] - token_ids_1 = [] - return len(self.build_inputs_with_special_tokens(token_ids_0, token_ids_1 if pair else None)) - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - sep = [self.sep_token_id] - cls = [self.cls_token_id] - if token_ids_1 is None: - return cls + token_ids_0 + sep - return cls + token_ids_0 + sep + token_ids_1 + sep - - def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): - if offset_mapping_1 is None: - return [(0, 0)] + offset_mapping_0 + [(0, 0)] - - return [(0, 0)] + offset_mapping_0 + [(0, 0)] + offset_mapping_1 + [(0, 0)] - - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): - - if already_has_special_tokens: - if token_ids_1 is not None: - raise ValueError( - "You should not supply a second sequence if the provided sequence of " - "ids is already formatted with special tokens for the model." - ) - return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0)) - - if token_ids_1 is not None: - return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1] - return [1] + ([0] * len(token_ids_0)) + [1] - - def create_token_type_ids_from_sequences(self, token_ids_0, token_ids_1=None): - sep = [self.sep_token_id] - cls = [self.cls_token_id] - - if token_ids_1 is None: - return len(cls + token_ids_0 + sep) * [0] - return len(cls + token_ids_0 + sep) * [0] + len(token_ids_1 + sep) * [1] - - def save_resources(self, save_directory): - for name, file_name in self.resource_files_names.items(): - save_path = os.path.join(save_directory, file_name) - if os.path.abspath(self.sentencepiece_model_file) != os.path.abspath(save_path) and os.path.isfile( - self.sentencepiece_model_file - ): - copyfile(self.sentencepiece_model_file, save_path) - elif not os.path.isfile(self.sentencepiece_model_file): - with open(save_path, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) diff --git a/paddlenlp/transformers/bigbird/tokenizer.py b/paddlenlp/transformers/bigbird/tokenizer.py index 6f73fbef6a75..9bc5ee70ef4a 100644 --- a/paddlenlp/transformers/bigbird/tokenizer.py +++ b/paddlenlp/transformers/bigbird/tokenizer.py @@ -21,7 +21,7 @@ from paddlenlp.data.vocab import Vocab -from ..albert_english.tokenizer import AlbertEnglishTokenizer +from ..albert.tokenizer import AlbertEnglishTokenizer __all__ = ["BigBirdTokenizer"] diff --git a/paddlenlp/transformers/fnet/tokenizer.py b/paddlenlp/transformers/fnet/tokenizer.py index 786207c83db7..36456a4aee4b 100644 --- a/paddlenlp/transformers/fnet/tokenizer.py +++ b/paddlenlp/transformers/fnet/tokenizer.py @@ -17,8 +17,7 @@ import sentencepiece as spm -from ..albert_english.tokenizer import AlbertEnglishTokenizer -from ..tokenizer_utils_base import AddedToken +from ..albert.tokenizer import AddedToken, AlbertEnglishTokenizer __all__ = ["FNetTokenizer"] diff --git a/paddlenlp/transformers/mbart/tokenizer.py b/paddlenlp/transformers/mbart/tokenizer.py index 9f25eaba241f..163031e178e0 100644 --- a/paddlenlp/transformers/mbart/tokenizer.py +++ b/paddlenlp/transformers/mbart/tokenizer.py @@ -19,7 +19,7 @@ from .. import AddedToken, PretrainedTokenizer -__all__ = ["MBartTokenizer"] +__all__ = ["MBartTokenizer", "MBart50Tokenizer"] MBART_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { "mbart-large-cc25": 1024, @@ -309,3 +309,323 @@ def set_tgt_lang_special_tokens(self, tgt_lang): self.cur_lang_code_id = self.lang_code_to_id[tgt_lang] self.prefix_tokens = [] self.suffix_tokens = [self.eos_token_id, self.cur_lang_code_id] + + +class MBart50Tokenizer(PretrainedTokenizer): + resource_files_names = { + "vocab_file": "sentencepiece.bpe.model", + } + pretrained_resource_files_map = { + "vocab_file": { + "mbart-large-50-one-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-one-to-many-mmt.sentencepiece.bpe.model", + "mbart-large-50-many-to-one-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-one-mmt.sentencepiece.bpe.model", + "mbart-large-50-many-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-many-mmt.sentencepiece.bpe.model", + } + } + pretrained_init_configuration = { + "mbart-large-50-one-to-many-mmt": {}, + "mbart-large-50-many-to-one-mmt": {}, + "mbart-large-50-many-to-many-mmt": {}, + } + max_model_input_sizes = MBART50_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES + model_input_names = ["input_ids"] + + FAIRSEQ_LANGUAGE_CODES = [ + "ar_AR", + "cs_CZ", + "de_DE", + "en_XX", + "es_XX", + "et_EE", + "fi_FI", + "fr_XX", + "gu_IN", + "hi_IN", + "it_IT", + "ja_XX", + "kk_KZ", + "ko_KR", + "lt_LT", + "lv_LV", + "my_MM", + "ne_NP", + "nl_XX", + "ro_RO", + "ru_RU", + "si_LK", + "tr_TR", + "vi_VN", + "zh_CN", + "af_ZA", + "az_AZ", + "bn_IN", + "fa_IR", + "he_IL", + "hr_HR", + "id_ID", + "ka_GE", + "km_KH", + "mk_MK", + "ml_IN", + "mn_MN", + "mr_IN", + "pl_PL", + "ps_AF", + "pt_XX", + "sv_SE", + "sw_KE", + "ta_IN", + "te_IN", + "th_TH", + "tl_XX", + "uk_UA", + "ur_PK", + "xh_ZA", + "gl_ES", + "sl_SI", + ] + + def __init__( + self, + vocab_file, + src_lang=None, + tgt_lang=None, + bos_token="", + eos_token="", + sep_token="", + cls_token="", + unk_token="", + pad_token="", + mask_token="", + sp_model_kwargs=None, + additional_special_tokens=None, + **kwargs + ): + self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs + + mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token + self._build_special_tokens_map_extended(mask_token=mask_token) + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.vocab_file = vocab_file + self.sp_model.Load(str(vocab_file)) + self.fairseq_offset = 1 + self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} + self.sp_model_size = len(self.sp_model) + self.lang_code_to_id = { + code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(self.FAIRSEQ_LANGUAGE_CODES) + } + self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + self.fairseq_tokens_to_ids.update(self.lang_code_to_id) + self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} + self.src_lang = src_lang if src_lang is not None else "en_XX" + self.tgt_lang = tgt_lang + # Get `special_tokens_map` after `_wrap_init()` + self.eos_token_id = self.fairseq_tokens_to_ids[eos_token] + self.unk_token_id = self.fairseq_tokens_to_ids[unk_token] + self.set_src_lang_special_tokens(self.src_lang) + self._additional_special_tokens = list(self.lang_code_to_id.keys()) + + if additional_special_tokens is not None: + # Only add those special tokens if they are not already there. + self._additional_special_tokens.extend( + [t for t in additional_special_tokens if t not in self._additional_special_tokens] + ) + + def __call__( + self, + text, + text_pair=None, + max_length=None, + stride=0, + is_split_into_words=False, + padding=None, + truncation="longest_first", + return_position_ids=False, + return_token_type_ids=False, + return_attention_mask=True, + return_length=False, + return_overflowing_tokens=False, + return_special_tokens_mask=False, + **kwargs + ): + if "pad_to_max_seq_len" in kwargs and padding is None: + pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len") + padding = "max_length" if pad_to_max_seq_len else False + elif padding is None: + padding = False + + if "max_seq_len" in kwargs and max_length is None: + max_length = kwargs["max_seq_len"] + + if "truncation_strategy" in kwargs and kwargs["truncation_strategy"] != "longest_first": + truncation = kwargs["truncation_strategy"] + + return super(MBart50Tokenizer, self).__call__( + text=text, + text_pair=text_pair, + max_length=max_length, + stride=stride, + is_split_into_words=is_split_into_words, + padding=padding, + truncation=truncation, + return_position_ids=return_position_ids, + return_token_type_ids=return_token_type_ids, + return_attention_mask=return_attention_mask, + return_length=return_length, + return_overflowing_tokens=return_overflowing_tokens, + return_special_tokens_mask=return_special_tokens_mask, + **kwargs, + ) + + def __getstate__(self): + state = self.__dict__.copy() + state["sp_model"] = None + state["sp_model_proto"] = self.sp_model.serialized_model_proto() + return state + + def __setstate__(self, d): + self.__dict__ = d + + # for backward compatibility + if not hasattr(self, "sp_model_kwargs"): + self.sp_model_kwargs = {} + + self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) + self.sp_model.LoadFromSerializedProto(self.sp_model_proto) + + def save_resources(self, save_directory): + for name, file_name in self.resource_files_names.items(): + save_path = os.path.join(save_directory, file_name) + if os.path.abspath(self.vocab_file) != os.path.abspath(save_path) and os.path.isfile(self.vocab_file): + copyfile(self.vocab_file, save_path) + elif not os.path.isfile(self.vocab_file): + with open(save_path, "wb") as fi: + content_spiece_model = self.sp_model.serialized_model_proto() + fi.write(content_spiece_model) + + def get_vocab(self): + vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} + vocab.update(self.added_tokens_encoder) + return vocab + + def _tokenize(self, text): + return self.sp_model.encode(text, out_type=str) + + @property + def vocab_size(self): + """ + Returns the size of vocabulary. + + Returns: + int: The sum of size of vocabulary and the size of speical tokens. + + """ + + return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1 + + def _convert_token_to_id(self, token): + """ + Converts a token (str) in an id using the vocab. + """ + if token in self.fairseq_tokens_to_ids: + return self.fairseq_tokens_to_ids[token] + spm_id = self.sp_model.PieceToId(token) + + return spm_id + self.fairseq_offset if spm_id else self.unk_token_id + + def _convert_id_to_token(self, index): + """ + Converts an index (integer) in a token (str) using the vocab. + """ + if index in self.fairseq_ids_to_tokens: + return self.fairseq_ids_to_tokens[index] + return self.sp_model.IdToPiece(index - self.fairseq_offset) + + def convert_tokens_to_string(self, tokens): + """ + Converts a sequence of tokens (strings for sub-words) in a single string. + """ + out_string = "".join(tokens).replace("▁", " ").strip() + return out_string + + def convert_ids_to_string(self, ids): + """ + Converts a sequence of tokens (strings for sub-words) in a single string. + """ + tokens = self.convert_ids_to_tokens(ids) + out_string = "".join(tokens).replace("▁", " ").strip() + return out_string + + def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): + """ + Retrieve sequence ids from a token list that has no special tokens added. + """ + + if already_has_special_tokens: + return super().get_special_tokens_mask( + token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True + ) + + prefix_ones = [1] * len(self.prefix_tokens) + suffix_ones = [1] * len(self.suffix_tokens) + if token_ids_1 is None: + return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones + return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones + + def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): + """ + Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and + adding special tokens. An MBART50 sequence has the following format, where ``X`` represents the sequence: + + - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]`` + - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]`` + + BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a + separator. + """ + if token_ids_1 is None: + return self.prefix_tokens + token_ids_0 + self.suffix_tokens + # We don't expect to process pairs, but leave the pair logic for API consistency + return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens + + def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): + """ + Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. + + Should be overridden in a subclass if the model has a special way of building those. + + Args: + offset_mapping_0 (List[tuple]): + List of char offsets to which the special tokens will be added. + offset_mapping_1 (List[tuple], optional): + Optional second list of char offsets for offset mapping pairs. + + Returns: + List[tuple]: List of char offsets with the appropriate offsets of special tokens. + """ + if offset_mapping_1 is None: + return [(0, 0)] + offset_mapping_0 + [(0, 0)] + + return [(0, 0)] + offset_mapping_0 + offset_mapping_1 + [(0, 0)] + + def set_src_lang_special_tokens(self, src_lang): + """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" + self.cur_lang_code_id = self.lang_code_to_id[src_lang] + self.prefix_tokens = [self.cur_lang_code_id] + self.suffix_tokens = [self.eos_token_id] + + def set_tgt_lang_special_tokens(self, tgt_lang): + """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos].""" + self.cur_lang_code_id = self.lang_code_to_id[tgt_lang] + self.prefix_tokens = [self.cur_lang_code_id] + self.suffix_tokens = [self.eos_token_id] + + def _build_translation_inputs(self, raw_inputs, return_tensors, src_lang, tgt_lang, **extra_kwargs): + """Used by translation pipeline, to prepare inputs for the generate function""" + if src_lang is None or tgt_lang is None: + raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") + self.src_lang = src_lang + inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) + tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) + inputs["forced_bos_token_id"] = tgt_lang_id + return inputs diff --git a/paddlenlp/transformers/mbart50/__init__.py b/paddlenlp/transformers/mbart50/__init__.py deleted file mode 100644 index c4ea9058ff63..000000000000 --- a/paddlenlp/transformers/mbart50/__init__.py +++ /dev/null @@ -1,15 +0,0 @@ -# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from .tokenizer import MBart50Tokenizer diff --git a/paddlenlp/transformers/mbart50/tokenizer.py b/paddlenlp/transformers/mbart50/tokenizer.py deleted file mode 100644 index 68d0520c548c..000000000000 --- a/paddlenlp/transformers/mbart50/tokenizer.py +++ /dev/null @@ -1,353 +0,0 @@ -# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import os -from shutil import copyfile - -import sentencepiece as spm - -from .. import AddedToken, PretrainedTokenizer - -__all__ = ["MBart50Tokenizer"] - -MBART_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "mbart-large-cc25": 1024, - "mbart-large-en-ro": 1024, -} - -MBART50_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = { - "mbart-large-50-one-to-many-mmt": 1024, - "mbart-large-50-many-to-one-mmt": 1024, - "mbart-large-50-many-to-many-mmt": 1024, -} - - -class MBart50Tokenizer(PretrainedTokenizer): - resource_files_names = { - "vocab_file": "sentencepiece.bpe.model", - } - pretrained_resource_files_map = { - "vocab_file": { - "mbart-large-50-one-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-one-to-many-mmt.sentencepiece.bpe.model", - "mbart-large-50-many-to-one-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-one-mmt.sentencepiece.bpe.model", - "mbart-large-50-many-to-many-mmt": "https://bj.bcebos.com/paddlenlp/models/transformers/mbart50/mbart-large-50-many-to-many-mmt.sentencepiece.bpe.model", - } - } - pretrained_init_configuration = { - "mbart-large-50-one-to-many-mmt": {}, - "mbart-large-50-many-to-one-mmt": {}, - "mbart-large-50-many-to-many-mmt": {}, - } - max_model_input_sizes = MBART50_PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES - model_input_names = ["input_ids"] - - FAIRSEQ_LANGUAGE_CODES = [ - "ar_AR", - "cs_CZ", - "de_DE", - "en_XX", - "es_XX", - "et_EE", - "fi_FI", - "fr_XX", - "gu_IN", - "hi_IN", - "it_IT", - "ja_XX", - "kk_KZ", - "ko_KR", - "lt_LT", - "lv_LV", - "my_MM", - "ne_NP", - "nl_XX", - "ro_RO", - "ru_RU", - "si_LK", - "tr_TR", - "vi_VN", - "zh_CN", - "af_ZA", - "az_AZ", - "bn_IN", - "fa_IR", - "he_IL", - "hr_HR", - "id_ID", - "ka_GE", - "km_KH", - "mk_MK", - "ml_IN", - "mn_MN", - "mr_IN", - "pl_PL", - "ps_AF", - "pt_XX", - "sv_SE", - "sw_KE", - "ta_IN", - "te_IN", - "th_TH", - "tl_XX", - "uk_UA", - "ur_PK", - "xh_ZA", - "gl_ES", - "sl_SI", - ] - - def __init__( - self, - vocab_file, - src_lang=None, - tgt_lang=None, - bos_token="", - eos_token="", - sep_token="", - cls_token="", - unk_token="", - pad_token="", - mask_token="", - sp_model_kwargs=None, - additional_special_tokens=None, - **kwargs - ): - self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs - - mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token - self._build_special_tokens_map_extended(mask_token=mask_token) - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.vocab_file = vocab_file - self.sp_model.Load(str(vocab_file)) - self.fairseq_offset = 1 - self.fairseq_tokens_to_ids = {"": 0, "": 1, "": 2, "": 3} - self.sp_model_size = len(self.sp_model) - self.lang_code_to_id = { - code: self.sp_model_size + i + self.fairseq_offset for i, code in enumerate(self.FAIRSEQ_LANGUAGE_CODES) - } - self.fairseq_tokens_to_ids[""] = len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset - self.fairseq_tokens_to_ids.update(self.lang_code_to_id) - self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()} - self.src_lang = src_lang if src_lang is not None else "en_XX" - self.tgt_lang = tgt_lang - # Get `special_tokens_map` after `_wrap_init()` - self.eos_token_id = self.fairseq_tokens_to_ids[eos_token] - self.unk_token_id = self.fairseq_tokens_to_ids[unk_token] - self.set_src_lang_special_tokens(self.src_lang) - self._additional_special_tokens = list(self.lang_code_to_id.keys()) - - if additional_special_tokens is not None: - # Only add those special tokens if they are not already there. - self._additional_special_tokens.extend( - [t for t in additional_special_tokens if t not in self._additional_special_tokens] - ) - - def __call__( - self, - text, - text_pair=None, - max_length=None, - stride=0, - is_split_into_words=False, - padding=None, - truncation="longest_first", - return_position_ids=False, - return_token_type_ids=False, - return_attention_mask=True, - return_length=False, - return_overflowing_tokens=False, - return_special_tokens_mask=False, - **kwargs - ): - if "pad_to_max_seq_len" in kwargs and padding is None: - pad_to_max_seq_len = kwargs.pop("pad_to_max_seq_len") - padding = "max_length" if pad_to_max_seq_len else False - elif padding is None: - padding = False - - if "max_seq_len" in kwargs and max_length is None: - max_length = kwargs["max_seq_len"] - - if "truncation_strategy" in kwargs and kwargs["truncation_strategy"] != "longest_first": - truncation = kwargs["truncation_strategy"] - - return super(MBart50Tokenizer, self).__call__( - text=text, - text_pair=text_pair, - max_length=max_length, - stride=stride, - is_split_into_words=is_split_into_words, - padding=padding, - truncation=truncation, - return_position_ids=return_position_ids, - return_token_type_ids=return_token_type_ids, - return_attention_mask=return_attention_mask, - return_length=return_length, - return_overflowing_tokens=return_overflowing_tokens, - return_special_tokens_mask=return_special_tokens_mask, - **kwargs, - ) - - def __getstate__(self): - state = self.__dict__.copy() - state["sp_model"] = None - state["sp_model_proto"] = self.sp_model.serialized_model_proto() - return state - - def __setstate__(self, d): - self.__dict__ = d - - # for backward compatibility - if not hasattr(self, "sp_model_kwargs"): - self.sp_model_kwargs = {} - - self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model.LoadFromSerializedProto(self.sp_model_proto) - - def save_resources(self, save_directory): - for name, file_name in self.resource_files_names.items(): - save_path = os.path.join(save_directory, file_name) - if os.path.abspath(self.vocab_file) != os.path.abspath(save_path) and os.path.isfile(self.vocab_file): - copyfile(self.vocab_file, save_path) - elif not os.path.isfile(self.vocab_file): - with open(save_path, "wb") as fi: - content_spiece_model = self.sp_model.serialized_model_proto() - fi.write(content_spiece_model) - - def get_vocab(self): - vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)} - vocab.update(self.added_tokens_encoder) - return vocab - - def _tokenize(self, text): - return self.sp_model.encode(text, out_type=str) - - @property - def vocab_size(self): - """ - Returns the size of vocabulary. - - Returns: - int: The sum of size of vocabulary and the size of speical tokens. - - """ - - return len(self.sp_model) + len(self.lang_code_to_id) + self.fairseq_offset + 1 - - def _convert_token_to_id(self, token): - """ - Converts a token (str) in an id using the vocab. - """ - if token in self.fairseq_tokens_to_ids: - return self.fairseq_tokens_to_ids[token] - spm_id = self.sp_model.PieceToId(token) - - return spm_id + self.fairseq_offset if spm_id else self.unk_token_id - - def _convert_id_to_token(self, index): - """ - Converts an index (integer) in a token (str) using the vocab. - """ - if index in self.fairseq_ids_to_tokens: - return self.fairseq_ids_to_tokens[index] - return self.sp_model.IdToPiece(index - self.fairseq_offset) - - def convert_tokens_to_string(self, tokens): - """ - Converts a sequence of tokens (strings for sub-words) in a single string. - """ - out_string = "".join(tokens).replace("▁", " ").strip() - return out_string - - def convert_ids_to_string(self, ids): - """ - Converts a sequence of tokens (strings for sub-words) in a single string. - """ - tokens = self.convert_ids_to_tokens(ids) - out_string = "".join(tokens).replace("▁", " ").strip() - return out_string - - def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False): - """ - Retrieve sequence ids from a token list that has no special tokens added. - """ - - if already_has_special_tokens: - return super().get_special_tokens_mask( - token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True - ) - - prefix_ones = [1] * len(self.prefix_tokens) - suffix_ones = [1] * len(self.suffix_tokens) - if token_ids_1 is None: - return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones - return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones - - def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None): - """ - Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and - adding special tokens. An MBART50 sequence has the following format, where ``X`` represents the sequence: - - - ``input_ids`` (for encoder) ``[src_lang_code] X [eos]`` - - ``labels``: (for decoder) ``[tgt_lang_code] X [eos]`` - - BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a - separator. - """ - if token_ids_1 is None: - return self.prefix_tokens + token_ids_0 + self.suffix_tokens - # We don't expect to process pairs, but leave the pair logic for API consistency - return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens - - def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None): - """ - Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. - - Should be overridden in a subclass if the model has a special way of building those. - - Args: - offset_mapping_0 (List[tuple]): - List of char offsets to which the special tokens will be added. - offset_mapping_1 (List[tuple], optional): - Optional second list of char offsets for offset mapping pairs. - - Returns: - List[tuple]: List of char offsets with the appropriate offsets of special tokens. - """ - if offset_mapping_1 is None: - return [(0, 0)] + offset_mapping_0 + [(0, 0)] - - return [(0, 0)] + offset_mapping_0 + offset_mapping_1 + [(0, 0)] - - def set_src_lang_special_tokens(self, src_lang): - """Reset the special tokens to the source lang setting. prefix=[src_lang_code] and suffix=[eos].""" - self.cur_lang_code_id = self.lang_code_to_id[src_lang] - self.prefix_tokens = [self.cur_lang_code_id] - self.suffix_tokens = [self.eos_token_id] - - def set_tgt_lang_special_tokens(self, tgt_lang): - """Reset the special tokens to the target language setting. prefix=[tgt_lang_code] and suffix=[eos].""" - self.cur_lang_code_id = self.lang_code_to_id[tgt_lang] - self.prefix_tokens = [self.cur_lang_code_id] - self.suffix_tokens = [self.eos_token_id] - - def _build_translation_inputs(self, raw_inputs, return_tensors, src_lang, tgt_lang, **extra_kwargs): - """Used by translation pipeline, to prepare inputs for the generate function""" - if src_lang is None or tgt_lang is None: - raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model") - self.src_lang = src_lang - inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs) - tgt_lang_id = self.convert_tokens_to_ids(tgt_lang) - inputs["forced_bos_token_id"] = tgt_lang_id - return inputs diff --git a/paddlenlp/transformers/reformer/tokenizer.py b/paddlenlp/transformers/reformer/tokenizer.py index 15e046a6591c..6944bc258423 100644 --- a/paddlenlp/transformers/reformer/tokenizer.py +++ b/paddlenlp/transformers/reformer/tokenizer.py @@ -17,7 +17,7 @@ import sentencepiece as spm -from ..albert_english.tokenizer import AlbertEnglishTokenizer +from ..albert.tokenizer import AlbertEnglishTokenizer __all__ = ["ReformerTokenizer"] diff --git a/paddlenlp/transformers/t5/tokenizer.py b/paddlenlp/transformers/t5/tokenizer.py index 9066d1cbb748..4fc3d60c7cfe 100644 --- a/paddlenlp/transformers/t5/tokenizer.py +++ b/paddlenlp/transformers/t5/tokenizer.py @@ -18,7 +18,7 @@ import sentencepiece as spm -from ..albert_english.tokenizer import AlbertEnglishTokenizer +from ..albert.tokenizer import AlbertEnglishTokenizer __all__ = [ "T5Tokenizer", diff --git a/tests/transformers/albert/test_tokenizer.py b/tests/transformers/albert/test_tokenizer.py index f6d24f1afe94..34c3f7302621 100644 --- a/tests/transformers/albert/test_tokenizer.py +++ b/tests/transformers/albert/test_tokenizer.py @@ -16,8 +16,10 @@ import os import unittest -from paddlenlp.transformers.albert_chinese.tokenizer import AlbertChineseTokenizer -from paddlenlp.transformers.albert_english.tokenizer import AlbertEnglishTokenizer +from paddlenlp.transformers.albert.tokenizer import ( + AlbertChineseTokenizer, + AlbertEnglishTokenizer, +) from paddlenlp.transformers.bert.tokenizer import BasicTokenizer, WordpieceTokenizer from ...testing_utils import get_tests_dir, slow From e367332bcc40b22498ed471c9dbfa42a5bdb2073 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Thu, 17 Oct 2024 16:20:49 +0000 Subject: [PATCH 16/21] fix problem in TOKENIZER_MAPPING_NAMES --- paddlenlp/transformers/albert/tokenizer.py | 2 +- paddlenlp/transformers/auto/tokenizer.py | 24 ++++++++++++++++------ tests/transformers/llama/test_tokenizer.py | 5 +---- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/paddlenlp/transformers/albert/tokenizer.py b/paddlenlp/transformers/albert/tokenizer.py index a7d80d0b2457..84ef1bae176c 100644 --- a/paddlenlp/transformers/albert/tokenizer.py +++ b/paddlenlp/transformers/albert/tokenizer.py @@ -20,7 +20,7 @@ import sentencepiece as spm -from .. import PretrainedTokenizer, BertTokenizer, AddedToken +from .. import AddedToken, BertTokenizer, PretrainedTokenizer __all__ = ["AlbertTokenizer"] diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index de388dcf4128..0ab5d27f29eb 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -43,9 +43,7 @@ else: TOKENIZER_MAPPING_NAMES = OrderedDict( [ - ("albert", "AlbertTokenizer"), - ("albert_chinese", "AlbertChineseTokenizer"), - ("albert_english", "AlbertEnglishTokenizer"), + ("albert", (("AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"),)), ("bart", "BartTokenizer"), ("bert", "BertTokenizer"), ("blenderbot", "BlenderbotTokenizer"), @@ -74,8 +72,7 @@ ), ("luke", "LukeTokenizer"), ("mamba", "MambaTokenizer"), - ("mbart", "MBartTokenizer"), - ("mbart50", "MBart50Tokenizer"), + ("mbart", (("MBartTokenizer", "MBart50Tokenizer"),)), ("mobilebert", "MobileBertTokenizer"), ("mpnet", "MPNetTokenizer"), ("nezha", "NeZhaTokenizer"), @@ -141,7 +138,22 @@ def tokenizer_class_from_name(class_name: str): return PretrainedTokenizerFast for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): - if class_name in tokenizers: + all_tokenizers = [] + if isinstance(tokenizers, tuple): + if len(tokenizers) == 2: + tokenizer_slow, tokenizer_fast = tokenizers + else: + tokenizer_slow = tokenizers[0] + tokenizer_fast = None + if isinstance(tokenizer_slow, tuple): + all_tokenizers.extend(tokenizer_slow) + else: + all_tokenizers.append(tokenizer_slow) + if tokenizer_fast is not None: + all_tokenizers.append(tokenizer_fast) + else: + all_tokenizers.append(tokenizers) + if class_name in all_tokenizers: module_name = model_type_to_module_name(module_name) try: module = importlib.import_module(f".{module_name}", "paddlenlp.transformers") diff --git a/tests/transformers/llama/test_tokenizer.py b/tests/transformers/llama/test_tokenizer.py index ad567815abfa..940548a7a950 100644 --- a/tests/transformers/llama/test_tokenizer.py +++ b/tests/transformers/llama/test_tokenizer.py @@ -220,7 +220,7 @@ class TikTokenIntegrationTests(unittest.TestCase): def test_tiktoken_llama(self): model_path = "hf-internal-testing/llama-3-8b-internal" - subfolder = "original" + subfolder = "" test_text = "This is a test sentence." test_tokens = [128000, 2028, 374, 264, 1296, 11914, 13, 128001] num_reserved_special_tokens = 256 @@ -244,7 +244,6 @@ def test_tiktoken_llama(self): additional_special_tokens=special_tokens, bos_token="<|begin_of_text|>", eos_token="<|end_of_text|>", - from_hf_hub=True, ) tokens = tiktoken_tokenizer.tokenize("<|begin_of_text|> " + test_text) self.assertEqual(tokens[0], "<|begin_of_text|>") @@ -257,7 +256,6 @@ def test_tiktoken_llama(self): eos_token="<|end_of_text|>", add_bos_token=True, add_eos_token=True, - from_hf_hub=True, use_fast=True, ) self.assertTrue(isinstance(tiktoken_tokenizer, PretrainedTokenizerFast)) @@ -279,7 +277,6 @@ def test_tiktoken_llama(self): from_slow=True, add_bos_token=True, add_eos_token=True, - from_hf_hub=True, use_fast=True, ) tokens = tiktoken_tokenizer.encode(test_text, add_special_tokens=True)["input_ids"] From a422932a17efd6c62d3b257bd9bddeb0c9ceafef Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Fri, 18 Oct 2024 01:11:12 +0000 Subject: [PATCH 17/21] try fix --- paddlenlp/transformers/albert/tokenizer.py | 2 +- paddlenlp/transformers/auto/tokenizer.py | 24 ++- paddlenlp/transformers/mbart/__init__.py | 1 + tests/transformers/mbart50/__init__.py | 13 -- tests/transformers/mbart50/test_tokenizer.py | 191 ------------------- 5 files changed, 13 insertions(+), 218 deletions(-) delete mode 100644 tests/transformers/mbart50/__init__.py delete mode 100644 tests/transformers/mbart50/test_tokenizer.py diff --git a/paddlenlp/transformers/albert/tokenizer.py b/paddlenlp/transformers/albert/tokenizer.py index 84ef1bae176c..d4ff3d4e0ffd 100644 --- a/paddlenlp/transformers/albert/tokenizer.py +++ b/paddlenlp/transformers/albert/tokenizer.py @@ -22,7 +22,7 @@ from .. import AddedToken, BertTokenizer, PretrainedTokenizer -__all__ = ["AlbertTokenizer"] +__all__ = ["AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"] SPIECE_UNDERLINE = "▁" diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 0ab5d27f29eb..dc7148da423e 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -43,7 +43,7 @@ else: TOKENIZER_MAPPING_NAMES = OrderedDict( [ - ("albert", (("AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"),)), + ("albert", (("AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"), None)), ("bart", "BartTokenizer"), ("bert", "BertTokenizer"), ("blenderbot", "BlenderbotTokenizer"), @@ -72,7 +72,7 @@ ), ("luke", "LukeTokenizer"), ("mamba", "MambaTokenizer"), - ("mbart", (("MBartTokenizer", "MBart50Tokenizer"),)), + ("mbart", (("MBartTokenizer", "MBart50Tokenizer"), None)), ("mobilebert", "MobileBertTokenizer"), ("mpnet", "MPNetTokenizer"), ("nezha", "NeZhaTokenizer"), @@ -140,11 +140,7 @@ def tokenizer_class_from_name(class_name: str): for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): all_tokenizers = [] if isinstance(tokenizers, tuple): - if len(tokenizers) == 2: - tokenizer_slow, tokenizer_fast = tokenizers - else: - tokenizer_slow = tokenizers[0] - tokenizer_fast = None + (tokenizer_slow, tokenizer_fast) = tokenizers if isinstance(tokenizer_slow, tuple): all_tokenizers.extend(tokenizer_slow) else: @@ -409,18 +405,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): if model_type is not None: tokenizer_class_py = TOKENIZER_MAPPING[type(config)] if isinstance(tokenizer_class_py, (list, tuple)): - if len(tokenizer_class_py) == 2: - tokenizer_class_fast = tokenizer_class_py[1] - tokenizer_class_py = tokenizer_class_py[0] - else: - tokenizer_class_fast = None + (tokenizer_class_py, tokenizer_class_fast) = tokenizer_class_py else: tokenizer_class_fast = None if tokenizer_class_fast and (use_fast or tokenizer_class_py is None): return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) else: if tokenizer_class_py is not None: - return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + if isinstance(tokenizer_class_py, str): + return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + else: + # Use the first tokenizer class in the list + return tokenizer_class_py[0].from_pretrained( + pretrained_model_name_or_path, *model_args, **kwargs + ) else: raise ValueError( "This tokenizer cannot be instantiated. Please make sure you have `sentencepiece` installed " diff --git a/paddlenlp/transformers/mbart/__init__.py b/paddlenlp/transformers/mbart/__init__.py index 86ecd1322521..ebdc0b0919be 100644 --- a/paddlenlp/transformers/mbart/__init__.py +++ b/paddlenlp/transformers/mbart/__init__.py @@ -12,4 +12,5 @@ # See the License for the specific language governing permissions and # limitations under the License. +from .configuration import * from .tokenizer import * diff --git a/tests/transformers/mbart50/__init__.py b/tests/transformers/mbart50/__init__.py deleted file mode 100644 index 97043fd7ba68..000000000000 --- a/tests/transformers/mbart50/__init__.py +++ /dev/null @@ -1,13 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. diff --git a/tests/transformers/mbart50/test_tokenizer.py b/tests/transformers/mbart50/test_tokenizer.py deleted file mode 100644 index 6e52858afe47..000000000000 --- a/tests/transformers/mbart50/test_tokenizer.py +++ /dev/null @@ -1,191 +0,0 @@ -# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. -# Copyright 2021 The HuggingFace Team. All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import tempfile -import unittest - -from paddlenlp.transformers import SPIECE_UNDERLINE, MBart50Tokenizer -from paddlenlp.transformers.mbart.modeling import shift_tokens_right - -from ...testing_utils import get_tests_dir, nested_simplify -from ..test_tokenizer_common import TokenizerTesterMixin - -SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") - -EN_CODE = 250004 -RO_CODE = 250020 - - -class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase): - tokenizer_class = MBart50Tokenizer - test_sentencepiece = True - - test_offsets = False - - def setUp(self): - super().setUp() - - # We have a SentencePiece fixture for testing - tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True) - tokenizer.save_pretrained(self.tmpdirname) - - def test_convert_token_and_id(self): - """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" - token = "" - token_id = 0 - - self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) - self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) - - def test_get_vocab(self): - vocab_keys = list(self.get_tokenizer().get_vocab().keys()) - - self.assertEqual(vocab_keys[0], "") - self.assertEqual(vocab_keys[1], "") - self.assertEqual(vocab_keys[-1], "") - self.assertEqual(len(vocab_keys), 1_054) - - def test_vocab_size(self): - self.assertEqual(self.get_tokenizer().vocab_size, 1_054) - - def test_full_tokenizer(self): - tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True) - - tokens = tokenizer.tokenize("This is a test") - self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) - - self.assertListEqual( - tokenizer.convert_tokens_to_ids(tokens), - [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]], - ) - - tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") - self.assertListEqual( - tokens, - # fmt: off - [ - SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", - SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", - SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", - SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", - SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", - "." - ], - # fmt: on - ) - ids = tokenizer.convert_tokens_to_ids(tokens) - self.assertListEqual( - ids, - [ - value + tokenizer.fairseq_offset - for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4] - ], - ) - - back_tokens = tokenizer.convert_ids_to_tokens(ids) - self.assertListEqual( - back_tokens, - # fmt: off - [ - SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", - SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", - SPIECE_UNDERLINE + "", "", "2", "0", "0", "0", ",", - SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", - SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", - "", "." - ], - # fmt: on - ) - - -class MBart50OneToManyIntegrationTest(unittest.TestCase): - checkpoint_name = "mbart-large-50-one-to-many-mmt" - src_text = [ - " UN Chief Says There Is No Military Solution in Syria", - """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""", - ] - tgt_text = [ - "Şeful ONU declară că nu există o soluţie militară în Siria", - "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei" - ' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor' - " face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.", - ] - expected_src_tokens = [EN_CODE, 8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2] - - @classmethod - def setUpClass(cls): - cls.tokenizer: MBart50Tokenizer = MBart50Tokenizer.from_pretrained( - cls.checkpoint_name, src_lang="en_XX", tgt_lang="ro_RO" - ) - cls.pad_token_id = 1 - return cls - - def check_language_codes(self): - self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ar_AR"], 250001) - self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["en_EN"], 250004) - self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ro_RO"], 250020) - self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["mr_IN"], 250038) - - def test_tokenizer_decode_ignores_language_codes(self): - self.assertIn(RO_CODE, self.tokenizer.all_special_ids) - generated_ids = [RO_CODE, 884, 9019, 96, 9, 916, 86792, 36, 18743, 15596, 5, 2] - result = self.tokenizer.decode(generated_ids, skip_special_tokens=True) - expected_romanian = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True) - self.assertEqual(result, expected_romanian) - self.assertNotIn(self.tokenizer.eos_token, result) - - def test_tokenizer_truncation(self): - src_text = ["this is gunna be a long sentence " * 20] - assert isinstance(src_text[0], str) - desired_max_length = 10 - ids = self.tokenizer(src_text, max_length=desired_max_length, truncation=True).input_ids[0] - self.assertEqual(ids[0], EN_CODE) - self.assertEqual(ids[-1], 2) - self.assertEqual(len(ids), desired_max_length) - - def test_mask_token(self): - self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["", "ar_AR"]), [250053, 250001]) - - def test_special_tokens_unaffacted_by_save_load(self): - tmpdirname = tempfile.mkdtemp() - original_special_tokens = self.tokenizer.fairseq_tokens_to_ids - self.tokenizer.save_pretrained(tmpdirname) - new_tok = MBart50Tokenizer.from_pretrained(tmpdirname) - self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens) - - def test_seq2seq_max_target_length(self): - batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pd") - targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pd") - labels = targets["input_ids"] - batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) - - self.assertEqual(batch.input_ids.shape[1], 3) - self.assertEqual(batch.decoder_input_ids.shape[1], 10) - - def test_tokenizer_translation(self): - inputs = self.tokenizer._build_translation_inputs( - "A test", return_tensors="pd", src_lang="en_XX", tgt_lang="ar_AR" - ) - - self.assertEqual( - nested_simplify(inputs), - { - # en_XX, A, test, EOS - "input_ids": [[250004, 62, 3034, 2]], - "attention_mask": [[1, 1, 1, 1]], - # ar_AR - "forced_bos_token_id": 250001, - }, - ) From d46655c5b66de900e0670ed6a252e7a73bea2c0d Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Mon, 21 Oct 2024 10:30:18 +0000 Subject: [PATCH 18/21] update --- paddlenlp/transformers/auto/tokenizer.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index dc7148da423e..9004a48747d6 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -107,13 +107,14 @@ ("tinybert", "TinyBertTokenizer"), ("unified_transformer", "UnifiedTransformerTokenizer"), ("unimo", "UNIMOTokenizer"), - ("gpt", "GPTChineseTokenizer"), + ("gpt", (("GPTTokenizer", "GPTChineseTokenizer"), None)), ("gau_alpha", "GAUAlphaTokenizer"), ("artist", "ArtistTokenizer"), ("chineseclip", "ChineseCLIPTokenizer"), ("ernie_vil", "ErnieViLTokenizer"), ("glm", "GLMGPT2Tokenizer"), ("qwen", "QWenTokenizer"), + ("qwen2", "Qwen2Tokenizer"), ("yuan", "YuanTokenizer"), ] ) From 3412f5095a566f3bf09c80f05113011c38ee6a9b Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Tue, 22 Oct 2024 07:56:50 +0000 Subject: [PATCH 19/21] fix --- paddlenlp/transformers/llama/tokenizer.py | 6 +- .../transformers/tokenizer_utils_base.py | 5 +- tests/transformers/mbart50/__init__.py | 13 ++ tests/transformers/mbart50/test_tokenizer.py | 191 ++++++++++++++++++ 4 files changed, 210 insertions(+), 5 deletions(-) create mode 100644 tests/transformers/mbart50/__init__.py create mode 100644 tests/transformers/mbart50/test_tokenizer.py diff --git a/paddlenlp/transformers/llama/tokenizer.py b/paddlenlp/transformers/llama/tokenizer.py index eae57e4bc7d3..be688206e2ad 100644 --- a/paddlenlp/transformers/llama/tokenizer.py +++ b/paddlenlp/transformers/llama/tokenizer.py @@ -72,9 +72,7 @@ def __init__( self.add_bos_token = add_bos_token self.add_eos_token = add_eos_token self.decode_with_prefix_space = decode_with_prefix_space - # self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs) - self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False)) - self.sp_model.Load(vocab_file) + self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", True)) @property def vocab_size(self): @@ -101,7 +99,7 @@ def bos_token_id(self) -> Optional[int]: def eos_token_id(self) -> Optional[int]: return self.sp_model.eos_id() - def get_spm_processor(self, from_slow=False): + def get_spm_processor(self, from_slow=True): tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs) if from_slow: # no dependency on protobuf tokenizer.Load(self.vocab_file) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index 213a41325a17..aa9cdb5e11e2 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1600,7 +1600,10 @@ def _from_pretrained( from_hf_hub=False, **kwargs, ): - from_slow = kwargs.get("from_slow", False) + if cls.__name__.endswith("Fast"): + from_slow = kwargs.get("from_slow", False) + else: + from_slow = kwargs.get("from_slow", True) has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None: slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( diff --git a/tests/transformers/mbart50/__init__.py b/tests/transformers/mbart50/__init__.py new file mode 100644 index 000000000000..97043fd7ba68 --- /dev/null +++ b/tests/transformers/mbart50/__init__.py @@ -0,0 +1,13 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. diff --git a/tests/transformers/mbart50/test_tokenizer.py b/tests/transformers/mbart50/test_tokenizer.py new file mode 100644 index 000000000000..6e52858afe47 --- /dev/null +++ b/tests/transformers/mbart50/test_tokenizer.py @@ -0,0 +1,191 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# Copyright 2021 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import tempfile +import unittest + +from paddlenlp.transformers import SPIECE_UNDERLINE, MBart50Tokenizer +from paddlenlp.transformers.mbart.modeling import shift_tokens_right + +from ...testing_utils import get_tests_dir, nested_simplify +from ..test_tokenizer_common import TokenizerTesterMixin + +SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model") + +EN_CODE = 250004 +RO_CODE = 250020 + + +class MBart50TokenizationTest(TokenizerTesterMixin, unittest.TestCase): + tokenizer_class = MBart50Tokenizer + test_sentencepiece = True + + test_offsets = False + + def setUp(self): + super().setUp() + + # We have a SentencePiece fixture for testing + tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True) + tokenizer.save_pretrained(self.tmpdirname) + + def test_convert_token_and_id(self): + """Test ``_convert_token_to_id`` and ``_convert_id_to_token``.""" + token = "" + token_id = 0 + + self.assertEqual(self.get_tokenizer()._convert_token_to_id(token), token_id) + self.assertEqual(self.get_tokenizer()._convert_id_to_token(token_id), token) + + def test_get_vocab(self): + vocab_keys = list(self.get_tokenizer().get_vocab().keys()) + + self.assertEqual(vocab_keys[0], "") + self.assertEqual(vocab_keys[1], "") + self.assertEqual(vocab_keys[-1], "") + self.assertEqual(len(vocab_keys), 1_054) + + def test_vocab_size(self): + self.assertEqual(self.get_tokenizer().vocab_size, 1_054) + + def test_full_tokenizer(self): + tokenizer = MBart50Tokenizer(SAMPLE_VOCAB, src_lang="en_XX", tgt_lang="ro_RO", keep_accents=True) + + tokens = tokenizer.tokenize("This is a test") + self.assertListEqual(tokens, ["▁This", "▁is", "▁a", "▁t", "est"]) + + self.assertListEqual( + tokenizer.convert_tokens_to_ids(tokens), + [value + tokenizer.fairseq_offset for value in [285, 46, 10, 170, 382]], + ) + + tokens = tokenizer.tokenize("I was born in 92000, and this is falsé.") + self.assertListEqual( + tokens, + # fmt: off + [ + SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", "9", "2", "0", "0", "0", ",", + SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", "é", + "." + ], + # fmt: on + ) + ids = tokenizer.convert_tokens_to_ids(tokens) + self.assertListEqual( + ids, + [ + value + tokenizer.fairseq_offset + for value in [8, 21, 84, 55, 24, 19, 7, 2, 602, 347, 347, 347, 3, 12, 66, 46, 72, 80, 6, 2, 4] + ], + ) + + back_tokens = tokenizer.convert_ids_to_tokens(ids) + self.assertListEqual( + back_tokens, + # fmt: off + [ + SPIECE_UNDERLINE + "I", SPIECE_UNDERLINE + "was", + SPIECE_UNDERLINE + "b", "or", "n", SPIECE_UNDERLINE + "in", + SPIECE_UNDERLINE + "", "", "2", "0", "0", "0", ",", + SPIECE_UNDERLINE + "and", SPIECE_UNDERLINE + "this", + SPIECE_UNDERLINE + "is", SPIECE_UNDERLINE + "f", "al", "s", + "", "." + ], + # fmt: on + ) + + +class MBart50OneToManyIntegrationTest(unittest.TestCase): + checkpoint_name = "mbart-large-50-one-to-many-mmt" + src_text = [ + " UN Chief Says There Is No Military Solution in Syria", + """ Secretary-General Ban Ki-moon says his response to Russia's stepped up military support for Syria is that "there is no military solution" to the nearly five-year conflict and more weapons will only worsen the violence and misery for millions of people.""", + ] + tgt_text = [ + "Şeful ONU declară că nu există o soluţie militară în Siria", + "Secretarul General Ban Ki-moon declară că răspunsul său la intensificarea sprijinului militar al Rusiei" + ' pentru Siria este că "nu există o soluţie militară" la conflictul de aproape cinci ani şi că noi arme nu vor' + " face decât să înrăutăţească violenţele şi mizeria pentru milioane de oameni.", + ] + expected_src_tokens = [EN_CODE, 8274, 127873, 25916, 7, 8622, 2071, 438, 67485, 53, 187895, 23, 51712, 2] + + @classmethod + def setUpClass(cls): + cls.tokenizer: MBart50Tokenizer = MBart50Tokenizer.from_pretrained( + cls.checkpoint_name, src_lang="en_XX", tgt_lang="ro_RO" + ) + cls.pad_token_id = 1 + return cls + + def check_language_codes(self): + self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ar_AR"], 250001) + self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["en_EN"], 250004) + self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["ro_RO"], 250020) + self.assertEqual(self.tokenizer.fairseq_tokens_to_ids["mr_IN"], 250038) + + def test_tokenizer_decode_ignores_language_codes(self): + self.assertIn(RO_CODE, self.tokenizer.all_special_ids) + generated_ids = [RO_CODE, 884, 9019, 96, 9, 916, 86792, 36, 18743, 15596, 5, 2] + result = self.tokenizer.decode(generated_ids, skip_special_tokens=True) + expected_romanian = self.tokenizer.decode(generated_ids[1:], skip_special_tokens=True) + self.assertEqual(result, expected_romanian) + self.assertNotIn(self.tokenizer.eos_token, result) + + def test_tokenizer_truncation(self): + src_text = ["this is gunna be a long sentence " * 20] + assert isinstance(src_text[0], str) + desired_max_length = 10 + ids = self.tokenizer(src_text, max_length=desired_max_length, truncation=True).input_ids[0] + self.assertEqual(ids[0], EN_CODE) + self.assertEqual(ids[-1], 2) + self.assertEqual(len(ids), desired_max_length) + + def test_mask_token(self): + self.assertListEqual(self.tokenizer.convert_tokens_to_ids(["", "ar_AR"]), [250053, 250001]) + + def test_special_tokens_unaffacted_by_save_load(self): + tmpdirname = tempfile.mkdtemp() + original_special_tokens = self.tokenizer.fairseq_tokens_to_ids + self.tokenizer.save_pretrained(tmpdirname) + new_tok = MBart50Tokenizer.from_pretrained(tmpdirname) + self.assertDictEqual(new_tok.fairseq_tokens_to_ids, original_special_tokens) + + def test_seq2seq_max_target_length(self): + batch = self.tokenizer(self.src_text, padding=True, truncation=True, max_length=3, return_tensors="pd") + targets = self.tokenizer(self.tgt_text, padding=True, truncation=True, max_length=10, return_tensors="pd") + labels = targets["input_ids"] + batch["decoder_input_ids"] = shift_tokens_right(labels, self.tokenizer.pad_token_id) + + self.assertEqual(batch.input_ids.shape[1], 3) + self.assertEqual(batch.decoder_input_ids.shape[1], 10) + + def test_tokenizer_translation(self): + inputs = self.tokenizer._build_translation_inputs( + "A test", return_tensors="pd", src_lang="en_XX", tgt_lang="ar_AR" + ) + + self.assertEqual( + nested_simplify(inputs), + { + # en_XX, A, test, EOS + "input_ids": [[250004, 62, 3034, 2]], + "attention_mask": [[1, 1, 1, 1]], + # ar_AR + "forced_bos_token_id": 250001, + }, + ) From 19521f92bcb86b07ec3670ee59dc09618406d440 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Wed, 23 Oct 2024 17:46:10 +0800 Subject: [PATCH 20/21] rm redundent comment, resolve complicate --- paddlenlp/transformers/tokenizer_utils_base.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/paddlenlp/transformers/tokenizer_utils_base.py b/paddlenlp/transformers/tokenizer_utils_base.py index aa9cdb5e11e2..e044e7e5830b 100644 --- a/paddlenlp/transformers/tokenizer_utils_base.py +++ b/paddlenlp/transformers/tokenizer_utils_base.py @@ -1389,7 +1389,7 @@ def __init__(self, **kwargs): self.model_input_names = kwargs.pop("model_input_names", self.model_input_names) # By default, cleaning tokenization spaces for both fast and slow tokenizers - self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", True) + self.clean_up_tokenization_spaces = kwargs.pop("clean_up_tokenization_spaces", False) # By default, do not split special tokens for both fast and slow tokenizers self.split_special_tokens = kwargs.pop("split_special_tokens", False) @@ -1531,8 +1531,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs): "added_tokens_file": ADDED_TOKENS_FILE, "special_tokens_map_file": SPECIAL_TOKENS_MAP_FILE, "tokenizer_config_file": TOKENIZER_CONFIG_FILE, - "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME, # what's this - # "tokenizer_file": FULL_TOKENIZER_FILE, + "chat_template_file": CHAT_TEMPLATE_CONFIG_NAME, } vocab_files_target = {**cls.resource_files_names, **additional_files_names} From d2d7eeb6b20455ca1e6e632ccc3de7e3f5850b22 Mon Sep 17 00:00:00 2001 From: lvdongyi Date: Fri, 25 Oct 2024 18:58:22 +0800 Subject: [PATCH 21/21] add case of built-in tokenizers to handle CI error --- paddlenlp/transformers/auto/tokenizer.py | 65 +++++++++++++++++------- 1 file changed, 46 insertions(+), 19 deletions(-) diff --git a/paddlenlp/transformers/auto/tokenizer.py b/paddlenlp/transformers/auto/tokenizer.py index 9004a48747d6..88392d4eb110 100644 --- a/paddlenlp/transformers/auto/tokenizer.py +++ b/paddlenlp/transformers/auto/tokenizer.py @@ -43,7 +43,7 @@ else: TOKENIZER_MAPPING_NAMES = OrderedDict( [ - ("albert", (("AlbertTokenizer", "AlbertChineseTokenizer", "AlbertEnglishTokenizer"), None)), + ("albert", (("AlbertChineseTokenizer", "AlbertEnglishTokenizer"), None)), ("bart", "BartTokenizer"), ("bert", "BertTokenizer"), ("blenderbot", "BlenderbotTokenizer"), @@ -119,37 +119,47 @@ ] ) -TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) -CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} +def get_mapping_tokenizers(tokenizers, with_fast=True): + all_tokenizers = [] + if isinstance(tokenizers, tuple): + (tokenizer_slow, tokenizer_fast) = tokenizers + if isinstance(tokenizer_slow, tuple): + all_tokenizers.extend(tokenizer_slow) + else: + all_tokenizers.append(tokenizer_slow) + if with_fast and tokenizer_fast is not None: + all_tokenizers.append(tokenizer_fast) + else: + all_tokenizers.append(tokenizers) + return all_tokenizers def get_configurations(): MAPPING_NAMES = OrderedDict() - for key, class_name in TOKENIZER_MAPPING_NAMES.items(): - import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.tokenizer") - tokenizer_name = getattr(import_class, key) - name = tuple(tokenizer_name.pretrained_init_configuration.keys()) - MAPPING_NAMES[name] = tokenizer_name + for class_name, values in TOKENIZER_MAPPING_NAMES.items(): + all_tokenizers = get_mapping_tokenizers(values, with_fast=False) + for key in all_tokenizers: + import_class = importlib.import_module(f"paddlenlp.transformers.{class_name}.tokenizer") + tokenizer_name = getattr(import_class, key) + name = tuple(tokenizer_name.pretrained_init_configuration.keys()) + MAPPING_NAMES[name] = tokenizer_name return MAPPING_NAMES +INIT_CONFIG_MAPPING = get_configurations() + +TOKENIZER_MAPPING = _LazyAutoMapping(CONFIG_MAPPING_NAMES, TOKENIZER_MAPPING_NAMES) + +CONFIG_TO_TYPE = {v: k for k, v in CONFIG_MAPPING_NAMES.items()} + + def tokenizer_class_from_name(class_name: str): if class_name == "PretrainedTokenizerFast": return PretrainedTokenizerFast for module_name, tokenizers in TOKENIZER_MAPPING_NAMES.items(): - all_tokenizers = [] - if isinstance(tokenizers, tuple): - (tokenizer_slow, tokenizer_fast) = tokenizers - if isinstance(tokenizer_slow, tuple): - all_tokenizers.extend(tokenizer_slow) - else: - all_tokenizers.append(tokenizer_slow) - if tokenizer_fast is not None: - all_tokenizers.append(tokenizer_fast) - else: - all_tokenizers.append(tokenizers) + all_tokenizers = get_mapping_tokenizers(tokenizers) if class_name in all_tokenizers: module_name = model_type_to_module_name(module_name) try: @@ -280,6 +290,8 @@ class AutoTokenizer: base tokenizer classes when created with the AutoTokenizer.from_pretrained() classmethod. """ + _tokenizer_mapping = get_configurations() + def __init__(self): raise EnvironmentError( "AutoTokenizer is designed to be instantiated " @@ -380,6 +392,20 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): # TODO: Support tokenizer_type raise NotImplementedError("tokenizer_type is not supported yet.") + all_tokenizer_names = [] + + for names, tokenizer_class in cls._tokenizer_mapping.items(): + for name in names: + all_tokenizer_names.append(name) + + # From built-in pretrained models + if pretrained_model_name_or_path in all_tokenizer_names: + for names, tokenizer_class in cls._tokenizer_mapping.items(): + for pattern in names: + if pattern == pretrained_model_name_or_path: + logger.info("We are using %s to load '%s'." % (tokenizer_class, pretrained_model_name_or_path)) + return tokenizer_class.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) + tokenizer_config = get_tokenizer_config(pretrained_model_name_or_path, **kwargs) config_tokenizer_class = tokenizer_config.get("tokenizer_class") if config_tokenizer_class is None: @@ -417,6 +443,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs): return tokenizer_class_py.from_pretrained(pretrained_model_name_or_path, *model_args, **kwargs) else: # Use the first tokenizer class in the list + print("We are using %s to load '%s'." % (tokenizer_class_py[0], pretrained_model_name_or_path)) return tokenizer_class_py[0].from_pretrained( pretrained_model_name_or_path, *model_args, **kwargs )