Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tokenizer] Add Fast Tokenizer #8832

Merged
merged 16 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions paddlenlp/transformers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
tokenize_special_chars,
convert_to_unicode,
)
from .tokenizer_utils_fast import PretrainedTokenizerFast
from .processing_utils import ProcessorMixin
from .feature_extraction_utils import BatchFeature, FeatureExtractionMixin
from .image_processing_utils import ImageProcessingMixin
Expand Down
324 changes: 324 additions & 0 deletions paddlenlp/transformers/convert_slow_tokenizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,324 @@
# coding=utf-8
# Copyright 2018 The Google AI Language Team Authors and The HuggingFace Inc. team.
# Copyright (c) 2018, NVIDIA CORPORATION. All rights reserved.
# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

from typing import Dict, List, Optional, Tuple

import tokenizers
from packaging import version
from tokenizers import (
AddedToken,
Regex,
Tokenizer,
decoders,
normalizers,
pre_tokenizers,
)
from tokenizers.models import BPE, Unigram


# Copied from transformers, adapted for tokenizers >= 0.19.0
def _get_prepend_scheme(add_prefix_space: bool, original_tokenizer) -> str:
if add_prefix_space:
prepend_scheme = "always"
if hasattr(original_tokenizer, "legacy") and not original_tokenizer.legacy:
prepend_scheme = "first"

Check warning on line 38 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L35-L38

Added lines #L35 - L38 were not covered by tests
else:
prepend_scheme = "never"
return prepend_scheme

Check warning on line 41 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L40-L41

Added lines #L40 - L41 were not covered by tests


# Extract the vocab and merge file from sentencepiece file
class SentencePieceExtractor:
def __init__(self, model: str):
from sentencepiece import SentencePieceProcessor

Check warning on line 47 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L47

Added line #L47 was not covered by tests

self.sp = SentencePieceProcessor()
self.sp.Load(model)

Check warning on line 50 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L49-L50

Added lines #L49 - L50 were not covered by tests

def extract(self, vocab_scores: Optional[Tuple[str, float]] = None) -> Tuple[Dict[str, int], List[Tuple]]:
sp = self.sp
vocab = {sp.id_to_piece(index): index for index in range(sp.GetPieceSize())}
if vocab_scores is not None:
vocab_scores, reverse = dict(vocab_scores), True

Check warning on line 56 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L53-L56

Added lines #L53 - L56 were not covered by tests
else:
vocab_scores, reverse = vocab, False

Check warning on line 58 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L58

Added line #L58 was not covered by tests

# Merges
merges = []
for merge, piece_score in vocab_scores.items():
local = []
for index in range(1, len(merge)):
piece_l, piece_r = merge[:index], merge[index:]
if piece_l in vocab and piece_r in vocab:
local.append((piece_l, piece_r, piece_score))
local = sorted(local, key=lambda x: (vocab[x[0]], vocab[x[1]]))
merges.extend(local)

Check warning on line 69 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L61-L69

Added lines #L61 - L69 were not covered by tests

merges = sorted(merges, key=lambda val: val[2], reverse=reverse)
merges = [(val[0], val[1]) for val in merges]

Check warning on line 72 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L71-L72

Added lines #L71 - L72 were not covered by tests

return vocab, merges

Check warning on line 74 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L74

Added line #L74 was not covered by tests


def check_number_comma(piece: str) -> bool:
return len(piece) < 2 or piece[-1] != "," or not piece[-2].isdigit()

Check warning on line 78 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L78

Added line #L78 was not covered by tests


class Converter:
def __init__(self, original_tokenizer):
self.original_tokenizer = original_tokenizer

Check warning on line 83 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L83

Added line #L83 was not covered by tests

def converted(self) -> Tokenizer:
raise NotImplementedError()

Check warning on line 86 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L86

Added line #L86 was not covered by tests


class SpmConverter(Converter):
def __init__(self, *args):

super().__init__(*args)

Check warning on line 92 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L92

Added line #L92 was not covered by tests

from . import sentencepiece_model_pb2 as model_pb2

Check warning on line 94 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L94

Added line #L94 was not covered by tests

m = model_pb2.ModelProto()
if hasattr(self.original_tokenizer, "sentencepiece_model_file"):
spm_vocab_file = self.original_tokenizer.sentencepiece_model_file

Check warning on line 98 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L96-L98

Added lines #L96 - L98 were not covered by tests
else:
spm_vocab_file = self.original_tokenizer.vocab_file
with open(spm_vocab_file, "rb") as f:
m.ParseFromString(f.read())
self.proto = m

Check warning on line 103 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L100-L103

Added lines #L100 - L103 were not covered by tests

if self.proto.trainer_spec.byte_fallback:
if not getattr(self, "handle_byte_fallback", None):
import warnings

Check warning on line 107 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L105-L107

Added lines #L105 - L107 were not covered by tests

warnings.warn(

Check warning on line 109 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L109

Added line #L109 was not covered by tests
"The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"
" which is not implemented in the fast tokenizers. In practice this means that the fast version of the"
" tokenizer can produce unknown tokens whereas the sentencepiece version would have converted these "
"unknown tokens into a sequence of byte tokens matching the original piece of text."
)

def vocab(self, proto):
return [(piece.piece, piece.score) for piece in proto.pieces]

Check warning on line 117 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L117

Added line #L117 was not covered by tests

def unk_id(self, proto):
return proto.trainer_spec.unk_id

Check warning on line 120 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L120

Added line #L120 was not covered by tests

def tokenizer(self, proto):
model_type = proto.trainer_spec.model_type
vocab_scores = self.vocab(proto)
unk_id = self.unk_id(proto)

Check warning on line 125 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L123-L125

Added lines #L123 - L125 were not covered by tests

if model_type == 1:
tokenizer = Tokenizer(Unigram(vocab_scores, unk_id))
elif model_type == 2:
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract()
bpe_vocab = {word: i for i, (word, score) in enumerate(vocab_scores)}
tokenizer = Tokenizer(

Check warning on line 132 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L127-L132

Added lines #L127 - L132 were not covered by tests
BPE(
bpe_vocab,
merges,
unk_token=proto.trainer_spec.unk_piece,
fuse_unk=True,
)
)
else:
raise Exception(

Check warning on line 141 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L141

Added line #L141 was not covered by tests
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)

return tokenizer

Check warning on line 145 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L145

Added line #L145 was not covered by tests

def normalizer(self, proto):
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
_normalizers = [

Check warning on line 149 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L148-L149

Added lines #L148 - L149 were not covered by tests
normalizers.Strip(left=False, right=True), # stripping is important
normalizers.Replace(Regex(" {2,}"), "▁"),
]
if not precompiled_charsmap:
return normalizers.Sequence(_normalizers)

Check warning on line 154 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L153-L154

Added lines #L153 - L154 were not covered by tests
else:
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)

Check warning on line 156 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L156

Added line #L156 was not covered by tests

def pre_tokenizer(self, replacement, add_prefix_space):
prepend_scheme = "always"
if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
prepend_scheme = "first"
if version.parse(tokenizers.__version__) >= version.parse("0.19.0"):
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return pre_tokenizers.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)

Check warning on line 164 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L159-L164

Added lines #L159 - L164 were not covered by tests
else:
return pre_tokenizers.Metaspace(

Check warning on line 166 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L166

Added line #L166 was not covered by tests
replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
)

def post_processor(self):
return None

Check warning on line 171 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L171

Added line #L171 was not covered by tests

def decoder(self, replacement, add_prefix_space):
if version.parse(tokenizers.__version__) >= version.parse("0.19.0"):
prepend_scheme = _get_prepend_scheme(add_prefix_space, self.original_tokenizer)
return decoders.Metaspace(replacement=replacement, prepend_scheme=prepend_scheme)

Check warning on line 176 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L174-L176

Added lines #L174 - L176 were not covered by tests
else:
return decoders.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)

Check warning on line 178 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L178

Added line #L178 was not covered by tests

def converted(self) -> Tokenizer:
tokenizer = self.tokenizer(self.proto)

Check warning on line 181 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L181

Added line #L181 was not covered by tests

# Tokenizer assemble
normalizer = self.normalizer(self.proto)
if normalizer is not None:
tokenizer.normalizer = normalizer

Check warning on line 186 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L184-L186

Added lines #L184 - L186 were not covered by tests

replacement = "▁"
add_prefix_space = True
pre_tokenizer = self.pre_tokenizer(replacement, add_prefix_space)
if pre_tokenizer is not None:
tokenizer.pre_tokenizer = pre_tokenizer

Check warning on line 192 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L188-L192

Added lines #L188 - L192 were not covered by tests

tokenizer.decoder = self.decoder(replacement, add_prefix_space)
post_processor = self.post_processor()
if post_processor:
tokenizer.post_processor = post_processor

Check warning on line 197 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L194-L197

Added lines #L194 - L197 were not covered by tests

return tokenizer

Check warning on line 199 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L199

Added line #L199 was not covered by tests


class TikTokenConverter(Converter):
def extract(self, tiktoken_file: str):
from .tiktoken_model_utils import bpe, bytes_to_unicode, load_tiktoken_bpe

Check warning on line 204 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L204

Added line #L204 was not covered by tests

bpe_ranks = (

Check warning on line 206 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L206

Added line #L206 was not covered by tests
self.original_tokenizer.mergeable_ranks
if hasattr(self.original_tokenizer, "mergeable_ranks") and self.original_tokenizer.mergeable_ranks
else load_tiktoken_bpe(tiktoken_file)
)
byte_encoder = bytes_to_unicode()

Check warning on line 211 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L211

Added line #L211 was not covered by tests

def token_bytes_to_string(b):
return "".join([byte_encoder[ord(char)] for char in b.decode("latin-1")])

Check warning on line 214 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L213-L214

Added lines #L213 - L214 were not covered by tests

merges = []
vocab = {}
for token, rank in bpe_ranks.items():
vocab[token_bytes_to_string(token)] = rank
if len(token) == 1:
continue
merged = tuple(bpe(bpe_ranks, token, max_rank=rank))
if len(merged) == 2:
merges.append(tuple(map(token_bytes_to_string, merged)))

Check warning on line 224 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L216-L224

Added lines #L216 - L224 were not covered by tests

return vocab, merges

Check warning on line 226 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L226

Added line #L226 was not covered by tests


class LlamaConverter(SpmConverter):
handle_byte_fallback = True

def vocab(self, proto):
vocab = [

Check warning on line 233 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L233

Added line #L233 was not covered by tests
("<unk>", 0.0),
("<s>", 0.0),
("</s>", 0.0),
]
vocab += [(piece.piece, piece.score) for piece in proto.pieces[3:]]
return vocab

Check warning on line 239 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L238-L239

Added lines #L238 - L239 were not covered by tests

def unk_id(self, proto):
return 0

Check warning on line 242 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L242

Added line #L242 was not covered by tests

def decoder(self, replacement, add_prefix_space):
return decoders.Sequence(

Check warning on line 245 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L245

Added line #L245 was not covered by tests
[
decoders.Replace("▁", " "),
decoders.ByteFallback(),
decoders.Fuse(),
decoders.Strip(content=" ", left=1),
]
)

def tokenizer(self, proto):
model_type = proto.trainer_spec.model_type
vocab_scores = self.vocab(proto)
if model_type == 1:

Check warning on line 257 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L255-L257

Added lines #L255 - L257 were not covered by tests

if version.parse(tokenizers.__version__) < version.parse("0.14.0"):
tokenizer = Tokenizer(Unigram(vocab_scores, 0))

Check warning on line 260 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L259-L260

Added lines #L259 - L260 were not covered by tests
else:
tokenizer = Tokenizer(Unigram(vocab_scores, 0, byte_fallback=True))

Check warning on line 262 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L262

Added line #L262 was not covered by tests

elif model_type == 2:
_, merges = SentencePieceExtractor(self.original_tokenizer.vocab_file).extract(vocab_scores)
bpe_vocab = {word: i for i, (word, _score) in enumerate(vocab_scores)}
tokenizer = Tokenizer(

Check warning on line 267 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L264-L267

Added lines #L264 - L267 were not covered by tests
BPE(bpe_vocab, merges, unk_token=proto.trainer_spec.unk_piece, fuse_unk=True, byte_fallback=True)
)
tokenizer.add_special_tokens(

Check warning on line 270 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L270

Added line #L270 was not covered by tests
[
AddedToken("<unk>", normalized=False, special=True),
AddedToken("<s>", normalized=False, special=True),
AddedToken("</s>", normalized=False, special=True),
]
)
else:
raise Exception(

Check warning on line 278 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L278

Added line #L278 was not covered by tests
"You're trying to run a `Unigram` model but you're file was trained with a different algorithm"
)

return tokenizer

Check warning on line 282 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L282

Added line #L282 was not covered by tests

def normalizer(self, proto):
return normalizers.Sequence(

Check warning on line 285 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L285

Added line #L285 was not covered by tests
[
normalizers.Prepend(prepend="▁"),
normalizers.Replace(pattern=" ", content="▁"),
]
)

def pre_tokenizer(self, replacement, add_prefix_space):
return None

Check warning on line 293 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L293

Added line #L293 was not covered by tests


SLOW_TO_FAST_CONVERTERS = {
"LlamaTokenizer": LlamaConverter,
}


def convert_slow_tokenizer(transformer_tokenizer) -> Tokenizer:
"""
Utilities to convert a slow tokenizer instance in a fast tokenizer instance.

Args:
transformer_tokenizer ([`~tokenizer_utils_base.PretrainedTokenizer`]):
Instance of a slow tokenizer to convert in the backend tokenizer for
[`~tokenizer_utils_base.PretrainedTokenizerFast`].

Return:
A instance of [`~tokenizers.Tokenizer`] to be used as the backend tokenizer of a
[`~tokenizer_utils_base.PretrainedTokenizerFast`]
"""

tokenizer_class_name = transformer_tokenizer.__class__.__name__
if tokenizer_class_name not in SLOW_TO_FAST_CONVERTERS:
raise ValueError(

Check warning on line 317 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L315-L317

Added lines #L315 - L317 were not covered by tests
f"An instance of tokenizer class {tokenizer_class_name} cannot be converted in a Fast tokenizer instance. "
f"No converter was found. Currently available slow->fast convertors: {list(SLOW_TO_FAST_CONVERTERS.keys())}"
)

converter_class = SLOW_TO_FAST_CONVERTERS[tokenizer_class_name]

Check warning on line 322 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L322

Added line #L322 was not covered by tests

return converter_class(transformer_tokenizer).converted()

Check warning on line 324 in paddlenlp/transformers/convert_slow_tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/convert_slow_tokenizer.py#L324

Added line #L324 was not covered by tests
1 change: 1 addition & 0 deletions paddlenlp/transformers/llama/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,3 +18,4 @@
from .modeling_auto_static import *
from .modeling_pp import *
from .tokenizer import *
from .tokenizer_fast import *
Loading
Loading