Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix special token settings in PretrainiedTokenizer #2629

Merged
merged 2 commits into from
Jun 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion paddlenlp/transformers/ernie_gen/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -421,7 +421,7 @@ def save_pretrained(self, save_directory):
list(self.resource_files_names.values())[0])
paddle.save(self.state_dict(), file_name)

def _wrap_init(self, original_init, *args, **kwargs):
def _post_init(self, original_init, *args, **kwargs):
"""
It would be hooked after `__init__` to add a dict including arguments of
`__init__` as a attribute named `config` of the prtrained model instance.
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/model_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,7 +120,7 @@ class is a pretrained model class adding layers on top of the base model,
pretrained_resource_files_map = {}
base_model_prefix = ""

def _wrap_init(self, original_init, *args, **kwargs):
def _post_init(self, original_init, *args, **kwargs):
"""
It would be hooked after `__init__` to add a dict including arguments of
`__init__` as a attribute named `config` of the pretrained model instance.
Expand Down
26 changes: 20 additions & 6 deletions paddlenlp/transformers/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -540,9 +540,9 @@ class PretrainedTokenizer(PretrainedTokenizerBase):

_decode_use_source_tokenizer = False

def _wrap_init(self, original_init, *args, **kwargs):
def _pre_init(self, original_init, *args, **kwargs):
"""
It would be hooked after `__init__` to add specials tokens (arguments of
It would be hooked before `__init__` to add specials tokens (arguments of
`__init__` whose name ends with `_token`) as attributes of the tokenizer
instance.
"""
Expand All @@ -559,10 +559,24 @@ def _wrap_init(self, original_init, *args, **kwargs):
self._decode_use_source_tokenizer = False

def _build_special_tokens_map_extended(self, **kwargs):
if getattr(self, "_has_built_special_tokens", None):
return
SpecialTokensMixin.__init__(self, **kwargs)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

看得不是很明白,这里是去掉了 SpecialTokensMixin 的使用了吗?

self._has_built_special_tokens = True
for key, value in kwargs.items():
if value is None:
continue
if key in self.SPECIAL_TOKENS_ATTRIBUTES:
if key == "additional_special_tokens":
assert isinstance(
value,
(list, tuple)), f"Value {value} is not a list or tuple"
assert all(
isinstance(t, (str, AddedToken)) for t in value
), "One of the tokens is not a string or an AddedToken"
setattr(self, key, value)
elif isinstance(value, (str, AddedToken)):
setattr(self, key, value)
else:
raise TypeError(
f"special token {key} has to be either str or AddedToken but got: {type(value)}"
)

@property
def vocab_size(self) -> int:
Expand Down
31 changes: 14 additions & 17 deletions paddlenlp/transformers/tokenizer_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -778,23 +778,20 @@ class SpecialTokensMixin:
]

def __init__(self, verbose=True, **kwargs):
# TODO(guosheng): `SpecialTokensMixin.__init__` would be called by
# `PretrainedTokenizer._wrap_init` using args which might be str rather
# than instances of `AddedToken`, and this causes `AddedToken` settings
# made by `_build_special_tokens_map_extended` overwrited.
# Maybe `PretrainedTokenizer._wrap_init` should be called before init
# later.
if getattr(self, "_has_built_special_tokens", None):
return
self._bos_token = None
self._eos_token = None
self._unk_token = None
self._sep_token = None
self._pad_token = None
self._cls_token = None
self._mask_token = None
self._pad_token_type_id = 0
self._additional_special_tokens = []
# note(guosheng): Since `__init__` might be called multiple times which
# is hooked before `PretrainedTokenizer` init, we do not set to None as
# HF to avoid unintentional overriding.
self._bos_token = getattr(self, "_bos_token", None)
self._eos_token = getattr(self, "_eos_token", None)
self._unk_token = getattr(self, "_unk_token", None)
self._sep_token = getattr(self, "_sep_token", None)
self._pad_token = getattr(self, "_pad_token", None)
self._cls_token = getattr(self, "_cls_token", None)
self._mask_token = getattr(self, "_mask_token", None)
self._pad_token_type_id = getattr(self, "_pad_token_type_id", 0)
self._additional_special_tokens = getattr(self,
"_additional_special_tokens",
[])
self.verbose = verbose

# We directly set the hidden value to allow initialization with special tokens
Expand Down
34 changes: 22 additions & 12 deletions paddlenlp/transformers/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,9 @@ class InitTrackerMeta(type(Layer)):
"""
This metaclass wraps the `__init__` method of a class to add `init_config`
attribute for instances of that class, and `init_config` use a dict to track
the initial configuration. If the class has `_wrap_init` method, it would be
hooked after `__init__` and called as `_wrap_init(self, init_fn, init_args)`.
the initial configuration. If the class has `_pre_init` or `_post_init`
method, it would be hooked before or after `__init__` and called as
`_pre_init(self, init_fn, init_args)` or `_post_init(self, init_fn, init_args)`.
Since InitTrackerMeta would be used as metaclass for pretrained model classes,
which always are Layer and `type(Layer)` is not `type`, thus use `type(Layer)`
rather than `type` as base class for it to avoid inheritance metaclass
Expand All @@ -57,23 +58,29 @@ class InitTrackerMeta(type(Layer)):

def __init__(cls, name, bases, attrs):
init_func = cls.__init__
# If attrs has `__init__`, wrap it using accessable `_wrap_init`.
# If attrs has `__init__`, wrap it using accessable `_pre_init, _post_init`.
# Otherwise, no need to wrap again since the super cls has been wraped.
# TODO: remove reduplicated tracker if using super cls `__init__`
help_func = getattr(cls, '_wrap_init',
None) if '__init__' in attrs else None
cls.__init__ = InitTrackerMeta.init_and_track_conf(init_func, help_func)
pre_init_func = getattr(cls, '_pre_init',
None) if '__init__' in attrs else None
post_init_func = getattr(cls, '_post_init',
None) if '__init__' in attrs else None
cls.__init__ = InitTrackerMeta.init_and_track_conf(
init_func, pre_init_func, post_init_func)
super(InitTrackerMeta, cls).__init__(name, bases, attrs)

@staticmethod
def init_and_track_conf(init_func, help_func=None):
def init_and_track_conf(init_func, pre_init_func=None, post_init_func=None):
"""
wraps `init_func` which is `__init__` method of a class to add `init_config`
attribute for instances of that class.
Args:
init_func (callable): It should be the `__init__` method of a class.
help_func (callable, optional): If provided, it would be hooked after
`init_func` and called as `_wrap_init(self, init_func, *init_args, **init_args)`.
pre_init_func (callable, optional): If provided, it would be hooked after
`init_func` and called as `pre_init_func(self, init_func, *init_args, **init_args)`.
Default None.
post_init_func (callable, optional): If provided, it would be hooked after
`init_func` and called as `post_init_func(self, init_func, *init_args, **init_args)`.
Default None.

Returns:
Expand All @@ -82,11 +89,14 @@ def init_and_track_conf(init_func, help_func=None):

@functools.wraps(init_func)
def __impl__(self, *args, **kwargs):
# registed helper by `pre_init_func`
if pre_init_func:
pre_init_func(self, init_func, *args, **kwargs)
# keep full configuration
init_func(self, *args, **kwargs)
# registed helper by `_wrap_init`
if help_func:
help_func(self, init_func, *args, **kwargs)
# registed helper by `post_init_func`
if post_init_func:
post_init_func(self, init_func, *args, **kwargs)
self.init_config = kwargs
if args:
kwargs['init_args'] = args
Expand Down