Skip to content

Commit

Permalink
Merge pull request #6119 from RasaHQ/empty_unicode_token
Browse files Browse the repository at this point in the history
Remove empty tokens from whitespace tokenizer
  • Loading branch information
dakshvar22 authored Jul 2, 2020
2 parents 70823f9 + 75328fd commit 3823811
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 0 deletions.
1 change: 1 addition & 0 deletions changelog/6119.bugfix.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Explicitly remove all emojis which appear as unicode characters from the output of ``regex.sub`` inside ``WhitespaceTokenizer``.
26 changes: 26 additions & 0 deletions rasa/nlu/tokenizers/whitespace_tokenizer.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from typing import Any, Dict, List, Text

import regex
import re

from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
from rasa.nlu.training_data import Message
Expand All @@ -24,6 +25,27 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:

self.case_sensitive = self.component_config["case_sensitive"]

self.emoji_pattern = self.get_emoji_regex()

@staticmethod
def get_emoji_regex():
emoji_pattern = re.compile(
"["
"\U0001F600-\U0001F64F" # emoticons
"\U0001F300-\U0001F5FF" # symbols & pictographs
"\U0001F680-\U0001F6FF" # transport & map symbols
"\U0001F1E0-\U0001F1FF" # flags (iOS)
"\U00002702-\U000027B0"
"\U000024C2-\U0001F251"
"]+",
flags=re.UNICODE,
)
return emoji_pattern

def remove_emoji(self, text: Text) -> Text:

return self.emoji_pattern.sub(r"", text)

def tokenize(self, message: Message, attribute: Text) -> List[Token]:
text = message.get(attribute)

Expand All @@ -47,8 +69,12 @@ def tokenize(self, message: Message, attribute: Text) -> List[Token]:
" ",
text,
).split()

# if we removed everything like smiles `:)`, use the whole text as 1 token
if not words:
words = [text]

words = [self.remove_emoji(w) for w in words]
words = [w for w in words if w]

return self._convert_words_to_tokens(words, text)
7 changes: 7 additions & 0 deletions tests/nlu/tokenizers/test_whitespace_tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,13 @@
],
[(0, 82), (83, 141)],
),
(
"Joselico gracias Dois 🙏🇺🇸🏦🛠🔥⭐️🦅👑💪",
["Joselico", "gracias", "Dois"],
[(0, 8), (9, 16), (17, 21)],
),
(":)", [":)"], [(0, 2)]),
("Hi :-)", ["Hi"], [(0, 2)]),
],
)
def test_whitespace(text, expected_tokens, expected_indices):
Expand Down

0 comments on commit 3823811

Please sign in to comment.