From eb6a41ea14e58b650f4be840c0df32c52e083710 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 13:47:13 +0200
Subject: [PATCH 001/239] Add cls token to whitespace tokenizer.

---
 rasa/nlu/constants.py                       |  2 ++
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 18 ++++++++++++++----
 tests/nlu/base/test_tokenizers.py           | 17 +++++++++++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 9e3b5f7a2e78..ba1f8b9c9a09 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -10,6 +10,8 @@
 
 MESSAGE_NER_FEATURES_ATTRIBUTE = "ner_features"
 
+CLS_TOKEN = "__CLS__"
+
 MESSAGE_ATTRIBUTES = [
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 94179ead2acb..591aca3c05ff 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -6,13 +6,12 @@
 from rasa.nlu.tokenizers import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_RESPONSE_ATTRIBUTE,
+    CLS_TOKEN,
 )
 
 
@@ -25,8 +24,10 @@ class WhitespaceTokenizer(Tokenizer, Component):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
-        # text will be tokenized with case sensitive as default
+        # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -40,6 +41,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
+        self.add_cls_token = self.component_config["add_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -88,9 +90,17 @@ def tokenize(
 
         running_offset = 0
         tokens = []
+
         for word in words:
             word_offset = text.index(word, running_offset)
             word_len = len(word)
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text)))
+
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index d6e0f78691e6..de008ef8e62f 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 
 from unittest.mock import patch
+
+from rasa.nlu.constants import CLS_TOKEN
 from rasa.nlu.training_data import TrainingData, Message
 from tests.nlu import utilities
 from rasa.nlu import training_data
@@ -77,6 +79,21 @@ def test_whitespace():
     ] == [0, 83]
 
 
+def test_whitespace_cls_token():
+    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = WhitespaceTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+
+
 def test_whitespace_custom_intent_symbol():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 

From 33fdc981965652fe0b971a62c2bd4c1b89b89881 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:01:51 +0200
Subject: [PATCH 002/239] Add cls token to spacy tokenizer.

---
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 30 +++++++++++++--------
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  2 +-
 tests/nlu/base/test_tokenizers.py           | 18 +++++++++++++
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 87443d3375de..e289dcf31c83 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,5 +1,5 @@
 import typing
-from typing import Any
+from typing import Any, Dict, Text, List, Optional
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
@@ -7,14 +7,11 @@
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
     MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
+    CLS_TOKEN,
 )
 
 if typing.TYPE_CHECKING:
@@ -32,6 +29,16 @@ class SpacyTokenizer(Tokenizer, Component):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        """Construct a new tokenizer using the SpacyTokenizer framework."""
+        super(SpacyTokenizer, self).__init__(component_config)
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
@@ -47,17 +54,18 @@ def train(
                         MESSAGE_TOKENS_NAMES[attribute], self.tokenize(attribute_doc)
                     )
 
-    def get_doc(self, message, attribute):
-
+    def get_doc(self, message: Message, attribute: Text) -> "Doc":
         return message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
-
         message.set(
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self.tokenize(self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE)),
         )
 
-    def tokenize(self, doc: "Doc") -> typing.List[Token]:
-
-        return [Token(t.text, t.idx) for t in doc]
+    def tokenize(self, doc: "Doc") -> List[Token]:
+        tokens = [Token(t.text, t.idx) for t in doc]
+        if self.add_cls_token:
+            idx = doc[-1].idx + len(doc[-1].text) + 1
+            tokens = tokens + [Token(CLS_TOKEN, idx)]
+        return tokens
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 591aca3c05ff..acf24d7b5334 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -101,6 +101,6 @@ def tokenize(
             attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
             and self.add_cls_token
         ):
-            tokens.append(Token(CLS_TOKEN, len(text)))
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
 
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index de008ef8e62f..5005f8cfb9df 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -92,6 +92,7 @@ def test_whitespace_cls_token():
         "lunch",
         CLS_TOKEN,
     ]
+    assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13, 19]
 
 
 def test_whitespace_custom_intent_symbol():
@@ -207,6 +208,23 @@ def test_spacy(spacy_nlp):
     assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
 
 
+def test_spacy_add_cls_token(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
+
+
 def test_spacy_intent_tokenizer(spacy_nlp_component):
     from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 

From 840a26ee3e1ee95b713052c3bc6e9ad9337f6f0c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:07:42 +0200
Subject: [PATCH 003/239] Add cls token to mitie tokenizer.

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 35 ++++++++++++++++++++------
 tests/nlu/base/test_tokenizers.py      | 17 +++++++++++++
 2 files changed, 44 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index df8f61552492..5e8c654f3dcd 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Text
+from typing import Any, List, Text, Optional, Dict
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
@@ -6,13 +6,11 @@
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    CLS_TOKEN,
 )
 
 
@@ -20,6 +18,16 @@ class MitieTokenizer(Tokenizer, Component):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
+    defaults = {
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        """Construct a new tokenizer using the SpacyTokenizer framework."""
+        super(MitieTokenizer, self).__init__(component_config)
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
@@ -35,7 +43,7 @@ def train(
                 if example.get(attribute) is not None:
                     example.set(
                         MESSAGE_TOKENS_NAMES[attribute],
-                        self.tokenize(example.get(attribute)),
+                        self.tokenize(example.get(attribute), attribute),
                     )
 
     def process(self, message: Message, **kwargs: Any) -> None:
@@ -44,12 +52,16 @@ def process(self, message: Message, **kwargs: Any) -> None:
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], self.tokenize(message.text)
         )
 
-    def _token_from_offset(self, text, offset, encoded_sentence):
+    def _token_from_offset(
+        self, text: Text, offset: int, encoded_sentence: bytes
+    ) -> Token:
         return Token(
             text.decode("utf-8"), self._byte_to_char_offset(encoded_sentence, offset)
         )
 
-    def tokenize(self, text: Text) -> List[Token]:
+    def tokenize(
+        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         import mitie
 
         encoded_sentence = text.encode("utf-8")
@@ -58,6 +70,13 @@ def tokenize(self, text: Text) -> List[Token]:
             self._token_from_offset(token, offset, encoded_sentence)
             for token, offset in tokenized
         ]
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+
         return tokens
 
     @staticmethod
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 5005f8cfb9df..79566099ea09 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -263,6 +263,23 @@ def test_mitie():
     assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
 
 
+def test_mitie_add_cls_token():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
+
+
 def test_jieba():
     from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 

From 8c6c27089e19808c14d4aadef03fe9fe6ba401b2 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:13:38 +0200
Subject: [PATCH 004/239] Add cls token to jieba tokenizer.

---
 rasa/nlu/tokenizers/jieba_tokenizer.py | 18 +++++++++++++++---
 rasa/nlu/tokenizers/spacy_tokenizer.py |  2 +-
 tests/nlu/base/test_tokenizers.py      | 18 ++++++++++++++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index d489be5c2ad0..fd34003fd0cc 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -16,8 +16,7 @@
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    CLS_TOKEN,
 )
 
 logger = logging.getLogger(__name__)
@@ -39,6 +38,8 @@ class JiebaTokenizer(Tokenizer, Component):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -61,6 +62,8 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["jieba"]
@@ -108,12 +111,21 @@ def preprocess_text(self, text, attribute):
         else:
             return text
 
-    def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]:
+    def tokenize(
+        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         import jieba
 
         text = self.preprocess_text(text, attribute)
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+
         return tokens
 
     @classmethod
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index e289dcf31c83..fac2ba1c3a32 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -67,5 +67,5 @@ def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
         if self.add_cls_token:
             idx = doc[-1].idx + len(doc[-1].text) + 1
-            tokens = tokens + [Token(CLS_TOKEN, idx)]
+            tokens.append(Token(CLS_TOKEN, idx))
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 79566099ea09..07057449ca7d 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -308,3 +308,21 @@ def test_jieba_load_dictionary(tmpdir_factory):
         tk.tokenize("")
 
     mock_method.assert_called_once_with(dictionary_path)
+
+
+def test_jieba_add_cls_token():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
+        "Micheal",
+        "你好",
+        "吗",
+        "？",
+        CLS_TOKEN,
+    ]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]

From 6b9f6db7edf84bd3ac39d03bc91849d00074d601 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:14:57 +0200
Subject: [PATCH 005/239] Add changelog entry.

---
 CHANGELOG.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 228ef6ba8f9d..1df0dde363bf 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -28,6 +28,7 @@ Added
   (``rasa.core.agent.handle_channels()``). The number of workers can be set using the
   environment variable ``SANIC_WORKERS`` (default: 1). A value of >1 is allowed only in
   combination with ``RedisLockStore`` as the lock store.
+- Added option ``add_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
 
 Changed
 -------

From 0147288da8ee89997982113845b9c14c188e095c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:47:28 +0200
Subject: [PATCH 006/239] move code from init to own file

---
 rasa/nlu/extractors/crf_entity_extractor.py |  2 +-
 rasa/nlu/featurizers/mitie_featurizer.py    |  2 +-
 rasa/nlu/test.py                            |  2 +-
 rasa/nlu/tokenizers/__init__.py             | 16 ---------
 rasa/nlu/tokenizers/jieba_tokenizer.py      | 14 +++-----
 rasa/nlu/tokenizers/mitie_tokenizer.py      | 14 +++-----
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 11 +++---
 rasa/nlu/tokenizers/tokenizer.py            | 39 +++++++++++++++++++++
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 14 +++-----
 tests/nlu/base/test_tokenizers.py           |  8 ++---
 10 files changed, 62 insertions(+), 60 deletions(-)
 create mode 100644 rasa/nlu/tokenizers/tokenizer.py

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index a4d1d63528a2..1b7d49fa3b72 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -6,7 +6,7 @@
 from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.constants import DOCS_BASE_URL
 
diff --git a/rasa/nlu/featurizers/mitie_featurizer.py b/rasa/nlu/featurizers/mitie_featurizer.py
index 9d0dbb8f5a7c..1eda72fe2112 100644
--- a/rasa/nlu/featurizers/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/mitie_featurizer.py
@@ -4,7 +4,7 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers import Featurizer
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 9143638590c4..2990a4678548 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -32,7 +32,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Interpreter, Trainer, TrainingData
 from rasa.nlu.components import Component
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.core.constants import RESPOND_PREFIX
 
 logger = logging.getLogger(__name__)
diff --git a/rasa/nlu/tokenizers/__init__.py b/rasa/nlu/tokenizers/__init__.py
index 8cb8732bf097..e69de29bb2d1 100644
--- a/rasa/nlu/tokenizers/__init__.py
+++ b/rasa/nlu/tokenizers/__init__.py
@@ -1,16 +0,0 @@
-class Tokenizer(object):
-    pass
-
-
-class Token(object):
-    def __init__(self, text, offset, data=None):
-        self.offset = offset
-        self.text = text
-        self.end = offset + len(text)
-        self.data = data if data else {}
-
-    def set(self, prop, info):
-        self.data[prop] = info
-
-    def get(self, prop, default=None):
-        return self.data.get(prop, default)
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index fd34003fd0cc..422cd4296a4e 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -7,16 +7,14 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 
 logger = logging.getLogger(__name__)
@@ -39,7 +37,7 @@ class JiebaTokenizer(Tokenizer, Component):
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False,
+        "use_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -62,7 +60,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -120,11 +118,7 @@ def tokenize(
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 5e8c654f3dcd..2c2a074f59f3 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -2,15 +2,13 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 
 
@@ -20,13 +18,13 @@ class MitieTokenizer(Tokenizer, Component):
 
     defaults = {
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False
+        "use_cls_token": False
     }
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(MitieTokenizer, self).__init__(component_config)
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -71,11 +69,7 @@ def tokenize(
             for token, offset in tokenized
         ]
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index fac2ba1c3a32..432eff0b7c06 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -3,7 +3,7 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
@@ -11,7 +11,6 @@
     MESSAGE_TOKENS_NAMES,
     MESSAGE_SPACY_FEATURES_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 
 if typing.TYPE_CHECKING:
@@ -31,13 +30,13 @@ class SpacyTokenizer(Tokenizer, Component):
 
     defaults = {
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False
+        "use_cls_token": False
     }
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(SpacyTokenizer, self).__init__(component_config)
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -65,7 +64,5 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        if self.add_cls_token:
-            idx = doc[-1].idx + len(doc[-1].text) + 1
-            tokens.append(Token(CLS_TOKEN, idx))
+        self.add_cls_token(tokens, self.use_cls_token)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
new file mode 100644
index 000000000000..4d903822f6f1
--- /dev/null
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -0,0 +1,39 @@
+from typing import Text, List, Optional, Dict, Any
+
+from rasa.nlu.constants import (
+    MESSAGE_RESPONSE_ATTRIBUTE,
+    MESSAGE_TEXT_ATTRIBUTE,
+    CLS_TOKEN,
+)
+
+
+class Token(object):
+    def __init__(self, text, offset, data=None):
+        self.offset = offset
+        self.text = text
+        self.end = offset + len(text)
+        self.data = data if data else {}
+
+    def set(self, prop, info):
+        self.data[prop] = info
+
+    def get(self, prop, default=None):
+        return self.data.get(prop, default)
+
+
+class Tokenizer(object):
+    def add_cls_token(
+        self,
+        tokens: List[Token],
+        use_cls_token: bool,
+        attribute: Text = MESSAGE_TEXT_ATTRIBUTE,
+    ) -> List[Token]:
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and use_cls_token
+        ):
+            # +1 to have a space between the last token and the __cls__ token
+            idx = tokens[-1].offset + len(tokens[-1].text) + 1
+            tokens.append(Token(CLS_TOKEN, idx))
+
+        return tokens
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index acf24d7b5334..20a30efe0409 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -3,15 +3,13 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    CLS_TOKEN,
 )
 
 
@@ -27,7 +25,7 @@ class WhitespaceTokenizer(Tokenizer, Component):
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False,
+        "use_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -41,7 +39,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -97,10 +95,6 @@ def tokenize(
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 07057449ca7d..267f24b81e6f 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -82,7 +82,7 @@ def test_whitespace():
 def test_whitespace_cls_token():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = WhitespaceTokenizer(component_config)
 
@@ -211,7 +211,7 @@ def test_spacy(spacy_nlp):
 def test_spacy_add_cls_token(spacy_nlp):
     from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = SpacyTokenizer(component_config)
 
@@ -266,7 +266,7 @@ def test_mitie():
 def test_mitie_add_cls_token():
     from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = MitieTokenizer(component_config)
 
@@ -313,7 +313,7 @@ def test_jieba_load_dictionary(tmpdir_factory):
 def test_jieba_add_cls_token():
     from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = JiebaTokenizer(component_config)
 

From 15bf9074852ab2b6036a198d12c47d7b71dc7503 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:49:26 +0200
Subject: [PATCH 007/239] update changelog entry.

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 1df0dde363bf..6b0d69554203 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -28,7 +28,7 @@ Added
   (``rasa.core.agent.handle_channels()``). The number of workers can be set using the
   environment variable ``SANIC_WORKERS`` (default: 1). A value of >1 is allowed only in
   combination with ``RedisLockStore`` as the lock store.
-- Added option ``add_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
+- Added option ``use_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
 
 Changed
 -------

From 9e372c3f3d95f05768e376ddeae1babdfc7aa0f2 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 15:10:49 +0200
Subject: [PATCH 008/239] make use_cls_token a class variable of tokenizer

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/mitie_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/spacy_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/tokenizer.py            | 10 +++++-----
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 422cd4296a4e..9185f7dfc122 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -24,7 +24,7 @@
     from rasa.nlu.model import Metadata
 
 
-class JiebaTokenizer(Tokenizer, Component):
+class JiebaTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -118,7 +118,7 @@ def tokenize(
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 2c2a074f59f3..bf3d564dd625 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -12,7 +12,7 @@
 )
 
 
-class MitieTokenizer(Tokenizer, Component):
+class MitieTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -69,7 +69,7 @@ def tokenize(
             for token, offset in tokenized
         ]
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 432eff0b7c06..1784c1e633d3 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -17,7 +17,7 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
 
-class SpacyTokenizer(Tokenizer, Component):
+class SpacyTokenizer(Component, Tokenizer):
 
     provides = [
         MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
@@ -64,5 +64,5 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        self.add_cls_token(tokens, self.use_cls_token)
+        self.add_cls_token(tokens)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 4d903822f6f1..71d914754fb4 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -22,15 +22,15 @@ def get(self, prop, default=None):
 
 
 class Tokenizer(object):
+    def __init__(self) -> None:
+        self.use_cls_token = False
+
     def add_cls_token(
-        self,
-        tokens: List[Token],
-        use_cls_token: bool,
-        attribute: Text = MESSAGE_TEXT_ATTRIBUTE,
+        self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
     ) -> List[Token]:
         if (
             attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and use_cls_token
+            and self.use_cls_token
         ):
             # +1 to have a space between the last token and the __cls__ token
             idx = tokens[-1].offset + len(tokens[-1].text) + 1
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 20a30efe0409..18333f41bd79 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -13,7 +13,7 @@
 )
 
 
-class WhitespaceTokenizer(Tokenizer, Component):
+class WhitespaceTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -95,6 +95,6 @@ def tokenize(
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens

From d21c7b3b3a48e43603ac6381700a8e659bff9aea Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 15:30:46 +0200
Subject: [PATCH 009/239] tokenizer inherits from compoenent

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      |  6 +-----
 rasa/nlu/tokenizers/mitie_tokenizer.py      |  8 +-------
 rasa/nlu/tokenizers/spacy_tokenizer.py      |  8 +-------
 rasa/nlu/tokenizers/tokenizer.py            | 12 +++++++++---
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  5 +----
 tests/nlu/base/test_evaluation.py           |  2 +-
 tests/nlu/base/test_featurizers.py          |  2 +-
 7 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 9185f7dfc122..c434a5a2e050 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -24,7 +24,7 @@
     from rasa.nlu.model import Metadata
 
 
-class JiebaTokenizer(Component, Tokenizer):
+class JiebaTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -36,8 +36,6 @@ class JiebaTokenizer(Component, Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -60,8 +58,6 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
-        self.use_cls_token = self.component_config["use_cls_token"]
-
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["jieba"]
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index bf3d564dd625..2c14c4be09c2 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -12,19 +12,13 @@
 )
 
 
-class MitieTokenizer(Component, Tokenizer):
+class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
-    defaults = {
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False
-    }
-
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(MitieTokenizer, self).__init__(component_config)
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 1784c1e633d3..3a982479c508 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -17,7 +17,7 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
 
-class SpacyTokenizer(Component, Tokenizer):
+class SpacyTokenizer(Tokenizer):
 
     provides = [
         MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
@@ -28,15 +28,9 @@ class SpacyTokenizer(Component, Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    defaults = {
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False
-    }
-
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(SpacyTokenizer, self).__init__(component_config)
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 71d914754fb4..41e04c844385 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -1,5 +1,6 @@
 from typing import Text, List, Optional, Dict, Any
 
+from rasa.nlu.components import Component
 from rasa.nlu.constants import (
     MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
@@ -21,9 +22,14 @@ def get(self, prop, default=None):
         return self.data.get(prop, default)
 
 
-class Tokenizer(object):
-    def __init__(self) -> None:
-        self.use_cls_token = False
+class Tokenizer(Component):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        super(Tokenizer, self).__init__(component_config)
+
+        if "use_cls_token" in self.component_config:
+            self.use_cls_token = self.component_config["use_cls_token"]
+        else:
+            self.use_cls_token = False
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 18333f41bd79..3641fb909689 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -13,7 +13,7 @@
 )
 
 
-class WhitespaceTokenizer(Component, Tokenizer):
+class WhitespaceTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -24,8 +24,6 @@ class WhitespaceTokenizer(Component, Tokenizer):
         "intent_split_symbol": "_",
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -39,7 +37,6 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index e51567cc5c17..240090bbd535 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -39,7 +39,7 @@
 from rasa.nlu.test import determine_intersection
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu import utils
 import json
 import os
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
index 0da0ae0f7b79..cd0c8ce3c13a 100644
--- a/tests/nlu/base/test_featurizers.py
+++ b/tests/nlu/base/test_featurizers.py
@@ -3,7 +3,7 @@
 import pytest
 
 from rasa.nlu import training_data
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message

From 562bc8bafabaa711de7a89ecfb0f9624db1e3c76 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:00:01 +0200
Subject: [PATCH 010/239] remove not needed init methods

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 4 ----
 rasa/nlu/tokenizers/spacy_tokenizer.py | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 2c14c4be09c2..6b4c6b30abdc 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -16,10 +16,6 @@ class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-        """Construct a new tokenizer using the SpacyTokenizer framework."""
-        super(MitieTokenizer, self).__init__(component_config)
-
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 3a982479c508..ffbeff7c2efc 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -28,10 +28,6 @@ class SpacyTokenizer(Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-        """Construct a new tokenizer using the SpacyTokenizer framework."""
-        super(SpacyTokenizer, self).__init__(component_config)
-
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:

From 95fe8da931fe7e9f17ab09971240d25c03fe593e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:06:46 +0200
Subject: [PATCH 011/239] review comment

---
 rasa/nlu/tokenizers/spacy_tokenizer.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index ffbeff7c2efc..432f283af1ce 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -40,7 +40,8 @@ def train(
 
                 if attribute_doc is not None:
                     example.set(
-                        MESSAGE_TOKENS_NAMES[attribute], self.tokenize(attribute_doc)
+                        MESSAGE_TOKENS_NAMES[attribute],
+                        self.tokenize(attribute_doc, attribute),
                     )
 
     def get_doc(self, message: Message, attribute: Text) -> "Doc":
@@ -49,10 +50,12 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
     def process(self, message: Message, **kwargs: Any) -> None:
         message.set(
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self.tokenize(self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE)),
+            self.tokenize(
+                self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE), MESSAGE_TEXT_ATTRIBUTE
+            ),
         )
 
-    def tokenize(self, doc: "Doc") -> List[Token]:
+    def tokenize(self, doc: "Doc", attribute: Text) -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        self.add_cls_token(tokens)
+        self.add_cls_token(tokens, attribute)
         return tokens

From 3b51563dc49830f4e5f9a09ebd823c5f7eb563ef Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:26:57 +0200
Subject: [PATCH 012/239] Add use_cls_token to default dict.

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      | 2 ++
 rasa/nlu/tokenizers/mitie_tokenizer.py      | 5 +++++
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 5 +++++
 rasa/nlu/tokenizers/tokenizer.py            | 2 +-
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 2 ++
 5 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index c434a5a2e050..dfbbf2cbcb9b 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -36,6 +36,8 @@ class JiebaTokenizer(Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 6b4c6b30abdc..ec5556d5840d 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -16,6 +16,11 @@ class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 432f283af1ce..9f061c2b29ec 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -28,6 +28,11 @@ class SpacyTokenizer(Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 41e04c844385..1b786590f010 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -29,7 +29,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         if "use_cls_token" in self.component_config:
             self.use_cls_token = self.component_config["use_cls_token"]
         else:
-            self.use_cls_token = False
+            self.use_cls_token = True
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 3641fb909689..9be597b49a9d 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -24,6 +24,8 @@ class WhitespaceTokenizer(Tokenizer):
         "intent_split_symbol": "_",
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:

From 0f165b90aded95f89d252c11802f6c193904bab0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:57:24 +0200
Subject: [PATCH 013/239] thorw key error if use_cls_token is not set as
 default value.

---
 rasa/nlu/tokenizers/mitie_tokenizer.py        |   3 +-
 rasa/nlu/tokenizers/spacy_tokenizer.py        |   7 +-
 rasa/nlu/tokenizers/tokenizer.py              |  13 +-
 rasa/nlu/tokenizers/whitespace_tokenizer.py   |   1 -
 tests/nlu/tokenizers/__init__.py              |   0
 tests/nlu/tokenizers/test_jieba_tokenizer.py  |  53 ++++++
 tests/nlu/tokenizers/test_mitie_tokenizer.py  |  41 +++++
 tests/nlu/tokenizers/test_spacy_tokenizer.py  |  65 +++++++
 .../test_whitespace_tokenizer.py}             | 168 ++----------------
 9 files changed, 186 insertions(+), 165 deletions(-)
 create mode 100644 tests/nlu/tokenizers/__init__.py
 create mode 100644 tests/nlu/tokenizers/test_jieba_tokenizer.py
 create mode 100644 tests/nlu/tokenizers/test_mitie_tokenizer.py
 create mode 100644 tests/nlu/tokenizers/test_spacy_tokenizer.py
 rename tests/nlu/{base/test_tokenizers.py => tokenizers/test_whitespace_tokenizer.py} (52%)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index ec5556d5840d..60b90938d758 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -1,6 +1,5 @@
-from typing import Any, List, Text, Optional, Dict
+from typing import Any, List, Text
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 9f061c2b29ec..0589af320787 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,7 +1,6 @@
 import typing
-from typing import Any, Dict, Text, List, Optional
+from typing import Any, Text, List
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
@@ -60,7 +59,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
             ),
         )
 
-    def tokenize(self, doc: "Doc", attribute: Text) -> List[Token]:
+    def tokenize(
+        self, doc: "Doc", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
         self.add_cls_token(tokens, attribute)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 1b786590f010..c1b41ad0bf33 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -1,3 +1,5 @@
+import logging
+
 from typing import Text, List, Optional, Dict, Any
 
 from rasa.nlu.components import Component
@@ -7,6 +9,8 @@
     CLS_TOKEN,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class Token(object):
     def __init__(self, text, offset, data=None):
@@ -26,10 +30,13 @@ class Tokenizer(Component):
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super(Tokenizer, self).__init__(component_config)
 
-        if "use_cls_token" in self.component_config:
+        try:
             self.use_cls_token = self.component_config["use_cls_token"]
-        else:
-            self.use_cls_token = True
+        except KeyError:
+            raise KeyError(
+                "No default value for 'use_cls_token' was set. Please, "
+                "add it to the default dict of the tokenizer."
+            )
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 9be597b49a9d..c129e97c8fd9 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -1,7 +1,6 @@
 import re
 from typing import Any, Dict, List, Text
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
diff --git a/tests/nlu/tokenizers/__init__.py b/tests/nlu/tokenizers/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/tokenizers/test_jieba_tokenizer.py b/tests/nlu/tokenizers/test_jieba_tokenizer.py
new file mode 100644
index 000000000000..7df57c5bfcd1
--- /dev/null
+++ b/tests/nlu/tokenizers/test_jieba_tokenizer.py
@@ -0,0 +1,53 @@
+from unittest.mock import patch
+
+from rasa.nlu.constants import CLS_TOKEN
+
+
+def test_jieba():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == ["我", "想", "去", "吃", "兰州", "拉面"]
+
+    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == [0, 1, 2, 3, 4, 6]
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == ["Micheal", "你好", "吗", "？"]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10]
+
+
+def test_jieba_load_dictionary(tmpdir_factory):
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath
+
+    component_config = {"dictionary_path": dictionary_path, "use_cls_token": False}
+
+    with patch.object(
+        JiebaTokenizer, "load_custom_dictionary", return_value=None
+    ) as mock_method:
+        tk = JiebaTokenizer(component_config)
+        tk.tokenize("")
+
+    mock_method.assert_called_once_with(dictionary_path)
+
+
+def test_jieba_add_cls_token():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
+        "Micheal",
+        "你好",
+        "吗",
+        "？",
+        CLS_TOKEN,
+    ]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]
diff --git a/tests/nlu/tokenizers/test_mitie_tokenizer.py b/tests/nlu/tokenizers/test_mitie_tokenizer.py
new file mode 100644
index 000000000000..41774fb9a440
--- /dev/null
+++ b/tests/nlu/tokenizers/test_mitie_tokenizer.py
@@ -0,0 +1,41 @@
+from rasa.nlu.constants import CLS_TOKEN
+
+
+def test_mitie():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == ["Forecast", "for", "lunch"]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13]
+
+    text = "hey ńöñàśçií how're you?"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "hey",
+        "ńöñàśçií",
+        "how",
+        "'re",
+        "you",
+        "?",
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
+
+
+def test_mitie_add_cls_token():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
diff --git a/tests/nlu/tokenizers/test_spacy_tokenizer.py b/tests/nlu/tokenizers/test_spacy_tokenizer.py
new file mode 100644
index 000000000000..9748f4fd8fcc
--- /dev/null
+++ b/tests/nlu/tokenizers/test_spacy_tokenizer.py
@@ -0,0 +1,65 @@
+from rasa.nlu.constants import CLS_TOKEN
+from rasa.nlu import training_data
+
+
+def test_spacy(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]
+
+    text = "hey ńöñàśçií how're you?"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "hey",
+        "ńöñàśçií",
+        "how",
+        "'re",
+        "you",
+        "?",
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
+
+
+def test_spacy_add_cls_token(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
+
+
+def test_spacy_intent_tokenizer(spacy_nlp_component):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
+    spacy_nlp_component.train(td, config=None)
+    spacy_tokenizer = SpacyTokenizer(component_config)
+    spacy_tokenizer.train(td, config=None)
+
+    intent_tokens_exist = [
+        True if example.get("intent_tokens") is not None else False
+        for example in td.intent_examples
+    ]
+
+    # no intent tokens should have been set
+    assert not any(intent_tokens_exist)
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
similarity index 52%
rename from tests/nlu/base/test_tokenizers.py
rename to tests/nlu/tokenizers/test_whitespace_tokenizer.py
index 267f24b81e6f..27c2c6b171f6 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -1,17 +1,14 @@
-# -*- coding: utf-8 -*-
-
-from unittest.mock import patch
-
 from rasa.nlu.constants import CLS_TOKEN
 from rasa.nlu.training_data import TrainingData, Message
 from tests.nlu import utilities
-from rasa.nlu import training_data
 
 
 def test_whitespace():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    tk = WhitespaceTokenizer()
+    component_config = {"use_cls_token": False}
+
+    tk = WhitespaceTokenizer(component_config)
 
     assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
         "Forecast",
@@ -98,7 +95,11 @@ def test_whitespace_cls_token():
 def test_whitespace_custom_intent_symbol():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}
+    component_config = {
+        "intent_tokenization_flag": True,
+        "intent_split_symbol": "+",
+        "use_cls_token": False,
+    }
 
     tk = WhitespaceTokenizer(component_config)
 
@@ -116,7 +117,7 @@ def test_whitespace_custom_intent_symbol():
 def test_whitespace_with_case():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"case_sensitive": False}
+    component_config = {"case_sensitive": False, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "forecast",
@@ -124,7 +125,7 @@ def test_whitespace_with_case():
         "lunch",
     ]
 
-    component_config = {"case_sensitive": True}
+    component_config = {"case_sensitive": True, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "Forecast",
@@ -132,7 +133,7 @@ def test_whitespace_with_case():
         "LUNCH",
     ]
 
-    component_config = {}
+    component_config = {"use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "Forecast",
@@ -140,7 +141,7 @@ def test_whitespace_with_case():
         "LUNCH",
     ]
 
-    component_config = {"case_sensitive": False}
+    component_config = {"case_sensitive": False, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     message = Message("Forecast for LUNCH")
     tk.process(message)
@@ -181,148 +182,3 @@ def test_whitespace_with_case():
     assert examples[1].data.get("tokens")[0].text == "i"
     assert examples[1].data.get("tokens")[1].text == "want"
     assert examples[1].data.get("tokens")[2].text == "tacos"
-
-
-def test_spacy(spacy_nlp):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    tk = SpacyTokenizer()
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "Forecast",
-        "for",
-        "lunch",
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]
-
-    text = "hey ńöñàśçií how're you?"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "hey",
-        "ńöñàśçií",
-        "how",
-        "'re",
-        "you",
-        "?",
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
-
-
-def test_spacy_add_cls_token(spacy_nlp):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = SpacyTokenizer(component_config)
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "Forecast",
-        "for",
-        "lunch",
-        CLS_TOKEN,
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
-
-
-def test_spacy_intent_tokenizer(spacy_nlp_component):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    spacy_nlp_component.train(td, config=None)
-    spacy_tokenizer = SpacyTokenizer()
-    spacy_tokenizer.train(td, config=None)
-
-    intent_tokens_exist = [
-        True if example.get("intent_tokens") is not None else False
-        for example in td.intent_examples
-    ]
-
-    # no intent tokens should have been set
-    assert not any(intent_tokens_exist)
-
-
-def test_mitie():
-    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
-
-    tk = MitieTokenizer()
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(text)] == ["Forecast", "for", "lunch"]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13]
-
-    text = "hey ńöñàśçií how're you?"
-    assert [t.text for t in tk.tokenize(text)] == [
-        "hey",
-        "ńöñàśçií",
-        "how",
-        "'re",
-        "you",
-        "?",
-    ]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
-
-
-def test_mitie_add_cls_token():
-    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = MitieTokenizer(component_config)
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(text)] == [
-        "Forecast",
-        "for",
-        "lunch",
-        CLS_TOKEN,
-    ]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
-
-
-def test_jieba():
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    tk = JiebaTokenizer()
-
-    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == ["我", "想", "去", "吃", "兰州", "拉面"]
-
-    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == [0, 1, 2, 3, 4, 6]
-
-    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == ["Micheal", "你好", "吗", "？"]
-
-    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10]
-
-
-def test_jieba_load_dictionary(tmpdir_factory):
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath
-
-    component_config = {"dictionary_path": dictionary_path}
-
-    with patch.object(
-        JiebaTokenizer, "load_custom_dictionary", return_value=None
-    ) as mock_method:
-        tk = JiebaTokenizer(component_config)
-        tk.tokenize("")
-
-    mock_method.assert_called_once_with(dictionary_path)
-
-
-def test_jieba_add_cls_token():
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = JiebaTokenizer(component_config)
-
-    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
-        "Micheal",
-        "你好",
-        "吗",
-        "？",
-        CLS_TOKEN,
-    ]
-
-    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]

From 174b8afc44061fb801a0463417fefa88aa86b7c7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 15:46:08 +0200
Subject: [PATCH 014/239] Divide featurizers into sparse and dense.

---
 rasa/nlu/featurizers/__init__.py              | 21 --------
 .../featurizers/dense_featurizer/__init__.py  |  0
 .../mitie_featurizer.py                       |  2 +-
 .../spacy_featurizer.py                       |  2 +-
 rasa/nlu/featurizers/featurzier.py            | 21 ++++++++
 .../featurizers/sparse_featurizer/__init__.py |  0
 .../count_vectors_featurizer.py               |  2 +-
 .../ngram_featurizer.py                       |  2 +-
 .../regex_featurizer.py                       |  2 +-
 rasa/nlu/registry.py                          | 12 +++--
 tests/nlu/base/test_components.py             |  2 +-
 tests/nlu/base/test_featurizers.py            | 50 ++++++++++++-------
 12 files changed, 67 insertions(+), 49 deletions(-)
 create mode 100644 rasa/nlu/featurizers/dense_featurizer/__init__.py
 rename rasa/nlu/featurizers/{ => dense_featurizer}/mitie_featurizer.py (98%)
 rename rasa/nlu/featurizers/{ => dense_featurizer}/spacy_featurizer.py (98%)
 create mode 100644 rasa/nlu/featurizers/featurzier.py
 create mode 100644 rasa/nlu/featurizers/sparse_featurizer/__init__.py
 rename rasa/nlu/featurizers/{ => sparse_featurizer}/count_vectors_featurizer.py (99%)
 rename rasa/nlu/featurizers/{ => sparse_featurizer}/ngram_featurizer.py (99%)
 rename rasa/nlu/featurizers/{ => sparse_featurizer}/regex_featurizer.py (99%)

diff --git a/rasa/nlu/featurizers/__init__.py b/rasa/nlu/featurizers/__init__.py
index cb7215b1f2ec..e69de29bb2d1 100644
--- a/rasa/nlu/featurizers/__init__.py
+++ b/rasa/nlu/featurizers/__init__.py
@@ -1,21 +0,0 @@
-import numpy as np
-
-from typing import Any, Text
-from rasa.nlu.training_data import Message
-from rasa.nlu.components import Component
-from rasa.nlu.constants import MESSAGE_VECTOR_FEATURE_NAMES, MESSAGE_TEXT_ATTRIBUTE
-
-
-class Featurizer(Component):
-    @staticmethod
-    def _combine_with_existing_features(
-        message: Message,
-        additional_features: Any,
-        feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-    ) -> Any:
-        if message.get(feature_name) is not None:
-            return np.concatenate(
-                (message.get(feature_name), additional_features), axis=-1
-            )
-        else:
-            return additional_features
diff --git a/rasa/nlu/featurizers/dense_featurizer/__init__.py b/rasa/nlu/featurizers/dense_featurizer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rasa/nlu/featurizers/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
similarity index 98%
rename from rasa/nlu/featurizers/mitie_featurizer.py
rename to rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 1eda72fe2112..1c3e440681c9 100644
--- a/rasa/nlu/featurizers/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -3,7 +3,7 @@
 from typing import Any, List, Text
 
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers import Featurizer
+from rasa.nlu.featurizers.featurzier import Featurizer
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 
diff --git a/rasa/nlu/featurizers/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
similarity index 98%
rename from rasa/nlu/featurizers/spacy_featurizer.py
rename to rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 9b772531fc15..1aa83e241930 100644
--- a/rasa/nlu/featurizers/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -3,7 +3,7 @@
 from typing import Any
 
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers import Featurizer
+from rasa.nlu.featurizers.featurzier import Featurizer
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
new file mode 100644
index 000000000000..cb7215b1f2ec
--- /dev/null
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -0,0 +1,21 @@
+import numpy as np
+
+from typing import Any, Text
+from rasa.nlu.training_data import Message
+from rasa.nlu.components import Component
+from rasa.nlu.constants import MESSAGE_VECTOR_FEATURE_NAMES, MESSAGE_TEXT_ATTRIBUTE
+
+
+class Featurizer(Component):
+    @staticmethod
+    def _combine_with_existing_features(
+        message: Message,
+        additional_features: Any,
+        feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+    ) -> Any:
+        if message.get(feature_name) is not None:
+            return np.concatenate(
+                (message.get(feature_name), additional_features), axis=-1
+            )
+        else:
+            return additional_features
diff --git a/rasa/nlu/featurizers/sparse_featurizer/__init__.py b/rasa/nlu/featurizers/sparse_featurizer/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/rasa/nlu/featurizers/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
similarity index 99%
rename from rasa/nlu/featurizers/count_vectors_featurizer.py
rename to rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 934a2b5c8dc3..b53aba5cab29 100644
--- a/rasa/nlu/featurizers/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -7,7 +7,7 @@
 from sklearn.feature_extraction.text import CountVectorizer
 from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers import Featurizer
+from rasa.nlu.featurizers.featurzier import Featurizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 
diff --git a/rasa/nlu/featurizers/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
similarity index 99%
rename from rasa/nlu/featurizers/ngram_featurizer.py
rename to rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index 8ed675bb09cb..2412f4c9c7b3 100644
--- a/rasa/nlu/featurizers/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -11,7 +11,7 @@
 
 from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers import Featurizer
+from rasa.nlu.featurizers.featurzier import Featurizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.utils import write_json_to_file
 import rasa.utils.io
diff --git a/rasa/nlu/featurizers/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
similarity index 99%
rename from rasa/nlu/featurizers/regex_featurizer.py
rename to rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index efdf7516a703..f146365924d3 100644
--- a/rasa/nlu/featurizers/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -8,7 +8,7 @@
 
 from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.featurizers import Featurizer
+from rasa.nlu.featurizers.featurzier import Featurizer
 from rasa.nlu.training_data import Message, TrainingData
 import rasa.utils.io
 from rasa.nlu.constants import (
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 06643b5ff459..c7f904941d83 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -18,11 +18,13 @@
 from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
 from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
-from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-from rasa.nlu.featurizers.mitie_featurizer import MitieFeaturizer
-from rasa.nlu.featurizers.ngram_featurizer import NGramFeaturizer
-from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer
-from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer
+from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    CountVectorsFeaturizer,
+)
+from nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
+from nlu.featurizers.sparse_featurizer.ngram_featurizer import NGramFeaturizer
+from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
+from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
diff --git a/tests/nlu/base/test_components.py b/tests/nlu/base/test_components.py
index c88327d049ec..ecbe453bed39 100644
--- a/tests/nlu/base/test_components.py
+++ b/tests/nlu/base/test_components.py
@@ -55,7 +55,7 @@ def test_find_unavailable_packages():
 
 
 def test_builder_create_by_module_path(component_builder, default_config):
-    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer
+    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 
     path = "rasa.nlu.featurizers.regex_featurizer.RegexFeaturizer"
     component_config = {"name": path}
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
index cd0c8ce3c13a..ad2967cb15d9 100644
--- a/tests/nlu/base/test_featurizers.py
+++ b/tests/nlu/base/test_featurizers.py
@@ -21,7 +21,7 @@
     ],
 )
 def test_spacy_featurizer(sentence, expected, spacy_nlp):
-    from rasa.nlu.featurizers import spacy_featurizer
+    from nlu.featurizers.dense_featurizer import spacy_featurizer
 
     doc = spacy_nlp(sentence)
     vecs = spacy_featurizer.features_for_doc(doc)
@@ -55,7 +55,7 @@ def test_spacy_training_sample_alignment(spacy_nlp_component):
 
 
 def test_spacy_intent_featurizer(spacy_nlp_component):
-    from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer
+    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     td = training_data.load_data("data/examples/rasa/demo-rasa.json")
     spacy_nlp_component.train(td, config=None)
@@ -78,7 +78,7 @@ def test_spacy_intent_featurizer(spacy_nlp_component):
     [("hey how are you today", [-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])],
 )
 def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
-    from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer
+    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     doc = spacy_nlp(sentence)
     token_vectors = [t.vector for t in doc]
@@ -95,7 +95,7 @@ def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
 
 
 def test_spacy_ner_featurizer_config(spacy_nlp):
-    from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer
+    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     sentence = "hi there friend"
     doc = spacy_nlp(sentence)
@@ -112,7 +112,7 @@ def test_spacy_ner_featurizer_config(spacy_nlp):
 
 
 def test_mitie_featurizer(mitie_feature_extractor, default_config):
-    from rasa.nlu.featurizers.mitie_featurizer import MitieFeaturizer
+    from nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 
     mitie_component_config = {"name": "MitieFeaturizer"}
     ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
@@ -124,7 +124,7 @@ def test_mitie_featurizer(mitie_feature_extractor, default_config):
 
 
 def test_ngram_featurizer(spacy_nlp):
-    from rasa.nlu.featurizers.ngram_featurizer import NGramFeaturizer
+    from nlu.featurizers.sparse_featurizer.ngram_featurizer import NGramFeaturizer
 
     ftr = NGramFeaturizer({"max_number_of_ngrams": 10})
 
@@ -163,7 +163,7 @@ def test_ngram_featurizer(spacy_nlp):
     ],
 )
 def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
-    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer
+    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 
     patterns = [
         {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
@@ -200,7 +200,7 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
     ],
 )
 def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
-    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer
+    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 
     lookups = [
         {
@@ -230,7 +230,7 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
 
 
 def test_spacy_featurizer_casing(spacy_nlp):
-    from rasa.nlu.featurizers import spacy_featurizer
+    from nlu.featurizers.dense_featurizer import spacy_featurizer
 
     # if this starts failing for the default model, we should think about
     # removing the lower casing the spacy nlp component does when it
@@ -262,7 +262,9 @@ def test_spacy_featurizer_casing(spacy_nlp):
     ],
 )
 def test_count_vector_featurizer(sentence, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
 
     ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
     train_message = Message(sentence)
@@ -288,7 +290,9 @@ def test_count_vector_featurizer(sentence, expected):
 def test_count_vector_featurizer_attribute_featurization(
     sentence, intent, response, intent_features, response_features
 ):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
 
     ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
     train_message = Message(sentence)
@@ -321,7 +325,9 @@ def test_count_vector_featurizer_attribute_featurization(
 def test_count_vector_featurizer_shared_vocab(
     sentence, intent, response, text_features, intent_features, response_features
 ):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
 
     ftr = CountVectorsFeaturizer(
         {"token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True}
@@ -350,7 +356,9 @@ def test_count_vector_featurizer_shared_vocab(
     ],
 )
 def test_count_vector_featurizer_oov_token(sentence, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
 
     ftr = CountVectorsFeaturizer(
         {"token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__"}
@@ -377,7 +385,9 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
     ],
 )
 def test_count_vector_featurizer_oov_words(sentence, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
 
     ftr = CountVectorsFeaturizer(
         {
@@ -411,7 +421,9 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
     ],
 )
 def test_count_vector_featurizer_using_tokens(tokens, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
 
     ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
 
@@ -446,7 +458,9 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
     ],
 )
 def test_count_vector_featurizer_char(sentence, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
 
     ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"})
     train_message = Message(sentence)
@@ -462,7 +476,9 @@ def test_count_vector_featurizer_char(sentence, expected):
 
 
 def test_count_vector_featurizer_persist_load(tmpdir):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
+    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
 
     # set non default values to config
     config = {

From e97acf497cfcc91d12782763e11be67727bea840 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 15:59:18 +0200
Subject: [PATCH 015/239] update count vectors featurizer

---
 .../count_vectors_featurizer.py               | 82 +++++++++++--------
 tests/nlu/featurizers/__init__.py             |  0
 .../{base => featurizers}/test_featurizers.py |  0
 3 files changed, 46 insertions(+), 36 deletions(-)
 create mode 100644 tests/nlu/featurizers/__init__.py
 rename tests/nlu/{base => featurizers}/test_featurizers.py (100%)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index b53aba5cab29..43290e7b1ebc 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -1,8 +1,7 @@
 import logging
 import os
 import re
-from typing import Any, Dict, List, Optional, Text, Union
-import numpy as np
+from typing import Any, Dict, List, Optional, Text
 
 from sklearn.feature_extraction.text import CountVectorizer
 from rasa.nlu import utils
@@ -14,8 +13,6 @@
 logger = logging.getLogger(__name__)
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
@@ -26,9 +23,9 @@
 
 
 class CountVectorsFeaturizer(Featurizer):
-    """Bag of words featurizer
+    """Bag of words featurizer.
 
-    Creates bag-of-words representation of intent features
+    Creates bag-of-words representation of features
     using sklearn's `CountVectorizer`.
     All tokens which consist only of digits (e.g. 123 and 99
     but not ab12d) will be represented by a single feature.
@@ -36,6 +33,8 @@ class CountVectorsFeaturizer(Featurizer):
     Set `analyzer` to 'char_wb'
     to use the idea of Subword Semantic Hashing
     from https://arxiv.org/abs/1810.07150.
+
+    The featurizer returns a sequence.
     """
 
     provides = [
@@ -219,7 +218,8 @@ def _get_message_text_by_attribute(
         """Get processed text of attribute of a message"""
 
         if message.get(attribute) is None:
-            # return empty string since sklearn countvectorizer does not like None object while training and predicting
+            # return empty string since sklearn countvectorizer does not like None
+            # object while training and predicting
             return ""
 
         tokens = self._get_message_tokens_by_attribute(message, attribute)
@@ -304,13 +304,10 @@ def _check_OOV_present(self, examples):
             )
 
     def _set_attribute_features(
-        self,
-        attribute: Text,
-        attribute_features: np.ndarray,
-        training_data: "TrainingData",
+        self, attribute: Text, attribute_features: List, training_data: "TrainingData"
     ):
         """Set computed features of the attribute to corresponding message objects"""
-        for i, example in enumerate(training_data.intent_examples):
+        for i, example in enumerate(training_data.training_examples):
             # create bag for each example
             example.set(
                 MESSAGE_VECTOR_FEATURE_NAMES[attribute],
@@ -330,7 +327,7 @@ def _get_all_attributes_processed_texts(
         for attribute in MESSAGE_ATTRIBUTES:
             attribute_texts = [
                 self._get_message_text_by_attribute(example, attribute)
-                for example in training_data.intent_examples
+                for example in training_data.training_examples
             ]
             self._check_OOV_present(attribute_texts)
             processed_attribute_texts[attribute] = attribute_texts
@@ -363,7 +360,6 @@ def create_shared_vocab_vectorizers(
             analyzer=analyzer,
             vocabulary=vocabulary,
         )
-
         attribute_vectorizers = {}
 
         for attribute in MESSAGE_ATTRIBUTES:
@@ -470,18 +466,31 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
 
     def _get_featurized_attribute(
         self, attribute: Text, attribute_texts: List[Text]
-    ) -> Optional[np.ndarray]:
+    ) -> Optional[List]:
         """Return features of a particular attribute for complete data"""
 
         if self._check_attribute_vocabulary(attribute):
             # count vectorizer was trained
-            featurized_attributes = (
-                self.vectorizers[attribute].transform(attribute_texts).toarray()
-            )
-            return featurized_attributes
+            return self._create_sequence(attribute, attribute_texts)
         else:
             return None
 
+    @staticmethod
+    def _get_text_sequence(text):
+        return text.split()
+
+    def _create_sequence(self, attribute: Text, attribute_texts: List[Text]) -> List:
+        texts = [self._get_text_sequence(text) for text in attribute_texts]
+
+        X = []
+
+        for i, tokens in enumerate(texts):
+            x = self.vectorizers[attribute].transform(tokens)
+            x.sort_indices()
+            X.append(x)
+
+        return X
+
     def train(
         self, training_data: TrainingData, cfg: RasaNLUModelConfig = None, **kwargs: Any
     ) -> None:
@@ -528,23 +537,20 @@ def process(self, message: Message, **kwargs: Any) -> None:
                 "component is either not trained or "
                 "didn't receive enough training data"
             )
-        else:
-            message_text = self._get_message_text_by_attribute(
-                message, attribute=MESSAGE_TEXT_ATTRIBUTE
-            )
+            return
+
+        attribute = MESSAGE_TEXT_ATTRIBUTE
+        message_text = self._get_message_text_by_attribute(message, attribute=attribute)
+
+        if self._check_attribute_vocabulary(attribute):
+            features = self._create_sequence(attribute, [message_text])
 
-            bag = (
-                self.vectorizers[MESSAGE_TEXT_ATTRIBUTE]
-                .transform([message_text])
-                .toarray()
-                .squeeze()
-            )
             message.set(
-                MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+                MESSAGE_VECTOR_FEATURE_NAMES[attribute],
                 self._combine_with_existing_features(
                     message,
-                    bag,
-                    feature_name=MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+                    features[0],
+                    feature_name=MESSAGE_VECTOR_FEATURE_NAMES[attribute],
                 ),
             )
 
@@ -571,11 +577,12 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
 
                 if self.use_shared_vocab:
                     # Only persist vocabulary from one attribute. Can be loaded and distributed to all attributes.
-                    utils.json_pickle(
-                        featurizer_file, attribute_vocabularies[MESSAGE_TEXT_ATTRIBUTE]
-                    )
+                    vocab = attribute_vocabularies[MESSAGE_TEXT_ATTRIBUTE]
                 else:
-                    utils.json_pickle(featurizer_file, attribute_vocabularies)
+                    vocab = attribute_vocabularies
+
+                utils.json_pickle(featurizer_file, vocab)
+
         return {"file": file_name}
 
     @classmethod
@@ -623,6 +630,9 @@ def load(
                     vocabulary=vocabulary,
                 )
 
+            for v in vectorizers.values():
+                v.vocabulary_ = v.vocabulary
+
             return cls(meta, vectorizers)
         else:
             return cls(meta)
diff --git a/tests/nlu/featurizers/__init__.py b/tests/nlu/featurizers/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/featurizers/test_featurizers.py
similarity index 100%
rename from tests/nlu/base/test_featurizers.py
rename to tests/nlu/featurizers/test_featurizers.py

From fb7277bf5be94e53bd7c820f2d5c5a2f7025eaa3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:21:55 +0200
Subject: [PATCH 016/239] add deprecation warning to ngram featurizer

---
 .../sparse_featurizer/ngram_featurizer.py     | 426 +-----------------
 1 file changed, 5 insertions(+), 421 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index 2412f4c9c7b3..e97c5b01fdf6 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -1,433 +1,17 @@
-import time
-from collections import Counter
-
 import logging
-import numpy as np
-import os
-import typing
-import warnings
-from string import punctuation
+
 from typing import Any, Dict, List, Optional, Text
 
-from rasa.nlu import utils
-from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurzier import Featurizer
-from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.utils import write_json_to_file
-import rasa.utils.io
-from rasa.nlu.constants import (
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_FEATURE_NAMES,
-)
 
 logger = logging.getLogger(__name__)
 
-if typing.TYPE_CHECKING:
-    from rasa.nlu.model import Metadata
-
 
 class NGramFeaturizer(Featurizer):
-
-    provides = [MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
-
-    requires = [MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
-
-    defaults = {
-        # defines the maximum number of ngrams to collect and add
-        # to the featurization of a sentence
-        "max_number_of_ngrams": 10,
-        # the minimal length in characters of an ngram to be eligible
-        "ngram_min_length": 3,
-        # the maximal length in characters of an ngram to be eligible
-        "ngram_max_length": 17,
-        # the minimal number of times an ngram needs to occur in the
-        # training data to be considered as a feature
-        "ngram_min_occurrences": 5,
-        # during cross validation (used to detect which ngrams are most
-        # valuable) every intent with fever examples than this config
-        # value will be excluded
-        "min_intent_examples": 4,
-    }
-
-    def __init__(
-        self,
-        component_config: Optional[Dict[Text, Any]] = None,
-        all_ngrams: Optional[List[Text]] = None,
-        best_num_ngrams: Optional[int] = None,
-    ):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
         super(NGramFeaturizer, self).__init__(component_config)
 
-        self.best_num_ngrams = best_num_ngrams
-        self.all_ngrams = all_ngrams
-
-    @classmethod
-    def required_packages(cls) -> List[Text]:
-        return ["spacy", "sklearn"]
-
-    def train(
-        self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
-    ):
-
-        start = time.time()
-        self.train_on_sentences(training_data.intent_examples)
-        logger.debug("Ngram collection took {} seconds".format(time.time() - start))
-
-        for example in training_data.training_examples:
-            updated = self._text_features_with_ngrams(example, self.best_num_ngrams)
-            example.set(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], updated)
-
-    def process(self, message: Message, **kwargs: Any):
-
-        updated = self._text_features_with_ngrams(message, self.best_num_ngrams)
-        message.set(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], updated)
-
-    def _text_features_with_ngrams(self, message, max_ngrams):
-
-        ngrams_to_use = self._ngrams_to_use(max_ngrams)
-
-        if ngrams_to_use is not None:
-            extras = np.array(self._ngrams_in_sentence(message, ngrams_to_use))
-            return self._combine_with_existing_features(message, extras)
-        else:
-            return message.get(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-
-    @classmethod
-    def load(
-        cls,
-        meta: Dict[Text, Any],
-        model_dir: Optional[Text] = None,
-        model_metadata: Optional["Metadata"] = None,
-        cached_component: Optional["NGramFeaturizer"] = None,
-        **kwargs: Any
-    ) -> "NGramFeaturizer":
-
-        file_name = meta.get("file")
-        featurizer_file = os.path.join(model_dir, file_name)
-
-        if os.path.exists(featurizer_file):
-            data = rasa.utils.io.read_json_file(featurizer_file)
-            return NGramFeaturizer(meta, data["all_ngrams"], data["best_num_ngrams"])
-        else:
-            return NGramFeaturizer(meta)
-
-    def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
-        """Persist this model into the passed directory."""
-
-        file_name = file_name + ".json"
-        featurizer_file = os.path.join(model_dir, file_name)
-        data = {"all_ngrams": self.all_ngrams, "best_num_ngrams": self.best_num_ngrams}
-
-        write_json_to_file(featurizer_file, data, separators=(",", ": "))
-
-        return {"file": file_name}
-
-    def train_on_sentences(self, examples):
-        labels = [e.get("intent") for e in examples]
-        self.all_ngrams = self._get_best_ngrams(examples, labels)
-        self.best_num_ngrams = self._cross_validation(examples, labels)
-
-    def _ngrams_to_use(self, num_ngrams):
-        if num_ngrams == 0 or self.all_ngrams is None:
-            return []
-        elif num_ngrams is not None:
-            return self.all_ngrams[:num_ngrams]
-        else:
-            return self.all_ngrams
-
-    def _get_best_ngrams(self, examples, labels):
-        """Return an ordered list of the best character ngrams."""
-
-        oov_strings = self._remove_in_vocab_words(examples)
-        ngrams = self._generate_all_ngrams(
-            oov_strings, self.component_config["ngram_min_length"]
-        )
-        return self._sort_applicable_ngrams(ngrams, examples, labels)
-
-    def _remove_in_vocab_words(self, examples):
-        """Automatically removes words with digits in them, that may be a
-        hyperlink or that _are_ in vocabulary for the nlp."""
-
-        new_sents = []
-        for example in examples:
-            new_sents.append(self._remove_in_vocab_words_from_sentence(example))
-        return new_sents
-
-    @staticmethod
-    def _is_ngram_worthy(token):
-        """Decide if we should use this token for ngram counting.
-
-        Excludes every word with digits in them, hyperlinks or
-        an assigned word vector."""
-        return (
-            not token.has_vector
-            and not token.like_url
-            and not token.like_num
-            and not token.like_email
-            and not token.is_punct
-        )
-
-    def _remove_in_vocab_words_from_sentence(self, example):
-        """Filter for words that do not have a word vector."""
-
-        cleaned_tokens = [
-            token
-            for token in example.get(
-                MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-            )
-            if self._is_ngram_worthy(token)
-        ]
-
-        # keep only out-of-vocab 'non_word' words
-        non_words = " ".join([t.text for t in cleaned_tokens])
-
-        # remove digits and extra spaces
-        non_words = "".join([letter for letter in non_words if not letter.isdigit()])
-        non_words = " ".join([word for word in non_words.split(" ") if word != ""])
-
-        # add cleaned sentence to list of these sentences
-        return non_words
-
-    def _intents_with_enough_examples(self, labels, examples):
-        """Filter examples where we do not have a min number of examples."""
-
-        min_intent_examples = self.component_config["min_intent_examples"]
-        usable_labels = []
-
-        for label in np.unique(labels):
-            lab_sents = np.array(examples)[np.array(labels) == label]
-            if len(lab_sents) < min_intent_examples:
-                continue
-            usable_labels.append(label)
-
-        return usable_labels
-
-    def _rank_ngrams_using_cv(self, examples, labels, list_of_ngrams):
-        from sklearn import linear_model
-
-        X = np.array(self._ngrams_in_sentences(examples, list_of_ngrams))
-        y = self.encode_labels(labels)
-
-        clf = linear_model.RandomizedLogisticRegression(C=1)
-        clf.fit(X, y)
-
-        # sort the ngrams according to the classification score
-        scores = clf.scores_
-        sorted_idxs = sorted(enumerate(scores), key=lambda x: -1 * x[1])
-        sorted_ngrams = [list_of_ngrams[i[0]] for i in sorted_idxs]
-
-        return sorted_ngrams
-
-    def _sort_applicable_ngrams(self, ngrams_list, examples, labels):
-        """Given an intent classification problem and a list of ngrams,
-
-        creates ordered list of most useful ngrams."""
-
-        if not ngrams_list:
-            return []
-
-        # make sure we have enough labeled instances for cv
-        usable_labels = self._intents_with_enough_examples(labels, examples)
-
-        mask = [label in usable_labels for label in labels]
-        if any(mask) and len(usable_labels) >= 2:
-            try:
-                examples = np.array(examples)[mask]
-                labels = np.array(labels)[mask]
-
-                return self._rank_ngrams_using_cv(examples, labels, ngrams_list)
-            except ValueError as e:
-                if "needs samples of at least 2 classes" in str(e):
-                    # we got unlucky during the random
-                    # sampling :( and selected a slice that
-                    # only contains one class
-                    return []
-                else:
-                    raise e
-        else:
-            # there is no example we can use for the cross validation
-            return []
-
-    def _ngrams_in_sentences(self, examples, ngrams):
-        """Given a set of sentences, returns a feature vector for each sentence.
-
-        The first $k$ elements are from the `intent_features`,
-        the rest are {1,0} elements denoting whether an ngram is in sentence."""
-
-        all_vectors = []
-        for example in examples:
-            presence_vector = self._ngrams_in_sentence(example, ngrams)
-            all_vectors.append(presence_vector)
-        return all_vectors
-
-    def _ngrams_in_sentence(self, example, ngrams):
-        """Given a set of sentences, return a vector indicating ngram presence.
-
-        The vector will return 1 entries if the corresponding ngram is
-        present in the sentence and 0 if it is not."""
-
-        cleaned_sentence = self._remove_in_vocab_words_from_sentence(example)
-        presence_vector = np.zeros(len(ngrams))
-        idx_array = [
-            idx for idx in range(len(ngrams)) if ngrams[idx] in cleaned_sentence
-        ]
-        presence_vector[idx_array] = 1
-        return presence_vector
-
-    def _generate_all_ngrams(self, list_of_strings, ngram_min_length):
-        """Takes a list of strings and generates all character ngrams.
-
-        Generated ngrams are at least 3 characters (and at most 17),
-        occur at least 5 times and occur independently of longer
-        superset ngrams at least once."""
-
-        features = {}
-        counters = {ngram_min_length - 1: Counter()}
-        max_length = self.component_config["ngram_max_length"]
-
-        for n in range(ngram_min_length, max_length):
-            candidates = []
-            features[n] = []
-            counters[n] = Counter()
-
-            # generate all possible n length ngrams
-            for text in list_of_strings:
-                text = text.replace(punctuation, " ")
-                for word in text.lower().split(" "):
-                    cands = [word[i : i + n] for i in range(len(word) - n)]
-                    for cand in cands:
-                        counters[n][cand] += 1
-                        if cand not in candidates:
-                            candidates.append(cand)
-
-            min_count = self.component_config["ngram_min_occurrences"]
-            # iterate over these candidates picking only the applicable ones
-            for can in candidates:
-                if counters[n][can] >= min_count:
-                    features[n].append(can)
-                    begin = can[:-1]
-                    end = can[1:]
-                    if n >= ngram_min_length:
-                        if (
-                            counters[n - 1][begin] == counters[n][can]
-                            and begin in features[n - 1]
-                        ):
-                            features[n - 1].remove(begin)
-                        if (
-                            counters[n - 1][end] == counters[n][can]
-                            and end in features[n - 1]
-                        ):
-                            features[n - 1].remove(end)
-
-        return [item for sublist in list(features.values()) for item in sublist]
-
-    @staticmethod
-    def _collect_features(examples):
-        if examples:
-            collected_features = [
-                e.get(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-                for e in examples
-                if e.get(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-                is not None
-            ]
-        else:
-            collected_features = []
-
-        if collected_features:
-            return np.stack(collected_features)
-        else:
-            return None
-
-    def _append_ngram_features(self, examples, existing_features, max_ngrams):
-        ngrams_to_use = self._ngrams_to_use(max_ngrams)
-        extras = np.array(self._ngrams_in_sentences(examples, ngrams_to_use))
-        if existing_features is not None:
-            return np.hstack((existing_features, extras))
-        else:
-            return extras
-
-    @staticmethod
-    def _num_cv_splits(y):
-        return min(10, np.min(np.bincount(y))) if y.size > 0 else 0
-
-    @staticmethod
-    def encode_labels(labels):
-        from sklearn import preprocessing
-
-        intent_encoder = preprocessing.LabelEncoder()
-        intent_encoder.fit(labels)
-        return intent_encoder.transform(labels)
-
-    def _score_ngram_selection(
-        self, examples, y, existing_text_features, cv_splits, max_ngrams
-    ):
-        from sklearn.model_selection import cross_val_score
-        from sklearn.linear_model import LogisticRegression
-
-        if existing_text_features is None:
-            return 0.0
-
-        clf = LogisticRegression(class_weight="balanced")
-
-        no_ngrams_X = self._append_ngram_features(
-            examples, existing_text_features, max_ngrams
+        logger.warning(
+            "DEPRECATION warning: Using `NGramFeaturizer` is deprecated. "
+            "Please use `CountVectorsFeaturizer`."
         )
-        return np.mean(cross_val_score(clf, no_ngrams_X, y, cv=cv_splits))
-
-    @staticmethod
-    def _generate_test_points(max_ngrams):
-        """Generate a list of increasing numbers.
-
-        They are used to take the best n ngrams and evaluate them. This n
-        is varied to find the best number of ngrams to use. This function
-        defines the number of ngrams that get tested."""
-
-        possible_ngrams = np.linspace(0, max_ngrams, 8)
-        return np.unique(list(map(int, np.floor(possible_ngrams))))
-
-    def _cross_validation(self, examples, labels) -> int:
-        """Choose the best number of ngrams to include in bow.
-
-        Given an intent classification problem and a set of ordered ngrams
-        (ordered in terms of importance by pick_applicable_ngrams) we
-        choose the best number of ngrams to include in our bow vecs
-        by cross validation."""
-
-        max_ngrams = self.component_config["max_number_of_ngrams"]
-
-        if not self.all_ngrams:
-            logger.debug("Found no ngrams. Using existing features.")
-            return 0
-
-        existing_text_features = self._collect_features(examples)
-
-        y = self.encode_labels(labels)
-        cv_splits = self._num_cv_splits(y)
-
-        if cv_splits >= 3:
-            logger.debug(
-                "Started ngram cross-validation to find b"
-                "est number of ngrams to use..."
-            )
-
-            scores = []
-            num_ngrams = self._generate_test_points(max_ngrams)
-            for n in num_ngrams:
-                score = self._score_ngram_selection(
-                    examples, y, existing_text_features, cv_splits, max_ngrams=n
-                )
-                scores.append(score)
-                logger.debug(
-                    "Evaluating usage of {} ngrams. Score: {}".format(n, score)
-                )
-
-            n_top = num_ngrams[np.argmax(scores)]
-            logger.info("Best score with {} ngrams: {}".format(n_top, np.max(scores)))
-            return n_top.item()
-        else:
-            warnings.warn(
-                "Can't cross-validate ngram featurizer. "
-                "There aren't enough examples per intent "
-                "(at least 3)"
-            )
-            return max_ngrams

From c5c9a387ccf6f27b4a63652f321b1587af6d81a0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:22:31 +0200
Subject: [PATCH 017/239] adapt combine with exising feature method

---
 .../dense_featurizer/mitie_featurizer.py          |  4 ++--
 .../dense_featurizer/spacy_featurizer.py          |  4 ++--
 rasa/nlu/featurizers/featurzier.py                | 15 ++++++++++++++-
 .../sparse_featurizer/count_vectors_featurizer.py |  4 ++--
 .../sparse_featurizer/regex_featurizer.py         |  2 +-
 5 files changed, 21 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 1c3e440681c9..71a8e9be9f2b 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -60,7 +60,7 @@ def train(
                     )
                     example.set(
                         MESSAGE_VECTOR_FEATURE_NAMES[attribute],
-                        self._combine_with_existing_features(
+                        self._combine_with_existing_dense_features(
                             example, features, MESSAGE_VECTOR_FEATURE_NAMES[attribute]
                         ),
                     )
@@ -74,7 +74,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
         )
         message.set(
             MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self._combine_with_existing_features(
+            self._combine_with_existing_dense_features(
                 message, features, MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
             ),
         )
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 1aa83e241930..5665401d7341 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -79,7 +79,7 @@ def _set_spacy_ner_features(self, message: Message):
             ner_features = np.array([t.vector for t in doc])
         else:
             ner_features = np.array([[] for t in doc])
-        combined_features = self._combine_with_existing_features(
+        combined_features = self._combine_with_existing_dense_features(
             message,
             ner_features,
             MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_ENTITIES_ATTRIBUTE],
@@ -94,7 +94,7 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
         message_attribute_doc = self.get_doc(message, attribute)
         if message_attribute_doc is not None:
             fs = features_for_doc(message_attribute_doc)
-            features = self._combine_with_existing_features(
+            features = self._combine_with_existing_dense_features(
                 message, fs, MESSAGE_VECTOR_FEATURE_NAMES[attribute]
             )
             message.set(MESSAGE_VECTOR_FEATURE_NAMES[attribute], features)
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index cb7215b1f2ec..2fa725a629b0 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -8,7 +8,7 @@
 
 class Featurizer(Component):
     @staticmethod
-    def _combine_with_existing_features(
+    def _combine_with_existing_dense_features(
         message: Message,
         additional_features: Any,
         feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
@@ -19,3 +19,16 @@ def _combine_with_existing_features(
             )
         else:
             return additional_features
+
+    @staticmethod
+    def _combine_with_existing_sparse_features(
+        message: Message,
+        additional_features: Any,
+        feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+    ) -> Any:
+        if message.get(feature_name) is not None:
+            from scipy.sparse import hstack
+
+            return hstack([message.get(feature_name), additional_features])
+        else:
+            return additional_features
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 43290e7b1ebc..78685b1534e4 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -311,7 +311,7 @@ def _set_attribute_features(
             # create bag for each example
             example.set(
                 MESSAGE_VECTOR_FEATURE_NAMES[attribute],
-                self._combine_with_existing_features(
+                self._combine_with_existing_sparse_features(
                     example,
                     attribute_features[i],
                     MESSAGE_VECTOR_FEATURE_NAMES[attribute],
@@ -547,7 +547,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
             message.set(
                 MESSAGE_VECTOR_FEATURE_NAMES[attribute],
-                self._combine_with_existing_features(
+                self._combine_with_existing_sparse_features(
                     message,
                     features[0],
                     feature_name=MESSAGE_VECTOR_FEATURE_NAMES[attribute],
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index f146365924d3..aac0f5dc8029 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -56,7 +56,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
     def _text_features_with_regex(self, message):
         if self.known_patterns:
             extras = self.features_for_patterns(message)
-            return self._combine_with_existing_features(message, extras)
+            return self._combine_with_existing_sparse_features(message, extras)
         else:
             return message.get(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
 

From 7e1cd8df600a679fab75ce0ebbc02faa9ed357c7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 17:21:58 +0200
Subject: [PATCH 018/239] restructure tests

---
 ...rs.py => test_count_vectors_featurizer.py} | 246 ------------------
 .../nlu/featurizers/test_mitie_featurizer.py  |  16 ++
 .../nlu/featurizers/test_regex_featurizer.py  |  86 ++++++
 .../nlu/featurizers/test_spacy_featurizer.py  | 130 +++++++++
 4 files changed, 232 insertions(+), 246 deletions(-)
 rename tests/nlu/featurizers/{test_featurizers.py => test_count_vectors_featurizer.py} (51%)
 create mode 100644 tests/nlu/featurizers/test_mitie_featurizer.py
 create mode 100644 tests/nlu/featurizers/test_regex_featurizer.py
 create mode 100644 tests/nlu/featurizers/test_spacy_featurizer.py

diff --git a/tests/nlu/featurizers/test_featurizers.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
similarity index 51%
rename from tests/nlu/featurizers/test_featurizers.py
rename to tests/nlu/featurizers/test_count_vectors_featurizer.py
index ad2967cb15d9..b5f5e20b2ab6 100644
--- a/tests/nlu/featurizers/test_featurizers.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -1,255 +1,9 @@
-# -*- coding: utf-8 -
 import numpy as np
 import pytest
 
-from rasa.nlu import training_data
 from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
-from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message
 from rasa.nlu.training_data import TrainingData
-from rasa.nlu.config import RasaNLUModelConfig
-
-
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [
-        (
-            "hey how are you today",
-            [-0.19649599, 0.32493639, -0.37408298, -0.10622784, 0.062756],
-        )
-    ],
-)
-def test_spacy_featurizer(sentence, expected, spacy_nlp):
-    from nlu.featurizers.dense_featurizer import spacy_featurizer
-
-    doc = spacy_nlp(sentence)
-    vecs = spacy_featurizer.features_for_doc(doc)
-    assert np.allclose(doc.vector[:5], expected, atol=1e-5)
-    assert np.allclose(vecs, doc.vector, atol=1e-5)
-
-
-def test_spacy_training_sample_alignment(spacy_nlp_component):
-    from spacy.tokens import Doc
-
-    m1 = Message.build(text="I have a feeling", intent="feeling")
-    m2 = Message.build(text="", intent="feeling")
-    m3 = Message.build(text="I am the last message", intent="feeling")
-    td = TrainingData(training_examples=[m1, m2, m3])
-
-    attribute_docs = spacy_nlp_component.docs_for_training_data(td)
-
-    assert isinstance(attribute_docs["text"][0], Doc)
-    assert isinstance(attribute_docs["text"][1], Doc)
-    assert isinstance(attribute_docs["text"][2], Doc)
-
-    assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
-    assert [t.text for t in attribute_docs["text"][1]] == []
-    assert [t.text for t in attribute_docs["text"][2]] == [
-        "i",
-        "am",
-        "the",
-        "last",
-        "message",
-    ]
-
-
-def test_spacy_intent_featurizer(spacy_nlp_component):
-    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-
-    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    spacy_nlp_component.train(td, config=None)
-    spacy_featurizer = SpacyFeaturizer()
-    spacy_featurizer.train(td, config=None)
-
-    intent_features_exist = np.array(
-        [
-            True if example.get("intent_features") is not None else False
-            for example in td.intent_examples
-        ]
-    )
-
-    # no intent features should have been set
-    assert not any(intent_features_exist)
-
-
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [("hey how are you today", [-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])],
-)
-def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
-    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-
-    doc = spacy_nlp(sentence)
-    token_vectors = [t.vector for t in doc]
-    spacy_config = {"ner_feature_vectors": True}
-    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
-    greet = {"intent": "greet", "text_features": [0.5]}
-    message = Message(sentence, greet)
-    message.set("spacy_doc", doc)
-    ftr._set_spacy_features(message)
-    ftr._set_spacy_ner_features(message)
-    vecs = message.get("ner_features")[0][:5]
-    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
-    assert np.allclose(vecs, expected, atol=1e-4)
-
-
-def test_spacy_ner_featurizer_config(spacy_nlp):
-    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-
-    sentence = "hi there friend"
-    doc = spacy_nlp(sentence)
-    spacy_config = {"ner_feature_vectors": False}
-    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
-    greet = {"intent": "greet", "text_features": [0.5]}
-    message = Message(sentence, greet)
-    message.set("spacy_doc", doc)
-    ftr._set_spacy_features(message)
-    ftr._set_spacy_ner_features(message)
-    vecs = np.array(message.get("ner_features"))
-    assert vecs.shape[0] == len(doc)
-    assert vecs.shape[1] == 0
-
-
-def test_mitie_featurizer(mitie_feature_extractor, default_config):
-    from nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
-
-    mitie_component_config = {"name": "MitieFeaturizer"}
-    ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
-    sentence = "Hey how are you today"
-    tokens = MitieTokenizer().tokenize(sentence)
-    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
-    expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
-    assert np.allclose(vecs[:5], expected, atol=1e-5)
-
-
-def test_ngram_featurizer(spacy_nlp):
-    from nlu.featurizers.sparse_featurizer.ngram_featurizer import NGramFeaturizer
-
-    ftr = NGramFeaturizer({"max_number_of_ngrams": 10})
-
-    # ensures that during random sampling of the ngram CV we don't end up
-    # with a one-class-split
-    repetition_factor = 5
-
-    greet = {"intent": "greet", "text_features": [0.5]}
-    goodbye = {"intent": "goodbye", "text_features": [0.5]}
-    labeled_sentences = [
-        Message("heyheyheyhey", greet),
-        Message("howdyheyhowdy", greet),
-        Message("heyhey howdyheyhowdy", greet),
-        Message("howdyheyhowdy heyhey", greet),
-        Message("astalavistasista", goodbye),
-        Message("astalavistasista sistala", goodbye),
-        Message("sistala astalavistasista", goodbye),
-    ] * repetition_factor
-
-    for m in labeled_sentences:
-        m.set("spacy_doc", spacy_nlp(m.text))
-
-    ftr.min_intent_examples_for_ngram_classification = 2
-    ftr.train_on_sentences(labeled_sentences)
-    assert len(ftr.all_ngrams) > 0
-    assert ftr.best_num_ngrams > 0
-
-
-@pytest.mark.parametrize(
-    "sentence, expected, labeled_tokens",
-    [
-        ("hey how are you today", [0.0, 1.0, 0.0], [0]),
-        ("hey 456 how are you", [1.0, 1.0, 0.0], [1, 0]),
-        ("blah balh random eh", [0.0, 0.0, 0.0], []),
-        ("a 1 digit number", [1.0, 0.0, 1.0], [1, 1]),
-    ],
-)
-def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
-    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
-
-    patterns = [
-        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
-        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
-        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
-    ]
-    ftr = RegexFeaturizer(known_patterns=patterns)
-
-    # adds tokens to the message
-    tokenizer = SpacyTokenizer()
-    message = Message(sentence)
-    message.set("spacy_doc", spacy_nlp(sentence))
-    tokenizer.process(message)
-
-    result = ftr.features_for_patterns(message)
-    assert np.allclose(result, expected, atol=1e-10)
-
-    # the tokenizer should have added tokens
-    assert len(message.get("tokens", [])) > 0
-    # the number of regex matches on each token should match
-    for i, token in enumerate(message.get("tokens")):
-        token_matches = token.get("pattern").values()
-        num_matches = sum(token_matches)
-        assert num_matches == labeled_tokens.count(i)
-
-
-@pytest.mark.parametrize(
-    "sentence, expected, labeled_tokens",
-    [
-        ("lemonade and mapo tofu", [1, 1], [0.0, 2.0, 3.0]),
-        ("a cup of tea", [1, 0], [3.0]),
-        ("Is burrito my favorite food?", [0, 1], [1.0]),
-        ("I want club?mate", [1, 0], [2.0, 3.0]),
-    ],
-)
-def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
-    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
-
-    lookups = [
-        {
-            "name": "drinks",
-            "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
-        },
-        {"name": "plates", "elements": "data/test/lookup_tables/plates.txt"},
-    ]
-    ftr = RegexFeaturizer(lookup_tables=lookups)
-
-    # adds tokens to the message
-    tokenizer = SpacyTokenizer()
-    message = Message(sentence)
-    message.set("spacy_doc", spacy_nlp(sentence))
-    tokenizer.process(message)
-
-    result = ftr.features_for_patterns(message)
-    assert np.allclose(result, expected, atol=1e-10)
-
-    # the tokenizer should have added tokens
-    assert len(message.get("tokens", [])) > 0
-    # the number of regex matches on each token should match
-    for i, token in enumerate(message.get("tokens")):
-        token_matches = token.get("pattern").values()
-        num_matches = sum(token_matches)
-        assert num_matches == labeled_tokens.count(i)
-
-
-def test_spacy_featurizer_casing(spacy_nlp):
-    from nlu.featurizers.dense_featurizer import spacy_featurizer
-
-    # if this starts failing for the default model, we should think about
-    # removing the lower casing the spacy nlp component does when it
-    # retrieves vectors. For compressed spacy models (e.g. models
-    # ending in _sm) this test will most likely fail.
-
-    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    for e in td.intent_examples:
-        doc = spacy_nlp(e.text)
-        doc_capitalized = spacy_nlp(e.text.capitalize())
-
-        vecs = spacy_featurizer.features_for_doc(doc)
-        vecs_capitalized = spacy_featurizer.features_for_doc(doc_capitalized)
-
-        assert np.allclose(
-            vecs, vecs_capitalized, atol=1e-5
-        ), "Vectors are unequal for texts '{}' and '{}'".format(
-            e.text, e.text.capitalize()
-        )
 
 
 @pytest.mark.parametrize(
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
new file mode 100644
index 000000000000..61861eeddfbf
--- /dev/null
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -0,0 +1,16 @@
+import numpy as np
+
+from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+from rasa.nlu.config import RasaNLUModelConfig
+
+
+def test_mitie_featurizer(mitie_feature_extractor, default_config):
+    from nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
+
+    mitie_component_config = {"name": "MitieFeaturizer"}
+    ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
+    sentence = "Hey how are you today"
+    tokens = MitieTokenizer().tokenize(sentence)
+    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
+    expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
+    assert np.allclose(vecs[:5], expected, atol=1e-5)
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
new file mode 100644
index 000000000000..30b3b62c3c6f
--- /dev/null
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -0,0 +1,86 @@
+import numpy as np
+import pytest
+
+from rasa.nlu import training_data
+from rasa.nlu.tokenizers.tokenizer import Token
+from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+from rasa.nlu.training_data import Message
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.config import RasaNLUModelConfig
+
+
+@pytest.mark.parametrize(
+    "sentence, expected, labeled_tokens",
+    [
+        ("hey how are you today", [0.0, 1.0, 0.0], [0]),
+        ("hey 456 how are you", [1.0, 1.0, 0.0], [1, 0]),
+        ("blah balh random eh", [0.0, 0.0, 0.0], []),
+        ("a 1 digit number", [1.0, 0.0, 1.0], [1, 1]),
+    ],
+)
+def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
+    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
+
+    patterns = [
+        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
+        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
+        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
+    ]
+    ftr = RegexFeaturizer(known_patterns=patterns)
+
+    # adds tokens to the message
+    tokenizer = SpacyTokenizer()
+    message = Message(sentence)
+    message.set("spacy_doc", spacy_nlp(sentence))
+    tokenizer.process(message)
+
+    result = ftr.features_for_patterns(message)
+    assert np.allclose(result, expected, atol=1e-10)
+
+    # the tokenizer should have added tokens
+    assert len(message.get("tokens", [])) > 0
+    # the number of regex matches on each token should match
+    for i, token in enumerate(message.get("tokens")):
+        token_matches = token.get("pattern").values()
+        num_matches = sum(token_matches)
+        assert num_matches == labeled_tokens.count(i)
+
+
+@pytest.mark.parametrize(
+    "sentence, expected, labeled_tokens",
+    [
+        ("lemonade and mapo tofu", [1, 1], [0.0, 2.0, 3.0]),
+        ("a cup of tea", [1, 0], [3.0]),
+        ("Is burrito my favorite food?", [0, 1], [1.0]),
+        ("I want club?mate", [1, 0], [2.0, 3.0]),
+    ],
+)
+def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
+    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
+
+    lookups = [
+        {
+            "name": "drinks",
+            "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
+        },
+        {"name": "plates", "elements": "data/test/lookup_tables/plates.txt"},
+    ]
+    ftr = RegexFeaturizer(lookup_tables=lookups)
+
+    # adds tokens to the message
+    tokenizer = SpacyTokenizer()
+    message = Message(sentence)
+    message.set("spacy_doc", spacy_nlp(sentence))
+    tokenizer.process(message)
+
+    result = ftr.features_for_patterns(message)
+    assert np.allclose(result, expected, atol=1e-10)
+
+    # the tokenizer should have added tokens
+    assert len(message.get("tokens", [])) > 0
+    # the number of regex matches on each token should match
+    for i, token in enumerate(message.get("tokens")):
+        token_matches = token.get("pattern").values()
+        num_matches = sum(token_matches)
+        assert num_matches == labeled_tokens.count(i)
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
new file mode 100644
index 000000000000..19fbe0034c1e
--- /dev/null
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -0,0 +1,130 @@
+import numpy as np
+import pytest
+
+from rasa.nlu import training_data
+from rasa.nlu.training_data import Message
+from rasa.nlu.training_data import TrainingData
+from rasa.nlu.config import RasaNLUModelConfig
+
+
+@pytest.mark.parametrize(
+    "sentence, expected",
+    [
+        (
+            "hey how are you today",
+            [-0.19649599, 0.32493639, -0.37408298, -0.10622784, 0.062756],
+        )
+    ],
+)
+def test_spacy_featurizer(sentence, expected, spacy_nlp):
+    from nlu.featurizers.dense_featurizer import spacy_featurizer
+
+    doc = spacy_nlp(sentence)
+    vecs = spacy_featurizer.features_for_doc(doc)
+    assert np.allclose(doc.vector[:5], expected, atol=1e-5)
+    assert np.allclose(vecs, doc.vector, atol=1e-5)
+
+
+def test_spacy_training_sample_alignment(spacy_nlp_component):
+    from spacy.tokens import Doc
+
+    m1 = Message.build(text="I have a feeling", intent="feeling")
+    m2 = Message.build(text="", intent="feeling")
+    m3 = Message.build(text="I am the last message", intent="feeling")
+    td = TrainingData(training_examples=[m1, m2, m3])
+
+    attribute_docs = spacy_nlp_component.docs_for_training_data(td)
+
+    assert isinstance(attribute_docs["text"][0], Doc)
+    assert isinstance(attribute_docs["text"][1], Doc)
+    assert isinstance(attribute_docs["text"][2], Doc)
+
+    assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
+    assert [t.text for t in attribute_docs["text"][1]] == []
+    assert [t.text for t in attribute_docs["text"][2]] == [
+        "i",
+        "am",
+        "the",
+        "last",
+        "message",
+    ]
+
+
+def test_spacy_intent_featurizer(spacy_nlp_component):
+    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+
+    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
+    spacy_nlp_component.train(td, config=None)
+    spacy_featurizer = SpacyFeaturizer()
+    spacy_featurizer.train(td, config=None)
+
+    intent_features_exist = np.array(
+        [
+            True if example.get("intent_features") is not None else False
+            for example in td.intent_examples
+        ]
+    )
+
+    # no intent features should have been set
+    assert not any(intent_features_exist)
+
+
+@pytest.mark.parametrize(
+    "sentence, expected",
+    [("hey how are you today", [-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])],
+)
+def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
+    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+
+    doc = spacy_nlp(sentence)
+    token_vectors = [t.vector for t in doc]
+    spacy_config = {"ner_feature_vectors": True}
+    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
+    greet = {"intent": "greet", "text_features": [0.5]}
+    message = Message(sentence, greet)
+    message.set("spacy_doc", doc)
+    ftr._set_spacy_features(message)
+    ftr._set_spacy_ner_features(message)
+    vecs = message.get("ner_features")[0][:5]
+    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
+    assert np.allclose(vecs, expected, atol=1e-4)
+
+
+def test_spacy_ner_featurizer_config(spacy_nlp):
+    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+
+    sentence = "hi there friend"
+    doc = spacy_nlp(sentence)
+    spacy_config = {"ner_feature_vectors": False}
+    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
+    greet = {"intent": "greet", "text_features": [0.5]}
+    message = Message(sentence, greet)
+    message.set("spacy_doc", doc)
+    ftr._set_spacy_features(message)
+    ftr._set_spacy_ner_features(message)
+    vecs = np.array(message.get("ner_features"))
+    assert vecs.shape[0] == len(doc)
+    assert vecs.shape[1] == 0
+
+
+def test_spacy_featurizer_casing(spacy_nlp):
+    from nlu.featurizers.dense_featurizer import spacy_featurizer
+
+    # if this starts failing for the default model, we should think about
+    # removing the lower casing the spacy nlp component does when it
+    # retrieves vectors. For compressed spacy models (e.g. models
+    # ending in _sm) this test will most likely fail.
+
+    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
+    for e in td.intent_examples:
+        doc = spacy_nlp(e.text)
+        doc_capitalized = spacy_nlp(e.text.capitalize())
+
+        vecs = spacy_featurizer.features_for_doc(doc)
+        vecs_capitalized = spacy_featurizer.features_for_doc(doc_capitalized)
+
+        assert np.allclose(
+            vecs, vecs_capitalized, atol=1e-5
+        ), "Vectors are unequal for texts '{}' and '{}'".format(
+            e.text, e.text.capitalize()
+        )

From 53033deefa74fd5a712b76b51c6d44a10ca01c72 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Sun, 20 Oct 2019 14:54:37 +0200
Subject: [PATCH 019/239] Disable cls token use in default pipeline.

---
 examples/formbot/config.yml       | 1 +
 examples/restaurantbot/config.yml | 1 +
 rasa/nlu/registry.py              | 4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/formbot/config.yml b/examples/formbot/config.yml
index 3aa0e7577759..3cf3f4a14fe5 100644
--- a/examples/formbot/config.yml
+++ b/examples/formbot/config.yml
@@ -2,6 +2,7 @@ language: en
 
 pipeline:
   - name: WhitespaceTokenizer
+    use_cls_token: False
   - name: CRFEntityExtractor
   - name: EntitySynonymMapper
   - name: CountVectorsFeaturizer
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index 58e9f0be7209..fcb2086a50e1 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -3,6 +3,7 @@ language: en
 pipeline:
   - name: "SpacyNLP"
   - name: "SpacyTokenizer"
+    use_cls_token: False
   - name: "SpacyFeaturizer"
   - name: "SklearnIntentClassifier"
   - name: "CRFEntityExtractor"
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 06643b5ff459..7a3e28b71c73 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -105,7 +105,7 @@
 registered_pipeline_templates = {
     "pretrained_embeddings_spacy": [
         {"name": "SpacyNLP"},
-        {"name": "SpacyTokenizer"},
+        {"name": "SpacyTokenizer", "use_cls_token": False},
         {"name": "SpacyFeaturizer"},
         {"name": "RegexFeaturizer"},
         {"name": "CRFEntityExtractor"},
@@ -114,7 +114,7 @@
     ],
     "keyword": [{"name": "KeywordIntentClassifier"}],
     "supervised_embeddings": [
-        {"name": "WhitespaceTokenizer"},
+        {"name": "WhitespaceTokenizer", "use_cls_token": False},
         {"name": "RegexFeaturizer"},
         {"name": "CRFEntityExtractor"},
         {"name": "EntitySynonymMapper"},

From 1bc0af007481a4fa22ce522aafceca7b61804c3c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Sun, 20 Oct 2019 14:57:11 +0200
Subject: [PATCH 020/239] correct type

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 60b90938d758..19c3686c5d58 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -45,7 +45,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
         )
 
     def _token_from_offset(
-        self, text: Text, offset: int, encoded_sentence: bytes
+        self, text: bytes, offset: int, encoded_sentence: bytes
     ) -> Token:
         return Token(
             text.decode("utf-8"), self._byte_to_char_offset(encoded_sentence, offset)

From 55b2c74acdd0d50bced48324f1d8332e8a83c9d4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 09:39:50 +0200
Subject: [PATCH 021/239] fix tests

---
 tests/nlu/base/test_config.py      | 2 +-
 tests/nlu/base/test_featurizers.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
index be729075adb3..f6453e49404e 100644
--- a/tests/nlu/base/test_config.py
+++ b/tests/nlu/base/test_config.py
@@ -65,7 +65,7 @@ def test_set_attr_on_component():
     cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
     cfg.set_component_attr(6, C=324)
 
-    assert cfg.for_component(1) == {"name": "SpacyTokenizer"}
+    assert cfg.for_component(1) == {"name": "SpacyTokenizer", "use_cls_token": False}
     assert cfg.for_component(6) == {"name": "SklearnIntentClassifier", "C": 324}
 
 
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
index cd0c8ce3c13a..d67c905deab6 100644
--- a/tests/nlu/base/test_featurizers.py
+++ b/tests/nlu/base/test_featurizers.py
@@ -117,7 +117,8 @@ def test_mitie_featurizer(mitie_feature_extractor, default_config):
     mitie_component_config = {"name": "MitieFeaturizer"}
     ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
     sentence = "Hey how are you today"
-    tokens = MitieTokenizer().tokenize(sentence)
+    mitie_component_config = {"name": "MitieTokenizer", "use_cls_token": False}
+    tokens = MitieTokenizer(mitie_component_config).tokenize(sentence)
     vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
     expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
     assert np.allclose(vecs[:5], expected, atol=1e-5)
@@ -212,7 +213,8 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     ftr = RegexFeaturizer(lookup_tables=lookups)
 
     # adds tokens to the message
-    tokenizer = SpacyTokenizer()
+    component_config = {"name": "SpacyTokenizer", "use_cls_token": False}
+    tokenizer = SpacyTokenizer(component_config)
     message = Message(sentence)
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)

From 213b808e2d3509cf8e01e819fb2cd52c7420472f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 10:05:06 +0200
Subject: [PATCH 022/239] fix tokenizer tests.

---
 tests/nlu/featurizers/test_mitie_featurizer.py | 3 ++-
 tests/nlu/featurizers/test_regex_featurizer.py | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index 61861eeddfbf..c408d08680a2 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -10,7 +10,8 @@ def test_mitie_featurizer(mitie_feature_extractor, default_config):
     mitie_component_config = {"name": "MitieFeaturizer"}
     ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
     sentence = "Hey how are you today"
-    tokens = MitieTokenizer().tokenize(sentence)
+    mitie_component_config = {"name": "MitieTokenizer", "use_cls_token": False}
+    tokens = MitieTokenizer(mitie_component_config).tokenize(sentence)
     vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
     expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
     assert np.allclose(vecs[:5], expected, atol=1e-5)
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index 30b3b62c3c6f..7a3455ca85ee 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -69,7 +69,8 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     ftr = RegexFeaturizer(lookup_tables=lookups)
 
     # adds tokens to the message
-    tokenizer = SpacyTokenizer()
+    component_config = {"name": "SpacyTokenizer", "use_cls_token": False}
+    tokenizer = SpacyTokenizer(component_config)
     message = Message(sentence)
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)

From 0f577e01bc263fc01ff77c2a8dfc41160c17a2a1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 10:23:54 +0200
Subject: [PATCH 023/239] add sparse and dense feature names to constants

---
 .../embedding_intent_classifier.py            | 12 +++---
 .../classifiers/sklearn_intent_classifier.py  |  7 +++-
 rasa/nlu/constants.py                         | 19 +++++----
 .../dense_featurizer/mitie_featurizer.py      | 20 +++++----
 .../dense_featurizer/spacy_featurizer.py      | 42 +++----------------
 rasa/nlu/featurizers/featurzier.py            | 13 ++++--
 .../count_vectors_featurizer.py               | 13 +++---
 .../sparse_featurizer/regex_featurizer.py     | 16 ++++---
 .../selectors/embedding_response_selector.py  |  6 +--
 rasa/nlu/utils/spacy_utils.py                 |  2 +-
 10 files changed, 71 insertions(+), 79 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 8e17c42c3016..cff51980ccf8 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -12,7 +12,7 @@
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
 )
 
 import tensorflow as tf
@@ -50,7 +50,7 @@ class EmbeddingIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
@@ -335,7 +335,9 @@ def _create_session_data(
 
         for e in training_data.intent_examples:
             if e.get(attribute):
-                X.append(e.get(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]))
+                X.append(
+                    e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                )
                 label_ids.append(label_id_dict[e.get(attribute)])
 
         X = np.array(X)
@@ -475,7 +477,7 @@ def preprocess_train_data(self, training_data):
             training_data,
             label_id_dict,
             attribute=MESSAGE_INTENT_ATTRIBUTE,
-            attribute_feature_name=MESSAGE_VECTOR_FEATURE_NAMES[
+            attribute_feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
                 MESSAGE_INTENT_ATTRIBUTE
             ],
         )
@@ -607,7 +609,7 @@ def predict_label(self, message):
             # get features (bag of words) for a message
             # noinspection PyPep8Naming
             X = message.get(
-                MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
             ).reshape(1, -1)
 
             # load tf graph and session
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index a562f4f8077a..9eab651f48b0 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -10,7 +10,10 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import MESSAGE_VECTOR_FEATURE_NAMES, MESSAGE_TEXT_ATTRIBUTE
+from rasa.nlu.constants import (
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_TEXT_ATTRIBUTE,
+)
 
 logger = logging.getLogger(__name__)
 
@@ -23,7 +26,7 @@ class SklearnIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index ba1f8b9c9a09..5fe2f5ac2f3c 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -8,10 +8,9 @@
 
 MESSAGE_ENTITIES_ATTRIBUTE = "entities"
 
-MESSAGE_NER_FEATURES_ATTRIBUTE = "ner_features"
-
 CLS_TOKEN = "__CLS__"
 
+
 MESSAGE_ATTRIBUTES = [
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
@@ -22,14 +21,18 @@
     MESSAGE_TEXT_ATTRIBUTE: "tokens",
     MESSAGE_INTENT_ATTRIBUTE: "intent_tokens",
     MESSAGE_RESPONSE_ATTRIBUTE: "response_tokens",
-    MESSAGE_ENTITIES_ATTRIBUTE: "tokens",
 }
 
-MESSAGE_VECTOR_FEATURE_NAMES = {
-    MESSAGE_TEXT_ATTRIBUTE: "text_features",
-    MESSAGE_INTENT_ATTRIBUTE: "intent_features",
-    MESSAGE_RESPONSE_ATTRIBUTE: "response_features",
-    MESSAGE_ENTITIES_ATTRIBUTE: "ner_features",
+MESSAGE_VECTOR_SPARSE_FEATURE_NAMES = {
+    MESSAGE_TEXT_ATTRIBUTE: "text_sparse_features",
+    MESSAGE_INTENT_ATTRIBUTE: "intent_sparse_features",
+    MESSAGE_RESPONSE_ATTRIBUTE: "response_sparse_features",
+}
+
+MESSAGE_VECTOR_DENSE_FEATURE_NAMES = {
+    MESSAGE_TEXT_ATTRIBUTE: "text_dense_features",
+    MESSAGE_INTENT_ATTRIBUTE: "intent_dense_features",
+    MESSAGE_RESPONSE_ATTRIBUTE: "response_dense_features",
 }
 
 MESSAGE_SPACY_FEATURES_NAMES = {
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 71a8e9be9f2b..9f94be854db5 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -11,20 +11,18 @@
     import mitie
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
 )
 
 
 class MitieFeaturizer(Featurizer):
 
     provides = [
-        MESSAGE_VECTOR_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES
+        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+        for attribute in MESSAGE_ATTRIBUTES
     ]
 
     requires = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES] + [
@@ -59,9 +57,11 @@ def train(
                         attribute_tokens, mitie_feature_extractor
                     )
                     example.set(
-                        MESSAGE_VECTOR_FEATURE_NAMES[attribute],
+                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
-                            example, features, MESSAGE_VECTOR_FEATURE_NAMES[attribute]
+                            example,
+                            features,
+                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         ),
                     )
 
@@ -73,9 +73,11 @@ def process(self, message: Message, **kwargs: Any) -> None:
             mitie_feature_extractor,
         )
         message.set(
-            MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(
-                message, features, MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                message,
+                features,
+                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             ),
         )
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 5665401d7341..3ac51234b484 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -11,15 +11,9 @@
     from spacy.tokens import Doc
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
     MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
-    MESSAGE_ENTITIES_ATTRIBUTE,
-    MESSAGE_NER_FEATURES_ATTRIBUTE,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 
@@ -37,23 +31,15 @@ def features_for_doc(doc: "Doc") -> np.ndarray:
 class SpacyFeaturizer(Featurizer):
 
     provides = [
-        MESSAGE_VECTOR_FEATURE_NAMES[attribute]
+        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ] + [MESSAGE_NER_FEATURES_ATTRIBUTE]
+    ]
 
     requires = [
         MESSAGE_SPACY_FEATURES_NAMES[attribute]
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    defaults = {"ner_feature_vectors": False}
-
-    def __init__(self, component_config=None, known_patterns=None, lookup_tables=None):
-
-        super(SpacyFeaturizer, self).__init__(component_config)
-
-        self.ner_feature_vectors = self.component_config["ner_feature_vectors"]
-
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
@@ -61,7 +47,6 @@ def train(
         for example in training_data.intent_examples:
             for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
                 self._set_spacy_features(example, attribute)
-            self._set_spacy_ner_features(example)
 
     def get_doc(self, message, attribute):
 
@@ -70,23 +55,6 @@ def get_doc(self, message, attribute):
     def process(self, message: Message, **kwargs: Any) -> None:
 
         self._set_spacy_features(message)
-        self._set_spacy_ner_features(message)
-
-    def _set_spacy_ner_features(self, message: Message):
-        """If we want to use spacy as an NER featurizer, set token vectors"""
-        doc = message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-        if self.ner_feature_vectors:
-            ner_features = np.array([t.vector for t in doc])
-        else:
-            ner_features = np.array([[] for t in doc])
-        combined_features = self._combine_with_existing_dense_features(
-            message,
-            ner_features,
-            MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_ENTITIES_ATTRIBUTE],
-        )
-        message.set(
-            MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_ENTITIES_ATTRIBUTE], combined_features
-        )
 
     def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
         """Adds the spacy word vectors to the messages features."""
@@ -95,6 +63,6 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
         if message_attribute_doc is not None:
             fs = features_for_doc(message_attribute_doc)
             features = self._combine_with_existing_dense_features(
-                message, fs, MESSAGE_VECTOR_FEATURE_NAMES[attribute]
+                message, fs, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
             )
-            message.set(MESSAGE_VECTOR_FEATURE_NAMES[attribute], features)
+            message.set(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute], features)
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index 2fa725a629b0..47073e1be7ac 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -3,7 +3,10 @@
 from typing import Any, Text
 from rasa.nlu.training_data import Message
 from rasa.nlu.components import Component
-from rasa.nlu.constants import MESSAGE_VECTOR_FEATURE_NAMES, MESSAGE_TEXT_ATTRIBUTE
+from rasa.nlu.constants import (
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_TEXT_ATTRIBUTE,
+)
 
 
 class Featurizer(Component):
@@ -11,7 +14,9 @@ class Featurizer(Component):
     def _combine_with_existing_dense_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+        feature_name: Text = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
+            MESSAGE_TEXT_ATTRIBUTE
+        ],
     ) -> Any:
         if message.get(feature_name) is not None:
             return np.concatenate(
@@ -24,7 +29,9 @@ def _combine_with_existing_dense_features(
     def _combine_with_existing_sparse_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+        feature_name: Text = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
+            MESSAGE_TEXT_ATTRIBUTE
+        ],
     ) -> Any:
         if message.get(feature_name) is not None:
             from scipy.sparse import hstack
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 78685b1534e4..b08cc80f3545 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -17,7 +17,7 @@
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 
@@ -38,7 +38,8 @@ class CountVectorsFeaturizer(Featurizer):
     """
 
     provides = [
-        MESSAGE_VECTOR_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES
+        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]
+        for attribute in MESSAGE_ATTRIBUTES
     ]
 
     requires = []
@@ -310,11 +311,11 @@ def _set_attribute_features(
         for i, example in enumerate(training_data.training_examples):
             # create bag for each example
             example.set(
-                MESSAGE_VECTOR_FEATURE_NAMES[attribute],
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
                 self._combine_with_existing_sparse_features(
                     example,
                     attribute_features[i],
-                    MESSAGE_VECTOR_FEATURE_NAMES[attribute],
+                    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
                 ),
             )
 
@@ -546,11 +547,11 @@ def process(self, message: Message, **kwargs: Any) -> None:
             features = self._create_sequence(attribute, [message_text])
 
             message.set(
-                MESSAGE_VECTOR_FEATURE_NAMES[attribute],
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
                 self._combine_with_existing_sparse_features(
                     message,
                     features[0],
-                    feature_name=MESSAGE_VECTOR_FEATURE_NAMES[attribute],
+                    feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
                 ),
             )
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index aac0f5dc8029..444c0e12ed2a 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -14,7 +14,7 @@
 from rasa.nlu.constants import (
     MESSAGE_TOKENS_NAMES,
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
 )
 
 logger = logging.getLogger(__name__)
@@ -25,7 +25,7 @@
 
 class RegexFeaturizer(Featurizer):
 
-    provides = [MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    provides = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
     requires = [MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
@@ -46,19 +46,25 @@ def train(
 
         for example in training_data.training_examples:
             updated = self._text_features_with_regex(example)
-            example.set(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], updated)
+            example.set(
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], updated
+            )
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
         updated = self._text_features_with_regex(message)
-        message.set(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], updated)
+        message.set(
+            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], updated
+        )
 
     def _text_features_with_regex(self, message):
         if self.known_patterns:
             extras = self.features_for_patterns(message)
             return self._combine_with_existing_sparse_features(message, extras)
         else:
-            return message.get(MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            return message.get(
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+            )
 
     def _add_lookup_table_regexes(self, lookup_tables):
         # appends the regex features from the lookup tables to
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 55f69c5bdca4..c94be79d5676 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -11,7 +11,7 @@
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
     OPEN_UTTERANCE_PREDICTION_KEY,
     OPEN_UTTERANCE_RANKING_KEY,
     MESSAGE_SELECTOR_PROPERTY_NAME,
@@ -50,7 +50,7 @@ class ResponseSelector(EmbeddingIntentClassifier):
 
     provides = ["response", "response_ranking"]
 
-    requires = [MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
@@ -152,7 +152,7 @@ def preprocess_train_data(self, training_data):
             training_data,
             label_id_dict,
             attribute=MESSAGE_RESPONSE_ATTRIBUTE,
-            attribute_feature_name=MESSAGE_VECTOR_FEATURE_NAMES[
+            attribute_feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
                 MESSAGE_RESPONSE_ATTRIBUTE
             ],
         )
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index 30ec6e40bafb..ae59ee95737f 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -21,7 +21,7 @@
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 

From 6f59d4c7df565e131bf55abac267abb1d2ae1b07 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 13:47:13 +0200
Subject: [PATCH 024/239] Add cls token to whitespace tokenizer.

---
 rasa/nlu/constants.py                       |  2 ++
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 18 ++++++++++++++----
 tests/nlu/base/test_tokenizers.py           | 17 +++++++++++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 9e3b5f7a2e78..ba1f8b9c9a09 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -10,6 +10,8 @@
 
 MESSAGE_NER_FEATURES_ATTRIBUTE = "ner_features"
 
+CLS_TOKEN = "__CLS__"
+
 MESSAGE_ATTRIBUTES = [
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 94179ead2acb..591aca3c05ff 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -6,13 +6,12 @@
 from rasa.nlu.tokenizers import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_RESPONSE_ATTRIBUTE,
+    CLS_TOKEN,
 )
 
 
@@ -25,8 +24,10 @@ class WhitespaceTokenizer(Tokenizer, Component):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
-        # text will be tokenized with case sensitive as default
+        # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -40,6 +41,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
+        self.add_cls_token = self.component_config["add_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -88,9 +90,17 @@ def tokenize(
 
         running_offset = 0
         tokens = []
+
         for word in words:
             word_offset = text.index(word, running_offset)
             word_len = len(word)
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text)))
+
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index d6e0f78691e6..de008ef8e62f 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 
 from unittest.mock import patch
+
+from rasa.nlu.constants import CLS_TOKEN
 from rasa.nlu.training_data import TrainingData, Message
 from tests.nlu import utilities
 from rasa.nlu import training_data
@@ -77,6 +79,21 @@ def test_whitespace():
     ] == [0, 83]
 
 
+def test_whitespace_cls_token():
+    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = WhitespaceTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+
+
 def test_whitespace_custom_intent_symbol():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 

From c4068f268e31acc893bbcc864e8a515f5f5bc61f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:01:51 +0200
Subject: [PATCH 025/239] Add cls token to spacy tokenizer.

---
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 30 +++++++++++++--------
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  2 +-
 tests/nlu/base/test_tokenizers.py           | 18 +++++++++++++
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 87443d3375de..e289dcf31c83 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,5 +1,5 @@
 import typing
-from typing import Any
+from typing import Any, Dict, Text, List, Optional
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
@@ -7,14 +7,11 @@
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
     MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
+    CLS_TOKEN,
 )
 
 if typing.TYPE_CHECKING:
@@ -32,6 +29,16 @@ class SpacyTokenizer(Tokenizer, Component):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        """Construct a new tokenizer using the SpacyTokenizer framework."""
+        super(SpacyTokenizer, self).__init__(component_config)
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
@@ -47,17 +54,18 @@ def train(
                         MESSAGE_TOKENS_NAMES[attribute], self.tokenize(attribute_doc)
                     )
 
-    def get_doc(self, message, attribute):
-
+    def get_doc(self, message: Message, attribute: Text) -> "Doc":
         return message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
-
         message.set(
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self.tokenize(self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE)),
         )
 
-    def tokenize(self, doc: "Doc") -> typing.List[Token]:
-
-        return [Token(t.text, t.idx) for t in doc]
+    def tokenize(self, doc: "Doc") -> List[Token]:
+        tokens = [Token(t.text, t.idx) for t in doc]
+        if self.add_cls_token:
+            idx = doc[-1].idx + len(doc[-1].text) + 1
+            tokens = tokens + [Token(CLS_TOKEN, idx)]
+        return tokens
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 591aca3c05ff..acf24d7b5334 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -101,6 +101,6 @@ def tokenize(
             attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
             and self.add_cls_token
         ):
-            tokens.append(Token(CLS_TOKEN, len(text)))
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
 
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index de008ef8e62f..5005f8cfb9df 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -92,6 +92,7 @@ def test_whitespace_cls_token():
         "lunch",
         CLS_TOKEN,
     ]
+    assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13, 19]
 
 
 def test_whitespace_custom_intent_symbol():
@@ -207,6 +208,23 @@ def test_spacy(spacy_nlp):
     assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
 
 
+def test_spacy_add_cls_token(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
+
+
 def test_spacy_intent_tokenizer(spacy_nlp_component):
     from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 

From 64496de7e9513aa4d73519efec563815cbbf8873 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:07:42 +0200
Subject: [PATCH 026/239] Add cls token to mitie tokenizer.

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 31 ++++++++++++++++++++++----
 tests/nlu/base/test_tokenizers.py      | 17 ++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 559d9920fc23..d28673ec1311 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Text
+from typing import Any, List, Text, Optional, Dict
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
@@ -7,8 +7,10 @@
 
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
+    CLS_TOKEN,
 )
 from rasa.utils.io import DEFAULT_ENCODING
 
@@ -17,6 +19,16 @@ class MitieTokenizer(Tokenizer, Component):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
+    defaults = {
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        """Construct a new tokenizer using the SpacyTokenizer framework."""
+        super(MitieTokenizer, self).__init__(component_config)
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
@@ -32,7 +44,7 @@ def train(
                 if example.get(attribute) is not None:
                     example.set(
                         MESSAGE_TOKENS_NAMES[attribute],
-                        self.tokenize(example.get(attribute)),
+                        self.tokenize(example.get(attribute), attribute),
                     )
 
     def process(self, message: Message, **kwargs: Any) -> None:
@@ -41,13 +53,17 @@ def process(self, message: Message, **kwargs: Any) -> None:
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], self.tokenize(message.text)
         )
 
-    def _token_from_offset(self, text, offset, encoded_sentence):
+    def _token_from_offset(
+        self, text: Text, offset: int, encoded_sentence: bytes
+    ) -> Token:
         return Token(
             text.decode(DEFAULT_ENCODING),
             self._byte_to_char_offset(encoded_sentence, offset),
         )
 
-    def tokenize(self, text: Text) -> List[Token]:
+    def tokenize(
+        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         import mitie
 
         encoded_sentence = text.encode(DEFAULT_ENCODING)
@@ -56,6 +72,13 @@ def tokenize(self, text: Text) -> List[Token]:
             self._token_from_offset(token, offset, encoded_sentence)
             for token, offset in tokenized
         ]
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+
         return tokens
 
     @staticmethod
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 5005f8cfb9df..79566099ea09 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -263,6 +263,23 @@ def test_mitie():
     assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
 
 
+def test_mitie_add_cls_token():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
+
+
 def test_jieba():
     from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 

From 84787c8d891ff7aef58ede65bbd39862d050226b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:13:38 +0200
Subject: [PATCH 027/239] Add cls token to jieba tokenizer.

---
 rasa/nlu/tokenizers/jieba_tokenizer.py | 18 +++++++++++++++---
 rasa/nlu/tokenizers/spacy_tokenizer.py |  2 +-
 tests/nlu/base/test_tokenizers.py      | 18 ++++++++++++++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index ae9f1e927220..71c17ec0be4f 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -16,8 +16,7 @@
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    CLS_TOKEN,
 )
 
 logger = logging.getLogger(__name__)
@@ -39,6 +38,8 @@ class JiebaTokenizer(Tokenizer, Component):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -61,6 +62,8 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["jieba"]
@@ -108,12 +111,21 @@ def preprocess_text(self, text, attribute):
         else:
             return text
 
-    def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]:
+    def tokenize(
+        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         import jieba
 
         text = self.preprocess_text(text, attribute)
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+
         return tokens
 
     @classmethod
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index e289dcf31c83..fac2ba1c3a32 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -67,5 +67,5 @@ def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
         if self.add_cls_token:
             idx = doc[-1].idx + len(doc[-1].text) + 1
-            tokens = tokens + [Token(CLS_TOKEN, idx)]
+            tokens.append(Token(CLS_TOKEN, idx))
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 79566099ea09..07057449ca7d 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -308,3 +308,21 @@ def test_jieba_load_dictionary(tmpdir_factory):
         tk.tokenize("")
 
     mock_method.assert_called_once_with(dictionary_path)
+
+
+def test_jieba_add_cls_token():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
+        "Micheal",
+        "你好",
+        "吗",
+        "？",
+        CLS_TOKEN,
+    ]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]

From d496b433e4508f5dd2e853e3bbc5101e25058fba Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:14:57 +0200
Subject: [PATCH 028/239] Add changelog entry.

---
 CHANGELOG.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index caefdba5aa4d..dba0c5069251 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -12,6 +12,7 @@ This project adheres to `Semantic Versioning`_ starting with version 1.0.
 
 Added
 -----
+- Added option ``add_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
 
 Changed
 -------

From 291a24ea7c40192c263b727986b2bc8d17ad7c6a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:47:28 +0200
Subject: [PATCH 029/239] move code from init to own file

---
 rasa/nlu/extractors/crf_entity_extractor.py |  2 +-
 rasa/nlu/featurizers/mitie_featurizer.py    |  2 +-
 rasa/nlu/test.py                            |  2 +-
 rasa/nlu/tokenizers/__init__.py             | 16 ---------
 rasa/nlu/tokenizers/jieba_tokenizer.py      | 14 +++-----
 rasa/nlu/tokenizers/mitie_tokenizer.py      | 14 +++-----
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 11 +++---
 rasa/nlu/tokenizers/tokenizer.py            | 39 +++++++++++++++++++++
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 14 +++-----
 tests/nlu/base/test_tokenizers.py           |  8 ++---
 10 files changed, 62 insertions(+), 60 deletions(-)
 create mode 100644 rasa/nlu/tokenizers/tokenizer.py

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index dfad359e6e75..77548dac85c5 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -6,7 +6,7 @@
 from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.constants import DOCS_BASE_URL
 
diff --git a/rasa/nlu/featurizers/mitie_featurizer.py b/rasa/nlu/featurizers/mitie_featurizer.py
index 9d0dbb8f5a7c..1eda72fe2112 100644
--- a/rasa/nlu/featurizers/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/mitie_featurizer.py
@@ -4,7 +4,7 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers import Featurizer
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 9143638590c4..2990a4678548 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -32,7 +32,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Interpreter, Trainer, TrainingData
 from rasa.nlu.components import Component
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.core.constants import RESPOND_PREFIX
 
 logger = logging.getLogger(__name__)
diff --git a/rasa/nlu/tokenizers/__init__.py b/rasa/nlu/tokenizers/__init__.py
index 8cb8732bf097..e69de29bb2d1 100644
--- a/rasa/nlu/tokenizers/__init__.py
+++ b/rasa/nlu/tokenizers/__init__.py
@@ -1,16 +0,0 @@
-class Tokenizer(object):
-    pass
-
-
-class Token(object):
-    def __init__(self, text, offset, data=None):
-        self.offset = offset
-        self.text = text
-        self.end = offset + len(text)
-        self.data = data if data else {}
-
-    def set(self, prop, info):
-        self.data[prop] = info
-
-    def get(self, prop, default=None):
-        return self.data.get(prop, default)
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 71c17ec0be4f..3a7291d686ca 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -7,16 +7,14 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 
 logger = logging.getLogger(__name__)
@@ -39,7 +37,7 @@ class JiebaTokenizer(Tokenizer, Component):
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False,
+        "use_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -62,7 +60,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -120,11 +118,7 @@ def tokenize(
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index d28673ec1311..4450b804a735 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -2,15 +2,13 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 from rasa.utils.io import DEFAULT_ENCODING
 
@@ -21,13 +19,13 @@ class MitieTokenizer(Tokenizer, Component):
 
     defaults = {
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False
+        "use_cls_token": False
     }
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(MitieTokenizer, self).__init__(component_config)
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -73,11 +71,7 @@ def tokenize(
             for token, offset in tokenized
         ]
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index fac2ba1c3a32..432eff0b7c06 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -3,7 +3,7 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
@@ -11,7 +11,6 @@
     MESSAGE_TOKENS_NAMES,
     MESSAGE_SPACY_FEATURES_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 
 if typing.TYPE_CHECKING:
@@ -31,13 +30,13 @@ class SpacyTokenizer(Tokenizer, Component):
 
     defaults = {
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False
+        "use_cls_token": False
     }
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(SpacyTokenizer, self).__init__(component_config)
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -65,7 +64,5 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        if self.add_cls_token:
-            idx = doc[-1].idx + len(doc[-1].text) + 1
-            tokens.append(Token(CLS_TOKEN, idx))
+        self.add_cls_token(tokens, self.use_cls_token)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
new file mode 100644
index 000000000000..4d903822f6f1
--- /dev/null
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -0,0 +1,39 @@
+from typing import Text, List, Optional, Dict, Any
+
+from rasa.nlu.constants import (
+    MESSAGE_RESPONSE_ATTRIBUTE,
+    MESSAGE_TEXT_ATTRIBUTE,
+    CLS_TOKEN,
+)
+
+
+class Token(object):
+    def __init__(self, text, offset, data=None):
+        self.offset = offset
+        self.text = text
+        self.end = offset + len(text)
+        self.data = data if data else {}
+
+    def set(self, prop, info):
+        self.data[prop] = info
+
+    def get(self, prop, default=None):
+        return self.data.get(prop, default)
+
+
+class Tokenizer(object):
+    def add_cls_token(
+        self,
+        tokens: List[Token],
+        use_cls_token: bool,
+        attribute: Text = MESSAGE_TEXT_ATTRIBUTE,
+    ) -> List[Token]:
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and use_cls_token
+        ):
+            # +1 to have a space between the last token and the __cls__ token
+            idx = tokens[-1].offset + len(tokens[-1].text) + 1
+            tokens.append(Token(CLS_TOKEN, idx))
+
+        return tokens
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index acf24d7b5334..20a30efe0409 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -3,15 +3,13 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    CLS_TOKEN,
 )
 
 
@@ -27,7 +25,7 @@ class WhitespaceTokenizer(Tokenizer, Component):
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False,
+        "use_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -41,7 +39,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -97,10 +95,6 @@ def tokenize(
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 07057449ca7d..267f24b81e6f 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -82,7 +82,7 @@ def test_whitespace():
 def test_whitespace_cls_token():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = WhitespaceTokenizer(component_config)
 
@@ -211,7 +211,7 @@ def test_spacy(spacy_nlp):
 def test_spacy_add_cls_token(spacy_nlp):
     from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = SpacyTokenizer(component_config)
 
@@ -266,7 +266,7 @@ def test_mitie():
 def test_mitie_add_cls_token():
     from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = MitieTokenizer(component_config)
 
@@ -313,7 +313,7 @@ def test_jieba_load_dictionary(tmpdir_factory):
 def test_jieba_add_cls_token():
     from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = JiebaTokenizer(component_config)
 

From 5986a0db929985a58ad7a26966f6dfdd0624ddb9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:49:26 +0200
Subject: [PATCH 030/239] update changelog entry.

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index dba0c5069251..b59c1832e9ac 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -12,7 +12,7 @@ This project adheres to `Semantic Versioning`_ starting with version 1.0.
 
 Added
 -----
-- Added option ``add_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
+- Added option ``use_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
 
 Changed
 -------

From 54b5f3a9188acfc10b24442c99b3553fef1945fa Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 15:10:49 +0200
Subject: [PATCH 031/239] make use_cls_token a class variable of tokenizer

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/mitie_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/spacy_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/tokenizer.py            | 10 +++++-----
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 3a7291d686ca..bebe59a6f341 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -24,7 +24,7 @@
     from rasa.nlu.model import Metadata
 
 
-class JiebaTokenizer(Tokenizer, Component):
+class JiebaTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -118,7 +118,7 @@ def tokenize(
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 4450b804a735..b4ee25ff7a5a 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -13,7 +13,7 @@
 from rasa.utils.io import DEFAULT_ENCODING
 
 
-class MitieTokenizer(Tokenizer, Component):
+class MitieTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -71,7 +71,7 @@ def tokenize(
             for token, offset in tokenized
         ]
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 432eff0b7c06..1784c1e633d3 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -17,7 +17,7 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
 
-class SpacyTokenizer(Tokenizer, Component):
+class SpacyTokenizer(Component, Tokenizer):
 
     provides = [
         MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
@@ -64,5 +64,5 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        self.add_cls_token(tokens, self.use_cls_token)
+        self.add_cls_token(tokens)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 4d903822f6f1..71d914754fb4 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -22,15 +22,15 @@ def get(self, prop, default=None):
 
 
 class Tokenizer(object):
+    def __init__(self) -> None:
+        self.use_cls_token = False
+
     def add_cls_token(
-        self,
-        tokens: List[Token],
-        use_cls_token: bool,
-        attribute: Text = MESSAGE_TEXT_ATTRIBUTE,
+        self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
     ) -> List[Token]:
         if (
             attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and use_cls_token
+            and self.use_cls_token
         ):
             # +1 to have a space between the last token and the __cls__ token
             idx = tokens[-1].offset + len(tokens[-1].text) + 1
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 20a30efe0409..18333f41bd79 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -13,7 +13,7 @@
 )
 
 
-class WhitespaceTokenizer(Tokenizer, Component):
+class WhitespaceTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -95,6 +95,6 @@ def tokenize(
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens

From c939387140ce4bfab55ce32fea48be5ccb93c71e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 15:30:46 +0200
Subject: [PATCH 032/239] tokenizer inherits from compoenent

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      |  6 +-----
 rasa/nlu/tokenizers/mitie_tokenizer.py      |  8 +-------
 rasa/nlu/tokenizers/spacy_tokenizer.py      |  8 +-------
 rasa/nlu/tokenizers/tokenizer.py            | 12 +++++++++---
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  5 +----
 tests/nlu/base/test_evaluation.py           |  2 +-
 tests/nlu/base/test_featurizers.py          |  2 +-
 7 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index bebe59a6f341..29ed5d999da2 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -24,7 +24,7 @@
     from rasa.nlu.model import Metadata
 
 
-class JiebaTokenizer(Component, Tokenizer):
+class JiebaTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -36,8 +36,6 @@ class JiebaTokenizer(Component, Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -60,8 +58,6 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
-        self.use_cls_token = self.component_config["use_cls_token"]
-
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["jieba"]
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index b4ee25ff7a5a..74f8577e2d3f 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -13,19 +13,13 @@
 from rasa.utils.io import DEFAULT_ENCODING
 
 
-class MitieTokenizer(Component, Tokenizer):
+class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
-    defaults = {
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False
-    }
-
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(MitieTokenizer, self).__init__(component_config)
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 1784c1e633d3..3a982479c508 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -17,7 +17,7 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
 
-class SpacyTokenizer(Component, Tokenizer):
+class SpacyTokenizer(Tokenizer):
 
     provides = [
         MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
@@ -28,15 +28,9 @@ class SpacyTokenizer(Component, Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    defaults = {
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False
-    }
-
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(SpacyTokenizer, self).__init__(component_config)
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 71d914754fb4..41e04c844385 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -1,5 +1,6 @@
 from typing import Text, List, Optional, Dict, Any
 
+from rasa.nlu.components import Component
 from rasa.nlu.constants import (
     MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
@@ -21,9 +22,14 @@ def get(self, prop, default=None):
         return self.data.get(prop, default)
 
 
-class Tokenizer(object):
-    def __init__(self) -> None:
-        self.use_cls_token = False
+class Tokenizer(Component):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        super(Tokenizer, self).__init__(component_config)
+
+        if "use_cls_token" in self.component_config:
+            self.use_cls_token = self.component_config["use_cls_token"]
+        else:
+            self.use_cls_token = False
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 18333f41bd79..3641fb909689 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -13,7 +13,7 @@
 )
 
 
-class WhitespaceTokenizer(Component, Tokenizer):
+class WhitespaceTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -24,8 +24,6 @@ class WhitespaceTokenizer(Component, Tokenizer):
         "intent_split_symbol": "_",
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -39,7 +37,6 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index e51567cc5c17..240090bbd535 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -39,7 +39,7 @@
 from rasa.nlu.test import determine_intersection
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu import utils
 import json
 import os
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
index 0da0ae0f7b79..cd0c8ce3c13a 100644
--- a/tests/nlu/base/test_featurizers.py
+++ b/tests/nlu/base/test_featurizers.py
@@ -3,7 +3,7 @@
 import pytest
 
 from rasa.nlu import training_data
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message

From 944b716297bd400ecc57e94845402263326c4f3a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:00:01 +0200
Subject: [PATCH 033/239] remove not needed init methods

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 4 ----
 rasa/nlu/tokenizers/spacy_tokenizer.py | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 74f8577e2d3f..5c19bb108bcd 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -17,10 +17,6 @@ class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-        """Construct a new tokenizer using the SpacyTokenizer framework."""
-        super(MitieTokenizer, self).__init__(component_config)
-
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 3a982479c508..ffbeff7c2efc 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -28,10 +28,6 @@ class SpacyTokenizer(Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-        """Construct a new tokenizer using the SpacyTokenizer framework."""
-        super(SpacyTokenizer, self).__init__(component_config)
-
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:

From f1ed7d701618ed522c0aedf11ed44ae8474765d8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:06:46 +0200
Subject: [PATCH 034/239] review comment

---
 rasa/nlu/tokenizers/spacy_tokenizer.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index ffbeff7c2efc..432f283af1ce 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -40,7 +40,8 @@ def train(
 
                 if attribute_doc is not None:
                     example.set(
-                        MESSAGE_TOKENS_NAMES[attribute], self.tokenize(attribute_doc)
+                        MESSAGE_TOKENS_NAMES[attribute],
+                        self.tokenize(attribute_doc, attribute),
                     )
 
     def get_doc(self, message: Message, attribute: Text) -> "Doc":
@@ -49,10 +50,12 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
     def process(self, message: Message, **kwargs: Any) -> None:
         message.set(
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self.tokenize(self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE)),
+            self.tokenize(
+                self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE), MESSAGE_TEXT_ATTRIBUTE
+            ),
         )
 
-    def tokenize(self, doc: "Doc") -> List[Token]:
+    def tokenize(self, doc: "Doc", attribute: Text) -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        self.add_cls_token(tokens)
+        self.add_cls_token(tokens, attribute)
         return tokens

From 911202219597c341f8bceeddc7be1c149c13ae00 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:26:57 +0200
Subject: [PATCH 035/239] Add use_cls_token to default dict.

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      | 2 ++
 rasa/nlu/tokenizers/mitie_tokenizer.py      | 5 +++++
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 5 +++++
 rasa/nlu/tokenizers/tokenizer.py            | 2 +-
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 2 ++
 5 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 29ed5d999da2..05d0afc259fb 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -36,6 +36,8 @@ class JiebaTokenizer(Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 5c19bb108bcd..68516bec258f 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -17,6 +17,11 @@ class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 432f283af1ce..9f061c2b29ec 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -28,6 +28,11 @@ class SpacyTokenizer(Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 41e04c844385..1b786590f010 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -29,7 +29,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         if "use_cls_token" in self.component_config:
             self.use_cls_token = self.component_config["use_cls_token"]
         else:
-            self.use_cls_token = False
+            self.use_cls_token = True
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 3641fb909689..9be597b49a9d 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -24,6 +24,8 @@ class WhitespaceTokenizer(Tokenizer):
         "intent_split_symbol": "_",
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:

From 31dd425018c2e0c6c034e89ebbece8f9c979c23c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:57:24 +0200
Subject: [PATCH 036/239] thorw key error if use_cls_token is not set as
 default value.

---
 rasa/nlu/tokenizers/mitie_tokenizer.py        |   3 +-
 rasa/nlu/tokenizers/spacy_tokenizer.py        |   7 +-
 rasa/nlu/tokenizers/tokenizer.py              |  13 +-
 rasa/nlu/tokenizers/whitespace_tokenizer.py   |   1 -
 tests/nlu/tokenizers/__init__.py              |   0
 tests/nlu/tokenizers/test_jieba_tokenizer.py  |  53 ++++++
 tests/nlu/tokenizers/test_mitie_tokenizer.py  |  41 +++++
 tests/nlu/tokenizers/test_spacy_tokenizer.py  |  65 +++++++
 .../test_whitespace_tokenizer.py}             | 168 ++----------------
 9 files changed, 186 insertions(+), 165 deletions(-)
 create mode 100644 tests/nlu/tokenizers/__init__.py
 create mode 100644 tests/nlu/tokenizers/test_jieba_tokenizer.py
 create mode 100644 tests/nlu/tokenizers/test_mitie_tokenizer.py
 create mode 100644 tests/nlu/tokenizers/test_spacy_tokenizer.py
 rename tests/nlu/{base/test_tokenizers.py => tokenizers/test_whitespace_tokenizer.py} (52%)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 68516bec258f..ff9dced7e413 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -1,6 +1,5 @@
-from typing import Any, List, Text, Optional, Dict
+from typing import Any, List, Text
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 9f061c2b29ec..0589af320787 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,7 +1,6 @@
 import typing
-from typing import Any, Dict, Text, List, Optional
+from typing import Any, Text, List
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
@@ -60,7 +59,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
             ),
         )
 
-    def tokenize(self, doc: "Doc", attribute: Text) -> List[Token]:
+    def tokenize(
+        self, doc: "Doc", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
         self.add_cls_token(tokens, attribute)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 1b786590f010..c1b41ad0bf33 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -1,3 +1,5 @@
+import logging
+
 from typing import Text, List, Optional, Dict, Any
 
 from rasa.nlu.components import Component
@@ -7,6 +9,8 @@
     CLS_TOKEN,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class Token(object):
     def __init__(self, text, offset, data=None):
@@ -26,10 +30,13 @@ class Tokenizer(Component):
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super(Tokenizer, self).__init__(component_config)
 
-        if "use_cls_token" in self.component_config:
+        try:
             self.use_cls_token = self.component_config["use_cls_token"]
-        else:
-            self.use_cls_token = True
+        except KeyError:
+            raise KeyError(
+                "No default value for 'use_cls_token' was set. Please, "
+                "add it to the default dict of the tokenizer."
+            )
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 9be597b49a9d..c129e97c8fd9 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -1,7 +1,6 @@
 import re
 from typing import Any, Dict, List, Text
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
diff --git a/tests/nlu/tokenizers/__init__.py b/tests/nlu/tokenizers/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/tokenizers/test_jieba_tokenizer.py b/tests/nlu/tokenizers/test_jieba_tokenizer.py
new file mode 100644
index 000000000000..7df57c5bfcd1
--- /dev/null
+++ b/tests/nlu/tokenizers/test_jieba_tokenizer.py
@@ -0,0 +1,53 @@
+from unittest.mock import patch
+
+from rasa.nlu.constants import CLS_TOKEN
+
+
+def test_jieba():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == ["我", "想", "去", "吃", "兰州", "拉面"]
+
+    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == [0, 1, 2, 3, 4, 6]
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == ["Micheal", "你好", "吗", "？"]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10]
+
+
+def test_jieba_load_dictionary(tmpdir_factory):
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath
+
+    component_config = {"dictionary_path": dictionary_path, "use_cls_token": False}
+
+    with patch.object(
+        JiebaTokenizer, "load_custom_dictionary", return_value=None
+    ) as mock_method:
+        tk = JiebaTokenizer(component_config)
+        tk.tokenize("")
+
+    mock_method.assert_called_once_with(dictionary_path)
+
+
+def test_jieba_add_cls_token():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
+        "Micheal",
+        "你好",
+        "吗",
+        "？",
+        CLS_TOKEN,
+    ]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]
diff --git a/tests/nlu/tokenizers/test_mitie_tokenizer.py b/tests/nlu/tokenizers/test_mitie_tokenizer.py
new file mode 100644
index 000000000000..41774fb9a440
--- /dev/null
+++ b/tests/nlu/tokenizers/test_mitie_tokenizer.py
@@ -0,0 +1,41 @@
+from rasa.nlu.constants import CLS_TOKEN
+
+
+def test_mitie():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == ["Forecast", "for", "lunch"]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13]
+
+    text = "hey ńöñàśçií how're you?"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "hey",
+        "ńöñàśçií",
+        "how",
+        "'re",
+        "you",
+        "?",
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
+
+
+def test_mitie_add_cls_token():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
diff --git a/tests/nlu/tokenizers/test_spacy_tokenizer.py b/tests/nlu/tokenizers/test_spacy_tokenizer.py
new file mode 100644
index 000000000000..9748f4fd8fcc
--- /dev/null
+++ b/tests/nlu/tokenizers/test_spacy_tokenizer.py
@@ -0,0 +1,65 @@
+from rasa.nlu.constants import CLS_TOKEN
+from rasa.nlu import training_data
+
+
+def test_spacy(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]
+
+    text = "hey ńöñàśçií how're you?"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "hey",
+        "ńöñàśçií",
+        "how",
+        "'re",
+        "you",
+        "?",
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
+
+
+def test_spacy_add_cls_token(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
+
+
+def test_spacy_intent_tokenizer(spacy_nlp_component):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
+    spacy_nlp_component.train(td, config=None)
+    spacy_tokenizer = SpacyTokenizer(component_config)
+    spacy_tokenizer.train(td, config=None)
+
+    intent_tokens_exist = [
+        True if example.get("intent_tokens") is not None else False
+        for example in td.intent_examples
+    ]
+
+    # no intent tokens should have been set
+    assert not any(intent_tokens_exist)
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
similarity index 52%
rename from tests/nlu/base/test_tokenizers.py
rename to tests/nlu/tokenizers/test_whitespace_tokenizer.py
index 267f24b81e6f..27c2c6b171f6 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -1,17 +1,14 @@
-# -*- coding: utf-8 -*-
-
-from unittest.mock import patch
-
 from rasa.nlu.constants import CLS_TOKEN
 from rasa.nlu.training_data import TrainingData, Message
 from tests.nlu import utilities
-from rasa.nlu import training_data
 
 
 def test_whitespace():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    tk = WhitespaceTokenizer()
+    component_config = {"use_cls_token": False}
+
+    tk = WhitespaceTokenizer(component_config)
 
     assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
         "Forecast",
@@ -98,7 +95,11 @@ def test_whitespace_cls_token():
 def test_whitespace_custom_intent_symbol():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}
+    component_config = {
+        "intent_tokenization_flag": True,
+        "intent_split_symbol": "+",
+        "use_cls_token": False,
+    }
 
     tk = WhitespaceTokenizer(component_config)
 
@@ -116,7 +117,7 @@ def test_whitespace_custom_intent_symbol():
 def test_whitespace_with_case():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"case_sensitive": False}
+    component_config = {"case_sensitive": False, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "forecast",
@@ -124,7 +125,7 @@ def test_whitespace_with_case():
         "lunch",
     ]
 
-    component_config = {"case_sensitive": True}
+    component_config = {"case_sensitive": True, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "Forecast",
@@ -132,7 +133,7 @@ def test_whitespace_with_case():
         "LUNCH",
     ]
 
-    component_config = {}
+    component_config = {"use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "Forecast",
@@ -140,7 +141,7 @@ def test_whitespace_with_case():
         "LUNCH",
     ]
 
-    component_config = {"case_sensitive": False}
+    component_config = {"case_sensitive": False, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     message = Message("Forecast for LUNCH")
     tk.process(message)
@@ -181,148 +182,3 @@ def test_whitespace_with_case():
     assert examples[1].data.get("tokens")[0].text == "i"
     assert examples[1].data.get("tokens")[1].text == "want"
     assert examples[1].data.get("tokens")[2].text == "tacos"
-
-
-def test_spacy(spacy_nlp):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    tk = SpacyTokenizer()
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "Forecast",
-        "for",
-        "lunch",
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]
-
-    text = "hey ńöñàśçií how're you?"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "hey",
-        "ńöñàśçií",
-        "how",
-        "'re",
-        "you",
-        "?",
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
-
-
-def test_spacy_add_cls_token(spacy_nlp):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = SpacyTokenizer(component_config)
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "Forecast",
-        "for",
-        "lunch",
-        CLS_TOKEN,
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
-
-
-def test_spacy_intent_tokenizer(spacy_nlp_component):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    spacy_nlp_component.train(td, config=None)
-    spacy_tokenizer = SpacyTokenizer()
-    spacy_tokenizer.train(td, config=None)
-
-    intent_tokens_exist = [
-        True if example.get("intent_tokens") is not None else False
-        for example in td.intent_examples
-    ]
-
-    # no intent tokens should have been set
-    assert not any(intent_tokens_exist)
-
-
-def test_mitie():
-    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
-
-    tk = MitieTokenizer()
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(text)] == ["Forecast", "for", "lunch"]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13]
-
-    text = "hey ńöñàśçií how're you?"
-    assert [t.text for t in tk.tokenize(text)] == [
-        "hey",
-        "ńöñàśçií",
-        "how",
-        "'re",
-        "you",
-        "?",
-    ]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
-
-
-def test_mitie_add_cls_token():
-    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = MitieTokenizer(component_config)
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(text)] == [
-        "Forecast",
-        "for",
-        "lunch",
-        CLS_TOKEN,
-    ]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
-
-
-def test_jieba():
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    tk = JiebaTokenizer()
-
-    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == ["我", "想", "去", "吃", "兰州", "拉面"]
-
-    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == [0, 1, 2, 3, 4, 6]
-
-    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == ["Micheal", "你好", "吗", "？"]
-
-    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10]
-
-
-def test_jieba_load_dictionary(tmpdir_factory):
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath
-
-    component_config = {"dictionary_path": dictionary_path}
-
-    with patch.object(
-        JiebaTokenizer, "load_custom_dictionary", return_value=None
-    ) as mock_method:
-        tk = JiebaTokenizer(component_config)
-        tk.tokenize("")
-
-    mock_method.assert_called_once_with(dictionary_path)
-
-
-def test_jieba_add_cls_token():
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = JiebaTokenizer(component_config)
-
-    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
-        "Micheal",
-        "你好",
-        "吗",
-        "？",
-        CLS_TOKEN,
-    ]
-
-    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]

From e652d84bac43dea6912ad59e2a12477080ab447f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Sun, 20 Oct 2019 14:54:37 +0200
Subject: [PATCH 037/239] Disable cls token use in default pipeline.

---
 examples/formbot/config.yml       | 1 +
 examples/restaurantbot/config.yml | 1 +
 rasa/nlu/registry.py              | 4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/formbot/config.yml b/examples/formbot/config.yml
index 3aa0e7577759..3cf3f4a14fe5 100644
--- a/examples/formbot/config.yml
+++ b/examples/formbot/config.yml
@@ -2,6 +2,7 @@ language: en
 
 pipeline:
   - name: WhitespaceTokenizer
+    use_cls_token: False
   - name: CRFEntityExtractor
   - name: EntitySynonymMapper
   - name: CountVectorsFeaturizer
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index 58e9f0be7209..fcb2086a50e1 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -3,6 +3,7 @@ language: en
 pipeline:
   - name: "SpacyNLP"
   - name: "SpacyTokenizer"
+    use_cls_token: False
   - name: "SpacyFeaturizer"
   - name: "SklearnIntentClassifier"
   - name: "CRFEntityExtractor"
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 0d79360edd3e..2ec7aad0a0e1 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -105,7 +105,7 @@
 registered_pipeline_templates = {
     "pretrained_embeddings_spacy": [
         {"name": "SpacyNLP"},
-        {"name": "SpacyTokenizer"},
+        {"name": "SpacyTokenizer", "use_cls_token": False},
         {"name": "SpacyFeaturizer"},
         {"name": "RegexFeaturizer"},
         {"name": "CRFEntityExtractor"},
@@ -114,7 +114,7 @@
     ],
     "keyword": [{"name": "KeywordIntentClassifier"}],
     "supervised_embeddings": [
-        {"name": "WhitespaceTokenizer"},
+        {"name": "WhitespaceTokenizer", "use_cls_token": False},
         {"name": "RegexFeaturizer"},
         {"name": "CRFEntityExtractor"},
         {"name": "EntitySynonymMapper"},

From 1d775545391dc2b21a23105aef50de49c4d8dda5 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Sun, 20 Oct 2019 14:57:11 +0200
Subject: [PATCH 038/239] correct type

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index ff9dced7e413..e17d49c3bab7 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -46,7 +46,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
         )
 
     def _token_from_offset(
-        self, text: Text, offset: int, encoded_sentence: bytes
+        self, text: bytes, offset: int, encoded_sentence: bytes
     ) -> Token:
         return Token(
             text.decode(DEFAULT_ENCODING),

From 3d8a2e482d3318492a2b7ca72cb5b3e055a0426c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 09:39:50 +0200
Subject: [PATCH 039/239] fix tests

---
 tests/nlu/base/test_config.py      | 2 +-
 tests/nlu/base/test_featurizers.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
index be729075adb3..f6453e49404e 100644
--- a/tests/nlu/base/test_config.py
+++ b/tests/nlu/base/test_config.py
@@ -65,7 +65,7 @@ def test_set_attr_on_component():
     cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
     cfg.set_component_attr(6, C=324)
 
-    assert cfg.for_component(1) == {"name": "SpacyTokenizer"}
+    assert cfg.for_component(1) == {"name": "SpacyTokenizer", "use_cls_token": False}
     assert cfg.for_component(6) == {"name": "SklearnIntentClassifier", "C": 324}
 
 
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
index cd0c8ce3c13a..d67c905deab6 100644
--- a/tests/nlu/base/test_featurizers.py
+++ b/tests/nlu/base/test_featurizers.py
@@ -117,7 +117,8 @@ def test_mitie_featurizer(mitie_feature_extractor, default_config):
     mitie_component_config = {"name": "MitieFeaturizer"}
     ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
     sentence = "Hey how are you today"
-    tokens = MitieTokenizer().tokenize(sentence)
+    mitie_component_config = {"name": "MitieTokenizer", "use_cls_token": False}
+    tokens = MitieTokenizer(mitie_component_config).tokenize(sentence)
     vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
     expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
     assert np.allclose(vecs[:5], expected, atol=1e-5)
@@ -212,7 +213,8 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     ftr = RegexFeaturizer(lookup_tables=lookups)
 
     # adds tokens to the message
-    tokenizer = SpacyTokenizer()
+    component_config = {"name": "SpacyTokenizer", "use_cls_token": False}
+    tokenizer = SpacyTokenizer(component_config)
     message = Message(sentence)
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)

From 50f68b2857b9f15fe02112d065262fcbd368a594 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 11:34:47 +0200
Subject: [PATCH 040/239] spacy featurizer returns sequence

---
 CHANGELOG.rst                                 |  2 +
 .../dense_featurizer/spacy_featurizer.py      |  2 +-
 .../nlu/featurizers/test_spacy_featurizer.py  | 44 +++++--------------
 3 files changed, 15 insertions(+), 33 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index c0a3d221eea0..b59c1832e9ac 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -44,6 +44,8 @@ Added
   (``rasa.core.agent.handle_channels()``). The number of workers can be set using the
   environment variable ``SANIC_WORKERS`` (default: 1). A value of >1 is allowed only in
   combination with ``RedisLockStore`` as the lock store.
+- Botframework channel can handle uploaded files in ``UserMessage`` metadata.
+- Added data validator that checks there is no duplicated example data across multiples intents
 
 Changed
 -------
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 3ac51234b484..14f488463e9d 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -25,7 +25,7 @@ def ndim(spacy_nlp: "Language") -> int:
 
 def features_for_doc(doc: "Doc") -> np.ndarray:
     """Feature vector for a single document / sentence."""
-    return doc.vector
+    return np.array([t.vector for t in doc])
 
 
 class SpacyFeaturizer(Featurizer):
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index 19fbe0034c1e..df9e08bf0b5d 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -7,22 +7,14 @@
 from rasa.nlu.config import RasaNLUModelConfig
 
 
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [
-        (
-            "hey how are you today",
-            [-0.19649599, 0.32493639, -0.37408298, -0.10622784, 0.062756],
-        )
-    ],
-)
-def test_spacy_featurizer(sentence, expected, spacy_nlp):
+@pytest.mark.parametrize("sentence", ["hey how are you today"])
+def test_spacy_featurizer(sentence, spacy_nlp):
     from nlu.featurizers.dense_featurizer import spacy_featurizer
 
     doc = spacy_nlp(sentence)
     vecs = spacy_featurizer.features_for_doc(doc)
-    assert np.allclose(doc.vector[:5], expected, atol=1e-5)
-    assert np.allclose(vecs, doc.vector, atol=1e-5)
+    expected = [t.vector for t in doc]
+    assert np.allclose(vecs, expected, atol=1e-5)
 
 
 def test_spacy_training_sample_alignment(spacy_nlp_component):
@@ -78,33 +70,21 @@ def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
 
     doc = spacy_nlp(sentence)
     token_vectors = [t.vector for t in doc]
-    spacy_config = {"ner_feature_vectors": True}
+
+    spacy_config = {}
     ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
+
     greet = {"intent": "greet", "text_features": [0.5]}
+
     message = Message(sentence, greet)
     message.set("spacy_doc", doc)
-    ftr._set_spacy_features(message)
-    ftr._set_spacy_ner_features(message)
-    vecs = message.get("ner_features")[0][:5]
-    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
-    assert np.allclose(vecs, expected, atol=1e-4)
 
+    ftr._set_spacy_features(message)
 
-def test_spacy_ner_featurizer_config(spacy_nlp):
-    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+    vecs = message.get("text_dense_features")[0][:5]
 
-    sentence = "hi there friend"
-    doc = spacy_nlp(sentence)
-    spacy_config = {"ner_feature_vectors": False}
-    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
-    greet = {"intent": "greet", "text_features": [0.5]}
-    message = Message(sentence, greet)
-    message.set("spacy_doc", doc)
-    ftr._set_spacy_features(message)
-    ftr._set_spacy_ner_features(message)
-    vecs = np.array(message.get("ner_features"))
-    assert vecs.shape[0] == len(doc)
-    assert vecs.shape[1] == 0
+    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
+    assert np.allclose(vecs, expected, atol=1e-4)
 
 
 def test_spacy_featurizer_casing(spacy_nlp):

From 603d0655ad3b996125f4cbda55bdcd0528aec985 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 11:57:11 +0200
Subject: [PATCH 041/239] fix tests for count vectors featurizer

---
 .../count_vectors_featurizer.py               |  1 +
 .../test_count_vectors_featurizer.py          | 99 ++++++++++++-------
 2 files changed, 62 insertions(+), 38 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 409a1a89d707..619db36500ef 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -19,6 +19,7 @@
     MESSAGE_SPACY_FEATURES_NAMES,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
+    MESSAGE_INTENT_ATTRIBUTE,
 )
 
 
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index b5f5e20b2ab6..e94bb6c4a081 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+import scipy.sparse
 
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message
@@ -9,10 +10,10 @@
 @pytest.mark.parametrize(
     "sentence, expected",
     [
-        ("hello hello hello hello hello ", [5]),
-        ("hello goodbye hello", [1, 2]),
-        ("a b c d e f", [1, 1, 1, 1, 1, 1]),
-        ("a 1 2", [2, 1]),
+        ("hello hello hello hello hello ", [[1]]),
+        ("hello goodbye hello", [[0, 1]]),
+        ("a b c d e f", [[1, 0, 0, 0, 0, 0]]),
+        ("a 1 2", [[0, 1]]),
     ],
 )
 def test_count_vector_featurizer(sentence, expected):
@@ -30,15 +31,19 @@ def test_count_vector_featurizer(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_features") == expected)
+    assert isinstance(test_message.get("text_sparse_features"), scipy.sparse.csr_matrix)
+
+    actual = test_message.get("text_sparse_features")[0].toarray()
+
+    assert np.all(actual == expected)
 
 
 @pytest.mark.parametrize(
     "sentence, intent, response, intent_features, response_features",
     [
-        ("hello hello hello hello hello ", "greet", None, [1], None),
-        ("hello goodbye hello", "greet", None, [1], None),
-        ("a 1 2", "char", "char char", [1], [2]),
+        ("hello hello hello hello hello ", "greet", None, [[1]], None),
+        ("hello goodbye hello", "greet", None, [[1]], None),
+        ("a 1 2", "char", "char char", [[1]], [[1]]),
     ],
 )
 def test_count_vector_featurizer_attribute_featurization(
@@ -58,21 +63,33 @@ def test_count_vector_featurizer_attribute_featurization(
     data = TrainingData([train_message])
     ftr.train(data)
 
-    assert train_message.get("intent_features") == intent_features
-    assert train_message.get("response_features") == response_features
+    if intent_features:
+        assert (
+            train_message.get("intent_sparse_features")[0].toarray() == intent_features
+        )
+    else:
+        assert train_message.get("intent_sparse_features") == None
+
+    if response_features:
+        assert (
+            train_message.get("response_sparse_features")[0].toarray()
+            == response_features
+        )
+    else:
+        assert train_message.get("response_sparse_features") == None
 
 
 @pytest.mark.parametrize(
     "sentence, intent, response, text_features, intent_features, response_features",
     [
-        ("hello hello greet ", "greet", "hello", [1, 2], [1, 0], [0, 1]),
+        ("hello hello greet ", "greet", "hello", [[0, 1]], [[1, 0]], [[0, 1]]),
         (
             "I am fine",
             "acknowledge",
             "good",
-            [0, 1, 1, 0, 1],
-            [1, 0, 0, 0, 0],
-            [0, 0, 0, 1, 0],
+            [[0, 0, 0, 0, 1]],
+            [[1, 0, 0, 0, 0]],
+            [[0, 0, 0, 1, 0]],
         ),
     ],
 )
@@ -95,18 +112,24 @@ def test_count_vector_featurizer_shared_vocab(
     data = TrainingData([train_message])
     ftr.train(data)
 
-    assert np.all(train_message.get("text_features") == text_features)
-    assert np.all(train_message.get("intent_features") == intent_features)
-    assert np.all(train_message.get("response_features") == response_features)
+    assert np.all(
+        train_message.get("text_sparse_features")[0].toarray() == text_features
+    )
+    assert np.all(
+        train_message.get("intent_sparse_features")[0].toarray() == intent_features
+    )
+    assert np.all(
+        train_message.get("response_sparse_features")[0].toarray() == response_features
+    )
 
 
 @pytest.mark.parametrize(
     "sentence, expected",
     [
-        ("hello hello hello hello hello __OOV__", [1, 5]),
-        ("hello goodbye hello __oov__", [1, 1, 2]),
-        ("a b c d e f __oov__ __OOV__ __OOV__", [3, 1, 1, 1, 1, 1, 1]),
-        ("__OOV__ a 1 2 __oov__ __OOV__", [2, 3, 1]),
+        ("hello hello hello hello hello __OOV__", [[0, 1]]),
+        ("hello goodbye hello __oov__", [[0, 0, 1]]),
+        ("a b c d e f __oov__ __OOV__ __OOV__", [[0, 1, 0, 0, 0, 0, 0]]),
+        ("__OOV__ a 1 2 __oov__ __OOV__", [[0, 1, 0]]),
     ],
 )
 def test_count_vector_featurizer_oov_token(sentence, expected):
@@ -126,16 +149,16 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_features") == expected)
+    assert np.all(test_message.get("text_sparse_features")[0].toarray() == expected)
 
 
 @pytest.mark.parametrize(
     "sentence, expected",
     [
-        ("hello hello hello hello hello oov_word0", [1, 5]),
-        ("hello goodbye hello oov_word0 OOV_word0", [2, 1, 2]),
-        ("a b c d e f __oov__ OOV_word0 oov_word1", [3, 1, 1, 1, 1, 1, 1]),
-        ("__OOV__ a 1 2 __oov__ OOV_word1", [2, 3, 1]),
+        ("hello hello hello hello hello oov_word0", [[0, 1]]),
+        ("hello goodbye hello oov_word0 OOV_word0", [[0, 0, 1]]),
+        ("a b c d e f __oov__ OOV_word0 oov_word1", [[0, 1, 0, 0, 0, 0, 0]]),
+        ("__OOV__ a 1 2 __oov__ OOV_word1", [[0, 1, 0]]),
     ],
 )
 def test_count_vector_featurizer_oov_words(sentence, expected):
@@ -159,19 +182,19 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_features") == expected)
+    assert np.all(test_message.get("text_sparse_features")[0].toarray() == expected)
 
 
 @pytest.mark.parametrize(
     "tokens, expected",
     [
-        (["hello", "hello", "hello", "hello", "hello"], [5]),
-        (["你好", "你好", "你好", "你好", "你好"], [5]),  # test for unicode chars
-        (["hello", "goodbye", "hello"], [1, 2]),
+        (["hello", "hello", "hello", "hello", "hello"], [[1]]),
+        (["你好", "你好", "你好", "你好", "你好"], [[1]]),  # test for unicode chars
+        (["hello", "goodbye", "hello"], [[0, 1]]),
         # Note: order has changed in Chinese version of "hello" & "goodbye"
-        (["你好", "再见", "你好"], [2, 1]),  # test for unicode chars
-        (["a", "b", "c", "d", "e", "f"], [1, 1, 1, 1, 1, 1]),
-        (["a", "1", "2"], [2, 1]),
+        (["你好", "再见", "你好"], [[1, 0]]),  # test for unicode chars
+        (["a", "b", "c", "d", "e", "f"], [[1, 0, 0, 0, 0, 0]]),
+        (["a", "1", "2"], [[0, 1]]),
     ],
 )
 def test_count_vector_featurizer_using_tokens(tokens, expected):
@@ -200,15 +223,15 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
 
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_features") == expected)
+    assert np.all(test_message.get("text_sparse_features")[0].toarray() == expected)
 
 
 @pytest.mark.parametrize(
     "sentence, expected",
     [
-        ("ababab", [3, 3, 3, 2]),
-        ("ab ab ab", [2, 2, 3, 3, 3, 2]),
-        ("abc", [1, 1, 1, 1, 1]),
+        ("ababab", [[3, 3, 3, 2]]),
+        ("ab ab ab", [[0, 0, 1, 1, 1, 0]]),
+        ("abc", [[1, 1, 1, 1, 1]]),
     ],
 )
 def test_count_vector_featurizer_char(sentence, expected):
@@ -226,7 +249,7 @@ def test_count_vector_featurizer_char(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_features") == expected)
+    assert np.all(test_message.get("text_sparse_features")[0].toarray() == expected)
 
 
 def test_count_vector_featurizer_persist_load(tmpdir):

From d1a19dcc1eff9614e07f4100dea41d22d98a9562 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 12:39:53 +0200
Subject: [PATCH 042/239] mitie featurizer returns sequence

---
 .../dense_featurizer/mitie_featurizer.py      |   9 +-
 .../dense_featurizer/spacy_featurizer.py      |  16 +-
 .../nlu/featurizers/test_mitie_featurizer.py  | 285 +++++++++++++++++-
 .../nlu/featurizers/test_spacy_featurizer.py  |  14 +-
 4 files changed, 300 insertions(+), 24 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 9f94be854db5..5efb9cfe4a84 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -99,10 +99,7 @@ def features_for_tokens(
         feature_extractor: "mitie.total_word_feature_extractor",
     ) -> np.ndarray:
 
-        vec = np.zeros(self.ndim(feature_extractor))
+        vec = []
         for token in tokens:
-            vec += feature_extractor.get_feature_vector(token.text)
-        if tokens:
-            return vec / len(tokens)
-        else:
-            return vec
+            vec.append(feature_extractor.get_feature_vector(token.text))
+        return np.array(vec)
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 14f488463e9d..bb35a66c521c 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -18,16 +18,6 @@
 )
 
 
-def ndim(spacy_nlp: "Language") -> int:
-    """Number of features used to represent a document / sentence."""
-    return spacy_nlp.vocab.vectors_length
-
-
-def features_for_doc(doc: "Doc") -> np.ndarray:
-    """Feature vector for a single document / sentence."""
-    return np.array([t.vector for t in doc])
-
-
 class SpacyFeaturizer(Featurizer):
 
     provides = [
@@ -40,6 +30,10 @@ class SpacyFeaturizer(Featurizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    def _features_for_doc(self, doc: "Doc") -> np.ndarray:
+        """Feature vector for a single document / sentence."""
+        return np.array([t.vector for t in doc])
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
@@ -61,7 +55,7 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
 
         message_attribute_doc = self.get_doc(message, attribute)
         if message_attribute_doc is not None:
-            fs = features_for_doc(message_attribute_doc)
+            fs = self._features_for_doc(message_attribute_doc)
             features = self._combine_with_existing_dense_features(
                 message, fs, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
             )
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index c408d08680a2..dd312dc4c849 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -9,9 +9,290 @@ def test_mitie_featurizer(mitie_feature_extractor, default_config):
 
     mitie_component_config = {"name": "MitieFeaturizer"}
     ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
+
     sentence = "Hey how are you today"
+
     mitie_component_config = {"name": "MitieTokenizer", "use_cls_token": False}
     tokens = MitieTokenizer(mitie_component_config).tokenize(sentence)
+
     vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
-    expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
-    assert np.allclose(vecs[:5], expected, atol=1e-5)
+
+    print(vecs[0])
+
+    expected = np.array(
+        [
+            0.00000000e00,
+            -5.12735510e00,
+            4.39929873e-01,
+            -5.60760403e00,
+            -8.26445103e00,
+            -2.82586724e-01,
+            -1.00573087e00,
+            6.16612673e00,
+            4.51831198e00,
+            -4.36631250e00,
+            -4.30110741e00,
+            1.45059049e00,
+            -1.43656611e00,
+            -2.57342124e00,
+            4.97706032e00,
+            8.10247707e00,
+            -3.96101475e-01,
+            7.00332284e-01,
+            5.96706450e-01,
+            -2.48464966e00,
+            4.15551476e-02,
+            -1.87830377e00,
+            2.42095375e00,
+            -5.44092357e-01,
+            2.67223120e00,
+            -2.00578022e00,
+            1.22963965e00,
+            2.41412973e00,
+            -1.68471694e00,
+            1.87547529e00,
+            -6.68652773e-01,
+            3.51532221e-01,
+            -2.20869064e-01,
+            -1.27275801e00,
+            -4.55570340e-01,
+            1.30415881e00,
+            -7.04453290e-01,
+            2.64976263e00,
+            1.26680803e00,
+            2.04671100e-01,
+            2.10326767e00,
+            -9.23435837e-02,
+            -1.68740237e00,
+            -1.97914347e-01,
+            -1.41813803e00,
+            -1.66212067e-01,
+            -3.05680543e-01,
+            -9.69038725e-01,
+            9.14714634e-01,
+            7.34257996e-01,
+            -2.09392056e-01,
+            1.55312046e-01,
+            8.92380118e-01,
+            -9.96749043e-01,
+            4.89007890e-01,
+            1.47573900e00,
+            -1.83904159e00,
+            -3.72725785e-01,
+            2.75361121e-01,
+            -5.25904536e-01,
+            1.30387291e-01,
+            7.00960279e-01,
+            6.50017858e-01,
+            -4.50371534e-01,
+            -6.38834953e-01,
+            6.73773468e-01,
+            8.81396413e-01,
+            -6.75141394e-01,
+            4.86862332e-01,
+            5.01379609e-01,
+            4.88356948e-01,
+            -4.53982174e-01,
+            -7.30833590e-01,
+            8.64133000e-01,
+            -1.71588242e-01,
+            6.58954322e-01,
+            2.18044650e-02,
+            7.73006380e-01,
+            4.51285660e-01,
+            -8.16508114e-01,
+            1.95529416e-01,
+            4.02571887e-01,
+            3.07720363e-01,
+            -1.18403830e-01,
+            -1.73695073e-01,
+            1.27457175e-02,
+            4.45014611e-02,
+            -2.24128217e-01,
+            -1.06355086e-01,
+            6.02598965e-01,
+            -3.68196577e-01,
+            -5.17653847e00,
+            5.86267173e-01,
+            -2.78645813e-01,
+            -2.67106779e-02,
+            2.77676725e00,
+            2.26144981e00,
+            -1.74828792e00,
+            4.56249189e00,
+            1.75182652e00,
+            -4.38783467e-01,
+            -1.09945142e00,
+            -2.11774683e00,
+            -2.76408410e00,
+            -1.55349958e00,
+            -3.79859638e00,
+            3.96455169e00,
+            -2.52921015e-01,
+            -1.92639256e00,
+            -4.89389300e-01,
+            -4.02990580e00,
+            -8.72295380e00,
+            -1.46801007e00,
+            -2.31932306e00,
+            1.67305171e00,
+            -3.19912529e00,
+            1.86834311e00,
+            2.06363127e-01,
+            4.57791597e-01,
+            -2.40873742e00,
+            1.95506680e00,
+            -3.92530274e00,
+            -2.98407483e00,
+            -1.78072822e00,
+            1.29415095e00,
+            1.00851856e-01,
+            -1.08310506e-01,
+            1.16931573e-01,
+            1.38969958e00,
+            -7.87991047e-01,
+            -1.70851195e00,
+            3.38014960e00,
+            -2.66119748e-01,
+            2.83784223e00,
+            3.44787151e-01,
+            1.87817657e00,
+            7.69976914e-01,
+            5.02131760e-01,
+            1.00641572e00,
+            2.05512595e00,
+            8.01849067e-01,
+            -8.64741862e-01,
+            -2.41731501e00,
+            -6.90070271e-01,
+            8.99859846e-01,
+            2.59272814e-01,
+            2.12083149e00,
+            1.71254003e00,
+            2.31126094e00,
+            1.05681944e00,
+            -8.90498281e-01,
+            5.30907393e-01,
+            2.41127789e-01,
+            -3.24536473e-01,
+            -5.03312349e-01,
+            -3.45470524e00,
+            7.23897219e-01,
+            3.49540949e00,
+            -1.54396147e-01,
+            1.96257377e00,
+            -8.16661939e-02,
+            -1.42608774e00,
+            -6.39269233e-01,
+            7.82996774e-01,
+            2.48106170e00,
+            9.45179760e-01,
+            -8.31814110e-01,
+            -7.13138878e-01,
+            -1.56903923e00,
+            1.44644022e00,
+            -1.24463284e00,
+            -5.90117991e-01,
+            -1.30865097e00,
+            1.70658243e00,
+            3.14512819e-01,
+            5.01549184e-01,
+            -3.24578106e-01,
+            2.81532764e-01,
+            6.94498479e-01,
+            1.65341794e00,
+            -1.78533092e-01,
+            -1.36791408e00,
+            -3.05325389e-01,
+            1.57340133e00,
+            -8.41358781e-01,
+            5.52713513e-01,
+            -7.22983599e-01,
+            4.10806626e-01,
+            -5.17388061e-02,
+            1.05758071e00,
+            2.37797365e-01,
+            -8.51634622e-01,
+            -1.79594696e-01,
+            -4.38443124e-01,
+            9.10361111e-02,
+            1.02692962e-01,
+            6.27609611e-01,
+            5.56623459e-01,
+            5.40035427e-01,
+            1.44254386e00,
+            8.21452856e-01,
+            7.96434343e-01,
+            -6.25197291e-01,
+            3.09273601e-03,
+            -6.24552667e-02,
+            -1.03001225e00,
+            3.47646058e-01,
+            -3.60108972e-01,
+            7.73691535e-01,
+            -8.22658122e-01,
+            -6.71157479e-01,
+            9.70521867e-01,
+            2.73865640e-01,
+            1.69602585e00,
+            5.74894428e-01,
+            5.25952458e-01,
+            -2.73797333e-01,
+            2.24092394e-01,
+            -3.01282465e-01,
+            -8.98015559e-01,
+            -3.54814857e-01,
+            -5.35844207e-01,
+            -2.62837589e-01,
+            7.90212154e-01,
+            1.64234906e-01,
+            1.01651788e00,
+            1.22546232e00,
+            -3.33948851e-01,
+            -4.89927202e-01,
+            -1.12350926e-01,
+            -1.36075035e-01,
+            -9.49754834e-01,
+            -5.68806455e-02,
+            2.42536068e-01,
+            -2.81865031e-01,
+            5.53327501e-02,
+            7.90774226e-01,
+            2.38684490e-02,
+            -9.46886778e-01,
+            5.67425728e-01,
+            1.11705333e-01,
+            -8.51398855e-02,
+            -3.33825350e-01,
+            2.33040452e-01,
+            -4.90594149e-01,
+            6.75024092e-03,
+            3.73918623e-01,
+            -3.34260643e-01,
+            -7.60734856e-01,
+            -5.63092679e-02,
+            4.10971254e-01,
+            -2.63164580e-01,
+            8.54819715e-02,
+            -4.20097411e-02,
+            -9.82390791e-02,
+            3.80521566e-01,
+            2.33330190e-01,
+            4.47722435e-01,
+            -3.42616737e-02,
+            -5.51659703e-01,
+            5.68716228e-01,
+            -2.82406271e-01,
+            6.78738177e-01,
+            -8.50788295e-01,
+            1.21547759e-01,
+            -3.42155367e-01,
+            -5.10491610e-01,
+            -6.96370900e-01,
+            2.27460936e-02,
+            -3.87611985e-01,
+            -1.09960282e00,
+        ]
+    )
+
+    assert np.allclose(vecs[0], expected, atol=1e-5)
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index df9e08bf0b5d..8fe68f1ee28b 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -9,10 +9,12 @@
 
 @pytest.mark.parametrize("sentence", ["hey how are you today"])
 def test_spacy_featurizer(sentence, spacy_nlp):
-    from nlu.featurizers.dense_featurizer import spacy_featurizer
+    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+
+    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())
 
     doc = spacy_nlp(sentence)
-    vecs = spacy_featurizer.features_for_doc(doc)
+    vecs = ftr._features_for_doc(doc)
     expected = [t.vector for t in doc]
     assert np.allclose(vecs, expected, atol=1e-5)
 
@@ -88,20 +90,22 @@ def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
 
 
 def test_spacy_featurizer_casing(spacy_nlp):
-    from nlu.featurizers.dense_featurizer import spacy_featurizer
+    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     # if this starts failing for the default model, we should think about
     # removing the lower casing the spacy nlp component does when it
     # retrieves vectors. For compressed spacy models (e.g. models
     # ending in _sm) this test will most likely fail.
 
+    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())
+
     td = training_data.load_data("data/examples/rasa/demo-rasa.json")
     for e in td.intent_examples:
         doc = spacy_nlp(e.text)
         doc_capitalized = spacy_nlp(e.text.capitalize())
 
-        vecs = spacy_featurizer.features_for_doc(doc)
-        vecs_capitalized = spacy_featurizer.features_for_doc(doc_capitalized)
+        vecs = ftr._features_for_doc(doc)
+        vecs_capitalized = ftr._features_for_doc(doc_capitalized)
 
         assert np.allclose(
             vecs, vecs_capitalized, atol=1e-5

From bd2ceb32a5c59110f1f16e61322ef0574a1309bc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 13:21:50 +0200
Subject: [PATCH 043/239] regex featurizer returns sequence

---
 .../sparse_featurizer/regex_featurizer.py     | 24 ++++----
 .../nlu/featurizers/test_regex_featurizer.py  | 59 +++++++++++++++----
 2 files changed, 61 insertions(+), 22 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index bedc87502596..45d54b8545b3 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -4,6 +4,7 @@
 import os
 import re
 import typing
+import scipy.sparse
 from typing import Any, Dict, Optional, Text
 
 from rasa.nlu import utils
@@ -82,25 +83,26 @@ def features_for_patterns(self, message):
         message is tokenized, the function will mark all tokens with a dict
         relating the name of the regex to whether it was matched."""
 
-        found_patterns = []
-        for exp in self.known_patterns:
-            matches = re.finditer(exp["pattern"], message.text)
+        tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
+
+        vec = np.zeros([len(tokens), len(self.known_patterns)])
+
+        for pattern_index, pattern in enumerate(self.known_patterns):
+            matches = re.finditer(pattern["pattern"], message.text)
             matches = list(matches)
-            found_patterns.append(False)
-            for token_index, t in enumerate(
-                message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
-            ):
+
+            for token_index, t in enumerate(tokens):
                 patterns = t.get("pattern", default={})
-                patterns[exp["name"]] = False
+                patterns[pattern["name"]] = False
 
                 for match in matches:
                     if t.offset < match.end() and t.end > match.start():
-                        patterns[exp["name"]] = True
-                        found_patterns[-1] = True
+                        patterns[pattern["name"]] = True
+                        vec[token_index][pattern_index] = 1.0
 
                 t.set("pattern", patterns)
 
-        return np.array(found_patterns).astype(float)
+        return scipy.sparse.csr_matrix(vec)
 
     def _generate_lookup_regex(self, lookup_table):
         """creates a regex out of the contents of a lookup table file"""
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index 7a3455ca85ee..1eb65d8edb3b 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -13,10 +13,38 @@
 @pytest.mark.parametrize(
     "sentence, expected, labeled_tokens",
     [
-        ("hey how are you today", [0.0, 1.0, 0.0], [0]),
-        ("hey 456 how are you", [1.0, 1.0, 0.0], [1, 0]),
-        ("blah balh random eh", [0.0, 0.0, 0.0], []),
-        ("a 1 digit number", [1.0, 0.0, 1.0], [1, 1]),
+        (
+            "hey how are you today",
+            [
+                [0.0, 1.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+            ],
+            [0],
+        ),
+        (
+            "hey 456 how are you",
+            [
+                [0.0, 1.0, 0.0],
+                [1.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+                [0.0, 0.0, 0.0],
+            ],
+            [1, 0],
+        ),
+        (
+            "blah balh random eh",
+            [[0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
+            [],
+        ),
+        (
+            "a 1 digit number",
+            [[0.0, 0.0, 0.0], [1.0, 0.0, 1.0], [0.0, 0.0, 0.0], [0.0, 0.0, 0.0]],
+            [1, 1],
+        ),
     ],
 )
 def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
@@ -30,13 +58,13 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
     ftr = RegexFeaturizer(known_patterns=patterns)
 
     # adds tokens to the message
-    tokenizer = SpacyTokenizer()
+    tokenizer = SpacyTokenizer({"use_cls_token": False})
     message = Message(sentence)
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)
 
     result = ftr.features_for_patterns(message)
-    assert np.allclose(result, expected, atol=1e-10)
+    assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
     assert len(message.get("tokens", [])) > 0
@@ -50,10 +78,18 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
 @pytest.mark.parametrize(
     "sentence, expected, labeled_tokens",
     [
-        ("lemonade and mapo tofu", [1, 1], [0.0, 2.0, 3.0]),
-        ("a cup of tea", [1, 0], [3.0]),
-        ("Is burrito my favorite food?", [0, 1], [1.0]),
-        ("I want club?mate", [1, 0], [2.0, 3.0]),
+        (
+            "lemonade and mapo tofu",
+            [[1.0, 0.0], [0.0, 0.0], [0.0, 1.0], [0.0, 1.0]],
+            [0.0, 2.0, 3.0],
+        ),
+        ("a cup of tea", [[0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [1.0, 0.0]], [3.0]),
+        (
+            "Is burrito my favorite food?",
+            [[0.0, 0.0], [0.0, 1.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0], [0.0, 0.0]],
+            [1.0],
+        ),
+        ("I want club?mate", [[0.0, 0.0], [0.0, 0.0], [1.0, 0.0]], [2.0, 3.0]),
     ],
 )
 def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
@@ -76,7 +112,8 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     tokenizer.process(message)
 
     result = ftr.features_for_patterns(message)
-    assert np.allclose(result, expected, atol=1e-10)
+    print(result.toarray())
+    assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
     assert len(message.get("tokens", [])) > 0

From 7e46fe806726619e64c459a774b2b98873d35474 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 16:31:23 +0200
Subject: [PATCH 044/239] clean up

---
 rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py    | 1 -
 .../sparse_featurizer/count_vectors_featurizer.py            | 5 ++---
 rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py   | 2 +-
 tests/nlu/featurizers/test_count_vectors_featurizer.py       | 4 ++--
 tests/nlu/featurizers/test_regex_featurizer.py               | 5 -----
 5 files changed, 5 insertions(+), 12 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index bb35a66c521c..88a657bcf693 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -7,7 +7,6 @@
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
-    from spacy.language import Language
     from spacy.tokens import Doc
 
 from rasa.nlu.constants import (
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 619db36500ef..38521140c88a 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -9,9 +9,6 @@
 from rasa.nlu.featurizers.featurzier import Featurizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
-
-logger = logging.getLogger(__name__)
-
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
@@ -22,6 +19,8 @@
     MESSAGE_INTENT_ATTRIBUTE,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class CountVectorsFeaturizer(Featurizer):
     """Bag of words featurizer.
diff --git a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index e97c5b01fdf6..e2e4a434135f 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -1,6 +1,6 @@
 import logging
 
-from typing import Any, Dict, List, Optional, Text
+from typing import Any, Dict, Optional, Text
 
 from rasa.nlu.featurizers.featurzier import Featurizer
 
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index e94bb6c4a081..9fdcafb889c8 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -68,7 +68,7 @@ def test_count_vector_featurizer_attribute_featurization(
             train_message.get("intent_sparse_features")[0].toarray() == intent_features
         )
     else:
-        assert train_message.get("intent_sparse_features") == None
+        assert train_message.get("intent_sparse_features") is None
 
     if response_features:
         assert (
@@ -76,7 +76,7 @@ def test_count_vector_featurizer_attribute_featurization(
             == response_features
         )
     else:
-        assert train_message.get("response_sparse_features") == None
+        assert train_message.get("response_sparse_features") is None
 
 
 @pytest.mark.parametrize(
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index 1eb65d8edb3b..d2a8301dedcb 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -1,13 +1,8 @@
 import numpy as np
 import pytest
 
-from rasa.nlu import training_data
-from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message
-from rasa.nlu.training_data import TrainingData
-from rasa.nlu.config import RasaNLUModelConfig
 
 
 @pytest.mark.parametrize(

From a4b8b0eec945fef63b7f59ca8eea069c13431f72 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 16:35:00 +0200
Subject: [PATCH 045/239] Add changelog entry

---
 CHANGELOG.rst | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index b59c1832e9ac..0b4484d662b9 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -16,9 +16,12 @@ Added
 
 Changed
 -------
+- Divided featurizers in ``rasa.nlu`` into sparse and dense featurizers
+- All featurizers in ``rasa.nlu`` return a sequence
 
 Removed
 -------
+- Deprecated ``NGramFeaturizer`` in ``rasa.nlu.featurizers`` (removed functionality and print deprecation warning instead)
 
 Fixed
 -----

From f02b9c246183d2db6b3d3a16347c05baf962b8a6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 17:27:12 +0200
Subject: [PATCH 046/239] helper method to convert seq features back

---
 rasa/nlu/classifiers/__init__.py              |  20 +
 .../embedding_intent_classifier.py            |  14 +-
 .../classifiers/sklearn_intent_classifier.py  |  18 +-
 .../dense_featurizer/spacy_featurizer.py      |   7 +-
 tests/nlu/base/test_components.py             |   4 +-
 tests/nlu/base/test_evaluation.py             |   2 +-
 tests/nlu/base/test_featurizers.py            | 529 ------------------
 7 files changed, 50 insertions(+), 544 deletions(-)
 delete mode 100644 tests/nlu/base/test_featurizers.py

diff --git a/rasa/nlu/classifiers/__init__.py b/rasa/nlu/classifiers/__init__.py
index ae7b52d8840a..f1613a979c42 100644
--- a/rasa/nlu/classifiers/__init__.py
+++ b/rasa/nlu/classifiers/__init__.py
@@ -1,3 +1,23 @@
 # How many labels are at max put into the output
 # ranking, everything else will be cut off
 LABEL_RANKING_LENGTH = 10
+
+import scipy.sparse
+
+
+# TODO should be removed in next PR
+def convert_sparse_back(sparse_features: scipy.sparse.csr_matrix):
+    import numpy as np
+
+    if sparse_features is not None:
+        return np.sum(sparse_features.toarray(), axis=0)
+    return None
+
+
+# TODO should be removed in next PR
+def convert_dense_back(dense_features: scipy.sparse.csr_matrix):
+    import numpy as np
+
+    if dense_features is not None:
+        return np.sum(dense_features, axis=0)
+    return None
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 10847fb0c3c6..dfb68ab32d22 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -6,7 +6,7 @@
 from typing import Any, Dict, List, Optional, Text, Tuple
 import warnings
 
-from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
+from rasa.nlu.classifiers import LABEL_RANKING_LENGTH, convert_sparse_back
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
 from rasa.nlu.constants import (
@@ -272,7 +272,7 @@ def _extract_labels_precomputed_features(
 
         # Collect precomputed encodings
         encoded_id_labels = [
-            (label_idx, label_example.get(attribute_feature_name))
+            (label_idx, convert_sparse_back(label_example.get(attribute_feature_name)))
             for (label_idx, label_example) in label_examples
         ]
 
@@ -336,7 +336,11 @@ def _create_session_data(
         for e in training_data.intent_examples:
             if e.get(attribute):
                 X.append(
-                    e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                    convert_sparse_back(
+                        e.get(
+                            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                        )
+                    )
                 )
                 label_ids.append(label_id_dict[e.get(attribute)])
 
@@ -608,8 +612,8 @@ def predict_label(self, message):
         else:
             # get features (bag of words) for a message
             # noinspection PyPep8Naming
-            X = message.get(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+            X = convert_sparse_back(
+                message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             ).reshape(1, -1)
 
             # load tf graph and session
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 1e35101099d7..dce7ffeda802 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -5,13 +5,13 @@
 from typing import Any, Dict, List, Optional, Text, Tuple
 
 from rasa.nlu import utils
-from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
+from rasa.nlu.classifiers import LABEL_RANKING_LENGTH, convert_dense_back
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     MESSAGE_TEXT_ATTRIBUTE,
 )
 
@@ -26,7 +26,7 @@ class SklearnIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
@@ -98,13 +98,19 @@ def train(
             y = self.transform_labels_str2num(labels)
             X = np.stack(
                 [
-                    example.get("text_features")
+                    convert_dense_back(
+                        example.get(
+                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                        )
+                    )
                     for example in training_data.intent_examples
                 ]
             )
 
             self.clf = self._create_classifier(num_threads, y)
 
+            print(X)
+
             self.clf.fit(X, y)
 
     def _num_cv_splits(self, y):
@@ -146,7 +152,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
             intent = None
             intent_ranking = []
         else:
-            X = message.get("text_features").reshape(1, -1)
+            X = convert_dense_back(
+                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            ).reshape(1, -1)
             intent_ids, probabilities = self.predict(X)
             intents = self.transform_labels_num2str(np.ravel(intent_ids))
             # `predict` returns a matrix as it is supposed
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 88a657bcf693..79891ff85403 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import typing
-from typing import Any
+from typing import Any, Optional
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurzier import Featurizer
@@ -34,7 +34,10 @@ def _features_for_doc(self, doc: "Doc") -> np.ndarray:
         return np.array([t.vector for t in doc])
 
     def train(
-        self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        config: Optional[RasaNLUModelConfig],
+        **kwargs: Any,
     ) -> None:
 
         for example in training_data.intent_examples:
diff --git a/tests/nlu/base/test_components.py b/tests/nlu/base/test_components.py
index ecbe453bed39..dc3f734795f6 100644
--- a/tests/nlu/base/test_components.py
+++ b/tests/nlu/base/test_components.py
@@ -55,9 +55,9 @@ def test_find_unavailable_packages():
 
 
 def test_builder_create_by_module_path(component_builder, default_config):
-    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
+    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 
-    path = "rasa.nlu.featurizers.regex_featurizer.RegexFeaturizer"
+    path = "rasa.nlu.featurizers.sparse_featurizer.regex_featurizer.RegexFeaturizer"
     component_config = {"name": path}
     component = component_builder.create_component(component_config, default_config)
     assert type(component) == RegexFeaturizer
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index 240090bbd535..14b27a28d3da 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -257,7 +257,7 @@ def test_run_evaluation(unpacked_trained_moodbot_path):
     data = DEFAULT_DATA_PATH
 
     result = run_evaluation(
-        data, os.path.join(unpacked_trained_moodbot_path, "nlu"), errors=None
+        data, os.path.join(unpacked_trained_moodbot_path, "nlu"), errors=False
     )
     assert result.get("intent_evaluation")
     assert result.get("entity_evaluation").get("CRFEntityExtractor")
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
deleted file mode 100644
index d67c905deab6..000000000000
--- a/tests/nlu/base/test_featurizers.py
+++ /dev/null
@@ -1,529 +0,0 @@
-# -*- coding: utf-8 -
-import numpy as np
-import pytest
-
-from rasa.nlu import training_data
-from rasa.nlu.tokenizers.tokenizer import Token
-from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
-from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-from rasa.nlu.training_data import Message
-from rasa.nlu.training_data import TrainingData
-from rasa.nlu.config import RasaNLUModelConfig
-
-
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [
-        (
-            "hey how are you today",
-            [-0.19649599, 0.32493639, -0.37408298, -0.10622784, 0.062756],
-        )
-    ],
-)
-def test_spacy_featurizer(sentence, expected, spacy_nlp):
-    from rasa.nlu.featurizers import spacy_featurizer
-
-    doc = spacy_nlp(sentence)
-    vecs = spacy_featurizer.features_for_doc(doc)
-    assert np.allclose(doc.vector[:5], expected, atol=1e-5)
-    assert np.allclose(vecs, doc.vector, atol=1e-5)
-
-
-def test_spacy_training_sample_alignment(spacy_nlp_component):
-    from spacy.tokens import Doc
-
-    m1 = Message.build(text="I have a feeling", intent="feeling")
-    m2 = Message.build(text="", intent="feeling")
-    m3 = Message.build(text="I am the last message", intent="feeling")
-    td = TrainingData(training_examples=[m1, m2, m3])
-
-    attribute_docs = spacy_nlp_component.docs_for_training_data(td)
-
-    assert isinstance(attribute_docs["text"][0], Doc)
-    assert isinstance(attribute_docs["text"][1], Doc)
-    assert isinstance(attribute_docs["text"][2], Doc)
-
-    assert [t.text for t in attribute_docs["text"][0]] == ["i", "have", "a", "feeling"]
-    assert [t.text for t in attribute_docs["text"][1]] == []
-    assert [t.text for t in attribute_docs["text"][2]] == [
-        "i",
-        "am",
-        "the",
-        "last",
-        "message",
-    ]
-
-
-def test_spacy_intent_featurizer(spacy_nlp_component):
-    from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer
-
-    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    spacy_nlp_component.train(td, config=None)
-    spacy_featurizer = SpacyFeaturizer()
-    spacy_featurizer.train(td, config=None)
-
-    intent_features_exist = np.array(
-        [
-            True if example.get("intent_features") is not None else False
-            for example in td.intent_examples
-        ]
-    )
-
-    # no intent features should have been set
-    assert not any(intent_features_exist)
-
-
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [("hey how are you today", [-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])],
-)
-def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
-    from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer
-
-    doc = spacy_nlp(sentence)
-    token_vectors = [t.vector for t in doc]
-    spacy_config = {"ner_feature_vectors": True}
-    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
-    greet = {"intent": "greet", "text_features": [0.5]}
-    message = Message(sentence, greet)
-    message.set("spacy_doc", doc)
-    ftr._set_spacy_features(message)
-    ftr._set_spacy_ner_features(message)
-    vecs = message.get("ner_features")[0][:5]
-    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
-    assert np.allclose(vecs, expected, atol=1e-4)
-
-
-def test_spacy_ner_featurizer_config(spacy_nlp):
-    from rasa.nlu.featurizers.spacy_featurizer import SpacyFeaturizer
-
-    sentence = "hi there friend"
-    doc = spacy_nlp(sentence)
-    spacy_config = {"ner_feature_vectors": False}
-    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
-    greet = {"intent": "greet", "text_features": [0.5]}
-    message = Message(sentence, greet)
-    message.set("spacy_doc", doc)
-    ftr._set_spacy_features(message)
-    ftr._set_spacy_ner_features(message)
-    vecs = np.array(message.get("ner_features"))
-    assert vecs.shape[0] == len(doc)
-    assert vecs.shape[1] == 0
-
-
-def test_mitie_featurizer(mitie_feature_extractor, default_config):
-    from rasa.nlu.featurizers.mitie_featurizer import MitieFeaturizer
-
-    mitie_component_config = {"name": "MitieFeaturizer"}
-    ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
-    sentence = "Hey how are you today"
-    mitie_component_config = {"name": "MitieTokenizer", "use_cls_token": False}
-    tokens = MitieTokenizer(mitie_component_config).tokenize(sentence)
-    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
-    expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
-    assert np.allclose(vecs[:5], expected, atol=1e-5)
-
-
-def test_ngram_featurizer(spacy_nlp):
-    from rasa.nlu.featurizers.ngram_featurizer import NGramFeaturizer
-
-    ftr = NGramFeaturizer({"max_number_of_ngrams": 10})
-
-    # ensures that during random sampling of the ngram CV we don't end up
-    # with a one-class-split
-    repetition_factor = 5
-
-    greet = {"intent": "greet", "text_features": [0.5]}
-    goodbye = {"intent": "goodbye", "text_features": [0.5]}
-    labeled_sentences = [
-        Message("heyheyheyhey", greet),
-        Message("howdyheyhowdy", greet),
-        Message("heyhey howdyheyhowdy", greet),
-        Message("howdyheyhowdy heyhey", greet),
-        Message("astalavistasista", goodbye),
-        Message("astalavistasista sistala", goodbye),
-        Message("sistala astalavistasista", goodbye),
-    ] * repetition_factor
-
-    for m in labeled_sentences:
-        m.set("spacy_doc", spacy_nlp(m.text))
-
-    ftr.min_intent_examples_for_ngram_classification = 2
-    ftr.train_on_sentences(labeled_sentences)
-    assert len(ftr.all_ngrams) > 0
-    assert ftr.best_num_ngrams > 0
-
-
-@pytest.mark.parametrize(
-    "sentence, expected, labeled_tokens",
-    [
-        ("hey how are you today", [0.0, 1.0, 0.0], [0]),
-        ("hey 456 how are you", [1.0, 1.0, 0.0], [1, 0]),
-        ("blah balh random eh", [0.0, 0.0, 0.0], []),
-        ("a 1 digit number", [1.0, 0.0, 1.0], [1, 1]),
-    ],
-)
-def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
-    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer
-
-    patterns = [
-        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
-        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
-        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
-    ]
-    ftr = RegexFeaturizer(known_patterns=patterns)
-
-    # adds tokens to the message
-    tokenizer = SpacyTokenizer()
-    message = Message(sentence)
-    message.set("spacy_doc", spacy_nlp(sentence))
-    tokenizer.process(message)
-
-    result = ftr.features_for_patterns(message)
-    assert np.allclose(result, expected, atol=1e-10)
-
-    # the tokenizer should have added tokens
-    assert len(message.get("tokens", [])) > 0
-    # the number of regex matches on each token should match
-    for i, token in enumerate(message.get("tokens")):
-        token_matches = token.get("pattern").values()
-        num_matches = sum(token_matches)
-        assert num_matches == labeled_tokens.count(i)
-
-
-@pytest.mark.parametrize(
-    "sentence, expected, labeled_tokens",
-    [
-        ("lemonade and mapo tofu", [1, 1], [0.0, 2.0, 3.0]),
-        ("a cup of tea", [1, 0], [3.0]),
-        ("Is burrito my favorite food?", [0, 1], [1.0]),
-        ("I want club?mate", [1, 0], [2.0, 3.0]),
-    ],
-)
-def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
-    from rasa.nlu.featurizers.regex_featurizer import RegexFeaturizer
-
-    lookups = [
-        {
-            "name": "drinks",
-            "elements": ["mojito", "lemonade", "sweet berry wine", "tea", "club?mate"],
-        },
-        {"name": "plates", "elements": "data/test/lookup_tables/plates.txt"},
-    ]
-    ftr = RegexFeaturizer(lookup_tables=lookups)
-
-    # adds tokens to the message
-    component_config = {"name": "SpacyTokenizer", "use_cls_token": False}
-    tokenizer = SpacyTokenizer(component_config)
-    message = Message(sentence)
-    message.set("spacy_doc", spacy_nlp(sentence))
-    tokenizer.process(message)
-
-    result = ftr.features_for_patterns(message)
-    assert np.allclose(result, expected, atol=1e-10)
-
-    # the tokenizer should have added tokens
-    assert len(message.get("tokens", [])) > 0
-    # the number of regex matches on each token should match
-    for i, token in enumerate(message.get("tokens")):
-        token_matches = token.get("pattern").values()
-        num_matches = sum(token_matches)
-        assert num_matches == labeled_tokens.count(i)
-
-
-def test_spacy_featurizer_casing(spacy_nlp):
-    from rasa.nlu.featurizers import spacy_featurizer
-
-    # if this starts failing for the default model, we should think about
-    # removing the lower casing the spacy nlp component does when it
-    # retrieves vectors. For compressed spacy models (e.g. models
-    # ending in _sm) this test will most likely fail.
-
-    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    for e in td.intent_examples:
-        doc = spacy_nlp(e.text)
-        doc_capitalized = spacy_nlp(e.text.capitalize())
-
-        vecs = spacy_featurizer.features_for_doc(doc)
-        vecs_capitalized = spacy_featurizer.features_for_doc(doc_capitalized)
-
-        assert np.allclose(
-            vecs, vecs_capitalized, atol=1e-5
-        ), "Vectors are unequal for texts '{}' and '{}'".format(
-            e.text, e.text.capitalize()
-        )
-
-
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [
-        ("hello hello hello hello hello ", [5]),
-        ("hello goodbye hello", [1, 2]),
-        ("a b c d e f", [1, 1, 1, 1, 1, 1]),
-        ("a 1 2", [2, 1]),
-    ],
-)
-def test_count_vector_featurizer(sentence, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-
-    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
-    train_message = Message(sentence)
-    # this is needed for a valid training example
-    train_message.set("intent", "bla")
-    data = TrainingData([train_message])
-    ftr.train(data)
-
-    test_message = Message(sentence)
-    ftr.process(test_message)
-
-    assert np.all(test_message.get("text_features") == expected)
-
-
-@pytest.mark.parametrize(
-    "sentence, intent, response, intent_features, response_features",
-    [
-        ("hello hello hello hello hello ", "greet", None, [1], None),
-        ("hello goodbye hello", "greet", None, [1], None),
-        ("a 1 2", "char", "char char", [1], [2]),
-    ],
-)
-def test_count_vector_featurizer_attribute_featurization(
-    sentence, intent, response, intent_features, response_features
-):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-
-    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
-    train_message = Message(sentence)
-
-    # this is needed for a valid training example
-    train_message.set("intent", intent)
-    train_message.set("response", response)
-
-    data = TrainingData([train_message])
-    ftr.train(data)
-
-    assert train_message.get("intent_features") == intent_features
-    assert train_message.get("response_features") == response_features
-
-
-@pytest.mark.parametrize(
-    "sentence, intent, response, text_features, intent_features, response_features",
-    [
-        ("hello hello greet ", "greet", "hello", [1, 2], [1, 0], [0, 1]),
-        (
-            "I am fine",
-            "acknowledge",
-            "good",
-            [0, 1, 1, 0, 1],
-            [1, 0, 0, 0, 0],
-            [0, 0, 0, 1, 0],
-        ),
-    ],
-)
-def test_count_vector_featurizer_shared_vocab(
-    sentence, intent, response, text_features, intent_features, response_features
-):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-
-    ftr = CountVectorsFeaturizer(
-        {"token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True}
-    )
-    train_message = Message(sentence)
-
-    # this is needed for a valid training example
-    train_message.set("intent", intent)
-    train_message.set("response", response)
-
-    data = TrainingData([train_message])
-    ftr.train(data)
-
-    assert np.all(train_message.get("text_features") == text_features)
-    assert np.all(train_message.get("intent_features") == intent_features)
-    assert np.all(train_message.get("response_features") == response_features)
-
-
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [
-        ("hello hello hello hello hello __OOV__", [1, 5]),
-        ("hello goodbye hello __oov__", [1, 1, 2]),
-        ("a b c d e f __oov__ __OOV__ __OOV__", [3, 1, 1, 1, 1, 1, 1]),
-        ("__OOV__ a 1 2 __oov__ __OOV__", [2, 3, 1]),
-    ],
-)
-def test_count_vector_featurizer_oov_token(sentence, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-
-    ftr = CountVectorsFeaturizer(
-        {"token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__"}
-    )
-    train_message = Message(sentence)
-    # this is needed for a valid training example
-    train_message.set("intent", "bla")
-    data = TrainingData([train_message])
-    ftr.train(data)
-
-    test_message = Message(sentence)
-    ftr.process(test_message)
-
-    assert np.all(test_message.get("text_features") == expected)
-
-
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [
-        ("hello hello hello hello hello oov_word0", [1, 5]),
-        ("hello goodbye hello oov_word0 OOV_word0", [2, 1, 2]),
-        ("a b c d e f __oov__ OOV_word0 oov_word1", [3, 1, 1, 1, 1, 1, 1]),
-        ("__OOV__ a 1 2 __oov__ OOV_word1", [2, 3, 1]),
-    ],
-)
-def test_count_vector_featurizer_oov_words(sentence, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-
-    ftr = CountVectorsFeaturizer(
-        {
-            "token_pattern": r"(?u)\b\w+\b",
-            "OOV_token": "__oov__",
-            "OOV_words": ["oov_word0", "OOV_word1"],
-        }
-    )
-    train_message = Message(sentence)
-    # this is needed for a valid training example
-    train_message.set("intent", "bla")
-    data = TrainingData([train_message])
-    ftr.train(data)
-
-    test_message = Message(sentence)
-    ftr.process(test_message)
-
-    assert np.all(test_message.get("text_features") == expected)
-
-
-@pytest.mark.parametrize(
-    "tokens, expected",
-    [
-        (["hello", "hello", "hello", "hello", "hello"], [5]),
-        (["你好", "你好", "你好", "你好", "你好"], [5]),  # test for unicode chars
-        (["hello", "goodbye", "hello"], [1, 2]),
-        # Note: order has changed in Chinese version of "hello" & "goodbye"
-        (["你好", "再见", "你好"], [2, 1]),  # test for unicode chars
-        (["a", "b", "c", "d", "e", "f"], [1, 1, 1, 1, 1, 1]),
-        (["a", "1", "2"], [2, 1]),
-    ],
-)
-def test_count_vector_featurizer_using_tokens(tokens, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-
-    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
-
-    # using empty string instead of real text string to make sure
-    # count vector only can come from `tokens` feature.
-    # using `message.text` can not get correct result
-
-    tokens_feature = [Token(i, 0) for i in tokens]
-
-    train_message = Message("")
-    train_message.set("tokens", tokens_feature)
-    # this is needed for a valid training example
-    train_message.set("intent", "bla")
-    data = TrainingData([train_message])
-
-    ftr.train(data)
-
-    test_message = Message("")
-    test_message.set("tokens", tokens_feature)
-
-    ftr.process(test_message)
-
-    assert np.all(test_message.get("text_features") == expected)
-
-
-@pytest.mark.parametrize(
-    "sentence, expected",
-    [
-        ("ababab", [3, 3, 3, 2]),
-        ("ab ab ab", [2, 2, 3, 3, 3, 2]),
-        ("abc", [1, 1, 1, 1, 1]),
-    ],
-)
-def test_count_vector_featurizer_char(sentence, expected):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-
-    ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"})
-    train_message = Message(sentence)
-    # this is needed for a valid training example
-    train_message.set("intent", "bla")
-    data = TrainingData([train_message])
-    ftr.train(data)
-
-    test_message = Message(sentence)
-    ftr.process(test_message)
-
-    assert np.all(test_message.get("text_features") == expected)
-
-
-def test_count_vector_featurizer_persist_load(tmpdir):
-    from rasa.nlu.featurizers.count_vectors_featurizer import CountVectorsFeaturizer
-
-    # set non default values to config
-    config = {
-        "analyzer": "char",
-        "token_pattern": r"(?u)\b\w+\b",
-        "strip_accents": "ascii",
-        "stop_words": "stop",
-        "min_df": 2,
-        "max_df": 3,
-        "min_ngram": 2,
-        "max_ngram": 3,
-        "max_features": 10,
-        "lowercase": False,
-    }
-    train_ftr = CountVectorsFeaturizer(config)
-
-    sentence1 = "ababab 123 13xc лаомтгцу sfjv oö aà"
-    sentence2 = "abababalidcn 123123 13xcdc лаомтгцу sfjv oö aà"
-    train_message1 = Message(sentence1)
-    train_message2 = Message(sentence2)
-
-    # this is needed for a valid training example
-    train_message1.set("intent", "bla")
-    train_message2.set("intent", "bla")
-    data = TrainingData([train_message1, train_message2])
-    train_ftr.train(data)
-    # persist featurizer
-    file_dict = train_ftr.persist("ftr", tmpdir.strpath)
-    train_vect_params = {
-        attribute: vectorizer.get_params()
-        for attribute, vectorizer in train_ftr.vectorizers.items()
-    }
-    # add trained vocabulary to vectorizer params
-    for attribute, attribute_vect_params in train_vect_params.items():
-        if hasattr(train_ftr.vectorizers[attribute], "vocabulary_"):
-            train_vect_params[attribute].update(
-                {"vocabulary": train_ftr.vectorizers[attribute].vocabulary_}
-            )
-
-    # load featurizer
-    meta = train_ftr.component_config.copy()
-    meta.update(file_dict)
-    test_ftr = CountVectorsFeaturizer.load(meta, tmpdir.strpath)
-    test_vect_params = {
-        attribute: vectorizer.get_params()
-        for attribute, vectorizer in test_ftr.vectorizers.items()
-    }
-
-    assert train_vect_params == test_vect_params
-
-    test_message1 = Message(sentence1)
-    test_ftr.process(test_message1)
-    test_message2 = Message(sentence2)
-    test_ftr.process(test_message2)
-
-    # check that train features and test features after loading are the same
-    assert np.all(
-        [
-            train_message1.get("text_features") == test_message1.get("text_features"),
-            train_message2.get("text_features") == test_message2.get("text_features"),
-        ]
-    )

From d3a5dd5654499aff9aa30daf5c2ec93521017a35 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 17:30:25 +0200
Subject: [PATCH 047/239] remove print statement

---
 rasa/nlu/classifiers/sklearn_intent_classifier.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index dce7ffeda802..15175699955a 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -109,8 +109,6 @@ def train(
 
             self.clf = self._create_classifier(num_threads, y)
 
-            print(X)
-
             self.clf.fit(X, y)
 
     def _num_cv_splits(self, y):

From 46ab485384a942c8a7f0e3448e49823892227b1f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 22 Oct 2019 08:08:42 +0200
Subject: [PATCH 048/239] fix imports

---
 rasa/nlu/registry.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 8e43d85b75cb..1891e0b7f914 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -18,13 +18,13 @@
 from rasa.nlu.extractors.entity_synonyms import EntitySynonymMapper
 from rasa.nlu.extractors.mitie_entity_extractor import MitieEntityExtractor
 from rasa.nlu.extractors.spacy_entity_extractor import SpacyEntityExtractor
-from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
     CountVectorsFeaturizer,
 )
-from nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
-from nlu.featurizers.sparse_featurizer.ngram_featurizer import NGramFeaturizer
-from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
-from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
+from rasa.nlu.featurizers.sparse_featurizer.ngram_featurizer import NGramFeaturizer
+from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
+from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer

From 076f33d408d01100cb4dbec44bba8839cb807920 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 22 Oct 2019 09:03:48 +0200
Subject: [PATCH 049/239] remove ner_features from restaurantbot

---
 examples/restaurantbot/config.yml           | 17 -----------------
 rasa/nlu/extractors/crf_entity_extractor.py |  1 +
 2 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index fcb2086a50e1..f55888e312b3 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -7,23 +7,6 @@ pipeline:
   - name: "SpacyFeaturizer"
   - name: "SklearnIntentClassifier"
   - name: "CRFEntityExtractor"
-    features: [ ["low", "title", "upper"],
-      [
-          "bias",
-          "low",
-          "prefix5",
-          "prefix2",
-          "suffix5",
-          "suffix3",
-          "suffix2",
-          "upper",
-          "title",
-          "digit",
-          "pattern",
-          "ner_features",
-      ],
-     ["low", "title", "upper"]]
-
   - name: "EntitySynonymMapper"
 
 policies:
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 77548dac85c5..354e2c4a8250 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -596,6 +596,7 @@ def __tag_of_token(token):
 
     @staticmethod
     def __additional_ner_features(message: Message) -> List[Any]:
+        # TODO use sparse text features
         features = message.get("ner_features", [])
         tokens = message.get("tokens", [])
         if len(tokens) != len(features):

From 905f2d69a3720796e8ff7752b85d1bb5d6dab556 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 22 Oct 2019 09:10:33 +0200
Subject: [PATCH 050/239] change default value

---
 rasa/nlu/featurizers/featurzier.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index 47073e1be7ac..78a19636f4f9 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -5,6 +5,7 @@
 from rasa.nlu.components import Component
 from rasa.nlu.constants import (
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     MESSAGE_TEXT_ATTRIBUTE,
 )
 
@@ -14,9 +15,7 @@ class Featurizer(Component):
     def _combine_with_existing_dense_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
-            MESSAGE_TEXT_ATTRIBUTE
-        ],
+        feature_name: Text = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
     ) -> Any:
         if message.get(feature_name) is not None:
             return np.concatenate(

From 1941a25d1a8c0dffb55cc7dfd0987f9b8b7373e3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 22 Oct 2019 10:06:32 +0200
Subject: [PATCH 051/239] fix imports

---
 .../featurizers/test_count_vectors_featurizer.py | 16 ++++++++--------
 tests/nlu/featurizers/test_mitie_featurizer.py   |  2 +-
 tests/nlu/featurizers/test_regex_featurizer.py   |  4 ++--
 tests/nlu/featurizers/test_spacy_featurizer.py   |  8 ++++----
 4 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index 9fdcafb889c8..2f8a9c374af7 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -17,7 +17,7 @@
     ],
 )
 def test_count_vector_featurizer(sentence, expected):
-    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
         CountVectorsFeaturizer,
     )
 
@@ -49,7 +49,7 @@ def test_count_vector_featurizer(sentence, expected):
 def test_count_vector_featurizer_attribute_featurization(
     sentence, intent, response, intent_features, response_features
 ):
-    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
         CountVectorsFeaturizer,
     )
 
@@ -96,7 +96,7 @@ def test_count_vector_featurizer_attribute_featurization(
 def test_count_vector_featurizer_shared_vocab(
     sentence, intent, response, text_features, intent_features, response_features
 ):
-    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
         CountVectorsFeaturizer,
     )
 
@@ -133,7 +133,7 @@ def test_count_vector_featurizer_shared_vocab(
     ],
 )
 def test_count_vector_featurizer_oov_token(sentence, expected):
-    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
         CountVectorsFeaturizer,
     )
 
@@ -162,7 +162,7 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
     ],
 )
 def test_count_vector_featurizer_oov_words(sentence, expected):
-    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
         CountVectorsFeaturizer,
     )
 
@@ -198,7 +198,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
     ],
 )
 def test_count_vector_featurizer_using_tokens(tokens, expected):
-    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
         CountVectorsFeaturizer,
     )
 
@@ -235,7 +235,7 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
     ],
 )
 def test_count_vector_featurizer_char(sentence, expected):
-    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
         CountVectorsFeaturizer,
     )
 
@@ -253,7 +253,7 @@ def test_count_vector_featurizer_char(sentence, expected):
 
 
 def test_count_vector_featurizer_persist_load(tmpdir):
-    from nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
         CountVectorsFeaturizer,
     )
 
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index dd312dc4c849..be5df1209b9a 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -5,7 +5,7 @@
 
 
 def test_mitie_featurizer(mitie_feature_extractor, default_config):
-    from nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
+    from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 
     mitie_component_config = {"name": "MitieFeaturizer"}
     ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index d2a8301dedcb..5e8ea40ca8f9 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -43,7 +43,7 @@
     ],
 )
 def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
-    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
+    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 
     patterns = [
         {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
@@ -88,7 +88,7 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
     ],
 )
 def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
-    from nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
+    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 
     lookups = [
         {
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index 8fe68f1ee28b..077286fd1150 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -9,7 +9,7 @@
 
 @pytest.mark.parametrize("sentence", ["hey how are you today"])
 def test_spacy_featurizer(sentence, spacy_nlp):
-    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())
 
@@ -45,7 +45,7 @@ def test_spacy_training_sample_alignment(spacy_nlp_component):
 
 
 def test_spacy_intent_featurizer(spacy_nlp_component):
-    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     td = training_data.load_data("data/examples/rasa/demo-rasa.json")
     spacy_nlp_component.train(td, config=None)
@@ -68,7 +68,7 @@ def test_spacy_intent_featurizer(spacy_nlp_component):
     [("hey how are you today", [-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])],
 )
 def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
-    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     doc = spacy_nlp(sentence)
     token_vectors = [t.vector for t in doc]
@@ -90,7 +90,7 @@ def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
 
 
 def test_spacy_featurizer_casing(spacy_nlp):
-    from nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     # if this starts failing for the default model, we should think about
     # removing the lower casing the spacy nlp component does when it

From 6483379e61c41cd8deb3d04cc32e69988b89d3a0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 22 Oct 2019 12:41:02 +0200
Subject: [PATCH 052/239] handle cls token in featurizers

---
 .../dense_featurizer/spacy_featurizer.py      | 13 ++++-
 .../count_vectors_featurizer.py               |  6 +++
 tests/nlu/featurizers/test_featurizer.py      | 49 +++++++++++++++++++
 tests/nlu/training/test_train.py              |  1 -
 4 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 tests/nlu/featurizers/test_featurizer.py

diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 79891ff85403..75b8f486a6c9 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -14,6 +14,8 @@
     MESSAGE_SPACY_FEATURES_NAMES,
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
+    MESSAGE_TOKENS_NAMES,
+    CLS_TOKEN,
 )
 
 
@@ -27,7 +29,7 @@ class SpacyFeaturizer(Featurizer):
     requires = [
         MESSAGE_SPACY_FEATURES_NAMES[attribute]
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ]
+    ] + [MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES]
 
     def _features_for_doc(self, doc: "Doc") -> np.ndarray:
         """Feature vector for a single document / sentence."""
@@ -56,8 +58,17 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
         """Adds the spacy word vectors to the messages features."""
 
         message_attribute_doc = self.get_doc(message, attribute)
+        tokens = message.get(MESSAGE_TOKENS_NAMES[attribute])
+        cls_token_used = tokens[-1].text == CLS_TOKEN if tokens else False
+
         if message_attribute_doc is not None:
             fs = self._features_for_doc(message_attribute_doc)
+
+            if cls_token_used:
+                # cls token is used, need to append a vector
+                cls_token_vec = np.zeros([1, fs.shape[-1]])
+                fs = np.concatenate([fs, cls_token_vec])
+
             features = self._combine_with_existing_dense_features(
                 message, fs, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
             )
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 38521140c88a..47d04fcc2eb9 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -17,6 +17,7 @@
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
     MESSAGE_INTENT_ATTRIBUTE,
+    CLS_TOKEN,
 )
 
 logger = logging.getLogger(__name__)
@@ -279,12 +280,17 @@ def _get_message_tokens_by_attribute(
     ) -> List[Text]:
         """Get text tokens of an attribute of a message"""
 
+        tokens = message.get(MESSAGE_TOKENS_NAMES[attribute])
+        cls_token_used = tokens[-1].text == CLS_TOKEN if tokens else False
+
         if attribute in SPACY_FEATURIZABLE_ATTRIBUTES and message.get(
             MESSAGE_SPACY_FEATURES_NAMES[attribute]
         ):  # if lemmatize is possible
             tokens = [
                 t.lemma_ for t in message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
             ]
+            if cls_token_used:
+                tokens.append(CLS_TOKEN)
         elif message.get(
             MESSAGE_TOKENS_NAMES[attribute]
         ):  # if directly tokens is provided
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
new file mode 100644
index 000000000000..25e17b0a7f1e
--- /dev/null
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -0,0 +1,49 @@
+import numpy as np
+import scipy.sparse
+
+from rasa.nlu.featurizers.featurzier import Featurizer
+from rasa.nlu.constants import (
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_TEXT_ATTRIBUTE,
+)
+from rasa.nlu.training_data import Message
+
+
+def test_combine_with_existing_dense_features():
+
+    featurizer = Featurizer()
+    attribute = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+
+    existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
+    new_features = [[1, 0], [0, 1]]
+    expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]
+
+    message = Message("This is a text.")
+    message.set(attribute, existing_features)
+
+    actual_features = featurizer._combine_with_existing_dense_features(
+        message, new_features, attribute
+    )
+
+    assert np.all(expected_features == actual_features)
+
+
+def test_combine_with_existing_sparse_features():
+
+    featurizer = Featurizer()
+    attribute = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+
+    existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
+    new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]])
+    expected_features = [[1, 0, 2, 3, 1, 0], [2, 0, 0, 1, 0, 1]]
+
+    message = Message("This is a text.")
+    message.set(attribute, existing_features)
+
+    actual_features = featurizer._combine_with_existing_sparse_features(
+        message, new_features, attribute
+    )
+    actual_features = actual_features.toarray()
+
+    assert np.all(expected_features == actual_features)
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 5bad045cb2bb..f396f3d995e1 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -34,7 +34,6 @@ def pipelines_for_tests():
                 "SpacyTokenizer",
                 "MitieFeaturizer",
                 "SpacyFeaturizer",
-                "NGramFeaturizer",
                 "RegexFeaturizer",
                 "CountVectorsFeaturizer",
                 "MitieEntityExtractor",

From 952e95ab6e039aead41b30ec0df3a2e9de5d10a1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 22 Oct 2019 13:29:19 +0200
Subject: [PATCH 053/239] Remove ngram featurizer from registry

---
 rasa/nlu/registry.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 1891e0b7f914..6310584c7921 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -62,7 +62,6 @@
     # featurizers
     SpacyFeaturizer,
     MitieFeaturizer,
-    NGramFeaturizer,
     RegexFeaturizer,
     CountVectorsFeaturizer,
     # classifiers

From 6faa44b531e647ce6b6a7fdf91a3583eea847416 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 09:32:35 +0200
Subject: [PATCH 054/239] review comments

---
 rasa/nlu/constants.py                         |  3 -
 .../count_vectors_featurizer.py               | 20 +++---
 .../sparse_featurizer/ngram_featurizer.py     | 10 ++-
 .../sparse_featurizer/regex_featurizer.py     | 70 +++++++++++--------
 rasa/nlu/utils/spacy_utils.py                 |  5 --
 5 files changed, 58 insertions(+), 50 deletions(-)

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 2ac562ac1003..08cc2925a827 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -8,9 +8,6 @@
 
 MESSAGE_ENTITIES_ATTRIBUTE = "entities"
 
-CLS_TOKEN = "__CLS__"
-
-
 CLS_TOKEN = "__CLS__"
 
 MESSAGE_ATTRIBUTES = [
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 47d04fcc2eb9..6dac9ab8a8d4 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -1,8 +1,8 @@
 import logging
 import os
 import re
+import scipy.sparse
 from typing import Any, Dict, List, Optional, Text
-
 from sklearn.feature_extraction.text import CountVectorizer
 from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
@@ -24,9 +24,8 @@
 
 
 class CountVectorsFeaturizer(Featurizer):
-    """Bag of words featurizer.
-
-    Creates bag-of-words representation of features
+    """
+    Creates a sequence of features
     using sklearn's `CountVectorizer`.
     All tokens which consist only of digits (e.g. 123 and 99
     but not ab12d) will be represented by a single feature.
@@ -34,8 +33,6 @@ class CountVectorsFeaturizer(Featurizer):
     Set `analyzer` to 'char_wb'
     to use the idea of Subword Semantic Hashing
     from https://arxiv.org/abs/1810.07150.
-
-    The featurizer returns a sequence.
     """
 
     provides = [
@@ -481,7 +478,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
 
     def _get_featurized_attribute(
         self, attribute: Text, attribute_texts: List[Text]
-    ) -> Optional[List]:
+    ) -> Optional[List[scipy.sparse.csr_matrix]]:
         """Return features of a particular attribute for complete data"""
 
         if self._check_attribute_vocabulary(attribute):
@@ -491,10 +488,12 @@ def _get_featurized_attribute(
             return None
 
     @staticmethod
-    def _get_text_sequence(text):
+    def _get_text_sequence(text: Text) -> List[Text]:
         return text.split()
 
-    def _create_sequence(self, attribute: Text, attribute_texts: List[Text]) -> List:
+    def _create_sequence(
+        self, attribute: Text, attribute_texts: List[Text]
+    ) -> List[scipy.sparse.csr_matrix]:
         texts = [self._get_text_sequence(text) for text in attribute_texts]
 
         X = []
@@ -558,13 +557,14 @@ def process(self, message: Message, **kwargs: Any) -> None:
         message_text = self._get_message_text_by_attribute(message, attribute=attribute)
 
         if self._check_attribute_vocabulary(attribute):
+            # features shape (1, seq, dim)
             features = self._create_sequence(attribute, [message_text])
 
             message.set(
                 MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
                 self._combine_with_existing_sparse_features(
                     message,
-                    features[0],
+                    features[0],  # 0 -> batch dimension
                     feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
                 ),
             )
diff --git a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index e2e4a434135f..e33d3ba1bb0c 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -13,5 +13,13 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
 
         logger.warning(
             "DEPRECATION warning: Using `NGramFeaturizer` is deprecated. "
-            "Please use `CountVectorsFeaturizer`."
+            "Please use `CountVectorsFeaturizer` instead. The following settings"
+            "should match the previous `NGramFeaturizer`:"
+            ""
+            "- name: 'CountVectorsFeaturizer'"
+            "  analyzer: 'char_wb'"
+            "  min_ngram: 3"
+            "  max_ngram: 17"
+            "  max_features: 10"
+            "  min_df: 5"
         )
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 45d54b8545b3..591018b17505 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -5,7 +5,7 @@
 import re
 import typing
 import scipy.sparse
-from typing import Any, Dict, Optional, Text
+from typing import Any, Dict, Optional, Text, List, Union
 
 from rasa.nlu import utils
 from rasa.nlu.config import RasaNLUModelConfig
@@ -15,6 +15,7 @@
 from rasa.nlu.constants import (
     MESSAGE_TOKENS_NAMES,
     MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
 )
 
@@ -30,7 +31,12 @@ class RegexFeaturizer(Featurizer):
 
     requires = [MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
-    def __init__(self, component_config=None, known_patterns=None, lookup_tables=None):
+    def __init__(
+        self,
+        component_config: Dict[Text, Any] = None,
+        known_patterns: List[Dict[Text, Text]] = None,
+        lookup_tables: List[Dict[Text, Union[Text, List]]] = None,
+    ):
 
         super(RegexFeaturizer, self).__init__(component_config)
 
@@ -46,28 +52,27 @@ def train(
         self._add_lookup_table_regexes(training_data.lookup_tables)
 
         for example in training_data.training_examples:
-            updated = self._text_features_with_regex(example)
-            example.set(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], updated
-            )
+            self._text_features_with_regex(example)
 
     def process(self, message: Message, **kwargs: Any) -> None:
+        self._text_features_with_regex(message)
 
-        updated = self._text_features_with_regex(message)
-        message.set(
-            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], updated
-        )
-
-    def _text_features_with_regex(self, message):
+    def _text_features_with_regex(self, message: Message) -> None:
         if self.known_patterns:
             extras = self.features_for_patterns(message)
-            return self._combine_with_existing_sparse_features(message, extras)
+            features = self._combine_with_existing_sparse_features(message, extras)
         else:
-            return message.get(
+            features = message.get(
                 MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
             )
 
-    def _add_lookup_table_regexes(self, lookup_tables):
+        message.set(
+            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], features
+        )
+
+    def _add_lookup_table_regexes(
+        self, lookup_tables: List[Dict[Text, Union[Text, List]]]
+    ) -> None:
         # appends the regex features from the lookup tables to
         # self.known_patterns
         for table in lookup_tables:
@@ -75,7 +80,7 @@ def _add_lookup_table_regexes(self, lookup_tables):
             lookup_regex = {"name": table["name"], "pattern": regex_pattern}
             self.known_patterns.append(lookup_regex)
 
-    def features_for_patterns(self, message):
+    def features_for_patterns(self, message: Message) -> scipy.sparse.csr_matrix:
         """Checks which known patterns match the message.
 
         Given a sentence, returns a vector of {1,0} values indicating which
@@ -83,28 +88,31 @@ def features_for_patterns(self, message):
         message is tokenized, the function will mark all tokens with a dict
         relating the name of the regex to whether it was matched."""
 
-        tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
+        for attribute in [MESSAGE_TEXT_ATTRIBUTE, MESSAGE_RESPONSE_ATTRIBUTE]:
+            tokens = message.get(MESSAGE_TOKENS_NAMES[attribute], [])
 
-        vec = np.zeros([len(tokens), len(self.known_patterns)])
+            vec = np.zeros([len(tokens), len(self.known_patterns)])
 
-        for pattern_index, pattern in enumerate(self.known_patterns):
-            matches = re.finditer(pattern["pattern"], message.text)
-            matches = list(matches)
+            for pattern_index, pattern in enumerate(self.known_patterns):
+                matches = re.finditer(pattern["pattern"], message.text)
+                matches = list(matches)
 
-            for token_index, t in enumerate(tokens):
-                patterns = t.get("pattern", default={})
-                patterns[pattern["name"]] = False
+                for token_index, t in enumerate(tokens):
+                    patterns = t.get("pattern", default={})
+                    patterns[pattern["name"]] = False
 
-                for match in matches:
-                    if t.offset < match.end() and t.end > match.start():
-                        patterns[pattern["name"]] = True
-                        vec[token_index][pattern_index] = 1.0
+                    for match in matches:
+                        if t.offset < match.end() and t.end > match.start():
+                            patterns[pattern["name"]] = True
+                            vec[token_index][pattern_index] = 1.0
 
-                t.set("pattern", patterns)
+                    t.set("pattern", patterns)
 
-        return scipy.sparse.csr_matrix(vec)
+            return scipy.sparse.csr_matrix(vec)
 
-    def _generate_lookup_regex(self, lookup_table):
+    def _generate_lookup_regex(
+        self, lookup_table: Dict[Text, Union[Text, List]]
+    ) -> Text:
         """creates a regex out of the contents of a lookup table file"""
         lookup_elements = lookup_table["elements"]
         elements_to_regex = []
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index 08a3dfa54271..571f59a8d70b 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -15,13 +15,8 @@
     from rasa.nlu.model import Metadata
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
     MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 

From 20a92ca9865b187f4702e16646a5ce5d5e5f1745 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 09:48:34 +0200
Subject: [PATCH 055/239] count vectors featurizer requires tokens

---
 .../count_vectors_featurizer.py               | 22 +++++--------------
 rasa/nlu/tokenizers/spacy_tokenizer.py        |  2 +-
 rasa/nlu/tokenizers/tokenizer.py              | 13 ++++++++---
 tests/nlu/tokenizers/test_spacy_tokenizer.py  |  5 +++++
 .../tokenizers/test_whitespace_tokenizer.py   |  5 +++++
 5 files changed, 26 insertions(+), 21 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 6dac9ab8a8d4..86e0f649511e 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -40,7 +40,7 @@ class CountVectorsFeaturizer(Featurizer):
         for attribute in MESSAGE_ATTRIBUTES
     ]
 
-    requires = []
+    requires = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
     defaults = {
         # whether to use a shared vocab
@@ -277,24 +277,12 @@ def _get_message_tokens_by_attribute(
     ) -> List[Text]:
         """Get text tokens of an attribute of a message"""
 
-        tokens = message.get(MESSAGE_TOKENS_NAMES[attribute])
-        cls_token_used = tokens[-1].text == CLS_TOKEN if tokens else False
-
-        if attribute in SPACY_FEATURIZABLE_ATTRIBUTES and message.get(
-            MESSAGE_SPACY_FEATURES_NAMES[attribute]
-        ):  # if lemmatize is possible
-            tokens = [
-                t.lemma_ for t in message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
-            ]
-            if cls_token_used:
-                tokens.append(CLS_TOKEN)
-        elif message.get(
+        if message.get(
             MESSAGE_TOKENS_NAMES[attribute]
         ):  # if directly tokens is provided
-            tokens = [t.text for t in message.get(MESSAGE_TOKENS_NAMES[attribute])]
-        else:
-            tokens = message.get(attribute).split()
-        return tokens
+            return [t.lemma for t in message.get(MESSAGE_TOKENS_NAMES[attribute])]
+
+        return message.get(attribute).split()
 
     # noinspection PyPep8Naming
     def _check_OOV_present(self, examples):
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 0589af320787..f4ca504ad653 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -62,6 +62,6 @@ def process(self, message: Message, **kwargs: Any) -> None:
     def tokenize(
         self, doc: "Doc", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
     ) -> List[Token]:
-        tokens = [Token(t.text, t.idx) for t in doc]
+        tokens = [Token(t.text, t.idx, lemma=t.lemma_) for t in doc]
         self.add_cls_token(tokens, attribute)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index c1b41ad0bf33..bd9ec25d1475 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -13,16 +13,23 @@
 
 
 class Token(object):
-    def __init__(self, text, offset, data=None):
+    def __init__(
+        self,
+        text: Text,
+        offset: int,
+        data: Optional[Dict[Text, Any]] = None,
+        lemma: Optional[Text] = None,
+    ):
         self.offset = offset
         self.text = text
         self.end = offset + len(text)
         self.data = data if data else {}
+        self.lemma = lemma or text
 
-    def set(self, prop, info):
+    def set(self, prop: Text, info: Any):
         self.data[prop] = info
 
-    def get(self, prop, default=None):
+    def get(self, prop: Text, default: Optional[Any] = None):
         return self.data.get(prop, default)
 
 
diff --git a/tests/nlu/tokenizers/test_spacy_tokenizer.py b/tests/nlu/tokenizers/test_spacy_tokenizer.py
index 9748f4fd8fcc..4eb1a6d6f08c 100644
--- a/tests/nlu/tokenizers/test_spacy_tokenizer.py
+++ b/tests/nlu/tokenizers/test_spacy_tokenizer.py
@@ -15,6 +15,11 @@ def test_spacy(spacy_nlp):
         "for",
         "lunch",
     ]
+    assert [t.lemma for t in tk.tokenize(spacy_nlp(text))] == [
+        "forecast",
+        "for",
+        "lunch",
+    ]
     assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]
 
     text = "hey ńöñàśçií how're you?"
diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
index 27c2c6b171f6..424171cc8e4e 100644
--- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -15,6 +15,11 @@ def test_whitespace():
         "for",
         "lunch",
     ]
+    assert [t.lemma for t in tk.tokenize("Forecast for lunch")] == [
+        "Forecast",
+        "for",
+        "lunch",
+    ]
 
     assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13]
 

From 810cae538d822aef14abca8c8bf5b36bb35348f9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 11:22:40 +0200
Subject: [PATCH 056/239] remove not needed vocab check

---
 .../count_vectors_featurizer.py               | 26 ++++++++-----------
 1 file changed, 11 insertions(+), 15 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 86e0f649511e..3e4abf6eb1a8 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -544,18 +544,17 @@ def process(self, message: Message, **kwargs: Any) -> None:
         attribute = MESSAGE_TEXT_ATTRIBUTE
         message_text = self._get_message_text_by_attribute(message, attribute=attribute)
 
-        if self._check_attribute_vocabulary(attribute):
-            # features shape (1, seq, dim)
-            features = self._create_sequence(attribute, [message_text])
-
-            message.set(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
-                self._combine_with_existing_sparse_features(
-                    message,
-                    features[0],  # 0 -> batch dimension
-                    feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
-                ),
-            )
+        # features shape (1, seq, dim)
+        features = self._create_sequence(attribute, [message_text])
+
+        message.set(
+            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+            self._combine_with_existing_sparse_features(
+                message,
+                features[0],  # 0 -> batch dimension
+                feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+            ),
+        )
 
     @staticmethod
     def _is_any_model_trained(attribute_vocabularies) -> bool:
@@ -633,9 +632,6 @@ def load(
                     vocabulary=vocabulary,
                 )
 
-            for v in vectorizers.values():
-                v.vocabulary_ = v.vocabulary
-
             return cls(meta, vectorizers)
         else:
             return cls(meta)

From b6ad85ca836c55a60bb075c2b65db99f57bb716c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 13:47:13 +0200
Subject: [PATCH 057/239] Add cls token to whitespace tokenizer.

---
 rasa/nlu/constants.py                       |  2 ++
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 18 ++++++++++++++----
 tests/nlu/base/test_tokenizers.py           | 17 +++++++++++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 9e3b5f7a2e78..ba1f8b9c9a09 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -10,6 +10,8 @@
 
 MESSAGE_NER_FEATURES_ATTRIBUTE = "ner_features"
 
+CLS_TOKEN = "__CLS__"
+
 MESSAGE_ATTRIBUTES = [
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 94179ead2acb..591aca3c05ff 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -6,13 +6,12 @@
 from rasa.nlu.tokenizers import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_RESPONSE_ATTRIBUTE,
+    CLS_TOKEN,
 )
 
 
@@ -25,8 +24,10 @@ class WhitespaceTokenizer(Tokenizer, Component):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
-        # text will be tokenized with case sensitive as default
+        # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -40,6 +41,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
+        self.add_cls_token = self.component_config["add_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -88,9 +90,17 @@ def tokenize(
 
         running_offset = 0
         tokens = []
+
         for word in words:
             word_offset = text.index(word, running_offset)
             word_len = len(word)
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text)))
+
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index d6e0f78691e6..de008ef8e62f 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -1,6 +1,8 @@
 # -*- coding: utf-8 -*-
 
 from unittest.mock import patch
+
+from rasa.nlu.constants import CLS_TOKEN
 from rasa.nlu.training_data import TrainingData, Message
 from tests.nlu import utilities
 from rasa.nlu import training_data
@@ -77,6 +79,21 @@ def test_whitespace():
     ] == [0, 83]
 
 
+def test_whitespace_cls_token():
+    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = WhitespaceTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+
+
 def test_whitespace_custom_intent_symbol():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 

From fb24e35f87a8081d06772f88ff874f87a4d254ca Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:01:51 +0200
Subject: [PATCH 058/239] Add cls token to spacy tokenizer.

---
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 30 +++++++++++++--------
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  2 +-
 tests/nlu/base/test_tokenizers.py           | 18 +++++++++++++
 3 files changed, 38 insertions(+), 12 deletions(-)

diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 87443d3375de..e289dcf31c83 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,5 +1,5 @@
 import typing
-from typing import Any
+from typing import Any, Dict, Text, List, Optional
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
@@ -7,14 +7,11 @@
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
     MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
+    CLS_TOKEN,
 )
 
 if typing.TYPE_CHECKING:
@@ -32,6 +29,16 @@ class SpacyTokenizer(Tokenizer, Component):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        """Construct a new tokenizer using the SpacyTokenizer framework."""
+        super(SpacyTokenizer, self).__init__(component_config)
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
@@ -47,17 +54,18 @@ def train(
                         MESSAGE_TOKENS_NAMES[attribute], self.tokenize(attribute_doc)
                     )
 
-    def get_doc(self, message, attribute):
-
+    def get_doc(self, message: Message, attribute: Text) -> "Doc":
         return message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
-
         message.set(
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self.tokenize(self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE)),
         )
 
-    def tokenize(self, doc: "Doc") -> typing.List[Token]:
-
-        return [Token(t.text, t.idx) for t in doc]
+    def tokenize(self, doc: "Doc") -> List[Token]:
+        tokens = [Token(t.text, t.idx) for t in doc]
+        if self.add_cls_token:
+            idx = doc[-1].idx + len(doc[-1].text) + 1
+            tokens = tokens + [Token(CLS_TOKEN, idx)]
+        return tokens
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 591aca3c05ff..acf24d7b5334 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -101,6 +101,6 @@ def tokenize(
             attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
             and self.add_cls_token
         ):
-            tokens.append(Token(CLS_TOKEN, len(text)))
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
 
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index de008ef8e62f..5005f8cfb9df 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -92,6 +92,7 @@ def test_whitespace_cls_token():
         "lunch",
         CLS_TOKEN,
     ]
+    assert [t.offset for t in tk.tokenize("Forecast for lunch")] == [0, 9, 13, 19]
 
 
 def test_whitespace_custom_intent_symbol():
@@ -207,6 +208,23 @@ def test_spacy(spacy_nlp):
     assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
 
 
+def test_spacy_add_cls_token(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
+
+
 def test_spacy_intent_tokenizer(spacy_nlp_component):
     from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 

From ad64e5014092a0224257f1e9bb98962cc114c866 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:07:42 +0200
Subject: [PATCH 059/239] Add cls token to mitie tokenizer.

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 31 ++++++++++++++++++++++----
 tests/nlu/base/test_tokenizers.py      | 17 ++++++++++++++
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 559d9920fc23..d28673ec1311 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -1,4 +1,4 @@
-from typing import Any, List, Text
+from typing import Any, List, Text, Optional, Dict
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
@@ -7,8 +7,10 @@
 
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
+    CLS_TOKEN,
 )
 from rasa.utils.io import DEFAULT_ENCODING
 
@@ -17,6 +19,16 @@ class MitieTokenizer(Tokenizer, Component):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
+    defaults = {
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False
+    }
+
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        """Construct a new tokenizer using the SpacyTokenizer framework."""
+        super(MitieTokenizer, self).__init__(component_config)
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
@@ -32,7 +44,7 @@ def train(
                 if example.get(attribute) is not None:
                     example.set(
                         MESSAGE_TOKENS_NAMES[attribute],
-                        self.tokenize(example.get(attribute)),
+                        self.tokenize(example.get(attribute), attribute),
                     )
 
     def process(self, message: Message, **kwargs: Any) -> None:
@@ -41,13 +53,17 @@ def process(self, message: Message, **kwargs: Any) -> None:
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], self.tokenize(message.text)
         )
 
-    def _token_from_offset(self, text, offset, encoded_sentence):
+    def _token_from_offset(
+        self, text: Text, offset: int, encoded_sentence: bytes
+    ) -> Token:
         return Token(
             text.decode(DEFAULT_ENCODING),
             self._byte_to_char_offset(encoded_sentence, offset),
         )
 
-    def tokenize(self, text: Text) -> List[Token]:
+    def tokenize(
+        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         import mitie
 
         encoded_sentence = text.encode(DEFAULT_ENCODING)
@@ -56,6 +72,13 @@ def tokenize(self, text: Text) -> List[Token]:
             self._token_from_offset(token, offset, encoded_sentence)
             for token, offset in tokenized
         ]
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+
         return tokens
 
     @staticmethod
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 5005f8cfb9df..79566099ea09 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -263,6 +263,23 @@ def test_mitie():
     assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
 
 
+def test_mitie_add_cls_token():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
+
+
 def test_jieba():
     from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 

From 2ce36d9a05fa9d8a258ec81c816811a28ce21b22 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:13:38 +0200
Subject: [PATCH 060/239] Add cls token to jieba tokenizer.

---
 rasa/nlu/tokenizers/jieba_tokenizer.py | 18 +++++++++++++++---
 rasa/nlu/tokenizers/spacy_tokenizer.py |  2 +-
 tests/nlu/base/test_tokenizers.py      | 18 ++++++++++++++++++
 3 files changed, 34 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index ae9f1e927220..71c17ec0be4f 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -16,8 +16,7 @@
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    CLS_TOKEN,
 )
 
 logger = logging.getLogger(__name__)
@@ -39,6 +38,8 @@ class JiebaTokenizer(Tokenizer, Component):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # Add a __cls__ token to the end of the list of tokens
+        "add_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -61,6 +62,8 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
+        self.add_cls_token = self.component_config["add_cls_token"]
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["jieba"]
@@ -108,12 +111,21 @@ def preprocess_text(self, text, attribute):
         else:
             return text
 
-    def tokenize(self, text: Text, attribute=MESSAGE_TEXT_ATTRIBUTE) -> List[Token]:
+    def tokenize(
+        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         import jieba
 
         text = self.preprocess_text(text, attribute)
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
+
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and self.add_cls_token
+        ):
+            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+
         return tokens
 
     @classmethod
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index e289dcf31c83..fac2ba1c3a32 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -67,5 +67,5 @@ def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
         if self.add_cls_token:
             idx = doc[-1].idx + len(doc[-1].text) + 1
-            tokens = tokens + [Token(CLS_TOKEN, idx)]
+            tokens.append(Token(CLS_TOKEN, idx))
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 79566099ea09..07057449ca7d 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -308,3 +308,21 @@ def test_jieba_load_dictionary(tmpdir_factory):
         tk.tokenize("")
 
     mock_method.assert_called_once_with(dictionary_path)
+
+
+def test_jieba_add_cls_token():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"add_cls_token": True}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
+        "Micheal",
+        "你好",
+        "吗",
+        "？",
+        CLS_TOKEN,
+    ]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]

From 3f851995d799c87e9fe0b259fa60b90166f942d2 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:14:57 +0200
Subject: [PATCH 061/239] Add changelog entry.

---
 CHANGELOG.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index f91d2836966b..2bf099232f05 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -12,6 +12,7 @@ This project adheres to `Semantic Versioning`_ starting with version 1.0.
 
 Added
 -----
+- Added option ``add_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
 
 Changed
 -------

From 88964a03e7a39b11184d774b26c44b2741d7cf99 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:47:28 +0200
Subject: [PATCH 062/239] move code from init to own file

---
 rasa/nlu/extractors/crf_entity_extractor.py |  2 +-
 rasa/nlu/featurizers/mitie_featurizer.py    |  2 +-
 rasa/nlu/test.py                            |  2 +-
 rasa/nlu/tokenizers/__init__.py             | 16 ---------
 rasa/nlu/tokenizers/jieba_tokenizer.py      | 14 +++-----
 rasa/nlu/tokenizers/mitie_tokenizer.py      | 14 +++-----
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 11 +++---
 rasa/nlu/tokenizers/tokenizer.py            | 39 +++++++++++++++++++++
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 14 +++-----
 tests/nlu/base/test_tokenizers.py           |  8 ++---
 10 files changed, 62 insertions(+), 60 deletions(-)
 create mode 100644 rasa/nlu/tokenizers/tokenizer.py

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index dfad359e6e75..77548dac85c5 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -6,7 +6,7 @@
 from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.constants import DOCS_BASE_URL
 
diff --git a/rasa/nlu/featurizers/mitie_featurizer.py b/rasa/nlu/featurizers/mitie_featurizer.py
index 9d0dbb8f5a7c..1eda72fe2112 100644
--- a/rasa/nlu/featurizers/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/mitie_featurizer.py
@@ -4,7 +4,7 @@
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers import Featurizer
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 9143638590c4..2990a4678548 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -32,7 +32,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Interpreter, Trainer, TrainingData
 from rasa.nlu.components import Component
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.core.constants import RESPOND_PREFIX
 
 logger = logging.getLogger(__name__)
diff --git a/rasa/nlu/tokenizers/__init__.py b/rasa/nlu/tokenizers/__init__.py
index 8cb8732bf097..e69de29bb2d1 100644
--- a/rasa/nlu/tokenizers/__init__.py
+++ b/rasa/nlu/tokenizers/__init__.py
@@ -1,16 +0,0 @@
-class Tokenizer(object):
-    pass
-
-
-class Token(object):
-    def __init__(self, text, offset, data=None):
-        self.offset = offset
-        self.text = text
-        self.end = offset + len(text)
-        self.data = data if data else {}
-
-    def set(self, prop, info):
-        self.data[prop] = info
-
-    def get(self, prop, default=None):
-        return self.data.get(prop, default)
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 71c17ec0be4f..3a7291d686ca 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -7,16 +7,14 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 
 logger = logging.getLogger(__name__)
@@ -39,7 +37,7 @@ class JiebaTokenizer(Tokenizer, Component):
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False,
+        "use_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -62,7 +60,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -120,11 +118,7 @@ def tokenize(
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index d28673ec1311..4450b804a735 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -2,15 +2,13 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 from rasa.utils.io import DEFAULT_ENCODING
 
@@ -21,13 +19,13 @@ class MitieTokenizer(Tokenizer, Component):
 
     defaults = {
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False
+        "use_cls_token": False
     }
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(MitieTokenizer, self).__init__(component_config)
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
@@ -73,11 +71,7 @@ def tokenize(
             for token, offset in tokenized
         ]
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index fac2ba1c3a32..432eff0b7c06 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -3,7 +3,7 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
@@ -11,7 +11,6 @@
     MESSAGE_TOKENS_NAMES,
     MESSAGE_SPACY_FEATURES_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
-    CLS_TOKEN,
 )
 
 if typing.TYPE_CHECKING:
@@ -31,13 +30,13 @@ class SpacyTokenizer(Tokenizer, Component):
 
     defaults = {
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False
+        "use_cls_token": False
     }
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(SpacyTokenizer, self).__init__(component_config)
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -65,7 +64,5 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        if self.add_cls_token:
-            idx = doc[-1].idx + len(doc[-1].text) + 1
-            tokens.append(Token(CLS_TOKEN, idx))
+        self.add_cls_token(tokens, self.use_cls_token)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
new file mode 100644
index 000000000000..4d903822f6f1
--- /dev/null
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -0,0 +1,39 @@
+from typing import Text, List, Optional, Dict, Any
+
+from rasa.nlu.constants import (
+    MESSAGE_RESPONSE_ATTRIBUTE,
+    MESSAGE_TEXT_ATTRIBUTE,
+    CLS_TOKEN,
+)
+
+
+class Token(object):
+    def __init__(self, text, offset, data=None):
+        self.offset = offset
+        self.text = text
+        self.end = offset + len(text)
+        self.data = data if data else {}
+
+    def set(self, prop, info):
+        self.data[prop] = info
+
+    def get(self, prop, default=None):
+        return self.data.get(prop, default)
+
+
+class Tokenizer(object):
+    def add_cls_token(
+        self,
+        tokens: List[Token],
+        use_cls_token: bool,
+        attribute: Text = MESSAGE_TEXT_ATTRIBUTE,
+    ) -> List[Token]:
+        if (
+            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            and use_cls_token
+        ):
+            # +1 to have a space between the last token and the __cls__ token
+            idx = tokens[-1].offset + len(tokens[-1].text) + 1
+            tokens.append(Token(CLS_TOKEN, idx))
+
+        return tokens
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index acf24d7b5334..20a30efe0409 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -3,15 +3,13 @@
 
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token, Tokenizer
+from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    CLS_TOKEN,
 )
 
 
@@ -27,7 +25,7 @@ class WhitespaceTokenizer(Tokenizer, Component):
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
         # Add a __cls__ token to the end of the list of tokens
-        "add_cls_token": False,
+        "use_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -41,7 +39,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
-        self.add_cls_token = self.component_config["add_cls_token"]
+        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -97,10 +95,6 @@ def tokenize(
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
 
-        if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and self.add_cls_token
-        ):
-            tokens.append(Token(CLS_TOKEN, len(text) + 1))
+        self.add_cls_token(tokens, self.use_cls_token, attribute)
 
         return tokens
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/base/test_tokenizers.py
index 07057449ca7d..267f24b81e6f 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/base/test_tokenizers.py
@@ -82,7 +82,7 @@ def test_whitespace():
 def test_whitespace_cls_token():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = WhitespaceTokenizer(component_config)
 
@@ -211,7 +211,7 @@ def test_spacy(spacy_nlp):
 def test_spacy_add_cls_token(spacy_nlp):
     from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = SpacyTokenizer(component_config)
 
@@ -266,7 +266,7 @@ def test_mitie():
 def test_mitie_add_cls_token():
     from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = MitieTokenizer(component_config)
 
@@ -313,7 +313,7 @@ def test_jieba_load_dictionary(tmpdir_factory):
 def test_jieba_add_cls_token():
     from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 
-    component_config = {"add_cls_token": True}
+    component_config = {"use_cls_token": True}
 
     tk = JiebaTokenizer(component_config)
 

From acb7503a2853e123d5020b3c02705c3326ee5689 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 14:49:26 +0200
Subject: [PATCH 063/239] update changelog entry.

---
 CHANGELOG.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 2bf099232f05..8eb624f51dc4 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -12,7 +12,7 @@ This project adheres to `Semantic Versioning`_ starting with version 1.0.
 
 Added
 -----
-- Added option ``add_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
+- Added option ``use_cls_token`` to all tokenizers to add the token ``__CLS__`` to the end of the list of tokens.
 
 Changed
 -------

From 3d89a66e52c4c5ed099fc922aaf5aa16ee662b61 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 15:10:49 +0200
Subject: [PATCH 064/239] make use_cls_token a class variable of tokenizer

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/mitie_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/spacy_tokenizer.py      |  4 ++--
 rasa/nlu/tokenizers/tokenizer.py            | 10 +++++-----
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  4 ++--
 5 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 3a7291d686ca..bebe59a6f341 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -24,7 +24,7 @@
     from rasa.nlu.model import Metadata
 
 
-class JiebaTokenizer(Tokenizer, Component):
+class JiebaTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -118,7 +118,7 @@ def tokenize(
         tokenized = jieba.tokenize(text)
         tokens = [Token(word, start) for (word, start, end) in tokenized]
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 4450b804a735..b4ee25ff7a5a 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -13,7 +13,7 @@
 from rasa.utils.io import DEFAULT_ENCODING
 
 
-class MitieTokenizer(Tokenizer, Component):
+class MitieTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -71,7 +71,7 @@ def tokenize(
             for token, offset in tokenized
         ]
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens
 
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 432eff0b7c06..1784c1e633d3 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -17,7 +17,7 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
 
-class SpacyTokenizer(Tokenizer, Component):
+class SpacyTokenizer(Component, Tokenizer):
 
     provides = [
         MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
@@ -64,5 +64,5 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def tokenize(self, doc: "Doc") -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        self.add_cls_token(tokens, self.use_cls_token)
+        self.add_cls_token(tokens)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 4d903822f6f1..71d914754fb4 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -22,15 +22,15 @@ def get(self, prop, default=None):
 
 
 class Tokenizer(object):
+    def __init__(self) -> None:
+        self.use_cls_token = False
+
     def add_cls_token(
-        self,
-        tokens: List[Token],
-        use_cls_token: bool,
-        attribute: Text = MESSAGE_TEXT_ATTRIBUTE,
+        self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
     ) -> List[Token]:
         if (
             attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
-            and use_cls_token
+            and self.use_cls_token
         ):
             # +1 to have a space between the last token and the __cls__ token
             idx = tokens[-1].offset + len(tokens[-1].text) + 1
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 20a30efe0409..18333f41bd79 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -13,7 +13,7 @@
 )
 
 
-class WhitespaceTokenizer(Tokenizer, Component):
+class WhitespaceTokenizer(Component, Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -95,6 +95,6 @@ def tokenize(
             running_offset = word_offset + word_len
             tokens.append(Token(word, word_offset))
 
-        self.add_cls_token(tokens, self.use_cls_token, attribute)
+        self.add_cls_token(tokens, attribute)
 
         return tokens

From 7ed1f27b58c1e0e4182ace744ce4c7450e26d7c7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 15:30:46 +0200
Subject: [PATCH 065/239] tokenizer inherits from compoenent

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      |  6 +-----
 rasa/nlu/tokenizers/mitie_tokenizer.py      |  8 +-------
 rasa/nlu/tokenizers/spacy_tokenizer.py      |  8 +-------
 rasa/nlu/tokenizers/tokenizer.py            | 12 +++++++++---
 rasa/nlu/tokenizers/whitespace_tokenizer.py |  5 +----
 tests/nlu/base/test_evaluation.py           |  2 +-
 tests/nlu/base/test_featurizers.py          |  2 +-
 7 files changed, 15 insertions(+), 28 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index bebe59a6f341..29ed5d999da2 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -24,7 +24,7 @@
     from rasa.nlu.model import Metadata
 
 
-class JiebaTokenizer(Component, Tokenizer):
+class JiebaTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -36,8 +36,6 @@ class JiebaTokenizer(Component, Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -60,8 +58,6 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.dictionary_path is not None:
             self.load_custom_dictionary(self.dictionary_path)
 
-        self.use_cls_token = self.component_config["use_cls_token"]
-
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["jieba"]
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index b4ee25ff7a5a..74f8577e2d3f 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -13,19 +13,13 @@
 from rasa.utils.io import DEFAULT_ENCODING
 
 
-class MitieTokenizer(Component, Tokenizer):
+class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
-    defaults = {
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False
-    }
-
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(MitieTokenizer, self).__init__(component_config)
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 1784c1e633d3..3a982479c508 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -17,7 +17,7 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
 
 
-class SpacyTokenizer(Component, Tokenizer):
+class SpacyTokenizer(Tokenizer):
 
     provides = [
         MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
@@ -28,15 +28,9 @@ class SpacyTokenizer(Component, Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    defaults = {
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False
-    }
-
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         """Construct a new tokenizer using the SpacyTokenizer framework."""
         super(SpacyTokenizer, self).__init__(component_config)
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 71d914754fb4..41e04c844385 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -1,5 +1,6 @@
 from typing import Text, List, Optional, Dict, Any
 
+from rasa.nlu.components import Component
 from rasa.nlu.constants import (
     MESSAGE_RESPONSE_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
@@ -21,9 +22,14 @@ def get(self, prop, default=None):
         return self.data.get(prop, default)
 
 
-class Tokenizer(object):
-    def __init__(self) -> None:
-        self.use_cls_token = False
+class Tokenizer(Component):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        super(Tokenizer, self).__init__(component_config)
+
+        if "use_cls_token" in self.component_config:
+            self.use_cls_token = self.component_config["use_cls_token"]
+        else:
+            self.use_cls_token = False
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 18333f41bd79..3641fb909689 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -13,7 +13,7 @@
 )
 
 
-class WhitespaceTokenizer(Component, Tokenizer):
+class WhitespaceTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
@@ -24,8 +24,6 @@ class WhitespaceTokenizer(Component, Tokenizer):
         "intent_split_symbol": "_",
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
-        # Add a __cls__ token to the end of the list of tokens
-        "use_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
@@ -39,7 +37,6 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
         self.case_sensitive = self.component_config["case_sensitive"]
-        self.use_cls_token = self.component_config["use_cls_token"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
diff --git a/tests/nlu/base/test_evaluation.py b/tests/nlu/base/test_evaluation.py
index e51567cc5c17..240090bbd535 100644
--- a/tests/nlu/base/test_evaluation.py
+++ b/tests/nlu/base/test_evaluation.py
@@ -39,7 +39,7 @@
 from rasa.nlu.test import determine_intersection
 from rasa.nlu.test import determine_token_labels
 from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu import utils
 import json
 import os
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
index 0da0ae0f7b79..cd0c8ce3c13a 100644
--- a/tests/nlu/base/test_featurizers.py
+++ b/tests/nlu/base/test_featurizers.py
@@ -3,7 +3,7 @@
 import pytest
 
 from rasa.nlu import training_data
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message

From b9e3188b1ace83408dee89b25c09ba31db969ca9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:00:01 +0200
Subject: [PATCH 066/239] remove not needed init methods

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 4 ----
 rasa/nlu/tokenizers/spacy_tokenizer.py | 4 ----
 2 files changed, 8 deletions(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 74f8577e2d3f..5c19bb108bcd 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -17,10 +17,6 @@ class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-        """Construct a new tokenizer using the SpacyTokenizer framework."""
-        super(MitieTokenizer, self).__init__(component_config)
-
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 3a982479c508..ffbeff7c2efc 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -28,10 +28,6 @@ class SpacyTokenizer(Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
-        """Construct a new tokenizer using the SpacyTokenizer framework."""
-        super(SpacyTokenizer, self).__init__(component_config)
-
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:

From 787e0471bed6d27c98c4a78f41fb8c6dfefa622b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:06:46 +0200
Subject: [PATCH 067/239] review comment

---
 rasa/nlu/tokenizers/spacy_tokenizer.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index ffbeff7c2efc..432f283af1ce 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -40,7 +40,8 @@ def train(
 
                 if attribute_doc is not None:
                     example.set(
-                        MESSAGE_TOKENS_NAMES[attribute], self.tokenize(attribute_doc)
+                        MESSAGE_TOKENS_NAMES[attribute],
+                        self.tokenize(attribute_doc, attribute),
                     )
 
     def get_doc(self, message: Message, attribute: Text) -> "Doc":
@@ -49,10 +50,12 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
     def process(self, message: Message, **kwargs: Any) -> None:
         message.set(
             MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self.tokenize(self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE)),
+            self.tokenize(
+                self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE), MESSAGE_TEXT_ATTRIBUTE
+            ),
         )
 
-    def tokenize(self, doc: "Doc") -> List[Token]:
+    def tokenize(self, doc: "Doc", attribute: Text) -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
-        self.add_cls_token(tokens)
+        self.add_cls_token(tokens, attribute)
         return tokens

From 6fe28f0c1f4bc38f82d3f852b48e6e7ca3c03106 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:26:57 +0200
Subject: [PATCH 068/239] Add use_cls_token to default dict.

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      | 2 ++
 rasa/nlu/tokenizers/mitie_tokenizer.py      | 5 +++++
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 5 +++++
 rasa/nlu/tokenizers/tokenizer.py            | 2 +-
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 2 ++
 5 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 29ed5d999da2..05d0afc259fb 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -36,6 +36,8 @@ class JiebaTokenizer(Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 5c19bb108bcd..68516bec258f 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -17,6 +17,11 @@ class MitieTokenizer(Tokenizer):
 
     provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie"]
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 432f283af1ce..9f061c2b29ec 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -28,6 +28,11 @@ class SpacyTokenizer(Tokenizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True
+    }
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 41e04c844385..1b786590f010 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -29,7 +29,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         if "use_cls_token" in self.component_config:
             self.use_cls_token = self.component_config["use_cls_token"]
         else:
-            self.use_cls_token = False
+            self.use_cls_token = True
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 3641fb909689..9be597b49a9d 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -24,6 +24,8 @@ class WhitespaceTokenizer(Tokenizer):
         "intent_split_symbol": "_",
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
+        # add __CLS__ token to the end of the list of tokens
+        "use_cls_token": True,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:

From 172c0e5f42fca0648cf6ed7aa5c3227e83c57474 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 18 Oct 2019 16:57:24 +0200
Subject: [PATCH 069/239] thorw key error if use_cls_token is not set as
 default value.

---
 rasa/nlu/tokenizers/mitie_tokenizer.py        |   3 +-
 rasa/nlu/tokenizers/spacy_tokenizer.py        |   7 +-
 rasa/nlu/tokenizers/tokenizer.py              |  13 +-
 rasa/nlu/tokenizers/whitespace_tokenizer.py   |   1 -
 tests/nlu/tokenizers/__init__.py              |   0
 tests/nlu/tokenizers/test_jieba_tokenizer.py  |  53 ++++++
 tests/nlu/tokenizers/test_mitie_tokenizer.py  |  41 +++++
 tests/nlu/tokenizers/test_spacy_tokenizer.py  |  65 +++++++
 .../test_whitespace_tokenizer.py}             | 168 ++----------------
 9 files changed, 186 insertions(+), 165 deletions(-)
 create mode 100644 tests/nlu/tokenizers/__init__.py
 create mode 100644 tests/nlu/tokenizers/test_jieba_tokenizer.py
 create mode 100644 tests/nlu/tokenizers/test_mitie_tokenizer.py
 create mode 100644 tests/nlu/tokenizers/test_spacy_tokenizer.py
 rename tests/nlu/{base/test_tokenizers.py => tokenizers/test_whitespace_tokenizer.py} (52%)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 68516bec258f..ff9dced7e413 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -1,6 +1,5 @@
-from typing import Any, List, Text, Optional, Dict
+from typing import Any, List, Text
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 9f061c2b29ec..0589af320787 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -1,7 +1,6 @@
 import typing
-from typing import Any, Dict, Text, List, Optional
+from typing import Any, Text, List
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
@@ -60,7 +59,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
             ),
         )
 
-    def tokenize(self, doc: "Doc", attribute: Text) -> List[Token]:
+    def tokenize(
+        self, doc: "Doc", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+    ) -> List[Token]:
         tokens = [Token(t.text, t.idx) for t in doc]
         self.add_cls_token(tokens, attribute)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 1b786590f010..c1b41ad0bf33 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -1,3 +1,5 @@
+import logging
+
 from typing import Text, List, Optional, Dict, Any
 
 from rasa.nlu.components import Component
@@ -7,6 +9,8 @@
     CLS_TOKEN,
 )
 
+logger = logging.getLogger(__name__)
+
 
 class Token(object):
     def __init__(self, text, offset, data=None):
@@ -26,10 +30,13 @@ class Tokenizer(Component):
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super(Tokenizer, self).__init__(component_config)
 
-        if "use_cls_token" in self.component_config:
+        try:
             self.use_cls_token = self.component_config["use_cls_token"]
-        else:
-            self.use_cls_token = True
+        except KeyError:
+            raise KeyError(
+                "No default value for 'use_cls_token' was set. Please, "
+                "add it to the default dict of the tokenizer."
+            )
 
     def add_cls_token(
         self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 9be597b49a9d..c129e97c8fd9 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -1,7 +1,6 @@
 import re
 from typing import Any, Dict, List, Text
 
-from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
diff --git a/tests/nlu/tokenizers/__init__.py b/tests/nlu/tokenizers/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/tokenizers/test_jieba_tokenizer.py b/tests/nlu/tokenizers/test_jieba_tokenizer.py
new file mode 100644
index 000000000000..7df57c5bfcd1
--- /dev/null
+++ b/tests/nlu/tokenizers/test_jieba_tokenizer.py
@@ -0,0 +1,53 @@
+from unittest.mock import patch
+
+from rasa.nlu.constants import CLS_TOKEN
+
+
+def test_jieba():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == ["我", "想", "去", "吃", "兰州", "拉面"]
+
+    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == [0, 1, 2, 3, 4, 6]
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == ["Micheal", "你好", "吗", "？"]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10]
+
+
+def test_jieba_load_dictionary(tmpdir_factory):
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath
+
+    component_config = {"dictionary_path": dictionary_path, "use_cls_token": False}
+
+    with patch.object(
+        JiebaTokenizer, "load_custom_dictionary", return_value=None
+    ) as mock_method:
+        tk = JiebaTokenizer(component_config)
+        tk.tokenize("")
+
+    mock_method.assert_called_once_with(dictionary_path)
+
+
+def test_jieba_add_cls_token():
+    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = JiebaTokenizer(component_config)
+
+    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
+        "Micheal",
+        "你好",
+        "吗",
+        "？",
+        CLS_TOKEN,
+    ]
+
+    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]
diff --git a/tests/nlu/tokenizers/test_mitie_tokenizer.py b/tests/nlu/tokenizers/test_mitie_tokenizer.py
new file mode 100644
index 000000000000..41774fb9a440
--- /dev/null
+++ b/tests/nlu/tokenizers/test_mitie_tokenizer.py
@@ -0,0 +1,41 @@
+from rasa.nlu.constants import CLS_TOKEN
+
+
+def test_mitie():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == ["Forecast", "for", "lunch"]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13]
+
+    text = "hey ńöñàśçií how're you?"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "hey",
+        "ńöñàśçií",
+        "how",
+        "'re",
+        "you",
+        "?",
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
+
+
+def test_mitie_add_cls_token():
+    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = MitieTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(text)] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
diff --git a/tests/nlu/tokenizers/test_spacy_tokenizer.py b/tests/nlu/tokenizers/test_spacy_tokenizer.py
new file mode 100644
index 000000000000..9748f4fd8fcc
--- /dev/null
+++ b/tests/nlu/tokenizers/test_spacy_tokenizer.py
@@ -0,0 +1,65 @@
+from rasa.nlu.constants import CLS_TOKEN
+from rasa.nlu import training_data
+
+
+def test_spacy(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]
+
+    text = "hey ńöñàśçií how're you?"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "hey",
+        "ńöñàśçií",
+        "how",
+        "'re",
+        "you",
+        "?",
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
+
+
+def test_spacy_add_cls_token(spacy_nlp):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": True}
+
+    tk = SpacyTokenizer(component_config)
+
+    text = "Forecast for lunch"
+    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
+        "Forecast",
+        "for",
+        "lunch",
+        CLS_TOKEN,
+    ]
+    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
+
+
+def test_spacy_intent_tokenizer(spacy_nlp_component):
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+
+    component_config = {"use_cls_token": False}
+
+    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
+    spacy_nlp_component.train(td, config=None)
+    spacy_tokenizer = SpacyTokenizer(component_config)
+    spacy_tokenizer.train(td, config=None)
+
+    intent_tokens_exist = [
+        True if example.get("intent_tokens") is not None else False
+        for example in td.intent_examples
+    ]
+
+    # no intent tokens should have been set
+    assert not any(intent_tokens_exist)
diff --git a/tests/nlu/base/test_tokenizers.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
similarity index 52%
rename from tests/nlu/base/test_tokenizers.py
rename to tests/nlu/tokenizers/test_whitespace_tokenizer.py
index 267f24b81e6f..27c2c6b171f6 100644
--- a/tests/nlu/base/test_tokenizers.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -1,17 +1,14 @@
-# -*- coding: utf-8 -*-
-
-from unittest.mock import patch
-
 from rasa.nlu.constants import CLS_TOKEN
 from rasa.nlu.training_data import TrainingData, Message
 from tests.nlu import utilities
-from rasa.nlu import training_data
 
 
 def test_whitespace():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    tk = WhitespaceTokenizer()
+    component_config = {"use_cls_token": False}
+
+    tk = WhitespaceTokenizer(component_config)
 
     assert [t.text for t in tk.tokenize("Forecast for lunch")] == [
         "Forecast",
@@ -98,7 +95,11 @@ def test_whitespace_cls_token():
 def test_whitespace_custom_intent_symbol():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"intent_tokenization_flag": True, "intent_split_symbol": "+"}
+    component_config = {
+        "intent_tokenization_flag": True,
+        "intent_split_symbol": "+",
+        "use_cls_token": False,
+    }
 
     tk = WhitespaceTokenizer(component_config)
 
@@ -116,7 +117,7 @@ def test_whitespace_custom_intent_symbol():
 def test_whitespace_with_case():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"case_sensitive": False}
+    component_config = {"case_sensitive": False, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "forecast",
@@ -124,7 +125,7 @@ def test_whitespace_with_case():
         "lunch",
     ]
 
-    component_config = {"case_sensitive": True}
+    component_config = {"case_sensitive": True, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "Forecast",
@@ -132,7 +133,7 @@ def test_whitespace_with_case():
         "LUNCH",
     ]
 
-    component_config = {}
+    component_config = {"use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
         "Forecast",
@@ -140,7 +141,7 @@ def test_whitespace_with_case():
         "LUNCH",
     ]
 
-    component_config = {"case_sensitive": False}
+    component_config = {"case_sensitive": False, "use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     message = Message("Forecast for LUNCH")
     tk.process(message)
@@ -181,148 +182,3 @@ def test_whitespace_with_case():
     assert examples[1].data.get("tokens")[0].text == "i"
     assert examples[1].data.get("tokens")[1].text == "want"
     assert examples[1].data.get("tokens")[2].text == "tacos"
-
-
-def test_spacy(spacy_nlp):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    tk = SpacyTokenizer()
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "Forecast",
-        "for",
-        "lunch",
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13]
-
-    text = "hey ńöñàśçií how're you?"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "hey",
-        "ńöñàśçií",
-        "how",
-        "'re",
-        "you",
-        "?",
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 4, 13, 16, 20, 23]
-
-
-def test_spacy_add_cls_token(spacy_nlp):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = SpacyTokenizer(component_config)
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(spacy_nlp(text))] == [
-        "Forecast",
-        "for",
-        "lunch",
-        CLS_TOKEN,
-    ]
-    assert [t.offset for t in tk.tokenize(spacy_nlp(text))] == [0, 9, 13, 19]
-
-
-def test_spacy_intent_tokenizer(spacy_nlp_component):
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-
-    td = training_data.load_data("data/examples/rasa/demo-rasa.json")
-    spacy_nlp_component.train(td, config=None)
-    spacy_tokenizer = SpacyTokenizer()
-    spacy_tokenizer.train(td, config=None)
-
-    intent_tokens_exist = [
-        True if example.get("intent_tokens") is not None else False
-        for example in td.intent_examples
-    ]
-
-    # no intent tokens should have been set
-    assert not any(intent_tokens_exist)
-
-
-def test_mitie():
-    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
-
-    tk = MitieTokenizer()
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(text)] == ["Forecast", "for", "lunch"]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13]
-
-    text = "hey ńöñàśçií how're you?"
-    assert [t.text for t in tk.tokenize(text)] == [
-        "hey",
-        "ńöñàśçií",
-        "how",
-        "'re",
-        "you",
-        "?",
-    ]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 4, 13, 16, 20, 23]
-
-
-def test_mitie_add_cls_token():
-    from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = MitieTokenizer(component_config)
-
-    text = "Forecast for lunch"
-    assert [t.text for t in tk.tokenize(text)] == [
-        "Forecast",
-        "for",
-        "lunch",
-        CLS_TOKEN,
-    ]
-    assert [t.offset for t in tk.tokenize(text)] == [0, 9, 13, 19]
-
-
-def test_jieba():
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    tk = JiebaTokenizer()
-
-    assert [t.text for t in tk.tokenize("我想去吃兰州拉面")] == ["我", "想", "去", "吃", "兰州", "拉面"]
-
-    assert [t.offset for t in tk.tokenize("我想去吃兰州拉面")] == [0, 1, 2, 3, 4, 6]
-
-    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == ["Micheal", "你好", "吗", "？"]
-
-    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10]
-
-
-def test_jieba_load_dictionary(tmpdir_factory):
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    dictionary_path = tmpdir_factory.mktemp("jieba_custom_dictionary").strpath
-
-    component_config = {"dictionary_path": dictionary_path}
-
-    with patch.object(
-        JiebaTokenizer, "load_custom_dictionary", return_value=None
-    ) as mock_method:
-        tk = JiebaTokenizer(component_config)
-        tk.tokenize("")
-
-    mock_method.assert_called_once_with(dictionary_path)
-
-
-def test_jieba_add_cls_token():
-    from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
-
-    component_config = {"use_cls_token": True}
-
-    tk = JiebaTokenizer(component_config)
-
-    assert [t.text for t in tk.tokenize("Micheal你好吗？")] == [
-        "Micheal",
-        "你好",
-        "吗",
-        "？",
-        CLS_TOKEN,
-    ]
-
-    assert [t.offset for t in tk.tokenize("Micheal你好吗？")] == [0, 7, 9, 10, 12]

From 45a5868048a44fb580673c7e1e015e1e9ef92e60 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Sun, 20 Oct 2019 14:54:37 +0200
Subject: [PATCH 070/239] Disable cls token use in default pipeline.

---
 examples/formbot/config.yml       | 1 +
 examples/restaurantbot/config.yml | 1 +
 rasa/nlu/registry.py              | 4 ++--
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/examples/formbot/config.yml b/examples/formbot/config.yml
index 3aa0e7577759..3cf3f4a14fe5 100644
--- a/examples/formbot/config.yml
+++ b/examples/formbot/config.yml
@@ -2,6 +2,7 @@ language: en
 
 pipeline:
   - name: WhitespaceTokenizer
+    use_cls_token: False
   - name: CRFEntityExtractor
   - name: EntitySynonymMapper
   - name: CountVectorsFeaturizer
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index 58e9f0be7209..fcb2086a50e1 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -3,6 +3,7 @@ language: en
 pipeline:
   - name: "SpacyNLP"
   - name: "SpacyTokenizer"
+    use_cls_token: False
   - name: "SpacyFeaturizer"
   - name: "SklearnIntentClassifier"
   - name: "CRFEntityExtractor"
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index 0d79360edd3e..2ec7aad0a0e1 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -105,7 +105,7 @@
 registered_pipeline_templates = {
     "pretrained_embeddings_spacy": [
         {"name": "SpacyNLP"},
-        {"name": "SpacyTokenizer"},
+        {"name": "SpacyTokenizer", "use_cls_token": False},
         {"name": "SpacyFeaturizer"},
         {"name": "RegexFeaturizer"},
         {"name": "CRFEntityExtractor"},
@@ -114,7 +114,7 @@
     ],
     "keyword": [{"name": "KeywordIntentClassifier"}],
     "supervised_embeddings": [
-        {"name": "WhitespaceTokenizer"},
+        {"name": "WhitespaceTokenizer", "use_cls_token": False},
         {"name": "RegexFeaturizer"},
         {"name": "CRFEntityExtractor"},
         {"name": "EntitySynonymMapper"},

From 7c9c679afe77f9a256384868af303ba766778fb3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Sun, 20 Oct 2019 14:57:11 +0200
Subject: [PATCH 071/239] correct type

---
 rasa/nlu/tokenizers/mitie_tokenizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index ff9dced7e413..e17d49c3bab7 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -46,7 +46,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
         )
 
     def _token_from_offset(
-        self, text: Text, offset: int, encoded_sentence: bytes
+        self, text: bytes, offset: int, encoded_sentence: bytes
     ) -> Token:
         return Token(
             text.decode(DEFAULT_ENCODING),

From dfeca3ed0060e841ea289dcde30ee84e499a4fac Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 21 Oct 2019 09:39:50 +0200
Subject: [PATCH 072/239] fix tests

---
 tests/nlu/base/test_config.py      | 2 +-
 tests/nlu/base/test_featurizers.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
index be729075adb3..f6453e49404e 100644
--- a/tests/nlu/base/test_config.py
+++ b/tests/nlu/base/test_config.py
@@ -65,7 +65,7 @@ def test_set_attr_on_component():
     cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
     cfg.set_component_attr(6, C=324)
 
-    assert cfg.for_component(1) == {"name": "SpacyTokenizer"}
+    assert cfg.for_component(1) == {"name": "SpacyTokenizer", "use_cls_token": False}
     assert cfg.for_component(6) == {"name": "SklearnIntentClassifier", "C": 324}
 
 
diff --git a/tests/nlu/base/test_featurizers.py b/tests/nlu/base/test_featurizers.py
index cd0c8ce3c13a..d67c905deab6 100644
--- a/tests/nlu/base/test_featurizers.py
+++ b/tests/nlu/base/test_featurizers.py
@@ -117,7 +117,8 @@ def test_mitie_featurizer(mitie_feature_extractor, default_config):
     mitie_component_config = {"name": "MitieFeaturizer"}
     ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
     sentence = "Hey how are you today"
-    tokens = MitieTokenizer().tokenize(sentence)
+    mitie_component_config = {"name": "MitieTokenizer", "use_cls_token": False}
+    tokens = MitieTokenizer(mitie_component_config).tokenize(sentence)
     vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
     expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
     assert np.allclose(vecs[:5], expected, atol=1e-5)
@@ -212,7 +213,8 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     ftr = RegexFeaturizer(lookup_tables=lookups)
 
     # adds tokens to the message
-    tokenizer = SpacyTokenizer()
+    component_config = {"name": "SpacyTokenizer", "use_cls_token": False}
+    tokenizer = SpacyTokenizer(component_config)
     message = Message(sentence)
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)

From ce915975980d6c963276a70dcd7019ef8bc0cd9b Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 15:12:46 +0200
Subject: [PATCH 073/239] Update
 rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 3e4abf6eb1a8..a7a667a9f590 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -26,7 +26,7 @@
 class CountVectorsFeaturizer(Featurizer):
     """
     Creates a sequence of features
-    using sklearn's `CountVectorizer`.
+    Based on sklearn's `CountVectorizer`.
     All tokens which consist only of digits (e.g. 123 and 99
     but not ab12d) will be represented by a single feature.
 

From f69673a88226df2a282850d96056c75ee0d105a9 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 15:17:40 +0200
Subject: [PATCH 074/239] Update
 rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py

Co-Authored-By: Vladimir Vlasov <vladimir@rasa.com>
---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index a7a667a9f590..dbc4fae14f26 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -25,7 +25,7 @@
 
 class CountVectorsFeaturizer(Featurizer):
     """
-    Creates a sequence of features
+    Creates a sequence of token counts features
     Based on sklearn's `CountVectorizer`.
     All tokens which consist only of digits (e.g. 123 and 99
     but not ab12d) will be represented by a single feature.

From 78d4d5113d888941c5052e05da94da048be49a5c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 15:22:54 +0200
Subject: [PATCH 075/239] review comments

---
 .../sparse_featurizer/regex_featurizer.py     | 51 ++++++++-----------
 .../nlu/featurizers/test_regex_featurizer.py  |  5 +-
 2 files changed, 25 insertions(+), 31 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 591018b17505..47e814d705be 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -59,56 +59,49 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     def _text_features_with_regex(self, message: Message) -> None:
         if self.known_patterns:
-            extras = self.features_for_patterns(message)
-            features = self._combine_with_existing_sparse_features(message, extras)
-        else:
-            features = message.get(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-            )
-
-        message.set(
-            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE], features
-        )
+            for attribute in [MESSAGE_TEXT_ATTRIBUTE, MESSAGE_RESPONSE_ATTRIBUTE]:
+                extras = self._features_for_patterns(message, attribute)
+                features = self._combine_with_existing_sparse_features(message, extras)
+                message.set(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute], features)
 
     def _add_lookup_table_regexes(
         self, lookup_tables: List[Dict[Text, Union[Text, List]]]
     ) -> None:
-        # appends the regex features from the lookup tables to
-        # self.known_patterns
+        """appends the regex features from the lookup tables to self.known_patterns"""
         for table in lookup_tables:
             regex_pattern = self._generate_lookup_regex(table)
             lookup_regex = {"name": table["name"], "pattern": regex_pattern}
             self.known_patterns.append(lookup_regex)
 
-    def features_for_patterns(self, message: Message) -> scipy.sparse.csr_matrix:
+    def _features_for_patterns(
+        self, message: Message, attribute: Text
+    ) -> scipy.sparse.csr_matrix:
         """Checks which known patterns match the message.
 
         Given a sentence, returns a vector of {1,0} values indicating which
         regexes did match. Furthermore, if the
         message is tokenized, the function will mark all tokens with a dict
         relating the name of the regex to whether it was matched."""
+        tokens = message.get(MESSAGE_TOKENS_NAMES[attribute], [])
 
-        for attribute in [MESSAGE_TEXT_ATTRIBUTE, MESSAGE_RESPONSE_ATTRIBUTE]:
-            tokens = message.get(MESSAGE_TOKENS_NAMES[attribute], [])
-
-            vec = np.zeros([len(tokens), len(self.known_patterns)])
+        vec = np.zeros([len(tokens), len(self.known_patterns)])
 
-            for pattern_index, pattern in enumerate(self.known_patterns):
-                matches = re.finditer(pattern["pattern"], message.text)
-                matches = list(matches)
+        for pattern_index, pattern in enumerate(self.known_patterns):
+            matches = re.finditer(pattern["pattern"], message.text)
+            matches = list(matches)
 
-                for token_index, t in enumerate(tokens):
-                    patterns = t.get("pattern", default={})
-                    patterns[pattern["name"]] = False
+            for token_index, t in enumerate(tokens):
+                patterns = t.get("pattern", default={})
+                patterns[pattern["name"]] = False
 
-                    for match in matches:
-                        if t.offset < match.end() and t.end > match.start():
-                            patterns[pattern["name"]] = True
-                            vec[token_index][pattern_index] = 1.0
+                for match in matches:
+                    if t.offset < match.end() and t.end > match.start():
+                        patterns[pattern["name"]] = True
+                        vec[token_index][pattern_index] = 1.0
 
-                    t.set("pattern", patterns)
+                t.set("pattern", patterns)
 
-            return scipy.sparse.csr_matrix(vec)
+        return scipy.sparse.csr_matrix(vec)
 
     def _generate_lookup_regex(
         self, lookup_table: Dict[Text, Union[Text, List]]
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index 5e8ea40ca8f9..fc495cd122b6 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -1,6 +1,7 @@
 import numpy as np
 import pytest
 
+from rasa.nlu.constants import MESSAGE_TEXT_ATTRIBUTE
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message
 
@@ -58,7 +59,7 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr.features_for_patterns(message)
+    result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
@@ -106,7 +107,7 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr.features_for_patterns(message)
+    result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
     print(result.toarray())
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 

From 411d328ab621e6035b2b6170caba2726698103c7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 15:33:44 +0200
Subject: [PATCH 076/239] test regex featurizer on response

---
 rasa/nlu/tokenizers/spacy_tokenizer.py         | 11 +++++------
 tests/nlu/featurizers/test_regex_featurizer.py | 18 +++++++++++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index f4ca504ad653..f226861c0bfa 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -52,12 +52,11 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
         return message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        message.set(
-            MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self.tokenize(
-                self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE), MESSAGE_TEXT_ATTRIBUTE
-            ),
-        )
+        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+            message.set(
+                MESSAGE_TOKENS_NAMES[attribute],
+                self.tokenize(self.get_doc(message, attribute), attribute),
+            )
 
     def tokenize(
         self, doc: "Doc", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index fc495cd122b6..53cfdd5c164f 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -1,7 +1,11 @@
 import numpy as np
 import pytest
 
-from rasa.nlu.constants import MESSAGE_TEXT_ATTRIBUTE
+from rasa.nlu.constants import (
+    MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_RESPONSE_ATTRIBUTE,
+    MESSAGE_SPACY_FEATURES_NAMES,
+)
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message
 
@@ -55,13 +59,21 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
 
     # adds tokens to the message
     tokenizer = SpacyTokenizer({"use_cls_token": False})
-    message = Message(sentence)
-    message.set("spacy_doc", spacy_nlp(sentence))
+    message = Message(sentence, data={MESSAGE_RESPONSE_ATTRIBUTE: sentence})
+    message.set(
+        MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE], spacy_nlp(sentence)
+    )
+    message.set(
+        MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_RESPONSE_ATTRIBUTE], spacy_nlp(sentence)
+    )
     tokenizer.process(message)
 
     result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
+    result = ftr._features_for_patterns(message, MESSAGE_RESPONSE_ATTRIBUTE)
+    assert np.allclose(result.toarray(), expected, atol=1e-10)
+
     # the tokenizer should have added tokens
     assert len(message.get("tokens", [])) > 0
     # the number of regex matches on each token should match

From 884e2b37ac284a2d34491bafc1a0e34e5934710f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 16:02:54 +0200
Subject: [PATCH 077/239] review comments

---
 .../sparse_featurizer/count_vectors_featurizer.py  |  5 ++---
 .../sparse_featurizer/regex_featurizer.py          | 14 +++++++-------
 rasa/nlu/tokenizers/spacy_tokenizer.py             | 11 ++++++-----
 tests/nlu/featurizers/test_regex_featurizer.py     |  6 ------
 4 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index dbc4fae14f26..f714f63e2eb2 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -24,9 +24,8 @@
 
 
 class CountVectorsFeaturizer(Featurizer):
-    """
-    Creates a sequence of token counts features
-    Based on sklearn's `CountVectorizer`.
+    """Creates a sequence of token counts features
+    based on sklearn's `CountVectorizer`.
     All tokens which consist only of digits (e.g. 123 and 99
     but not ab12d) will be represented by a single feature.
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 47e814d705be..77d59f61b2a4 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -52,17 +52,17 @@ def train(
         self._add_lookup_table_regexes(training_data.lookup_tables)
 
         for example in training_data.training_examples:
-            self._text_features_with_regex(example)
+            for attribute in [MESSAGE_TEXT_ATTRIBUTE, MESSAGE_RESPONSE_ATTRIBUTE]:
+                self._text_features_with_regex(example, attribute)
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        self._text_features_with_regex(message)
+        self._text_features_with_regex(message, MESSAGE_TEXT_ATTRIBUTE)
 
-    def _text_features_with_regex(self, message: Message) -> None:
+    def _text_features_with_regex(self, message: Message, attribute: Text) -> None:
         if self.known_patterns:
-            for attribute in [MESSAGE_TEXT_ATTRIBUTE, MESSAGE_RESPONSE_ATTRIBUTE]:
-                extras = self._features_for_patterns(message, attribute)
-                features = self._combine_with_existing_sparse_features(message, extras)
-                message.set(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute], features)
+            extras = self._features_for_patterns(message, attribute)
+            features = self._combine_with_existing_sparse_features(message, extras)
+            message.set(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute], features)
 
     def _add_lookup_table_regexes(
         self, lookup_tables: List[Dict[Text, Union[Text, List]]]
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index f226861c0bfa..f4ca504ad653 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -52,11 +52,12 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
         return message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
-            message.set(
-                MESSAGE_TOKENS_NAMES[attribute],
-                self.tokenize(self.get_doc(message, attribute), attribute),
-            )
+        message.set(
+            MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            self.tokenize(
+                self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE), MESSAGE_TEXT_ATTRIBUTE
+            ),
+        )
 
     def tokenize(
         self, doc: "Doc", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index 53cfdd5c164f..40a9f70b4f37 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -63,17 +63,11 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
     message.set(
         MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE], spacy_nlp(sentence)
     )
-    message.set(
-        MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_RESPONSE_ATTRIBUTE], spacy_nlp(sentence)
-    )
     tokenizer.process(message)
 
     result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
-    result = ftr._features_for_patterns(message, MESSAGE_RESPONSE_ATTRIBUTE)
-    assert np.allclose(result.toarray(), expected, atol=1e-10)
-
     # the tokenizer should have added tokens
     assert len(message.get("tokens", [])) > 0
     # the number of regex matches on each token should match

From e857c35c02209568c416328f0c8a8fa3c798de7a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 08:56:57 +0200
Subject: [PATCH 078/239] switch from ner to sparse features

---
 rasa/nlu/extractors/crf_entity_extractor.py   | 33 ++++++++-----------
 .../count_vectors_featurizer.py               |  3 +-
 2 files changed, 15 insertions(+), 21 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 354e2c4a8250..a3a2dcd2fef2 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -79,7 +79,7 @@ class CRFEntityExtractor(EntityExtractor):
         "upper": lambda doc: doc[0].isupper(),  # pytype: disable=attribute-error
         "digit": lambda doc: doc[0].isdigit(),  # pytype: disable=attribute-error
         "pattern": lambda doc: doc[3],
-        "ner_features": lambda doc: doc[4],
+        "sparse_features": lambda doc: doc[4],
     }
 
     def __init__(
@@ -95,8 +95,6 @@ def __init__(
         self._validate_configuration()
 
         self._check_pos_features_and_spacy()
-        # possibly add a check here to ensure ner_features iff custom_extractor
-        self._check_ner_features()
 
     def _check_pos_features_and_spacy(self):
         import itertools
@@ -117,13 +115,6 @@ def _check_spacy():
                 "instructions."
             )
 
-    def _check_ner_features(self):
-        import itertools
-
-        features = self.component_config.get("features", [])
-        used_features = set(itertools.chain.from_iterable(features))
-        self.use_ner_features = "ner_features" in used_features
-
     def _validate_configuration(self):
         if len(self.component_config.get("features", [])) % 2 != 1:
             raise ValueError(
@@ -596,14 +587,14 @@ def __tag_of_token(token):
 
     @staticmethod
     def __additional_ner_features(message: Message) -> List[Any]:
-        # TODO use sparse text features
-        features = message.get("ner_features", [])
+        features = message.get("text_sparse_features", [])
         tokens = message.get("tokens", [])
         if len(tokens) != len(features):
-            warn_string = "Number of custom NER features ({}) does not match number of tokens ({})".format(
+            warn_string = "Number of sparse features ({}) does not match number of tokens ({})".format(
                 len(features), len(tokens)
             )
             raise Exception(warn_string)
+
         # convert to python-crfsuite feature format
         features_out = []
         for feature in features:
@@ -611,7 +602,7 @@ def __additional_ner_features(message: Message) -> List[Any]:
                 str(index): token_features
                 for index, token_features in enumerate(feature)
             }
-            converted = {"custom_ner_features": feature_dict}
+            converted = {"sparse_features": feature_dict}
             features_out.append(converted)
         return features_out
 
@@ -633,15 +624,19 @@ def _from_text_to_crf(
             tokens = message.get("spacy_doc")
         else:
             tokens = message.get("tokens")
-        ner_features = (
-            self.__additional_ner_features(message) if self.use_ner_features else None
-        )
+
+        sparse_features = message.get("text_sparse_features")
+
         for i, token in enumerate(tokens):
             pattern = self.__pattern_of_token(message, i)
             entity = entities[i] if entities else "N/A"
             tag = self.__tag_of_token(token) if self.pos_features else None
-            custom_ner_features = ner_features[i] if self.use_ner_features else None
-            crf_format.append((token.text, tag, entity, pattern, custom_ner_features))
+            token_sparse_features = (
+                sparse_features[i] if sparse_features is not None else []
+            )
+
+            crf_format.append((token.text, tag, entity, pattern, token_sparse_features))
+
         return crf_format
 
     def _train_model(
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index f714f63e2eb2..63207956e03f 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -275,10 +275,9 @@ def _get_message_tokens_by_attribute(
         message: "Message", attribute: Text
     ) -> List[Text]:
         """Get text tokens of an attribute of a message"""
-
         if message.get(
             MESSAGE_TOKENS_NAMES[attribute]
-        ):  # if directly tokens is provided
+        ):
             return [t.lemma for t in message.get(MESSAGE_TOKENS_NAMES[attribute])]
 
         return message.get(attribute).split()

From 01b4de6aa4b29448850f4eadfca08454df1c7957 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 10:50:56 +0200
Subject: [PATCH 079/239] add seq to senntence embedding method

---
 rasa/nlu/classifiers/__init__.py              | 20 -------------------
 .../embedding_intent_classifier.py            | 14 +++++++++----
 .../classifiers/sklearn_intent_classifier.py  |  7 ++++---
 rasa/nlu/extractors/crf_entity_extractor.py   |  4 ++--
 rasa/nlu/featurizers/featurzier.py            | 19 ++++++++++++++++--
 .../count_vectors_featurizer.py               |  7 +------
 tests/nlu/featurizers/test_featurizer.py      | 18 ++++++++++++++++-
 7 files changed, 51 insertions(+), 38 deletions(-)

diff --git a/rasa/nlu/classifiers/__init__.py b/rasa/nlu/classifiers/__init__.py
index f1613a979c42..ae7b52d8840a 100644
--- a/rasa/nlu/classifiers/__init__.py
+++ b/rasa/nlu/classifiers/__init__.py
@@ -1,23 +1,3 @@
 # How many labels are at max put into the output
 # ranking, everything else will be cut off
 LABEL_RANKING_LENGTH = 10
-
-import scipy.sparse
-
-
-# TODO should be removed in next PR
-def convert_sparse_back(sparse_features: scipy.sparse.csr_matrix):
-    import numpy as np
-
-    if sparse_features is not None:
-        return np.sum(sparse_features.toarray(), axis=0)
-    return None
-
-
-# TODO should be removed in next PR
-def convert_dense_back(dense_features: scipy.sparse.csr_matrix):
-    import numpy as np
-
-    if dense_features is not None:
-        return np.sum(dense_features, axis=0)
-    return None
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index dfb68ab32d22..13170662eb7b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -6,7 +6,8 @@
 from typing import Any, Dict, List, Optional, Text, Tuple
 import warnings
 
-from rasa.nlu.classifiers import LABEL_RANKING_LENGTH, convert_sparse_back
+from nlu.featurizers.featurzier import sequence_to_sentence_embedding
+from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
 from rasa.nlu.constants import (
@@ -272,7 +273,12 @@ def _extract_labels_precomputed_features(
 
         # Collect precomputed encodings
         encoded_id_labels = [
-            (label_idx, convert_sparse_back(label_example.get(attribute_feature_name)))
+            (
+                label_idx,
+                sequence_to_sentence_embedding(
+                    label_example.get(attribute_feature_name)
+                ),
+            )
             for (label_idx, label_example) in label_examples
         ]
 
@@ -336,7 +342,7 @@ def _create_session_data(
         for e in training_data.intent_examples:
             if e.get(attribute):
                 X.append(
-                    convert_sparse_back(
+                    sequence_to_sentence_embedding(
                         e.get(
                             MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
                         )
@@ -612,7 +618,7 @@ def predict_label(self, message):
         else:
             # get features (bag of words) for a message
             # noinspection PyPep8Naming
-            X = convert_sparse_back(
+            X = sequence_to_sentence_embedding(
                 message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             ).reshape(1, -1)
 
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 15175699955a..86e3860f282d 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -4,8 +4,9 @@
 import typing
 from typing import Any, Dict, List, Optional, Text, Tuple
 
+from nlu.featurizers.featurzier import sequence_to_sentence_embedding
 from rasa.nlu import utils
-from rasa.nlu.classifiers import LABEL_RANKING_LENGTH, convert_dense_back
+from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
@@ -98,7 +99,7 @@ def train(
             y = self.transform_labels_str2num(labels)
             X = np.stack(
                 [
-                    convert_dense_back(
+                    sequence_to_sentence_embedding(
                         example.get(
                             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
                         )
@@ -150,7 +151,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             intent = None
             intent_ranking = []
         else:
-            X = convert_dense_back(
+            X = sequence_to_sentence_embedding(
                 message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             ).reshape(1, -1)
             intent_ids, probabilities = self.predict(X)
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index a3a2dcd2fef2..5013cfaf6c88 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -625,14 +625,14 @@ def _from_text_to_crf(
         else:
             tokens = message.get("tokens")
 
-        sparse_features = message.get("text_sparse_features")
+        dense_features = message.get("text_dense_features")
 
         for i, token in enumerate(tokens):
             pattern = self.__pattern_of_token(message, i)
             entity = entities[i] if entities else "N/A"
             tag = self.__tag_of_token(token) if self.pos_features else None
             token_sparse_features = (
-                sparse_features[i] if sparse_features is not None else []
+                dense_features[i] if dense_features is not None else []
             )
 
             crf_format.append((token.text, tag, entity, pattern, token_sparse_features))
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index 78a19636f4f9..c6a3e1f5c2bf 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -1,6 +1,6 @@
 import numpy as np
-
-from typing import Any, Text
+import scipy.sparse
+from typing import Any, Text, List, Union, Optional
 from rasa.nlu.training_data import Message
 from rasa.nlu.components import Component
 from rasa.nlu.constants import (
@@ -10,6 +10,21 @@
 )
 
 
+def sequence_to_sentence_embedding(
+    features: Union[List[List[float]], scipy.sparse.spmatrix], method: Text = "mean"
+) -> Optional[np.ndarray]:
+    if features is None:
+        return None
+
+    if isinstance(features, scipy.sparse.spmatrix):
+        features = features.toarray()
+
+    if method == "mean" or method == "avg":
+        return np.mean(features, axis=0)
+    if method == "sum":
+        return np.sum(features, axis=0)
+
+
 class Featurizer(Component):
     @staticmethod
     def _combine_with_existing_dense_features(
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 63207956e03f..1c7080ae4038 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -13,11 +13,8 @@
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
     MESSAGE_INTENT_ATTRIBUTE,
-    CLS_TOKEN,
 )
 
 logger = logging.getLogger(__name__)
@@ -275,9 +272,7 @@ def _get_message_tokens_by_attribute(
         message: "Message", attribute: Text
     ) -> List[Text]:
         """Get text tokens of an attribute of a message"""
-        if message.get(
-            MESSAGE_TOKENS_NAMES[attribute]
-        ):
+        if message.get(MESSAGE_TOKENS_NAMES[attribute]):
             return [t.lemma for t in message.get(MESSAGE_TOKENS_NAMES[attribute])]
 
         return message.get(attribute).split()
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 25e17b0a7f1e..18981a4760a7 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -1,7 +1,8 @@
 import numpy as np
+import pytest
 import scipy.sparse
 
-from rasa.nlu.featurizers.featurzier import Featurizer
+from rasa.nlu.featurizers.featurzier import Featurizer, sequence_to_sentence_embedding
 from rasa.nlu.constants import (
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
@@ -47,3 +48,18 @@ def test_combine_with_existing_sparse_features():
     actual_features = actual_features.toarray()
 
     assert np.all(expected_features == actual_features)
+
+
+@pytest.mark.parametrize(
+    "features, expected, method",
+    [
+        ([[1, 0, 2, 3], [2, 0, 0, 1]], [3, 0, 2, 4], "sum"),
+        ([[1, 0, 2, 3], [2, 0, 0, 1]], [1.5, 0, 1, 2], "avg"),
+        (scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), [1.5, 0, 1, 2], "avg"),
+        (None, None, "avg"),
+    ],
+)
+def test_sequence_to_sentence_embedding(features, expected, method):
+    actual = sequence_to_sentence_embedding(features, method=method)
+
+    assert np.all(expected == actual)

From f21bbd718e8d7bc24c5103640930230cb39e0722 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 13:01:31 +0200
Subject: [PATCH 080/239] update crf entity extractor

---
 docs/nlu/components.rst                       |   9 +-
 docs/nlu/entity-extraction.rst                |   9 -
 rasa/nlu/extractors/crf_entity_extractor.py   | 125 ++--
 rasa/utils/train_utils.py                     |  14 +-
 tests/nlu/base/test_extractors.py             | 532 ------------------
 tests/nlu/extractors/__init__.py              |   0
 .../test_duckling_http_extractor.py           | 227 ++++++++
 tests/nlu/extractors/test_entity_synonyms.py  |  30 +
 .../test_mitie_entity_extractors.py           |   0
 .../test_spacy_entity_extractors.py           |  53 ++
 .../extractors/text_crf_entity_extractor.py   | 254 +++++++++
 11 files changed, 627 insertions(+), 626 deletions(-)
 delete mode 100644 tests/nlu/base/test_extractors.py
 create mode 100644 tests/nlu/extractors/__init__.py
 create mode 100644 tests/nlu/extractors/test_duckling_http_extractor.py
 create mode 100644 tests/nlu/extractors/test_entity_synonyms.py
 create mode 100644 tests/nlu/extractors/test_mitie_entity_extractors.py
 create mode 100644 tests/nlu/extractors/test_spacy_entity_extractors.py
 create mode 100644 tests/nlu/extractors/text_crf_entity_extractor.py

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 988503c1d133..82baf3ce0652 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -116,15 +116,12 @@ SpacyFeaturizer
 :Requires: :ref:`SpacyNLP`
 :Description:
     Creates feature for intent classification using the spacy featurizer.
-    Optionally adds word vectors for each ``token`` to ``ner_features``, which can be referenced in ``CRFEntityExtractor``
 :Configuration:
 
     .. code-block:: yaml
 
         pipeline:
         - name: "SpacyFeaturizer"
-          # Whether to add word vectors to ``ner_features`` (default: False)
-          ner_feature_vectors: True
 
 NGramFeaturizer
 ~~~~~~~~~~~~~~~
@@ -762,8 +759,8 @@ CRFEntityExtractor
     and the states are entity classes. Features of the words (capitalisation, POS tagging,
     etc.) give probabilities to certain entity classes, as are transitions between
     neighbouring entity tags: the most likely set of tags is then calculated and returned.
-    If POS features are used (pos or pos2), spaCy has to be installed. To use custom features
-    made available by Featurizers, use ``"ner_features"``.
+    If POS features are used (pos or pos2), spaCy has to be installed. If you want to use
+    word embeddings from any provided featurizer, use ``"word_embedding"``.
 :Configuration:
    .. code-block:: yaml
 
@@ -777,7 +774,7 @@ CRFEntityExtractor
           # Available features are:
           # ``low``, ``title``, ``suffix5``, ``suffix3``, ``suffix2``,
           # ``suffix1``, ``pos``, ``pos2``, ``prefix5``, ``prefix2``,
-          # ``bias``, ``upper``, ``digit``, ``pattern``, and ``ner_features``
+          # ``bias``, ``upper``, ``digit``, ``pattern``, and ``word_embedding``
           features: [["low", "title"], ["bias", "suffix3"], ["upper", "pos", "pos2"]]
 
           # The flag determines whether to use BILOU tagging or not. BILOU
diff --git a/docs/nlu/entity-extraction.rst b/docs/nlu/entity-extraction.rst
index 35edf433e665..58671a316b4c 100644
--- a/docs/nlu/entity-extraction.rst
+++ b/docs/nlu/entity-extraction.rst
@@ -149,12 +149,3 @@ associate that with a location entity.
 
 If you just want to match regular expressions exactly, you can do this in your code,
 as a postprocessing step after receiving the response from Rasa NLU.
-
-
-Passing Custom Features to ``CRFEntityExtractor``
-^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-If you want to pass custom features to ``CRFEntityExtractor``, you can create a ``Featurizer`` that provides ``ner_features``.
-If you do, ``ner_features`` should be an iterable of ``len(tokens)``, where each entry is a vector.
-If ``CRFEntityExtractor`` finds ``"ner_features"`` in one of the arrays in ``features`` in the config, it will pass the ``ner_features`` vectors to ``sklearn_crfsuite``.
-The simplest example of this is to pass word vectors as features, which you can do using :ref:``SpacyFeaturizer``.
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 5013cfaf6c88..c66b7f8d18a7 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -1,13 +1,20 @@
 import logging
 import os
 import typing
-from typing import Any, Dict, List, Optional, Text, Tuple, Union
+import numpy as np
+from typing import Any, Dict, List, Optional, Text, Tuple, Union, NamedTuple
 
 from rasa.nlu.config import InvalidConfigError, RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
+from rasa.nlu.constants import (
+    MESSAGE_TOKENS_NAMES,
+    MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    MESSAGE_SPACY_FEATURES_NAMES,
+)
 from rasa.constants import DOCS_BASE_URL
 
 try:
@@ -22,11 +29,19 @@
     from spacy.tokens import Doc
 
 
+class CRFToken(NamedTuple):
+    text: Text
+    tag: Text
+    entity: Text
+    pattern: Dict[Text, Any]
+    word_embedding: np.ndarray
+
+
 class CRFEntityExtractor(EntityExtractor):
 
     provides = ["entities"]
 
-    requires = ["tokens"]
+    requires = [MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
     defaults = {
         # BILOU_flag determines whether to use BILOU tagging or not.
@@ -65,21 +80,21 @@ class CRFEntityExtractor(EntityExtractor):
     }
 
     function_dict = {
-        "low": lambda doc: doc[0].lower(),  # pytype: disable=attribute-error
-        "title": lambda doc: doc[0].istitle(),  # pytype: disable=attribute-error
-        "prefix5": lambda doc: doc[0][:5],
-        "prefix2": lambda doc: doc[0][:2],
-        "suffix5": lambda doc: doc[0][-5:],
-        "suffix3": lambda doc: doc[0][-3:],
-        "suffix2": lambda doc: doc[0][-2:],
-        "suffix1": lambda doc: doc[0][-1:],
-        "pos": lambda doc: doc[1],
-        "pos2": lambda doc: doc[1][:2],
-        "bias": lambda doc: "bias",
-        "upper": lambda doc: doc[0].isupper(),  # pytype: disable=attribute-error
-        "digit": lambda doc: doc[0].isdigit(),  # pytype: disable=attribute-error
-        "pattern": lambda doc: doc[3],
-        "sparse_features": lambda doc: doc[4],
+        "low": lambda crf_token: crf_token.text.lower(),  # pytype: disable=attribute-error
+        "title": lambda crf_token: crf_token.text.istitle(),  # pytype: disable=attribute-error
+        "prefix5": lambda crf_token: crf_token.text[:5],
+        "prefix2": lambda crf_token: crf_token.text[:2],
+        "suffix5": lambda crf_token: crf_token.text[-5:],
+        "suffix3": lambda crf_token: crf_token.text[-3:],
+        "suffix2": lambda crf_token: crf_token.text[-2:],
+        "suffix1": lambda crf_token: crf_token.text[-1:],
+        "pos": lambda crf_token: crf_token.tag,
+        "pos2": lambda crf_token: crf_token.tag[:2],
+        "bias": lambda crf_token: "bias",
+        "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
+        "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
+        "pattern": lambda crf_token: crf_token.pattern,
+        "word_embedding": lambda crf_token: crf_token.word_embedding,
     }
 
     def __init__(
@@ -146,23 +161,13 @@ def train(
 
             self._train_model(dataset)
 
-    def _create_dataset(
-        self, examples: List[Message]
-    ) -> List[
-        List[
-            Tuple[
-                Optional[Text],
-                Optional[Text],
-                Text,
-                Dict[Text, Any],
-                Optional[Dict[Text, Any]],
-            ]
-        ]
-    ]:
+    def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
         dataset = []
+
         for example in examples:
             entity_offsets = self._convert_example(example)
             dataset.append(self._from_json_to_crf(example, entity_offsets))
+
         return dataset
 
     def _check_spacy_doc(self, message):
@@ -421,18 +426,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
 
         return {"file": file_name}
 
-    def _sentence_to_features(
-        self,
-        sentence: List[
-            Tuple[
-                Optional[Text],
-                Optional[Text],
-                Text,
-                Dict[Text, Any],
-                Optional[Dict[Text, Any]],
-            ]
-        ],
-    ) -> List[Dict[Text, Any]]:
+    def _sentence_to_features(self, sentence: List[CRFToken]) -> List[Dict[Text, Any]]:
         """Convert a word into discrete features in self.crf_features,
         including word before and word after."""
 
@@ -491,15 +485,7 @@ def _sentence_to_labels(
 
     def _from_json_to_crf(
         self, message: Message, entity_offsets: List[Tuple[int, int, Text]]
-    ) -> List[
-        Tuple[
-            Optional[Text],
-            Optional[Text],
-            Text,
-            Dict[Text, Any],
-            Optional[Dict[Text, Any]],
-        ]
-    ]:
+    ) -> List[CRFToken]:
         """Convert json examples to format of underlying crfsuite."""
 
         if self.pos_features:
@@ -608,24 +594,18 @@ def __additional_ner_features(message: Message) -> List[Any]:
 
     def _from_text_to_crf(
         self, message: Message, entities: List[Text] = None
-    ) -> List[
-        Tuple[
-            Optional[Text],
-            Optional[Text],
-            Text,
-            Dict[Text, Any],
-            Optional[Dict[Text, Any]],
-        ]
-    ]:
+    ) -> List[CRFToken]:
         """Takes a sentence and switches it to crfsuite format."""
 
         crf_format = []
         if self.pos_features:
-            tokens = message.get("spacy_doc")
+            tokens = message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE])
         else:
-            tokens = message.get("tokens")
+            tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
 
-        dense_features = message.get("text_dense_features")
+        dense_features = message.get(
+            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+        )
 
         for i, token in enumerate(tokens):
             pattern = self.__pattern_of_token(message, i)
@@ -635,24 +615,13 @@ def _from_text_to_crf(
                 dense_features[i] if dense_features is not None else []
             )
 
-            crf_format.append((token.text, tag, entity, pattern, token_sparse_features))
+            crf_format.append(
+                CRFToken(token.text, tag, entity, pattern, token_sparse_features)
+            )
 
         return crf_format
 
-    def _train_model(
-        self,
-        df_train: List[
-            List[
-                Tuple[
-                    Optional[Text],
-                    Optional[Text],
-                    Text,
-                    Dict[Text, Any],
-                    Optional[Dict[Text, Any]],
-                ]
-            ]
-        ],
-    ) -> None:
+    def _train_model(self, df_train: List[List[CRFToken]]) -> None:
         """Train the crf tagger based on the training data."""
         import sklearn_crfsuite
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 8076e28b1ae8..d6fa7a15a0e0 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,7 +1,18 @@
 from collections import namedtuple
 import logging
 import typing
-from typing import List, Optional, Text, Dict, Tuple, Union, Generator, Callable, Any
+from typing import (
+    List,
+    Optional,
+    Text,
+    Dict,
+    Tuple,
+    Union,
+    Generator,
+    Callable,
+    Any,
+    NamedTuple,
+)
 import numpy as np
 from tqdm import tqdm
 from sklearn.model_selection import train_test_split
@@ -22,6 +33,7 @@
 tf.contrib._warning = None
 logger = logging.getLogger(__name__)
 
+
 # namedtuple for all tf session related data
 SessionData = namedtuple("SessionData", ("X", "Y", "label_ids"))
 
diff --git a/tests/nlu/base/test_extractors.py b/tests/nlu/base/test_extractors.py
deleted file mode 100644
index 4ce6cb30b9f8..000000000000
--- a/tests/nlu/base/test_extractors.py
+++ /dev/null
@@ -1,532 +0,0 @@
-# coding=utf-8
-import responses
-
-from rasa.nlu.config import RasaNLUModelConfig
-from rasa.nlu.training_data import TrainingData, Message
-from tests.nlu import utilities
-
-
-def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
-    examples = [
-        Message(
-            "anywhere in the west",
-            {
-                "intent": "restaurant_search",
-                "entities": [
-                    {"start": 16, "end": 20, "value": "west", "entity": "location"}
-                ],
-                "spacy_doc": spacy_nlp("anywhere in the west"),
-            },
-        ),
-        Message(
-            "central indian restaurant",
-            {
-                "intent": "restaurant_search",
-                "entities": [
-                    {
-                        "start": 0,
-                        "end": 7,
-                        "value": "central",
-                        "entity": "location",
-                        "extractor": "random_extractor",
-                    },
-                    {
-                        "start": 8,
-                        "end": 14,
-                        "value": "indian",
-                        "entity": "cuisine",
-                        "extractor": "CRFEntityExtractor",
-                    },
-                ],
-                "spacy_doc": spacy_nlp("central indian restaurant"),
-            },
-        ),
-    ]
-
-    # uses BILOU and the default features
-    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
-    sentence = "anywhere in the west"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
-    crf_format = ext._from_text_to_crf(Message(sentence, doc))
-    assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"]
-    feats = ext._sentence_to_features(crf_format)
-    assert "BOS" in feats[0]
-    assert "EOS" in feats[-1]
-    assert feats[1]["0:low"] == "in"
-    sentence = "anywhere in the west"
-    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
-    filtered = ext.filter_trainable_entities(examples)
-    assert filtered[0].get("entities") == [
-        {"start": 16, "end": 20, "value": "west", "entity": "location"}
-    ], "Entity without extractor remains"
-    assert filtered[1].get("entities") == [
-        {
-            "start": 8,
-            "end": 14,
-            "value": "indian",
-            "entity": "cuisine",
-            "extractor": "CRFEntityExtractor",
-        }
-    ], "Only CRFEntityExtractor entity annotation remains"
-    assert examples[1].get("entities")[0] == {
-        "start": 0,
-        "end": 7,
-        "value": "central",
-        "entity": "location",
-        "extractor": "random_extractor",
-    }, "Original examples are not mutated"
-
-
-def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
-    sentence = "I need a home cleaning close-by"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
-    r = ext._from_crf_to_json(
-        Message(sentence, doc),
-        [
-            {"O": 1.0},
-            {"O": 1.0},
-            {"O": 1.0},
-            {"B-what": 1.0},
-            {"L-what": 1.0},
-            {"B-where": 1.0},
-            {"I-where": 1.0},
-            {"L-where": 1.0},
-        ],
-    )
-    assert len(r) == 2, "There should be two entities"
-
-    assert r[0]["confidence"]  # confidence should exist
-    del r[0]["confidence"]
-    assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"}
-
-    assert r[1]["confidence"]  # confidence should exist
-    del r[1]["confidence"]
-    assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
-
-
-def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-
-    ner_crf_pos_feature_config.update({"BILOU_flag": False})
-    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
-    sentence = "I need a home cleaning close-by"
-    doc = {"spacy_doc": spacy_nlp(sentence)}
-    rs = ext._from_crf_to_json(
-        Message(sentence, doc),
-        [
-            {"O": 1.0},
-            {"O": 1.0},
-            {"O": 1.0},
-            {"what": 1.0},
-            {"what": 1.0},
-            {"where": 1.0},
-            {"where": 1.0},
-            {"where": 1.0},
-        ],
-    )
-
-    # non BILOU will split multi-word entities - hence 5
-    assert len(rs) == 5, "There should be five entities"
-
-    for r in rs:
-        assert r["confidence"]  # confidence should exist
-        del r["confidence"]
-
-    assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"}
-    assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"}
-    assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"}
-    assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"}
-    assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
-
-
-def test_crf_create_entity_dict(spacy_nlp):
-    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
-    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
-    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
-
-    crf_extractor = CRFEntityExtractor()
-    spacy_tokenizer = SpacyTokenizer()
-    white_space_tokenizer = WhitespaceTokenizer()
-
-    examples = [
-        {
-            "message": Message(
-                "where is St. Michael's Hospital?",
-                {
-                    "intent": "search_location",
-                    "entities": [
-                        {
-                            "start": 9,
-                            "end": 31,
-                            "value": "St. Michael's Hospital",
-                            "entity": "hospital",
-                            "SpacyTokenizer": {
-                                "entity_start_token_idx": 2,
-                                "entity_end_token_idx": 5,
-                            },
-                            "WhitespaceTokenizer": {
-                                "entity_start_token_idx": 2,
-                                "entity_end_token_idx": 5,
-                            },
-                        }
-                    ],
-                },
-            )
-        },
-        {
-            "message": Message(
-                "where is Children's Hospital?",
-                {
-                    "intent": "search_location",
-                    "entities": [
-                        {
-                            "start": 9,
-                            "end": 28,
-                            "value": "Children's Hospital",
-                            "entity": "hospital",
-                            "SpacyTokenizer": {
-                                "entity_start_token_idx": 2,
-                                "entity_end_token_idx": 4,
-                            },
-                            "WhitespaceTokenizer": {
-                                "entity_start_token_idx": 2,
-                                "entity_end_token_idx": 4,
-                            },
-                        }
-                    ],
-                },
-            )
-        },
-    ]
-    for ex in examples:
-        # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
-        spacy_tokens = spacy_tokenizer.tokenize(spacy_nlp(ex["message"].text))
-        white_space_tokens = white_space_tokenizer.tokenize(ex["message"].text)
-        for tokenizer, tokens in [
-            ("SpacyTokenizer", spacy_tokens),
-            ("WhitespaceTokenizer", white_space_tokens),
-        ]:
-            for entity in ex["message"].get("entities"):
-                parsed_entities = crf_extractor._create_entity_dict(
-                    ex["message"],
-                    tokens,
-                    entity[tokenizer]["entity_start_token_idx"],
-                    entity[tokenizer]["entity_end_token_idx"],
-                    entity["entity"],
-                    0.8,
-                )
-                assert parsed_entities == {
-                    "start": entity["start"],
-                    "end": entity["end"],
-                    "value": entity["value"],
-                    "entity": entity["entity"],
-                    "confidence": 0.8,
-                }
-
-
-def test_duckling_entity_extractor(component_builder):
-    with responses.RequestsMock() as rsps:
-        rsps.add(
-            responses.POST,
-            "http://localhost:8000/parse",
-            json=[
-                {
-                    "body": "Today",
-                    "start": 0,
-                    "value": {
-                        "values": [
-                            {
-                                "value": "2018-11-13T00:00:00.000-08:00",
-                                "grain": "day",
-                                "type": "value",
-                            }
-                        ],
-                        "value": "2018-11-13T00:00:00.000-08:00",
-                        "grain": "day",
-                        "type": "value",
-                    },
-                    "end": 5,
-                    "dim": "time",
-                    "latent": False,
-                },
-                {
-                    "body": "the 5th",
-                    "start": 9,
-                    "value": {
-                        "values": [
-                            {
-                                "value": "2018-12-05T00:00:00.000-08:00",
-                                "grain": "day",
-                                "type": "value",
-                            },
-                            {
-                                "value": "2019-01-05T00:00:00.000-08:00",
-                                "grain": "day",
-                                "type": "value",
-                            },
-                            {
-                                "value": "2019-02-05T00:00:00.000-08:00",
-                                "grain": "day",
-                                "type": "value",
-                            },
-                        ],
-                        "value": "2018-12-05T00:00:00.000-08:00",
-                        "grain": "day",
-                        "type": "value",
-                    },
-                    "end": 16,
-                    "dim": "time",
-                    "latent": False,
-                },
-                {
-                    "body": "5th of May",
-                    "start": 13,
-                    "value": {
-                        "values": [
-                            {
-                                "value": "2019-05-05T00:00:00.000-07:00",
-                                "grain": "day",
-                                "type": "value",
-                            },
-                            {
-                                "value": "2020-05-05T00:00:00.000-07:00",
-                                "grain": "day",
-                                "type": "value",
-                            },
-                            {
-                                "value": "2021-05-05T00:00:00.000-07:00",
-                                "grain": "day",
-                                "type": "value",
-                            },
-                        ],
-                        "value": "2019-05-05T00:00:00.000-07:00",
-                        "grain": "day",
-                        "type": "value",
-                    },
-                    "end": 23,
-                    "dim": "time",
-                    "latent": False,
-                },
-                {
-                    "body": "tomorrow",
-                    "start": 37,
-                    "value": {
-                        "values": [
-                            {
-                                "value": "2018-11-14T00:00:00.000-08:00",
-                                "grain": "day",
-                                "type": "value",
-                            }
-                        ],
-                        "value": "2018-11-14T00:00:00.000-08:00",
-                        "grain": "day",
-                        "type": "value",
-                    },
-                    "end": 45,
-                    "dim": "time",
-                    "latent": False,
-                },
-            ],
-        )
-
-        _config = RasaNLUModelConfig({"pipeline": [{"name": "DucklingHTTPExtractor"}]})
-        _config.set_component_attr(
-            0, dimensions=["time"], timezone="UTC", url="http://localhost:8000"
-        )
-        duckling = component_builder.create_component(_config.for_component(0), _config)
-        message = Message("Today is the 5th of May. Let us meet tomorrow.")
-        duckling.process(message)
-        entities = message.get("entities")
-        assert len(entities) == 4
-
-    # Test duckling with a defined date
-
-    with responses.RequestsMock() as rsps:
-        rsps.add(
-            responses.POST,
-            "http://localhost:8000/parse",
-            json=[
-                {
-                    "body": "tomorrow",
-                    "start": 12,
-                    "value": {
-                        "values": [
-                            {
-                                "value": "2013-10-13T00:00:00.000Z",
-                                "grain": "day",
-                                "type": "value",
-                            }
-                        ],
-                        "value": "2013-10-13T00:00:00.000Z",
-                        "grain": "day",
-                        "type": "value",
-                    },
-                    "end": 20,
-                    "dim": "time",
-                    "latent": False,
-                }
-            ],
-        )
-
-        # 1381536182 == 2013/10/12 02:03:02
-        message = Message("Let us meet tomorrow.", time="1381536182")
-        duckling.process(message)
-        entities = message.get("entities")
-        assert len(entities) == 1
-        assert entities[0]["text"] == "tomorrow"
-        assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
-
-        # Test dimension filtering includes only specified dimensions
-        _config = RasaNLUModelConfig({"pipeline": [{"name": "DucklingHTTPExtractor"}]})
-        _config.set_component_attr(
-            0, dimensions=["number"], url="http://localhost:8000"
-        )
-        duckling_number = component_builder.create_component(
-            _config.for_component(0), _config
-        )
-
-    with responses.RequestsMock() as rsps:
-        rsps.add(
-            responses.POST,
-            "http://localhost:8000/parse",
-            json=[
-                {
-                    "body": "Yesterday",
-                    "start": 0,
-                    "value": {
-                        "values": [
-                            {
-                                "value": "2019-02-28T00:00:00.000+01:00",
-                                "grain": "day",
-                                "type": "value",
-                            }
-                        ],
-                        "value": "2019-02-28T00:00:00.000+01:00",
-                        "grain": "day",
-                        "type": "value",
-                    },
-                    "end": 9,
-                    "dim": "time",
-                },
-                {
-                    "body": "5",
-                    "start": 21,
-                    "value": {"value": 5, "type": "value"},
-                    "end": 22,
-                    "dim": "number",
-                },
-            ],
-        )
-
-        message = Message("Yesterday there were 5 people in a room")
-        duckling_number.process(message)
-        entities = message.get("entities")
-
-        assert len(entities) == 1
-        assert entities[0]["text"] == "5"
-        assert entities[0]["value"] == 5
-
-
-def test_duckling_entity_extractor_and_synonyms(component_builder):
-    _config = RasaNLUModelConfig(
-        {
-            "pipeline": [
-                {"name": "DucklingHTTPExtractor"},
-                {"name": "EntitySynonymMapper"},
-            ]
-        }
-    )
-    _config.set_component_attr(0, dimensions=["number"])
-    duckling = component_builder.create_component(_config.for_component(0), _config)
-    synonyms = component_builder.create_component(_config.for_component(1), _config)
-    message = Message("He was 6 feet away")
-    duckling.process(message)
-    # checks that the synonym processor
-    # can handle entities that have int values
-    synonyms.process(message)
-    assert message is not None
-
-
-def test_unintentional_synonyms_capitalized(component_builder):
-    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
-    ner_syn = component_builder.create_component(_config.for_component(5), _config)
-    examples = [
-        Message(
-            "Any Mexican restaurant will do",
-            {
-                "intent": "restaurant_search",
-                "entities": [
-                    {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}
-                ],
-            },
-        ),
-        Message(
-            "I want Tacos!",
-            {
-                "intent": "restaurant_search",
-                "entities": [
-                    {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}
-                ],
-            },
-        ),
-    ]
-    ner_syn.train(TrainingData(training_examples=examples), _config)
-    assert ner_syn.synonyms.get("mexican") is None
-    assert ner_syn.synonyms.get("tacos") == "Mexican"
-
-
-def test_spacy_ner_extractor(component_builder, spacy_nlp):
-    _config = RasaNLUModelConfig({"pipeline": [{"name": "SpacyEntityExtractor"}]})
-    ext = component_builder.create_component(_config.for_component(0), _config)
-    example = Message(
-        "anywhere in the U.K.",
-        {
-            "intent": "restaurant_search",
-            "entities": [],
-            "spacy_doc": spacy_nlp("anywhere in the west"),
-        },
-    )
-
-    ext.process(example, spacy_nlp=spacy_nlp)
-
-    assert len(example.get("entities", [])) == 1
-    assert example.get("entities")[0] == {
-        "start": 16,
-        "extractor": "SpacyEntityExtractor",
-        "end": 20,
-        "value": "U.K.",
-        "entity": "GPE",
-        "confidence": None,
-    }
-
-    # Test dimension filtering includes only specified dimensions
-
-    example = Message(
-        "anywhere in the West with Sebastian Thrun",
-        {
-            "intent": "example_intent",
-            "entities": [],
-            "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun"),
-        },
-    )
-    _config = RasaNLUModelConfig({"pipeline": [{"name": "SpacyEntityExtractor"}]})
-
-    _config.set_component_attr(0, dimensions=["PERSON"])
-    ext = component_builder.create_component(_config.for_component(0), _config)
-    ext.process(example, spacy_nlp=spacy_nlp)
-
-    assert len(example.get("entities", [])) == 1
-    assert example.get("entities")[0] == {
-        "start": 26,
-        "extractor": "SpacyEntityExtractor",
-        "end": 41,
-        "value": "Sebastian Thrun",
-        "entity": "PERSON",
-        "confidence": None,
-    }
diff --git a/tests/nlu/extractors/__init__.py b/tests/nlu/extractors/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/extractors/test_duckling_http_extractor.py b/tests/nlu/extractors/test_duckling_http_extractor.py
new file mode 100644
index 000000000000..a665e2108d3e
--- /dev/null
+++ b/tests/nlu/extractors/test_duckling_http_extractor.py
@@ -0,0 +1,227 @@
+import responses
+
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message
+
+
+def test_duckling_entity_extractor(component_builder):
+    with responses.RequestsMock() as rsps:
+        rsps.add(
+            responses.POST,
+            "http://localhost:8000/parse",
+            json=[
+                {
+                    "body": "Today",
+                    "start": 0,
+                    "value": {
+                        "values": [
+                            {
+                                "value": "2018-11-13T00:00:00.000-08:00",
+                                "grain": "day",
+                                "type": "value",
+                            }
+                        ],
+                        "value": "2018-11-13T00:00:00.000-08:00",
+                        "grain": "day",
+                        "type": "value",
+                    },
+                    "end": 5,
+                    "dim": "time",
+                    "latent": False,
+                },
+                {
+                    "body": "the 5th",
+                    "start": 9,
+                    "value": {
+                        "values": [
+                            {
+                                "value": "2018-12-05T00:00:00.000-08:00",
+                                "grain": "day",
+                                "type": "value",
+                            },
+                            {
+                                "value": "2019-01-05T00:00:00.000-08:00",
+                                "grain": "day",
+                                "type": "value",
+                            },
+                            {
+                                "value": "2019-02-05T00:00:00.000-08:00",
+                                "grain": "day",
+                                "type": "value",
+                            },
+                        ],
+                        "value": "2018-12-05T00:00:00.000-08:00",
+                        "grain": "day",
+                        "type": "value",
+                    },
+                    "end": 16,
+                    "dim": "time",
+                    "latent": False,
+                },
+                {
+                    "body": "5th of May",
+                    "start": 13,
+                    "value": {
+                        "values": [
+                            {
+                                "value": "2019-05-05T00:00:00.000-07:00",
+                                "grain": "day",
+                                "type": "value",
+                            },
+                            {
+                                "value": "2020-05-05T00:00:00.000-07:00",
+                                "grain": "day",
+                                "type": "value",
+                            },
+                            {
+                                "value": "2021-05-05T00:00:00.000-07:00",
+                                "grain": "day",
+                                "type": "value",
+                            },
+                        ],
+                        "value": "2019-05-05T00:00:00.000-07:00",
+                        "grain": "day",
+                        "type": "value",
+                    },
+                    "end": 23,
+                    "dim": "time",
+                    "latent": False,
+                },
+                {
+                    "body": "tomorrow",
+                    "start": 37,
+                    "value": {
+                        "values": [
+                            {
+                                "value": "2018-11-14T00:00:00.000-08:00",
+                                "grain": "day",
+                                "type": "value",
+                            }
+                        ],
+                        "value": "2018-11-14T00:00:00.000-08:00",
+                        "grain": "day",
+                        "type": "value",
+                    },
+                    "end": 45,
+                    "dim": "time",
+                    "latent": False,
+                },
+            ],
+        )
+
+        _config = RasaNLUModelConfig({"pipeline": [{"name": "DucklingHTTPExtractor"}]})
+        _config.set_component_attr(
+            0, dimensions=["time"], timezone="UTC", url="http://localhost:8000"
+        )
+        duckling = component_builder.create_component(_config.for_component(0), _config)
+        message = Message("Today is the 5th of May. Let us meet tomorrow.")
+        duckling.process(message)
+        entities = message.get("entities")
+        assert len(entities) == 4
+
+    # Test duckling with a defined date
+
+    with responses.RequestsMock() as rsps:
+        rsps.add(
+            responses.POST,
+            "http://localhost:8000/parse",
+            json=[
+                {
+                    "body": "tomorrow",
+                    "start": 12,
+                    "value": {
+                        "values": [
+                            {
+                                "value": "2013-10-13T00:00:00.000Z",
+                                "grain": "day",
+                                "type": "value",
+                            }
+                        ],
+                        "value": "2013-10-13T00:00:00.000Z",
+                        "grain": "day",
+                        "type": "value",
+                    },
+                    "end": 20,
+                    "dim": "time",
+                    "latent": False,
+                }
+            ],
+        )
+
+        # 1381536182 == 2013/10/12 02:03:02
+        message = Message("Let us meet tomorrow.", time="1381536182")
+        duckling.process(message)
+        entities = message.get("entities")
+        assert len(entities) == 1
+        assert entities[0]["text"] == "tomorrow"
+        assert entities[0]["value"] == "2013-10-13T00:00:00.000Z"
+
+        # Test dimension filtering includes only specified dimensions
+        _config = RasaNLUModelConfig({"pipeline": [{"name": "DucklingHTTPExtractor"}]})
+        _config.set_component_attr(
+            0, dimensions=["number"], url="http://localhost:8000"
+        )
+        duckling_number = component_builder.create_component(
+            _config.for_component(0), _config
+        )
+
+    with responses.RequestsMock() as rsps:
+        rsps.add(
+            responses.POST,
+            "http://localhost:8000/parse",
+            json=[
+                {
+                    "body": "Yesterday",
+                    "start": 0,
+                    "value": {
+                        "values": [
+                            {
+                                "value": "2019-02-28T00:00:00.000+01:00",
+                                "grain": "day",
+                                "type": "value",
+                            }
+                        ],
+                        "value": "2019-02-28T00:00:00.000+01:00",
+                        "grain": "day",
+                        "type": "value",
+                    },
+                    "end": 9,
+                    "dim": "time",
+                },
+                {
+                    "body": "5",
+                    "start": 21,
+                    "value": {"value": 5, "type": "value"},
+                    "end": 22,
+                    "dim": "number",
+                },
+            ],
+        )
+
+        message = Message("Yesterday there were 5 people in a room")
+        duckling_number.process(message)
+        entities = message.get("entities")
+
+        assert len(entities) == 1
+        assert entities[0]["text"] == "5"
+        assert entities[0]["value"] == 5
+
+
+def test_duckling_entity_extractor_and_synonyms(component_builder):
+    _config = RasaNLUModelConfig(
+        {
+            "pipeline": [
+                {"name": "DucklingHTTPExtractor"},
+                {"name": "EntitySynonymMapper"},
+            ]
+        }
+    )
+    _config.set_component_attr(0, dimensions=["number"])
+    duckling = component_builder.create_component(_config.for_component(0), _config)
+    synonyms = component_builder.create_component(_config.for_component(1), _config)
+    message = Message("He was 6 feet away")
+    duckling.process(message)
+    # checks that the synonym processor
+    # can handle entities that have int values
+    synonyms.process(message)
+    assert message is not None
diff --git a/tests/nlu/extractors/test_entity_synonyms.py b/tests/nlu/extractors/test_entity_synonyms.py
new file mode 100644
index 000000000000..a6479b8f8b43
--- /dev/null
+++ b/tests/nlu/extractors/test_entity_synonyms.py
@@ -0,0 +1,30 @@
+from rasa.nlu.training_data import TrainingData, Message
+from tests.nlu import utilities
+
+
+def test_unintentional_synonyms_capitalized(component_builder):
+    _config = utilities.base_test_conf("pretrained_embeddings_spacy")
+    ner_syn = component_builder.create_component(_config.for_component(5), _config)
+    examples = [
+        Message(
+            "Any Mexican restaurant will do",
+            {
+                "intent": "restaurant_search",
+                "entities": [
+                    {"start": 4, "end": 11, "value": "Mexican", "entity": "cuisine"}
+                ],
+            },
+        ),
+        Message(
+            "I want Tacos!",
+            {
+                "intent": "restaurant_search",
+                "entities": [
+                    {"start": 7, "end": 12, "value": "Mexican", "entity": "cuisine"}
+                ],
+            },
+        ),
+    ]
+    ner_syn.train(TrainingData(training_examples=examples), _config)
+    assert ner_syn.synonyms.get("mexican") is None
+    assert ner_syn.synonyms.get("tacos") == "Mexican"
diff --git a/tests/nlu/extractors/test_mitie_entity_extractors.py b/tests/nlu/extractors/test_mitie_entity_extractors.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/extractors/test_spacy_entity_extractors.py b/tests/nlu/extractors/test_spacy_entity_extractors.py
new file mode 100644
index 000000000000..0c5e59ae5b7a
--- /dev/null
+++ b/tests/nlu/extractors/test_spacy_entity_extractors.py
@@ -0,0 +1,53 @@
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import Message
+
+
+def test_spacy_ner_extractor(component_builder, spacy_nlp):
+    _config = RasaNLUModelConfig({"pipeline": [{"name": "SpacyEntityExtractor"}]})
+    ext = component_builder.create_component(_config.for_component(0), _config)
+    example = Message(
+        "anywhere in the U.K.",
+        {
+            "intent": "restaurant_search",
+            "entities": [],
+            "spacy_doc": spacy_nlp("anywhere in the west"),
+        },
+    )
+
+    ext.process(example, spacy_nlp=spacy_nlp)
+
+    assert len(example.get("entities", [])) == 1
+    assert example.get("entities")[0] == {
+        "start": 16,
+        "extractor": "SpacyEntityExtractor",
+        "end": 20,
+        "value": "U.K.",
+        "entity": "GPE",
+        "confidence": None,
+    }
+
+    # Test dimension filtering includes only specified dimensions
+
+    example = Message(
+        "anywhere in the West with Sebastian Thrun",
+        {
+            "intent": "example_intent",
+            "entities": [],
+            "spacy_doc": spacy_nlp("anywhere in the West with Sebastian Thrun"),
+        },
+    )
+    _config = RasaNLUModelConfig({"pipeline": [{"name": "SpacyEntityExtractor"}]})
+
+    _config.set_component_attr(0, dimensions=["PERSON"])
+    ext = component_builder.create_component(_config.for_component(0), _config)
+    ext.process(example, spacy_nlp=spacy_nlp)
+
+    assert len(example.get("entities", [])) == 1
+    assert example.get("entities")[0] == {
+        "start": 26,
+        "extractor": "SpacyEntityExtractor",
+        "end": 41,
+        "value": "Sebastian Thrun",
+        "entity": "PERSON",
+        "confidence": None,
+    }
diff --git a/tests/nlu/extractors/text_crf_entity_extractor.py b/tests/nlu/extractors/text_crf_entity_extractor.py
new file mode 100644
index 000000000000..b58218ca37be
--- /dev/null
+++ b/tests/nlu/extractors/text_crf_entity_extractor.py
@@ -0,0 +1,254 @@
+from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.training_data import TrainingData, Message
+
+
+def test_crf_extractor(spacy_nlp, ner_crf_pos_feature_config):
+    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+
+    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+    examples = [
+        Message(
+            "anywhere in the west",
+            {
+                "intent": "restaurant_search",
+                "entities": [
+                    {"start": 16, "end": 20, "value": "west", "entity": "location"}
+                ],
+                "spacy_doc": spacy_nlp("anywhere in the west"),
+            },
+        ),
+        Message(
+            "central indian restaurant",
+            {
+                "intent": "restaurant_search",
+                "entities": [
+                    {
+                        "start": 0,
+                        "end": 7,
+                        "value": "central",
+                        "entity": "location",
+                        "extractor": "random_extractor",
+                    },
+                    {
+                        "start": 8,
+                        "end": 14,
+                        "value": "indian",
+                        "entity": "cuisine",
+                        "extractor": "CRFEntityExtractor",
+                    },
+                ],
+                "spacy_doc": spacy_nlp("central indian restaurant"),
+            },
+        ),
+    ]
+
+    # uses BILOU and the default features
+    ext.train(TrainingData(training_examples=examples), RasaNLUModelConfig())
+    sentence = "anywhere in the west"
+    doc = {"spacy_doc": spacy_nlp(sentence)}
+    crf_format = ext._from_text_to_crf(Message(sentence, doc))
+    assert [word[0] for word in crf_format] == ["anywhere", "in", "the", "west"]
+    feats = ext._sentence_to_features(crf_format)
+    assert "BOS" in feats[0]
+    assert "EOS" in feats[-1]
+    assert feats[1]["0:low"] == "in"
+    sentence = "anywhere in the west"
+    ext.extract_entities(Message(sentence, {"spacy_doc": spacy_nlp(sentence)}))
+    filtered = ext.filter_trainable_entities(examples)
+    assert filtered[0].get("entities") == [
+        {"start": 16, "end": 20, "value": "west", "entity": "location"}
+    ], "Entity without extractor remains"
+    assert filtered[1].get("entities") == [
+        {
+            "start": 8,
+            "end": 14,
+            "value": "indian",
+            "entity": "cuisine",
+            "extractor": "CRFEntityExtractor",
+        }
+    ], "Only CRFEntityExtractor entity annotation remains"
+    assert examples[1].get("entities")[0] == {
+        "start": 0,
+        "end": 7,
+        "value": "central",
+        "entity": "location",
+        "extractor": "random_extractor",
+    }, "Original examples are not mutated"
+
+
+def test_crf_json_from_BILOU(spacy_nlp, ner_crf_pos_feature_config):
+    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+
+    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+    sentence = "I need a home cleaning close-by"
+    doc = {"spacy_doc": spacy_nlp(sentence)}
+    r = ext._from_crf_to_json(
+        Message(sentence, doc),
+        [
+            {"O": 1.0},
+            {"O": 1.0},
+            {"O": 1.0},
+            {"B-what": 1.0},
+            {"L-what": 1.0},
+            {"B-where": 1.0},
+            {"I-where": 1.0},
+            {"L-where": 1.0},
+        ],
+    )
+    assert len(r) == 2, "There should be two entities"
+
+    assert r[0]["confidence"]  # confidence should exist
+    del r[0]["confidence"]
+    assert r[0] == {"start": 9, "end": 22, "value": "home cleaning", "entity": "what"}
+
+    assert r[1]["confidence"]  # confidence should exist
+    del r[1]["confidence"]
+    assert r[1] == {"start": 23, "end": 31, "value": "close-by", "entity": "where"}
+
+
+def test_crf_json_from_non_BILOU(spacy_nlp, ner_crf_pos_feature_config):
+    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+
+    ner_crf_pos_feature_config.update({"BILOU_flag": False})
+    ext = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+    sentence = "I need a home cleaning close-by"
+    doc = {"spacy_doc": spacy_nlp(sentence)}
+    rs = ext._from_crf_to_json(
+        Message(sentence, doc),
+        [
+            {"O": 1.0},
+            {"O": 1.0},
+            {"O": 1.0},
+            {"what": 1.0},
+            {"what": 1.0},
+            {"where": 1.0},
+            {"where": 1.0},
+            {"where": 1.0},
+        ],
+    )
+
+    # non BILOU will split multi-word entities - hence 5
+    assert len(rs) == 5, "There should be five entities"
+
+    for r in rs:
+        assert r["confidence"]  # confidence should exist
+        del r["confidence"]
+
+    assert rs[0] == {"start": 9, "end": 13, "value": "home", "entity": "what"}
+    assert rs[1] == {"start": 14, "end": 22, "value": "cleaning", "entity": "what"}
+    assert rs[2] == {"start": 23, "end": 28, "value": "close", "entity": "where"}
+    assert rs[3] == {"start": 28, "end": 29, "value": "-", "entity": "where"}
+    assert rs[4] == {"start": 29, "end": 31, "value": "by", "entity": "where"}
+
+
+def test_crf_create_entity_dict(spacy_nlp):
+    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+    from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
+    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+
+    crf_extractor = CRFEntityExtractor()
+    spacy_tokenizer = SpacyTokenizer()
+    white_space_tokenizer = WhitespaceTokenizer()
+
+    examples = [
+        {
+            "message": Message(
+                "where is St. Michael's Hospital?",
+                {
+                    "intent": "search_location",
+                    "entities": [
+                        {
+                            "start": 9,
+                            "end": 31,
+                            "value": "St. Michael's Hospital",
+                            "entity": "hospital",
+                            "SpacyTokenizer": {
+                                "entity_start_token_idx": 2,
+                                "entity_end_token_idx": 5,
+                            },
+                            "WhitespaceTokenizer": {
+                                "entity_start_token_idx": 2,
+                                "entity_end_token_idx": 5,
+                            },
+                        }
+                    ],
+                },
+            )
+        },
+        {
+            "message": Message(
+                "where is Children's Hospital?",
+                {
+                    "intent": "search_location",
+                    "entities": [
+                        {
+                            "start": 9,
+                            "end": 28,
+                            "value": "Children's Hospital",
+                            "entity": "hospital",
+                            "SpacyTokenizer": {
+                                "entity_start_token_idx": 2,
+                                "entity_end_token_idx": 4,
+                            },
+                            "WhitespaceTokenizer": {
+                                "entity_start_token_idx": 2,
+                                "entity_end_token_idx": 4,
+                            },
+                        }
+                    ],
+                },
+            )
+        },
+    ]
+    for ex in examples:
+        # spacy tokenizers receives a Doc as input and whitespace tokenizer receives a text
+        spacy_tokens = spacy_tokenizer.tokenize(spacy_nlp(ex["message"].text))
+        white_space_tokens = white_space_tokenizer.tokenize(ex["message"].text)
+        for tokenizer, tokens in [
+            ("SpacyTokenizer", spacy_tokens),
+            ("WhitespaceTokenizer", white_space_tokens),
+        ]:
+            for entity in ex["message"].get("entities"):
+                parsed_entities = crf_extractor._create_entity_dict(
+                    ex["message"],
+                    tokens,
+                    entity[tokenizer]["entity_start_token_idx"],
+                    entity[tokenizer]["entity_end_token_idx"],
+                    entity["entity"],
+                    0.8,
+                )
+                assert parsed_entities == {
+                    "start": entity["start"],
+                    "end": entity["end"],
+                    "value": entity["value"],
+                    "entity": entity["entity"],
+                    "confidence": 0.8,
+                }
+
+
+def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
+    import numpy as np
+    from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
+    from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+
+    ner_crf_pos_feature_config["features"][1].append("word_embedding")
+    crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
+
+    spacy_featurizer = SpacyFeaturizer()
+    white_space_tokenizer = WhitespaceTokenizer()
+
+    text = "Rasa is a company in Berlin."
+    message = Message(text)
+    message.set("spacy_doc", spacy_nlp(text))
+
+    white_space_tokenizer.process(message)
+    spacy_featurizer.process(message)
+
+    text_data = crf_extractor._from_text_to_crf(message)
+    features = crf_extractor._sentence_to_features(text_data)
+
+    assert "0:word_embedding" in features[0]
+    assert np.all(
+        features[0]["0:word_embedding"] == message.data.get("text_dense_features")[0]
+    )

From d08b2d76971191741eaec1d10a74dad274232d02 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 13:38:00 +0200
Subject: [PATCH 081/239] use constants

---
 .../classifiers/mitie_intent_classifier.py    | 12 +++-
 .../classifiers/sklearn_intent_classifier.py  | 31 +++++++---
 rasa/nlu/extractors/crf_entity_extractor.py   | 62 ++++++++-----------
 .../nlu/extractors/duckling_http_extractor.py |  7 ++-
 rasa/nlu/extractors/entity_synonyms.py        | 10 +--
 rasa/nlu/extractors/mitie_entity_extractor.py | 25 ++++++--
 rasa/nlu/extractors/spacy_entity_extractor.py |  7 ++-
 7 files changed, 92 insertions(+), 62 deletions(-)

diff --git a/rasa/nlu/classifiers/mitie_intent_classifier.py b/rasa/nlu/classifiers/mitie_intent_classifier.py
index c6f0cfb097b9..5f00bd837733 100644
--- a/rasa/nlu/classifiers/mitie_intent_classifier.py
+++ b/rasa/nlu/classifiers/mitie_intent_classifier.py
@@ -5,6 +5,7 @@
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
+from rasa.nlu.constants import MESSAGE_TOKENS_NAMES, MESSAGE_TEXT_ATTRIBUTE
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
@@ -15,7 +16,11 @@ class MitieIntentClassifier(Component):
 
     provides = ["intent"]
 
-    requires = ["tokens", "mitie_feature_extractor", "mitie_file"]
+    requires = [
+        MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+        "mitie_feature_extractor",
+        "mitie_file",
+    ]
 
     def __init__(
         self, component_config: Optional[Dict[Text, Any]] = None, clf=None
@@ -78,7 +83,10 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     @staticmethod
     def _tokens_of_message(message):
-        return [token.text for token in message.get("tokens", [])]
+        return [
+            token.text
+            for token in message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
+        ]
 
     @classmethod
     def load(
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 86e3860f282d..09c2bfbdaa01 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -13,6 +13,7 @@
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
     MESSAGE_TEXT_ATTRIBUTE,
 )
 
@@ -27,7 +28,9 @@ class SklearnIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]] + [
+        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+    ]
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
@@ -80,6 +83,22 @@ def transform_labels_num2str(self, y: np.ndarray) -> np.ndarray:
 
         return self.le.inverse_transform(y)
 
+    def combine_features(self, message: Message) -> np.ndarray:
+        features_1 = sequence_to_sentence_embedding(
+            message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+        )
+        features_2 = sequence_to_sentence_embedding(
+            message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+        )
+
+        if features_1 is not None and features_2 is not None:
+            return np.concatenate((features_1, features_2), axis=-1)
+
+        if features_1 is not None and features_2 is None:
+            return features_1
+
+        return features_2
+
     def train(
         self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
@@ -99,11 +118,7 @@ def train(
             y = self.transform_labels_str2num(labels)
             X = np.stack(
                 [
-                    sequence_to_sentence_embedding(
-                        example.get(
-                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-                        )
-                    )
+                    self.combine_features(example)
                     for example in training_data.intent_examples
                 ]
             )
@@ -151,9 +166,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             intent = None
             intent_ranking = []
         else:
-            X = sequence_to_sentence_embedding(
-                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-            ).reshape(1, -1)
+            X = self.combine_features(message).reshape(1, -1)
             intent_ids, probabilities = self.predict(X)
             intents = self.transform_labels_num2str(np.ravel(intent_ids))
             # `predict` returns a matrix as it is supposed
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index c66b7f8d18a7..5ef4ed1a32ac 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -14,6 +14,7 @@
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     MESSAGE_SPACY_FEATURES_NAMES,
+    MESSAGE_ENTITIES_ATTRIBUTE,
 )
 from rasa.constants import DOCS_BASE_URL
 
@@ -39,7 +40,7 @@ class CRFToken(NamedTuple):
 
 class CRFEntityExtractor(EntityExtractor):
 
-    provides = ["entities"]
+    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
 
     requires = [MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
@@ -171,7 +172,11 @@ def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
         return dataset
 
     def _check_spacy_doc(self, message):
-        if self.pos_features and message.get("spacy_doc") is None:
+        if (
+            self.pos_features
+            and message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            is None
+        ):
             raise InvalidConfigError(
                 "Could not find `spacy_doc` attribute for "
                 "message {}\n"
@@ -187,7 +192,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         extracted = self.add_extractor_name(self.extract_entities(message))
         message.set(
-            "entities", message.get("entities", []) + extracted, add_to_output=True
+            MESSAGE_ENTITIES_ATTRIBUTE,
+            message.get(MESSAGE_ENTITIES_ATTRIBUTE, []) + extracted,
+            add_to_output=True,
         )
 
     @staticmethod
@@ -195,7 +202,9 @@ def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
         def convert_entity(entity):
             return entity["start"], entity["end"], entity["entity"]
 
-        return [convert_entity(ent) for ent in example.get("entities", [])]
+        return [
+            convert_entity(ent) for ent in example.get(MESSAGE_ENTITIES_ATTRIBUTE, [])
+        ]
 
     def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
         """Take a sentence and return entities in json format"""
@@ -330,9 +339,9 @@ def _from_crf_to_json(
     ) -> List[Dict[Text, Any]]:
 
         if self.pos_features:
-            tokens = message.get("spacy_doc")
+            tokens = message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE])
         else:
-            tokens = message.get("tokens")
+            tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
 
         if len(tokens) != len(entities):
             raise Exception(
@@ -491,11 +500,13 @@ def _from_json_to_crf(
         if self.pos_features:
             from spacy.gold import GoldParse  # pytype: disable=import-error
 
-            doc_or_tokens = message.get("spacy_doc")
+            doc_or_tokens = message.get(
+                MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+            )
             gold = GoldParse(doc_or_tokens, entities=entity_offsets)
             ents = [l[5] for l in gold.orig_annot]
         else:
-            doc_or_tokens = message.get("tokens")
+            doc_or_tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
 
         # collect badly annotated examples
@@ -559,8 +570,10 @@ def _bilou_tags_from_offsets(tokens, entities, missing="O"):
 
     @staticmethod
     def __pattern_of_token(message, i):
-        if message.get("tokens") is not None:
-            return message.get("tokens")[i].get("pattern", {})
+        if message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]) is not None:
+            return message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])[i].get(
+                "pattern", {}
+            )
         else:
             return {}
 
@@ -571,27 +584,6 @@ def __tag_of_token(token):
         else:
             return token.tag_
 
-    @staticmethod
-    def __additional_ner_features(message: Message) -> List[Any]:
-        features = message.get("text_sparse_features", [])
-        tokens = message.get("tokens", [])
-        if len(tokens) != len(features):
-            warn_string = "Number of sparse features ({}) does not match number of tokens ({})".format(
-                len(features), len(tokens)
-            )
-            raise Exception(warn_string)
-
-        # convert to python-crfsuite feature format
-        features_out = []
-        for feature in features:
-            feature_dict = {
-                str(index): token_features
-                for index, token_features in enumerate(feature)
-            }
-            converted = {"sparse_features": feature_dict}
-            features_out.append(converted)
-        return features_out
-
     def _from_text_to_crf(
         self, message: Message, entities: List[Text] = None
     ) -> List[CRFToken]:
@@ -603,7 +595,7 @@ def _from_text_to_crf(
         else:
             tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
 
-        dense_features = message.get(
+        word_embeddings = message.get(
             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
         )
 
@@ -611,12 +603,10 @@ def _from_text_to_crf(
             pattern = self.__pattern_of_token(message, i)
             entity = entities[i] if entities else "N/A"
             tag = self.__tag_of_token(token) if self.pos_features else None
-            token_sparse_features = (
-                dense_features[i] if dense_features is not None else []
-            )
+            word_embedding = word_embeddings[i] if word_embeddings is not None else []
 
             crf_format.append(
-                CRFToken(token.text, tag, entity, pattern, token_sparse_features)
+                CRFToken(token.text, tag, entity, pattern, word_embedding)
             )
 
         return crf_format
diff --git a/rasa/nlu/extractors/duckling_http_extractor.py b/rasa/nlu/extractors/duckling_http_extractor.py
index 06db49727597..93fe07c883d4 100644
--- a/rasa/nlu/extractors/duckling_http_extractor.py
+++ b/rasa/nlu/extractors/duckling_http_extractor.py
@@ -5,6 +5,7 @@
 import requests
 from typing import Any, List, Optional, Text, Dict
 
+from rasa.nlu.constants import MESSAGE_ENTITIES_ATTRIBUTE
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -48,7 +49,7 @@ def convert_duckling_format_to_rasa(matches):
 class DucklingHTTPExtractor(EntityExtractor):
     """Searches for structured entites, e.g. dates, using a duckling server."""
 
-    provides = ["entities"]
+    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
 
     defaults = {
         # by default all dimensions recognized by duckling are returned
@@ -183,7 +184,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         extracted = self.add_extractor_name(extracted)
         message.set(
-            "entities", message.get("entities", []) + extracted, add_to_output=True
+            MESSAGE_ENTITIES_ATTRIBUTE,
+            message.get(MESSAGE_ENTITIES_ATTRIBUTE, []) + extracted,
+            add_to_output=True,
         )
 
     @classmethod
diff --git a/rasa/nlu/extractors/entity_synonyms.py b/rasa/nlu/extractors/entity_synonyms.py
index 9814ccb0f44d..3b9b8b12d481 100644
--- a/rasa/nlu/extractors/entity_synonyms.py
+++ b/rasa/nlu/extractors/entity_synonyms.py
@@ -2,7 +2,7 @@
 import warnings
 from typing import Any, Dict, Optional, Text
 
-from rasa.nlu import utils
+from rasa.nlu.constants import MESSAGE_ENTITIES_ATTRIBUTE
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -13,7 +13,7 @@
 
 class EntitySynonymMapper(EntityExtractor):
 
-    provides = ["entities"]
+    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
 
     def __init__(
         self,
@@ -33,15 +33,15 @@ def train(
             self.add_entities_if_synonyms(key, value)
 
         for example in training_data.entity_examples:
-            for entity in example.get("entities", []):
+            for entity in example.get(MESSAGE_ENTITIES_ATTRIBUTE, []):
                 entity_val = example.text[entity["start"] : entity["end"]]
                 self.add_entities_if_synonyms(entity_val, str(entity.get("value")))
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        updated_entities = message.get("entities", [])[:]
+        updated_entities = message.get(MESSAGE_ENTITIES_ATTRIBUTE, [])[:]
         self.replace_synonyms(updated_entities)
-        message.set("entities", updated_entities, add_to_output=True)
+        message.set(MESSAGE_ENTITIES_ATTRIBUTE, updated_entities, add_to_output=True)
 
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
 
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index dbd0c8104d84..74f790d38e18 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -3,6 +3,11 @@
 import typing
 from typing import Any, Dict, List, Optional, Text
 
+from rasa.nlu.constants import (
+    MESSAGE_ENTITIES_ATTRIBUTE,
+    MESSAGE_TOKENS_NAMES,
+    MESSAGE_TEXT_ATTRIBUTE,
+)
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -16,9 +21,13 @@
 
 class MitieEntityExtractor(EntityExtractor):
 
-    provides = ["entities"]
+    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
 
-    requires = ["tokens", "mitie_feature_extractor", "mitie_file"]
+    requires = [
+        MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+        "mitie_feature_extractor",
+        "mitie_file",
+    ]
 
     def __init__(self, component_config: Dict[Text, Any] = None, ner=None):
         """Construct a new intent classifier using the sklearn framework."""
@@ -88,9 +97,9 @@ def _prepare_mitie_sample(self, training_example):
         import mitie
 
         text = training_example.text
-        tokens = training_example.get("tokens")
+        tokens = training_example.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
         sample = mitie.ner_training_instance([t.text for t in tokens])
-        for ent in training_example.get("entities", []):
+        for ent in training_example.get(MESSAGE_ENTITIES_ATTRIBUTE, []):
             try:
                 # if the token is not aligned an exception will be raised
                 start, end = MitieEntityExtractor.find_entity(ent, text, tokens)
@@ -120,11 +129,15 @@ def process(self, message: Message, **kwargs: Any) -> None:
             )
 
         ents = self.extract_entities(
-            message.text, message.get("tokens"), mitie_feature_extractor
+            message.text,
+            message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]),
+            mitie_feature_extractor,
         )
         extracted = self.add_extractor_name(ents)
         message.set(
-            "entities", message.get("entities", []) + extracted, add_to_output=True
+            MESSAGE_ENTITIES_ATTRIBUTE,
+            message.get(MESSAGE_ENTITIES_ATTRIBUTE, []) + extracted,
+            add_to_output=True,
         )
 
     @classmethod
diff --git a/rasa/nlu/extractors/spacy_entity_extractor.py b/rasa/nlu/extractors/spacy_entity_extractor.py
index 04b508cc3182..3a4e217320ec 100644
--- a/rasa/nlu/extractors/spacy_entity_extractor.py
+++ b/rasa/nlu/extractors/spacy_entity_extractor.py
@@ -1,6 +1,7 @@
 import typing
 from typing import Any, Dict, List, Text
 
+from rasa.nlu.constants import MESSAGE_ENTITIES_ATTRIBUTE
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.training_data import Message
 
@@ -10,7 +11,7 @@
 
 class SpacyEntityExtractor(EntityExtractor):
 
-    provides = ["entities"]
+    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
 
     requires = ["spacy_nlp"]
 
@@ -35,7 +36,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
             all_extracted, dimensions
         )
         message.set(
-            "entities", message.get("entities", []) + extracted, add_to_output=True
+            MESSAGE_ENTITIES_ATTRIBUTE,
+            message.get(MESSAGE_ENTITIES_ATTRIBUTE, []) + extracted,
+            add_to_output=True,
         )
 
     @staticmethod

From a5e3382a8747f512aacbc63d89c4523f031e2565 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 13:40:47 +0200
Subject: [PATCH 082/239] fix imports

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 rasa/nlu/classifiers/sklearn_intent_classifier.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 13170662eb7b..7cb8ff8a99dd 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -6,7 +6,7 @@
 from typing import Any, Dict, List, Optional, Text, Tuple
 import warnings
 
-from nlu.featurizers.featurzier import sequence_to_sentence_embedding
+from rasa.nlu.featurizers.featurzier import sequence_to_sentence_embedding
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 09c2bfbdaa01..cf3eed7df935 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -4,7 +4,7 @@
 import typing
 from typing import Any, Dict, List, Optional, Text, Tuple
 
-from nlu.featurizers.featurzier import sequence_to_sentence_embedding
+from rasa.nlu.featurizers.featurzier import sequence_to_sentence_embedding
 from rasa.nlu import utils
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component

From b469bd6935b3df9b78fda765e72b3549ff92fb6c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 14:11:05 +0200
Subject: [PATCH 083/239] Fix crf entity extractor.

---
 examples/restaurantbot/config.yml             | 18 ++++++++++++
 .../classifiers/sklearn_intent_classifier.py  |  4 +--
 rasa/nlu/extractors/crf_entity_extractor.py   | 29 +++++++++++++++++--
 rasa/utils/train_utils.py                     | 13 +--------
 .../extractors/text_crf_entity_extractor.py   | 12 ++++----
 5 files changed, 53 insertions(+), 23 deletions(-)

diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index f55888e312b3..dabbc693a68a 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -7,6 +7,24 @@ pipeline:
   - name: "SpacyFeaturizer"
   - name: "SklearnIntentClassifier"
   - name: "CRFEntityExtractor"
+    features: [
+      ["low", "title", "upper"],
+      [
+        "bias",
+        "low",
+        "prefix5",
+        "prefix2",
+        "suffix5",
+        "suffix3",
+        "suffix2",
+        "upper",
+        "title",
+        "digit",
+        "pattern",
+        "word_embedding"
+      ],
+      ["low", "title", "upper"],
+    ]
   - name: "EntitySynonymMapper"
 
 policies:
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index cf3eed7df935..a66135a080a3 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -28,9 +28,7 @@ class SklearnIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]] + [
-        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-    ]
+    requires = []
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 5ef4ed1a32ac..0f8162ed5ea3 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -584,6 +584,31 @@ def __tag_of_token(token):
         else:
             return token.tag_
 
+    @staticmethod
+    def __get_word_embeddings(message: Message) -> Optional[List[Any]]:
+        features = message.get(
+            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+        )
+
+        if features is None:
+            return features
+
+        tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
+        if len(tokens) != len(features):
+            warn_string = f"Number of word embeddings ({len(features)}) does not match number of tokens ({len(tokens)})"
+            raise Exception(warn_string)
+
+        # convert to python-crfsuite feature format
+        features_out = []
+        for feature in features:
+            feature_dict = {
+                str(index): token_features
+                for index, token_features in enumerate(feature)
+            }
+            converted = {"word_embeddings": feature_dict}
+            features_out.append(converted)
+        return features_out
+
     def _from_text_to_crf(
         self, message: Message, entities: List[Text] = None
     ) -> List[CRFToken]:
@@ -595,9 +620,7 @@ def _from_text_to_crf(
         else:
             tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
 
-        word_embeddings = message.get(
-            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-        )
+        word_embeddings = self.__get_word_embeddings(message)
 
         for i, token in enumerate(tokens):
             pattern = self.__pattern_of_token(message, i)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index d6fa7a15a0e0..e3e21c22085d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,18 +1,7 @@
 from collections import namedtuple
 import logging
 import typing
-from typing import (
-    List,
-    Optional,
-    Text,
-    Dict,
-    Tuple,
-    Union,
-    Generator,
-    Callable,
-    Any,
-    NamedTuple,
-)
+from typing import List, Optional, Text, Dict, Tuple, Union, Generator, Callable, Any
 import numpy as np
 from tqdm import tqdm
 from sklearn.model_selection import train_test_split
diff --git a/tests/nlu/extractors/text_crf_entity_extractor.py b/tests/nlu/extractors/text_crf_entity_extractor.py
index b58218ca37be..be70ab2e8b32 100644
--- a/tests/nlu/extractors/text_crf_entity_extractor.py
+++ b/tests/nlu/extractors/text_crf_entity_extractor.py
@@ -236,9 +236,9 @@ def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
     crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
 
     spacy_featurizer = SpacyFeaturizer()
-    white_space_tokenizer = WhitespaceTokenizer()
+    white_space_tokenizer = WhitespaceTokenizer({"use_cls_token": False})
 
-    text = "Rasa is a company in Berlin."
+    text = "Rasa is a company in Berlin"
     message = Message(text)
     message.set("spacy_doc", spacy_nlp(text))
 
@@ -249,6 +249,8 @@ def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
     features = crf_extractor._sentence_to_features(text_data)
 
     assert "0:word_embedding" in features[0]
-    assert np.all(
-        features[0]["0:word_embedding"] == message.data.get("text_dense_features")[0]
-    )
+    for i in range(0, len(message.data.get("text_dense_features")[0])):
+        assert (
+            features[0]["0:word_embedding"]["word_embeddings"][str(i)]
+            == message.data.get("text_dense_features")[0][i]
+        )

From d97ce14c410476ecb3066797e2d8790021e1fb27 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 14:22:09 +0200
Subject: [PATCH 084/239] Remove empty file.

---
 tests/nlu/extractors/test_mitie_entity_extractors.py | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 delete mode 100644 tests/nlu/extractors/test_mitie_entity_extractors.py

diff --git a/tests/nlu/extractors/test_mitie_entity_extractors.py b/tests/nlu/extractors/test_mitie_entity_extractors.py
deleted file mode 100644
index e69de29bb2d1..000000000000

From 3275f5ab8adee9d8ae87c17c3aa0edccc0b0c305 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 16:18:21 +0200
Subject: [PATCH 085/239] add changelog entry.

---
 CHANGELOG.rst | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 476aefaee896..410f9a16386f 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -18,6 +18,8 @@ Changed
 -------
 - Divided featurizers in ``rasa.nlu`` into sparse and dense featurizers
 - All featurizers in ``rasa.nlu`` return a sequence
+- Renamed the feature name ``ner_features`` to ``word_embedding`` in ``CRFEntityExtractor``. The ``word_embedding`` are
+  extracted form the dense features created by any featurizer.
 
 Removed
 -------

From f85fbe2159e341dc4bee27894ce00ea1b7bf7ec8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 16:32:33 +0200
Subject: [PATCH 086/239] Remove case_sensitive option from WhitespaceTokenizer

---
 CHANGELOG.rst                                 |  1 +
 rasa/nlu/tokenizers/whitespace_tokenizer.py   |  5 ---
 .../tokenizers/test_whitespace_tokenizer.py   | 34 +++----------------
 3 files changed, 6 insertions(+), 34 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 410f9a16386f..077eb796b07b 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -24,6 +24,7 @@ Changed
 Removed
 -------
 - Deprecated ``NGramFeaturizer`` in ``rasa.nlu.featurizers`` (removed functionality and print deprecation warning instead)
+- Removed ``case_sensitive`` option from ``WhitespaceTokenizer`` as it is covered by ``CountVectorsFeaturizer``
 
 Fixed
 -----
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index c129e97c8fd9..10b4e5c9ab64 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -21,8 +21,6 @@ class WhitespaceTokenizer(Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
-        # Text will be tokenized with case sensitive as default
-        "case_sensitive": True,
         # add __CLS__ token to the end of the list of tokens
         "use_cls_token": True,
     }
@@ -37,7 +35,6 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         )
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
-        self.case_sensitive = self.component_config["case_sensitive"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -60,8 +57,6 @@ def tokenize(
         self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
     ) -> List[Token]:
 
-        if not self.case_sensitive:
-            text = text.lower()
         # remove 'not a word character' if
         if attribute != MESSAGE_INTENT_ATTRIBUTE:
             words = re.sub(
diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
index 424171cc8e4e..e40077e9b948 100644
--- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -122,22 +122,6 @@ def test_whitespace_custom_intent_symbol():
 def test_whitespace_with_case():
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 
-    component_config = {"case_sensitive": False, "use_cls_token": False}
-    tk = WhitespaceTokenizer(component_config)
-    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
-        "forecast",
-        "for",
-        "lunch",
-    ]
-
-    component_config = {"case_sensitive": True, "use_cls_token": False}
-    tk = WhitespaceTokenizer(component_config)
-    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
-        "Forecast",
-        "for",
-        "LUNCH",
-    ]
-
     component_config = {"use_cls_token": False}
     tk = WhitespaceTokenizer(component_config)
     assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
@@ -146,14 +130,6 @@ def test_whitespace_with_case():
         "LUNCH",
     ]
 
-    component_config = {"case_sensitive": False, "use_cls_token": False}
-    tk = WhitespaceTokenizer(component_config)
-    message = Message("Forecast for LUNCH")
-    tk.process(message)
-    assert message.data.get("tokens")[0].text == "forecast"
-    assert message.data.get("tokens")[1].text == "for"
-    assert message.data.get("tokens")[2].text == "lunch"
-
     _config = utilities.base_test_conf("supervised_embeddings")
     examples = [
         Message(
@@ -176,14 +152,14 @@ def test_whitespace_with_case():
         ),
     ]
 
-    component_config = {"case_sensitive": False}
+    component_config = {}
     tk = WhitespaceTokenizer(component_config)
     tk.train(TrainingData(training_examples=examples), _config)
-    assert examples[0].data.get("tokens")[0].text == "any"
-    assert examples[0].data.get("tokens")[1].text == "mexican"
+    assert examples[0].data.get("tokens")[0].text == "Any"
+    assert examples[0].data.get("tokens")[1].text == "Mexican"
     assert examples[0].data.get("tokens")[2].text == "restaurant"
     assert examples[0].data.get("tokens")[3].text == "will"
     assert examples[0].data.get("tokens")[4].text == "do"
-    assert examples[1].data.get("tokens")[0].text == "i"
+    assert examples[1].data.get("tokens")[0].text == "I"
     assert examples[1].data.get("tokens")[1].text == "want"
-    assert examples[1].data.get("tokens")[2].text == "tacos"
+    assert examples[1].data.get("tokens")[2].text == "Tacos"

From a4f5e8ed015a1b7b34d9ef45a040608d21b87af4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 16:39:58 +0200
Subject: [PATCH 087/239] Update docstring.

---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 1c7080ae4038..3a2b2477b964 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -21,8 +21,8 @@
 
 
 class CountVectorsFeaturizer(Featurizer):
-    """Creates a sequence of token counts features
-    based on sklearn's `CountVectorizer`.
+    """Creates a sequence of token counts features based on sklearn's `CountVectorizer`.
+
     All tokens which consist only of digits (e.g. 123 and 99
     but not ab12d) will be represented by a single feature.
 

From a6d93fb6b27db221dd14dbb200b230a804581dc8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 17:50:10 +0200
Subject: [PATCH 088/239] review comments

---
 .../embedding_intent_classifier.py            |  8 ++---
 .../classifiers/sklearn_intent_classifier.py  | 30 +++++++------------
 rasa/nlu/featurizers/featurzier.py            | 10 +++----
 tests/nlu/featurizers/test_featurizer.py      |  4 +--
 4 files changed, 21 insertions(+), 31 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7cb8ff8a99dd..d294451766f5 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -6,7 +6,7 @@
 from typing import Any, Dict, List, Optional, Text, Tuple
 import warnings
 
-from rasa.nlu.featurizers.featurzier import sequence_to_sentence_embedding
+from rasa.nlu.featurizers.featurzier import sequence_to_sentence_features
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
@@ -275,7 +275,7 @@ def _extract_labels_precomputed_features(
         encoded_id_labels = [
             (
                 label_idx,
-                sequence_to_sentence_embedding(
+                sequence_to_sentence_features(
                     label_example.get(attribute_feature_name)
                 ),
             )
@@ -342,7 +342,7 @@ def _create_session_data(
         for e in training_data.intent_examples:
             if e.get(attribute):
                 X.append(
-                    sequence_to_sentence_embedding(
+                    sequence_to_sentence_features(
                         e.get(
                             MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
                         )
@@ -618,7 +618,7 @@ def predict_label(self, message):
         else:
             # get features (bag of words) for a message
             # noinspection PyPep8Naming
-            X = sequence_to_sentence_embedding(
+            X = sequence_to_sentence_features(
                 message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             ).reshape(1, -1)
 
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index a66135a080a3..978961fbde15 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -4,7 +4,7 @@
 import typing
 from typing import Any, Dict, List, Optional, Text, Tuple
 
-from rasa.nlu.featurizers.featurzier import sequence_to_sentence_embedding
+from rasa.nlu.featurizers.featurzier import sequence_to_sentence_features
 from rasa.nlu import utils
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
@@ -28,7 +28,7 @@ class SklearnIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = []
+    requires = [MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
@@ -81,22 +81,6 @@ def transform_labels_num2str(self, y: np.ndarray) -> np.ndarray:
 
         return self.le.inverse_transform(y)
 
-    def combine_features(self, message: Message) -> np.ndarray:
-        features_1 = sequence_to_sentence_embedding(
-            message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-        )
-        features_2 = sequence_to_sentence_embedding(
-            message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-        )
-
-        if features_1 is not None and features_2 is not None:
-            return np.concatenate((features_1, features_2), axis=-1)
-
-        if features_1 is not None and features_2 is None:
-            return features_1
-
-        return features_2
-
     def train(
         self, training_data: TrainingData, cfg: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
@@ -116,7 +100,11 @@ def train(
             y = self.transform_labels_str2num(labels)
             X = np.stack(
                 [
-                    self.combine_features(example)
+                    sequence_to_sentence_features(
+                        example.get(
+                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                        )
+                    )
                     for example in training_data.intent_examples
                 ]
             )
@@ -164,7 +152,9 @@ def process(self, message: Message, **kwargs: Any) -> None:
             intent = None
             intent_ranking = []
         else:
-            X = self.combine_features(message).reshape(1, -1)
+            X = sequence_to_sentence_features(
+                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            ).reshape(1, -1)
             intent_ids, probabilities = self.predict(X)
             intents = self.transform_labels_num2str(np.ravel(intent_ids))
             # `predict` returns a matrix as it is supposed
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index c6a3e1f5c2bf..550bbfe3c94f 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -10,8 +10,8 @@
 )
 
 
-def sequence_to_sentence_embedding(
-    features: Union[List[List[float]], scipy.sparse.spmatrix], method: Text = "mean"
+def sequence_to_sentence_features(
+    features: Union[np.ndarray, scipy.sparse.spmatrix], method: Text = "mean"
 ) -> Optional[np.ndarray]:
     if features is None:
         return None
@@ -19,10 +19,10 @@ def sequence_to_sentence_embedding(
     if isinstance(features, scipy.sparse.spmatrix):
         features = features.toarray()
 
-    if method == "mean" or method == "avg":
+    if method == "mean":
         return np.mean(features, axis=0)
-    if method == "sum":
-        return np.sum(features, axis=0)
+
+    raise ValueError(f"Provided method '{method}' is not supported.")
 
 
 class Featurizer(Component):
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 18981a4760a7..5a8f8898e743 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -2,7 +2,7 @@
 import pytest
 import scipy.sparse
 
-from rasa.nlu.featurizers.featurzier import Featurizer, sequence_to_sentence_embedding
+from rasa.nlu.featurizers.featurzier import Featurizer, sequence_to_sentence_features
 from rasa.nlu.constants import (
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
@@ -60,6 +60,6 @@ def test_combine_with_existing_sparse_features():
     ],
 )
 def test_sequence_to_sentence_embedding(features, expected, method):
-    actual = sequence_to_sentence_embedding(features, method=method)
+    actual = sequence_to_sentence_features(features, method=method)
 
     assert np.all(expected == actual)

From ae8faf6b4638d5018fd0fbbd2898c7771ce04423 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 24 Oct 2019 09:03:08 +0200
Subject: [PATCH 089/239] undo removing case sensitive from whitespace
 tokenizer

---
 CHANGELOG.rst                                 |  3 +--
 rasa/nlu/tokenizers/whitespace_tokenizer.py   |  6 +++++
 .../tokenizers/test_whitespace_tokenizer.py   | 26 +++++++++++++++----
 3 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 077eb796b07b..09615edeb539 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -24,8 +24,7 @@ Changed
 Removed
 -------
 - Deprecated ``NGramFeaturizer`` in ``rasa.nlu.featurizers`` (removed functionality and print deprecation warning instead)
-- Removed ``case_sensitive`` option from ``WhitespaceTokenizer`` as it is covered by ``CountVectorsFeaturizer``
-
+w
 Fixed
 -----
 - ``MultiProjectImporter`` now imports files in the order of the import statements
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 10b4e5c9ab64..98cd182a4567 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -21,6 +21,8 @@ class WhitespaceTokenizer(Tokenizer):
         "intent_tokenization_flag": False,
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
+        # Text will be tokenized with case sensitive as default
+        "case_sensitive": True,
         # add __CLS__ token to the end of the list of tokens
         "use_cls_token": True,
     }
@@ -35,6 +37,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         )
         # split symbol for intents
         self.intent_split_symbol = self.component_config["intent_split_symbol"]
+        self.case_sensitive = self.component_config["case_sensitive"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -57,6 +60,9 @@ def tokenize(
         self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
     ) -> List[Token]:
 
+        if not self.case_sensitive:
+            text = text.lower()
+
         # remove 'not a word character' if
         if attribute != MESSAGE_INTENT_ATTRIBUTE:
             words = re.sub(
diff --git a/tests/nlu/tokenizers/test_whitespace_tokenizer.py b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
index e40077e9b948..a8e3b7bdabb4 100644
--- a/tests/nlu/tokenizers/test_whitespace_tokenizer.py
+++ b/tests/nlu/tokenizers/test_whitespace_tokenizer.py
@@ -130,6 +130,22 @@ def test_whitespace_with_case():
         "LUNCH",
     ]
 
+    component_config = {"case_sensitive": False, "use_cls_token": False}
+    tk = WhitespaceTokenizer(component_config)
+    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
+        "forecast",
+        "for",
+        "lunch",
+    ]
+
+    component_config = {"case_sensitive": True, "use_cls_token": False}
+    tk = WhitespaceTokenizer(component_config)
+    assert [t.text for t in tk.tokenize("Forecast for LUNCH")] == [
+        "Forecast",
+        "for",
+        "LUNCH",
+    ]
+
     _config = utilities.base_test_conf("supervised_embeddings")
     examples = [
         Message(
@@ -152,14 +168,14 @@ def test_whitespace_with_case():
         ),
     ]
 
-    component_config = {}
+    component_config = {"case_sensitive": False}
     tk = WhitespaceTokenizer(component_config)
     tk.train(TrainingData(training_examples=examples), _config)
-    assert examples[0].data.get("tokens")[0].text == "Any"
-    assert examples[0].data.get("tokens")[1].text == "Mexican"
+    assert examples[0].data.get("tokens")[0].text == "any"
+    assert examples[0].data.get("tokens")[1].text == "mexican"
     assert examples[0].data.get("tokens")[2].text == "restaurant"
     assert examples[0].data.get("tokens")[3].text == "will"
     assert examples[0].data.get("tokens")[4].text == "do"
-    assert examples[1].data.get("tokens")[0].text == "I"
+    assert examples[1].data.get("tokens")[0].text == "i"
     assert examples[1].data.get("tokens")[1].text == "want"
-    assert examples[1].data.get("tokens")[2].text == "Tacos"
+    assert examples[1].data.get("tokens")[2].text == "tacos"

From 8791d40b26026883f923d184564b6797798e6086 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 24 Oct 2019 09:06:36 +0200
Subject: [PATCH 090/239] Adapt tests.

---
 tests/nlu/featurizers/test_featurizer.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 5a8f8898e743..a118dc696621 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -53,13 +53,18 @@ def test_combine_with_existing_sparse_features():
 @pytest.mark.parametrize(
     "features, expected, method",
     [
-        ([[1, 0, 2, 3], [2, 0, 0, 1]], [3, 0, 2, 4], "sum"),
-        ([[1, 0, 2, 3], [2, 0, 0, 1]], [1.5, 0, 1, 2], "avg"),
-        (scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), [1.5, 0, 1, 2], "avg"),
-        (None, None, "avg"),
+        ([[1, 0, 2, 3], [2, 0, 0, 1]], [1.5, 0, 1, 2], "mean"),
+        (scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), [1.5, 0, 1, 2], "mean"),
+        (None, None, "mean"),
     ],
 )
-def test_sequence_to_sentence_embedding(features, expected, method):
+def test_sequence_to_sentence_features(features, expected, method):
     actual = sequence_to_sentence_features(features, method=method)
 
     assert np.all(expected == actual)
+
+
+def test_sequence_to_sentence_features_raise_value_error():
+    featuers = np.array([[1, 0, 2, 3], [2, 0, 0, 1]])
+    with pytest.raises(ValueError):
+        sequence_to_sentence_features(featuers, method="sum")

From 8d8696821432b9cca46f89185b63b17a17c0d7ab Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 24 Oct 2019 09:18:11 +0200
Subject: [PATCH 091/239] rename word_embeddings to text_dense_features

---
 CHANGELOG.rst                                    |  4 ++--
 docs/nlu/components.rst                          |  5 +++--
 examples/restaurantbot/config.yml                |  2 +-
 rasa/nlu/extractors/crf_entity_extractor.py      | 16 +++++++++-------
 .../nlu/extractors/text_crf_entity_extractor.py  |  7 +++----
 5 files changed, 18 insertions(+), 16 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 09615edeb539..fb385e15a48d 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -18,8 +18,8 @@ Changed
 -------
 - Divided featurizers in ``rasa.nlu`` into sparse and dense featurizers
 - All featurizers in ``rasa.nlu`` return a sequence
-- Renamed the feature name ``ner_features`` to ``word_embedding`` in ``CRFEntityExtractor``. The ``word_embedding`` are
-  extracted form the dense features created by any featurizer.
+- Renamed the feature name ``ner_features`` to ``text_dense_features`` in ``CRFEntityExtractor``.
+  The ``text_dense_features`` are created by any dense featurizer.
 
 Removed
 -------
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 82baf3ce0652..380cc3cf06f0 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -760,7 +760,8 @@ CRFEntityExtractor
     etc.) give probabilities to certain entity classes, as are transitions between
     neighbouring entity tags: the most likely set of tags is then calculated and returned.
     If POS features are used (pos or pos2), spaCy has to be installed. If you want to use
-    word embeddings from any provided featurizer, use ``"word_embedding"``.
+    additional features, such as pre-trained word embeddings, from any provided dense
+    featurizer, use ``"text_dense_features"``.
 :Configuration:
    .. code-block:: yaml
 
@@ -774,7 +775,7 @@ CRFEntityExtractor
           # Available features are:
           # ``low``, ``title``, ``suffix5``, ``suffix3``, ``suffix2``,
           # ``suffix1``, ``pos``, ``pos2``, ``prefix5``, ``prefix2``,
-          # ``bias``, ``upper``, ``digit``, ``pattern``, and ``word_embedding``
+          # ``bias``, ``upper``, ``digit``, ``pattern``, and ``text_dense_features``
           features: [["low", "title"], ["bias", "suffix3"], ["upper", "pos", "pos2"]]
 
           # The flag determines whether to use BILOU tagging or not. BILOU
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index dabbc693a68a..52eb0709829e 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -21,7 +21,7 @@ pipeline:
         "title",
         "digit",
         "pattern",
-        "word_embedding"
+        "text_dense_features"
       ],
       ["low", "title", "upper"],
     ]
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 0f8162ed5ea3..99ca9c032666 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -35,7 +35,7 @@ class CRFToken(NamedTuple):
     tag: Text
     entity: Text
     pattern: Dict[Text, Any]
-    word_embedding: np.ndarray
+    dense_features: np.ndarray
 
 
 class CRFEntityExtractor(EntityExtractor):
@@ -95,7 +95,7 @@ class CRFEntityExtractor(EntityExtractor):
         "upper": lambda crf_token: crf_token.text.isupper(),  # pytype: disable=attribute-error
         "digit": lambda crf_token: crf_token.text.isdigit(),  # pytype: disable=attribute-error
         "pattern": lambda crf_token: crf_token.pattern,
-        "word_embedding": lambda crf_token: crf_token.word_embedding,
+        "text_dense_features": lambda crf_token: crf_token.dense_features,
     }
 
     def __init__(
@@ -585,7 +585,7 @@ def __tag_of_token(token):
             return token.tag_
 
     @staticmethod
-    def __get_word_embeddings(message: Message) -> Optional[List[Any]]:
+    def __get_dense_features(message: Message) -> Optional[List[Any]]:
         features = message.get(
             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
         )
@@ -605,7 +605,7 @@ def __get_word_embeddings(message: Message) -> Optional[List[Any]]:
                 str(index): token_features
                 for index, token_features in enumerate(feature)
             }
-            converted = {"word_embeddings": feature_dict}
+            converted = {"text_dense_features": feature_dict}
             features_out.append(converted)
         return features_out
 
@@ -620,16 +620,18 @@ def _from_text_to_crf(
         else:
             tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
 
-        word_embeddings = self.__get_word_embeddings(message)
+        text_dense_features = self.__get_dense_features(message)
 
         for i, token in enumerate(tokens):
             pattern = self.__pattern_of_token(message, i)
             entity = entities[i] if entities else "N/A"
             tag = self.__tag_of_token(token) if self.pos_features else None
-            word_embedding = word_embeddings[i] if word_embeddings is not None else []
+            dense_features = (
+                text_dense_features[i] if text_dense_features is not None else []
+            )
 
             crf_format.append(
-                CRFToken(token.text, tag, entity, pattern, word_embedding)
+                CRFToken(token.text, tag, entity, pattern, dense_features)
             )
 
         return crf_format
diff --git a/tests/nlu/extractors/text_crf_entity_extractor.py b/tests/nlu/extractors/text_crf_entity_extractor.py
index be70ab2e8b32..1ff19ba338de 100644
--- a/tests/nlu/extractors/text_crf_entity_extractor.py
+++ b/tests/nlu/extractors/text_crf_entity_extractor.py
@@ -227,12 +227,11 @@ def test_crf_create_entity_dict(spacy_nlp):
 
 
 def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
-    import numpy as np
     from rasa.nlu.extractors.crf_entity_extractor import CRFEntityExtractor
     from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
     from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
-    ner_crf_pos_feature_config["features"][1].append("word_embedding")
+    ner_crf_pos_feature_config["features"][1].append("text_dense_features")
     crf_extractor = CRFEntityExtractor(component_config=ner_crf_pos_feature_config)
 
     spacy_featurizer = SpacyFeaturizer()
@@ -248,9 +247,9 @@ def test_crf_use_dense_features(ner_crf_pos_feature_config, spacy_nlp):
     text_data = crf_extractor._from_text_to_crf(message)
     features = crf_extractor._sentence_to_features(text_data)
 
-    assert "0:word_embedding" in features[0]
+    assert "0:text_dense_features" in features[0]
     for i in range(0, len(message.data.get("text_dense_features")[0])):
         assert (
-            features[0]["0:word_embedding"]["word_embeddings"][str(i)]
+            features[0]["0:text_dense_features"]["text_dense_features"][str(i)]
             == message.data.get("text_dense_features")[0][i]
         )

From a8a5abf8f070a13e4f2743010d0b5ed34b0d5db6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 24 Oct 2019 10:24:11 +0200
Subject: [PATCH 092/239] combine correct features in regex featurizer

---
 rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 77d59f61b2a4..b90d748debaf 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -61,7 +61,11 @@ def process(self, message: Message, **kwargs: Any) -> None:
     def _text_features_with_regex(self, message: Message, attribute: Text) -> None:
         if self.known_patterns:
             extras = self._features_for_patterns(message, attribute)
-            features = self._combine_with_existing_sparse_features(message, extras)
+            features = self._combine_with_existing_sparse_features(
+                message,
+                extras,
+                feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+            )
             message.set(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute], features)
 
     def _add_lookup_table_regexes(

From b1d371b02a61b0899e429cbfd383e2c7d2804462 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 25 Oct 2019 09:44:21 +0200
Subject: [PATCH 093/239] keep sparse sparse

---
 rasa/nlu/classifiers/embedding_intent_classifier.py |  5 +++--
 rasa/nlu/featurizers/featurzier.py                  |  9 +++++++--
 tests/nlu/featurizers/test_featurizer.py            | 11 +++++++++--
 3 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index d294451766f5..0d11469bdf93 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -14,6 +14,7 @@
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
 )
 
 import tensorflow as tf
@@ -344,7 +345,7 @@ def _create_session_data(
                 X.append(
                     sequence_to_sentence_features(
                         e.get(
-                            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
                         )
                     )
                 )
@@ -619,7 +620,7 @@ def predict_label(self, message):
             # get features (bag of words) for a message
             # noinspection PyPep8Naming
             X = sequence_to_sentence_features(
-                message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             ).reshape(1, -1)
 
             # load tf graph and session
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index 550bbfe3c94f..0b85154416ff 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -12,15 +12,20 @@
 
 def sequence_to_sentence_features(
     features: Union[np.ndarray, scipy.sparse.spmatrix], method: Text = "mean"
-) -> Optional[np.ndarray]:
+) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]:
     if features is None:
         return None
 
+    sparse_features = False
     if isinstance(features, scipy.sparse.spmatrix):
         features = features.toarray()
+        sparse_features = True
 
     if method == "mean":
-        return np.mean(features, axis=0)
+        sentence_features = np.mean(features, axis=0)
+        if sparse_features:
+            return scipy.sparse.csr_matrix(sentence_features)
+        return sentence_features
 
     raise ValueError(f"Provided method '{method}' is not supported.")
 
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index a118dc696621..3f3145280453 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -54,14 +54,21 @@ def test_combine_with_existing_sparse_features():
     "features, expected, method",
     [
         ([[1, 0, 2, 3], [2, 0, 0, 1]], [1.5, 0, 1, 2], "mean"),
-        (scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]), [1.5, 0, 1, 2], "mean"),
+        (
+            scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]),
+            scipy.sparse.csr_matrix([1.5, 0, 1, 2]),
+            "mean",
+        ),
         (None, None, "mean"),
     ],
 )
 def test_sequence_to_sentence_features(features, expected, method):
     actual = sequence_to_sentence_features(features, method=method)
 
-    assert np.all(expected == actual)
+    if isinstance(expected, scipy.sparse.spmatrix):
+        assert np.all(expected.toarray() == actual.toarray())
+    else:
+        assert np.all(expected == actual)
 
 
 def test_sequence_to_sentence_features_raise_value_error():

From 5ded8c9144cc93cb6154b712be3ed2c06853918f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 25 Oct 2019 09:52:18 +0200
Subject: [PATCH 094/239] fix changelog

---
 CHANGELOG.rst                                 |  2 +-
 .../embedding_intent_classifier.py            | 19 ++++++++++++-------
 rasa/nlu/featurizers/featurzier.py            | 14 +++++++-------
 3 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index fb385e15a48d..73ee94573668 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -24,7 +24,7 @@ Changed
 Removed
 -------
 - Deprecated ``NGramFeaturizer`` in ``rasa.nlu.featurizers`` (removed functionality and print deprecation warning instead)
-w
+
 Fixed
 -----
 - ``MultiProjectImporter`` now imports files in the order of the import statements
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 0d11469bdf93..2eb18ac89b68 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -14,7 +14,6 @@
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
 )
 
 import tensorflow as tf
@@ -278,7 +277,7 @@ def _extract_labels_precomputed_features(
                 label_idx,
                 sequence_to_sentence_features(
                     label_example.get(attribute_feature_name)
-                ),
+                ).toarray(),
             )
             for (label_idx, label_example) in label_examples
         ]
@@ -345,9 +344,9 @@ def _create_session_data(
                 X.append(
                     sequence_to_sentence_features(
                         e.get(
-                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
                         )
-                    )
+                    ).toarray()
                 )
                 label_ids.append(label_id_dict[e.get(attribute)])
 
@@ -619,9 +618,15 @@ def predict_label(self, message):
         else:
             # get features (bag of words) for a message
             # noinspection PyPep8Naming
-            X = sequence_to_sentence_features(
-                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-            ).reshape(1, -1)
+            X = (
+                sequence_to_sentence_features(
+                    message.get(
+                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                    )
+                )
+                .toarray()
+                .reshape(1, -1)
+            )
 
             # load tf graph and session
             label_ids, message_sim = self._calculate_message_sim(X)
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index 0b85154416ff..54e6f8bc9166 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -16,18 +16,18 @@ def sequence_to_sentence_features(
     if features is None:
         return None
 
+    if method != "mean":
+        raise ValueError(f"Provided method '{method}' is not supported.")
+
     sparse_features = False
     if isinstance(features, scipy.sparse.spmatrix):
         features = features.toarray()
         sparse_features = True
 
-    if method == "mean":
-        sentence_features = np.mean(features, axis=0)
-        if sparse_features:
-            return scipy.sparse.csr_matrix(sentence_features)
-        return sentence_features
-
-    raise ValueError(f"Provided method '{method}' is not supported.")
+    sentence_features = np.mean(features, axis=0)
+    if sparse_features:
+        return scipy.sparse.coo_matrix(sentence_features)
+    return sentence_features
 
 
 class Featurizer(Component):

From bbf4d43dd95edf18b682ff941a75222da9968b15 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 25 Oct 2019 11:05:16 +0200
Subject: [PATCH 095/239] update sequence to sentence

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 11 +++++++----
 rasa/nlu/featurizers/featurzier.py                  |  9 ++-------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 2eb18ac89b68..fb73664d9cda 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -275,9 +275,9 @@ def _extract_labels_precomputed_features(
         encoded_id_labels = [
             (
                 label_idx,
-                sequence_to_sentence_features(
-                    label_example.get(attribute_feature_name)
-                ).toarray(),
+                sequence_to_sentence_features(label_example.get(attribute_feature_name))
+                .toarray()
+                .squeeze(),
             )
             for (label_idx, label_example) in label_examples
         ]
@@ -346,7 +346,9 @@ def _create_session_data(
                         e.get(
                             MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
                         )
-                    ).toarray()
+                    )
+                    .toarray()
+                    .squeeze()
                 )
                 label_ids.append(label_id_dict[e.get(attribute)])
 
@@ -625,6 +627,7 @@ def predict_label(self, message):
                     )
                 )
                 .toarray()
+                .squeeze()
                 .reshape(1, -1)
             )
 
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index 54e6f8bc9166..2783e73f466c 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -19,15 +19,10 @@ def sequence_to_sentence_features(
     if method != "mean":
         raise ValueError(f"Provided method '{method}' is not supported.")
 
-    sparse_features = False
     if isinstance(features, scipy.sparse.spmatrix):
-        features = features.toarray()
-        sparse_features = True
+        return scipy.sparse.csr_matrix(features.mean(axis=0))
 
-    sentence_features = np.mean(features, axis=0)
-    if sparse_features:
-        return scipy.sparse.coo_matrix(sentence_features)
-    return sentence_features
+    return np.mean(features, axis=0)
 
 
 class Featurizer(Component):

From ecbf1575567538080a92e07a72242f06630d0c6f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 25 Oct 2019 11:22:47 +0200
Subject: [PATCH 096/239] update sequence to sentence

---
 rasa/nlu/featurizers/featurzier.py       |  7 ++-----
 tests/nlu/featurizers/test_featurizer.py | 19 ++++++-------------
 2 files changed, 8 insertions(+), 18 deletions(-)

diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index 2783e73f466c..a35cace93ff3 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -11,16 +11,13 @@
 
 
 def sequence_to_sentence_features(
-    features: Union[np.ndarray, scipy.sparse.spmatrix], method: Text = "mean"
+    features: Union[np.ndarray, scipy.sparse.spmatrix]
 ) -> Optional[Union[np.ndarray, scipy.sparse.spmatrix]]:
     if features is None:
         return None
 
-    if method != "mean":
-        raise ValueError(f"Provided method '{method}' is not supported.")
-
     if isinstance(features, scipy.sparse.spmatrix):
-        return scipy.sparse.csr_matrix(features.mean(axis=0))
+        return scipy.sparse.csr_matrix(features.sum(axis=0))
 
     return np.mean(features, axis=0)
 
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 3f3145280453..803c559d7433 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -51,27 +51,20 @@ def test_combine_with_existing_sparse_features():
 
 
 @pytest.mark.parametrize(
-    "features, expected, method",
+    "features, expected",
     [
-        ([[1, 0, 2, 3], [2, 0, 0, 1]], [1.5, 0, 1, 2], "mean"),
+        ([[1, 0, 2, 3], [2, 0, 0, 1]], [1.5, 0, 1, 2]),
         (
             scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]]),
-            scipy.sparse.csr_matrix([1.5, 0, 1, 2]),
-            "mean",
+            scipy.sparse.csr_matrix([3, 0, 2, 4]),
         ),
-        (None, None, "mean"),
+        (None, None),
     ],
 )
-def test_sequence_to_sentence_features(features, expected, method):
-    actual = sequence_to_sentence_features(features, method=method)
+def test_sequence_to_sentence_features(features, expected):
+    actual = sequence_to_sentence_features(features)
 
     if isinstance(expected, scipy.sparse.spmatrix):
         assert np.all(expected.toarray() == actual.toarray())
     else:
         assert np.all(expected == actual)
-
-
-def test_sequence_to_sentence_features_raise_value_error():
-    featuers = np.array([[1, 0, 2, 3], [2, 0, 0, 1]])
-    with pytest.raises(ValueError):
-        sequence_to_sentence_features(featuers, method="sum")

From 3cb90b71382feadd60ab3ef00886e1f8e1c2014c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 23 Oct 2019 15:16:13 +0200
Subject: [PATCH 097/239] update session data

---
 rasa/core/policies/embedding_policy.py        |   6 +-
 rasa/core/test.py                             |   4 -
 .../embedding_intent_classifier.py            |  81 +++++--
 .../selectors/embedding_response_selector.py  |  14 +-
 rasa/nlu/test.py                              |   2 +-
 rasa/utils/train_utils.py                     | 197 +++++++++++++-----
 6 files changed, 224 insertions(+), 80 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index f81071a2c66a..603452027f0a 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -270,7 +270,7 @@ def _create_session_data(
             label_ids = None
             Y = None
 
-        return train_utils.SessionData(X=data_X, Y=Y, label_ids=label_ids)
+        return train_utils.SessionData(X_dense=data_X, Y=Y, labels=label_ids)
 
     def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
         """Create embedding bot vector."""
@@ -370,7 +370,7 @@ def _create_tf_placeholders(self, session_data: "train_utils.SessionData") -> No
         dialogue_len = None  # use dynamic time
         self.a_in = tf.placeholder(
             dtype=tf.float32,
-            shape=(None, dialogue_len, session_data.X.shape[-1]),
+            shape=(None, dialogue_len, session_data.X_dense.shape[-1]),
             name="a",
         )
         self.b_in = tf.placeholder(
@@ -535,7 +535,7 @@ def tf_feed_dict_for_prediction(
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
 
-        return {self.a_in: session_data.X}
+        return {self.a_in: session_data.X_dense}
 
     def predict_action_probabilities(
         self, tracker: "DialogueStateTracker", domain: "Domain"
diff --git a/rasa/core/test.py b/rasa/core/test.py
index ef57b3645744..6bacf7684a84 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -16,10 +16,6 @@
 if typing.TYPE_CHECKING:
     from rasa.core.agent import Agent
 
-import matplotlib
-
-matplotlib.use("TkAgg")
-
 logger = logging.getLogger(__name__)
 
 StoryEvalution = namedtuple(
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index ae0dd6745922..55c9f9204b0b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -3,10 +3,12 @@
 import os
 import pickle
 import typing
+import scipy.sparse
 from typing import Any, Dict, List, Optional, Text, Tuple
 import warnings
 
 from rasa.nlu.featurizers.featurzier import sequence_to_sentence_features
+from rasa.nlu.test import determine_token_labels
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
@@ -14,6 +16,9 @@
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    MESSAGE_ENTITIES_ATTRIBUTE,
+    MESSAGE_TOKENS_NAMES,
 )
 
 import tensorflow as tf
@@ -326,41 +331,70 @@ def _create_encoded_label_ids(
 
         return encoded_id_labels
 
+    # training data helpers:
+    @staticmethod
+    def _create_tag_id_dict(
+        training_data: "TrainingData", attribute: Text
+    ) -> Dict[Text, int]:
+        """Create label_id dictionary"""
+        distinct_tag_ids = set(
+            [
+                e["entity"]
+                for example in training_data.entity_examples
+                for e in example.get(attribute)
+            ]
+        ) - {None}
+        tag_id_dict = {
+            tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
+        }
+        tag_id_dict["O"] = 0
+        return tag_id_dict
+
     # noinspection PyPep8Naming
     def _create_session_data(
         self,
         training_data: "TrainingData",
         label_id_dict: Dict[Text, int],
+        tag_id_dict: Dict[Text, int],
         attribute: Text,
     ) -> "train_utils.SessionData":
         """Prepare data for training and create a SessionData object"""
-
-        X = []
-        label_ids = []
+        X_sparse = []
+        X_dense = []
         Y = []
+        labels = []
+        tags = []
 
-        for e in training_data.intent_examples:
+        for e in training_data.training_examples:
             if e.get(attribute):
-                X.append(
-                    sequence_to_sentence_features(
-                        e.get(
-                            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-                        )
-                    )
-                    .toarray()
-                    .squeeze()
+                X_sparse.append(
+                    e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                )
+                X_dense.append(
+                    e.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
                 )
-                label_ids.append(label_id_dict[e.get(attribute)])
+                # every example should have an intent
+                labels.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
+
+        for e in training_data.training_examples:
+            _tags = []
+            for t in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]):
+                _tag = determine_token_labels(
+                    t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
+                )
+                _tags.append(tag_id_dict[_tag])
+            tags.append(scipy.sparse.csr_matrix(np.array([_tags]).T))
 
-        X = np.array(X)
-        label_ids = np.array(label_ids)
+        X_sparse = np.array(X_sparse)
+        X_dense = np.array(X_dense)
+        labels = np.array(labels)
+        tags = np.array(tags)
 
-        for label_id_idx in label_ids:
+        for label_id_idx in labels:
             Y.append(self._encoded_all_label_ids[label_id_idx])
-
         Y = np.array(Y)
 
-        return train_utils.SessionData(X=X, Y=Y, label_ids=label_ids)
+        return train_utils.SessionData(X_dense, X_sparse, Y, tags, labels)
 
     # tf helpers:
     def _create_tf_embed_fnn(
@@ -483,6 +517,10 @@ def preprocess_train_data(self, training_data):
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
+        tag_id_dict = self._create_tag_id_dict(
+            training_data, attribute=MESSAGE_ENTITIES_ATTRIBUTE
+        )
+        self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
         self._encoded_all_label_ids = self._create_encoded_label_ids(
@@ -505,7 +543,10 @@ def preprocess_train_data(self, training_data):
         self.num_neg = min(self.num_neg, self._encoded_all_label_ids.shape[0] - 1)
 
         session_data = self._create_session_data(
-            training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
+            training_data,
+            label_id_dict,
+            tag_id_dict,
+            attribute=MESSAGE_INTENT_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)
@@ -514,7 +555,7 @@ def preprocess_train_data(self, training_data):
 
     def _check_enough_labels(self, session_data) -> bool:
 
-        return len(np.unique(session_data.label_ids)) >= 2
+        return len(np.unique(session_data.labels)) >= 2
 
     def train(
         self,
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index c94be79d5676..3283249b1ce3 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -16,6 +16,7 @@
     OPEN_UTTERANCE_RANKING_KEY,
     MESSAGE_SELECTOR_PROPERTY_NAME,
     DEFAULT_OPEN_UTTERANCE_TYPE,
+    MESSAGE_ENTITIES_ATTRIBUTE,
 )
 
 logger = logging.getLogger(__name__)
@@ -138,14 +139,18 @@ def _set_message_property(
         )
 
     def preprocess_train_data(self, training_data):
-        """Performs sanity checks on training data, extracts encodings for labels and prepares data for training"""
-
+        """Performs sanity checks on training data, extracts encodings for labels
+        and prepares data for training"""
         if self.retrieval_intent:
             training_data = training_data.filter_by_intent(self.retrieval_intent)
 
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_RESPONSE_ATTRIBUTE
         )
+        tag_id_dict = self._create_tag_id_dict(
+            training_data, attribute=MESSAGE_ENTITIES_ATTRIBUTE
+        )
+        self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
         self._encoded_all_label_ids = self._create_encoded_label_ids(
@@ -168,7 +173,10 @@ def preprocess_train_data(self, training_data):
         self.num_neg = min(self.num_neg, self._encoded_all_label_ids.shape[0] - 1)
 
         session_data = self._create_session_data(
-            training_data, label_id_dict, attribute=MESSAGE_RESPONSE_ATTRIBUTE
+            training_data,
+            label_id_dict,
+            tag_id_dict,
+            attribute=MESSAGE_RESPONSE_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index 2990a4678548..82cf5a77167f 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -786,7 +786,7 @@ def determine_token_labels(
         entity type
     """
 
-    if len(entities) == 0:
+    if entities is None or len(entities) == 0:
         return "O"
     if not do_extractors_support_overlap(extractors) and do_entities_overlap(entities):
         raise ValueError("The possible entities should not overlap")
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e3e21c22085d..8abe12e270c2 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,7 +1,19 @@
 from collections import namedtuple
 import logging
+import scipy.sparse
 import typing
-from typing import List, Optional, Text, Dict, Tuple, Union, Generator, Callable, Any
+from typing import (
+    List,
+    Optional,
+    Text,
+    Dict,
+    Tuple,
+    Union,
+    Generator,
+    Callable,
+    Any,
+    NamedTuple,
+)
 import numpy as np
 from tqdm import tqdm
 from sklearn.model_selection import train_test_split
@@ -24,7 +36,12 @@
 
 
 # namedtuple for all tf session related data
-SessionData = namedtuple("SessionData", ("X", "Y", "label_ids"))
+class SessionData(NamedTuple):
+    X_dense: Optional[np.ndarray] = None
+    X_sparse: Optional[np.ndarray] = None
+    Y: Optional[np.ndarray] = None
+    tags: Optional[np.ndarray] = None
+    labels: Optional[np.ndarray] = None
 
 
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
@@ -42,10 +59,10 @@ def train_val_split(
     """Create random hold out validation set using stratified split."""
 
     label_counts = dict(
-        zip(*np.unique(session_data.label_ids, return_counts=True, axis=0))
+        zip(*np.unique(session_data.labels, return_counts=True, axis=0))
     )
 
-    if evaluate_on_num_examples >= len(session_data.X) - len(label_counts):
+    if evaluate_on_num_examples >= len(session_data.X_dense) - len(label_counts):
         raise ValueError(
             "Validation set of {} is too large. Remaining train set "
             "should be at least equal to number of classes {}."
@@ -58,42 +75,75 @@ def train_val_split(
             "".format(evaluate_on_num_examples, len(label_counts))
         )
 
-    counts = np.array([label_counts[label] for label in session_data.label_ids])
+    counts = np.array([label_counts[label] for label in session_data.labels])
 
-    multi_X = session_data.X[counts > 1]
+    multi_X_dense = session_data.X_dense[counts > 1]
+    multi_X_sparse = session_data.X_sparse[counts > 1]
     multi_Y = session_data.Y[counts > 1]
-    multi_label_ids = session_data.label_ids[counts > 1]
+    multi_labels = session_data.labels[counts > 1]
+    multi_tags = session_data.tags[counts > 1]
 
-    solo_X = session_data.X[counts == 1]
+    solo_X_sparse = session_data.X_sparse[counts == 1]
+    solo_X_dense = session_data.X_dense[counts == 1]
     solo_Y = session_data.Y[counts == 1]
-    solo_label_ids = session_data.label_ids[counts == 1]
+    solo_labels = session_data.labels[counts == 1]
+    solo_tags = session_data.tags[counts == 1]
 
-    (X_train, X_val, Y_train, Y_val, label_ids_train, label_ids_val) = train_test_split(
-        multi_X,
+    (
+        X_dense_train,
+        X_dense_val,
+        X_sparse_train,
+        X_sparse_val,
+        Y_train,
+        Y_val,
+        labels_train,
+        labels_val,
+        tags_train,
+        tags_val,
+    ) = train_test_split(
+        multi_X_dense,
+        multi_X_sparse,
         multi_Y,
-        multi_label_ids,
+        multi_labels,
+        multi_tags,
         test_size=evaluate_on_num_examples,
         random_state=random_seed,
-        stratify=multi_label_ids,
+        stratify=multi_labels,
     )
-    X_train = np.concatenate([X_train, solo_X])
+    X_dense_train = np.concatenate([X_dense_train, solo_X_dense])
+    X_sparse_train = np.concatenate([X_sparse_train, solo_X_sparse])
     Y_train = np.concatenate([Y_train, solo_Y])
-    label_ids_train = np.concatenate([label_ids_train, solo_label_ids])
+    labels_train = np.concatenate([labels_train, solo_labels])
+    tags_train = np.concatenate([tags_train, solo_tags])
 
     return (
-        SessionData(X=X_train, Y=Y_train, label_ids=label_ids_train),
-        SessionData(X=X_val, Y=Y_val, label_ids=label_ids_val),
+        SessionData(
+            X_sparse=X_sparse_train,
+            X_dense=X_dense_train,
+            Y=Y_train,
+            labels=labels_train,
+            tags=tags_train,
+        ),
+        SessionData(
+            X_sparse=X_sparse_val,
+            X_dense=X_dense_val,
+            Y=Y_val,
+            labels=labels_val,
+            tags=tags_val,
+        ),
     )
 
 
 def shuffle_session_data(session_data: "SessionData") -> "SessionData":
     """Shuffle session data."""
 
-    ids = np.random.permutation(len(session_data.X))
+    ids = np.random.permutation(len(session_data.X_dense))
     return SessionData(
-        X=session_data.X[ids],
+        X_dense=session_data.X_dense[ids],
+        X_sparse=session_data.X_sparse[ids],
         Y=session_data.Y[ids],
-        label_ids=session_data.label_ids[ids],
+        labels=session_data.labels[ids],
+        tags=session_data.tags[ids],
     )
 
 
@@ -106,9 +156,11 @@ def split_session_data_by_label(
     for label_id in unique_label_ids:
         label_data.append(
             SessionData(
-                X=session_data.X[session_data.label_ids == label_id],
-                Y=session_data.Y[session_data.label_ids == label_id],
-                label_ids=session_data.label_ids[session_data.label_ids == label_id],
+                X_sparse=session_data.X_sparse[session_data.labels == label_id],
+                X_dense=session_data.X_dense[session_data.labels == label_id],
+                Y=session_data.Y[session_data.labels == label_id],
+                labels=session_data.labels[session_data.labels == label_id],
+                tags=session_data.tags[session_data.tags == label_id],
             )
         )
     return label_data
@@ -125,9 +177,9 @@ def balance_session_data(
     that more populated classes should appear more often.
     """
 
-    num_examples = len(session_data.X)
+    num_examples = len(session_data.X_dense)
     unique_label_ids, counts_label_ids = np.unique(
-        session_data.label_ids, return_counts=True, axis=0
+        session_data.labels, return_counts=True, axis=0
     )
     num_label_ids = len(unique_label_ids)
 
@@ -137,9 +189,13 @@ def balance_session_data(
     data_idx = [0] * num_label_ids
     num_data_cycles = [0] * num_label_ids
     skipped = [False] * num_label_ids
-    new_X = []
+
+    new_X_sparse = []
+    new_X_dense = []
     new_Y = []
-    new_label_ids = []
+    new_labels = []
+    new_tags = []
+
     while min(num_data_cycles) == 0:
         if shuffle:
             indices_of_labels = np.random.permutation(num_label_ids)
@@ -157,8 +213,13 @@ def balance_session_data(
                 int(counts_label_ids[index] / num_examples * batch_size) + 1
             )
 
-            new_X.append(
-                label_data[index].X[
+            new_X_dense.append(
+                label_data[index].X_dense[
+                    data_idx[index] : data_idx[index] + index_batch_size
+                ]
+            )
+            new_X_sparse.append(
+                label_data[index].X_sparse[
                     data_idx[index] : data_idx[index] + index_batch_size
                 ]
             )
@@ -167,8 +228,13 @@ def balance_session_data(
                     data_idx[index] : data_idx[index] + index_batch_size
                 ]
             )
-            new_label_ids.append(
-                label_data[index].label_ids[
+            new_tags.append(
+                label_data[index].tags[
+                    data_idx[index] : data_idx[index] + index_batch_size
+                ]
+            )
+            new_labels.append(
+                label_data[index].labels[
                     data_idx[index] : data_idx[index] + index_batch_size
                 ]
             )
@@ -182,9 +248,11 @@ def balance_session_data(
                 break
 
     return SessionData(
-        X=np.concatenate(new_X),
+        X_dense=np.concatenate(new_X_dense),
+        X_sparse=np.concatenate(new_X_sparse),
         Y=np.concatenate(new_Y),
-        label_ids=np.concatenate(new_label_ids),
+        tags=np.concatenate(new_tags),
+        labels=np.concatenate(new_labels),
     )
 
 
@@ -202,15 +270,41 @@ def gen_batch(
     if batch_strategy == "balanced":
         session_data = balance_session_data(session_data, batch_size, shuffle)
 
-    num_batches = session_data.X.shape[0] // batch_size + int(
-        session_data.X.shape[0] % batch_size > 0
+    num_batches = session_data.X_sparse.shape[0] // batch_size + int(
+        session_data.X_sparse.shape[0] % batch_size > 0
     )
 
     for batch_num in range(num_batches):
-        batch_x = session_data.X[batch_num * batch_size : (batch_num + 1) * batch_size]
-        batch_y = session_data.Y[batch_num * batch_size : (batch_num + 1) * batch_size]
+        start = batch_num * batch_size
+        end = (batch_num + 1) * batch_size
+
+        batch_x_sparse = convert_sparse_to_dense(session_data.X_sparse[start:end])
+        batch_x_dense = convert_sparse_to_dense(
+            session_data.X_dense[start:end], init_with_zero=True
+        )
+        batch_y = convert_sparse_to_dense(session_data.Y[start:end])
+        batch_tags = convert_sparse_to_dense(session_data.tags[start:end])
 
-        yield batch_x, batch_y
+        yield batch_x_sparse, batch_x_dense, batch_y, batch_tags
+
+
+def convert_sparse_to_dense(
+    data_sparse: Union[np.ndarray, List[scipy.sparse.csr_matrix]],
+    init_with_zero: bool = False,
+):
+    data_size = len(data_sparse)
+    max_seq_len = max([x.shape[0] for x in data_sparse])
+    feature_len = max([x.shape[-1] for x in data_sparse])
+
+    if init_with_zero:
+        data_dense = np.zeros([data_size, max_seq_len, feature_len], dtype=np.float)
+    else:
+        data_dense = np.ones([data_size, max_seq_len, feature_len], dtype=np.float) * -1
+
+    for i in range(data_size):
+        data_dense[i, : data_sparse[i].shape[0], :] = data_sparse[i].toarray()
+
+    return data_dense
 
 
 # noinspection PyPep8Naming
@@ -223,26 +317,31 @@ def create_tf_dataset(
     """Create tf dataset."""
 
     # set batch and sequence length to None
-    if session_data.X[0].ndim == 1:
-        shape_X = (None, session_data.X[0].shape[-1])
-    else:
-        shape_X = (None, None, session_data.X[0].shape[-1])
-
-    if session_data.Y[0].ndim == 1:
-        shape_Y = (None, session_data.Y[0].shape[-1])
-    else:
-        shape_Y = (None, None, session_data.Y[0].shape[-1])
+    shape_X_dense = _get_shape(session_data.X_dense)
+    shape_X_sparse = _get_shape(session_data.X_sparse)
+    shape_Y = _get_shape(session_data.Y)
+    shape_tags = _get_shape(session_data.tags)
 
     return tf.data.Dataset.from_generator(
         lambda batch_size_: gen_batch(
             session_data, batch_size_, batch_strategy, shuffle
         ),
-        output_types=(tf.float32, tf.float32),
-        output_shapes=(shape_X, shape_Y),
+        output_types=(tf.float32, tf.float32, tf.float32, tf.float32),
+        output_shapes=(shape_X_sparse, shape_X_dense, shape_Y, shape_tags),
         args=([batch_size]),
     )
 
 
+def _get_shape(data: Union[np.ndarray, List[scipy.sparse.csr_matrix]]) -> Tuple:
+    if data is None:
+        return ()
+
+    if data[0].ndim == 1:
+        return None, data[0].shape[-1]
+
+    return None, None, data[0].shape[-1]
+
+
 def create_iterator_init_datasets(
     session_data: "SessionData",
     eval_session_data: "SessionData",

From 3faf171f7352337cfca57216919be845c7cdd4db Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 28 Oct 2019 10:04:06 +0100
Subject: [PATCH 098/239] use dict for session data.

---
 rasa/core/policies/embedding_policy.py        |   4 +-
 .../embedding_intent_classifier.py            |   6 +-
 rasa/utils/train_utils.py                     | 296 +++++++++---------
 tests/utils/test_train_utils.py               |  77 +++++
 4 files changed, 237 insertions(+), 146 deletions(-)
 create mode 100644 tests/utils/test_train_utils.py

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 603452027f0a..6e1f6b2c4823 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -270,7 +270,9 @@ def _create_session_data(
             label_ids = None
             Y = None
 
-        return train_utils.SessionData(X_dense=data_X, Y=Y, labels=label_ids)
+        return train_utils.SessionData(
+            X={"X": data_X}, Y={"Y": Y}, labels={"labels": label_ids}
+        )
 
     def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
         """Create embedding bot vector."""
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 55c9f9204b0b..f32af40d64ff 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -394,7 +394,11 @@ def _create_session_data(
             Y.append(self._encoded_all_label_ids[label_id_idx])
         Y = np.array(Y)
 
-        return train_utils.SessionData(X_dense, X_sparse, Y, tags, labels)
+        return train_utils.SessionData(
+            {"dense": X_dense, "sparse": X_sparse},
+            {"Y": Y},
+            {"tags": tags, "labels": labels},
+        )
 
     # tf helpers:
     def _create_tf_embed_fnn(
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 8abe12e270c2..a10ec5f571e8 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1,4 +1,4 @@
-from collections import namedtuple
+from collections import defaultdict
 import logging
 import scipy.sparse
 import typing
@@ -37,11 +37,9 @@
 
 # namedtuple for all tf session related data
 class SessionData(NamedTuple):
-    X_dense: Optional[np.ndarray] = None
-    X_sparse: Optional[np.ndarray] = None
-    Y: Optional[np.ndarray] = None
-    tags: Optional[np.ndarray] = None
-    labels: Optional[np.ndarray] = None
+    X: Dict[Text, np.ndarray]
+    Y: Dict[Text, np.ndarray]
+    labels: Dict[Text, np.ndarray]
 
 
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
@@ -54,121 +52,126 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
 
 # noinspection PyPep8Naming
 def train_val_split(
-    session_data: "SessionData", evaluate_on_num_examples: int, random_seed: int
+    session_data: "SessionData",
+    evaluate_on_num_examples: int,
+    random_seed: int,
+    label_key: Text = "labels",
 ) -> Tuple["SessionData", "SessionData"]:
     """Create random hold out validation set using stratified split."""
 
     label_counts = dict(
-        zip(*np.unique(session_data.labels, return_counts=True, axis=0))
+        zip(*np.unique(session_data.labels[label_key], return_counts=True, axis=0))
     )
 
-    if evaluate_on_num_examples >= len(session_data.X_dense) - len(label_counts):
+    num_examples = get_number_of_examples(session_data)
+    if evaluate_on_num_examples >= num_examples - len(label_counts):
         raise ValueError(
-            "Validation set of {} is too large. Remaining train set "
-            "should be at least equal to number of classes {}."
-            "".format(evaluate_on_num_examples, len(label_counts))
+            f"Validation set of {evaluate_on_num_examples} is too large. Remaining train set "
+            "should be at least equal to number of classes {len(label_counts)}."
         )
     elif evaluate_on_num_examples < len(label_counts):
         raise ValueError(
-            "Validation set of {} is too small. It should be "
-            "at least equal to number of classes {}."
-            "".format(evaluate_on_num_examples, len(label_counts))
+            f"Validation set of {evaluate_on_num_examples} is too small. It should be "
+            "at least equal to number of classes {label_counts}."
         )
 
-    counts = np.array([label_counts[label] for label in session_data.labels])
+    counts = np.array([label_counts[label] for label in session_data.labels[label_key]])
 
-    multi_X_dense = session_data.X_dense[counts > 1]
-    multi_X_sparse = session_data.X_sparse[counts > 1]
-    multi_Y = session_data.Y[counts > 1]
-    multi_labels = session_data.labels[counts > 1]
-    multi_tags = session_data.tags[counts > 1]
+    multi_values = []
+    [multi_values.append(v[counts > 1]) for k, v in session_data.X.items()]
+    [multi_values.append(v[counts > 1]) for k, v in session_data.Y.items()]
+    [multi_values.append(v[counts > 1]) for k, v in session_data.labels.items()]
 
-    solo_X_sparse = session_data.X_sparse[counts == 1]
-    solo_X_dense = session_data.X_dense[counts == 1]
-    solo_Y = session_data.Y[counts == 1]
-    solo_labels = session_data.labels[counts == 1]
-    solo_tags = session_data.tags[counts == 1]
+    solo_values = []
+    [solo_values.append(v[counts == 1]) for k, v in session_data.X.items()]
+    [solo_values.append(v[counts == 1]) for k, v in session_data.Y.items()]
+    [solo_values.append(v[counts == 1]) for k, v in session_data.labels.items()]
 
-    (
-        X_dense_train,
-        X_dense_val,
-        X_sparse_train,
-        X_sparse_val,
-        Y_train,
-        Y_val,
-        labels_train,
-        labels_val,
-        tags_train,
-        tags_val,
-    ) = train_test_split(
-        multi_X_dense,
-        multi_X_sparse,
-        multi_Y,
-        multi_labels,
-        multi_tags,
+    keys = [k for d in session_data for k, v in d.items()]
+
+    output_values = train_test_split(
+        *multi_values,
         test_size=evaluate_on_num_examples,
         random_state=random_seed,
-        stratify=multi_labels,
+        stratify=session_data.labels[label_key][counts > 1],
     )
-    X_dense_train = np.concatenate([X_dense_train, solo_X_dense])
-    X_sparse_train = np.concatenate([X_sparse_train, solo_X_sparse])
-    Y_train = np.concatenate([Y_train, solo_Y])
-    labels_train = np.concatenate([labels_train, solo_labels])
-    tags_train = np.concatenate([tags_train, solo_tags])
+
+    X_train = {}
+    Y_train = {}
+    labels_train = {}
+    X_val = {}
+    Y_val = {}
+    labels_val = {}
+
+    # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
+    # order is kept, so first session_data.X values, then session_data.Y values, and
+    # finally session_data.labels values
+    for i in range(len(session_data.X)):
+        X_train[keys[i]] = np.concatenate([output_values[i * 2], solo_values[i]])
+
+    for i in range(len(session_data.X), len(session_data.X) + len(session_data.Y)):
+        Y_train[keys[i]] = np.concatenate([output_values[i * 2], solo_values[i]])
+
+    for i in range(
+        len(session_data.X) + len(session_data.Y),
+        len(session_data.X) + len(session_data.Y) + len(session_data.labels),
+    ):
+        labels_train[keys[i]] = np.concatenate([output_values[i * 2], solo_values[i]])
+
+    for i in range(len(session_data.X)):
+        X_val[keys[i]] = np.concatenate([output_values[(i * 2) + 1], solo_values[i]])
+
+    for i in range(len(session_data.X), len(session_data.X) + len(session_data.Y)):
+        Y_val[keys[i]] = np.concatenate([output_values[(i * 2) + 1], solo_values[i]])
+
+    for i in range(
+        len(session_data.X) + len(session_data.Y),
+        len(session_data.X) + len(session_data.Y) + len(session_data.labels),
+    ):
+        labels_val[keys[i]] = np.concatenate(
+            [output_values[(i * 2) + 1], solo_values[i]]
+        )
 
     return (
-        SessionData(
-            X_sparse=X_sparse_train,
-            X_dense=X_dense_train,
-            Y=Y_train,
-            labels=labels_train,
-            tags=tags_train,
-        ),
-        SessionData(
-            X_sparse=X_sparse_val,
-            X_dense=X_dense_val,
-            Y=Y_val,
-            labels=labels_val,
-            tags=tags_val,
-        ),
+        SessionData(X_train, Y_train, labels_train),
+        SessionData(X_val, Y_val, labels_val),
     )
 
 
 def shuffle_session_data(session_data: "SessionData") -> "SessionData":
     """Shuffle session data."""
+    data_points = get_number_of_examples(session_data)
+    ids = np.random.permutation(data_points)
+    return session_data_for_ids(session_data, ids)
 
-    ids = np.random.permutation(len(session_data.X_dense))
-    return SessionData(
-        X_dense=session_data.X_dense[ids],
-        X_sparse=session_data.X_sparse[ids],
-        Y=session_data.Y[ids],
-        labels=session_data.labels[ids],
-        tags=session_data.tags[ids],
-    )
+
+def session_data_for_ids(session_data: SessionData, ids: np.ndarray):
+    """Filter session data by ids."""
+    X = {k: v[ids] for k, v in session_data.X.items()}
+    Y = {k: v[ids] for k, v in session_data.Y.items()}
+    labels = {k: v[ids] for k, v in session_data.labels.items()}
+
+    return SessionData(X, Y, labels)
 
 
 def split_session_data_by_label(
-    session_data: "SessionData", unique_label_ids: "np.ndarray"
+    session_data: "SessionData", label_key: Text, unique_label_ids: "np.ndarray"
 ) -> List["SessionData"]:
     """Reorganize session data into a list of session data with the same labels."""
 
     label_data = []
     for label_id in unique_label_ids:
-        label_data.append(
-            SessionData(
-                X_sparse=session_data.X_sparse[session_data.labels == label_id],
-                X_dense=session_data.X_dense[session_data.labels == label_id],
-                Y=session_data.Y[session_data.labels == label_id],
-                labels=session_data.labels[session_data.labels == label_id],
-                tags=session_data.tags[session_data.tags == label_id],
-            )
-        )
+        ids = session_data.labels[label_key] == label_id
+        label_data.append(session_data_for_ids(session_data, ids))
     return label_data
 
 
 # noinspection PyPep8Naming
 def balance_session_data(
-    session_data: "SessionData", batch_size: int, shuffle: bool
+    session_data: "SessionData",
+    batch_size: int,
+    shuffle: bool,
+    label_key: Text = "labels",
 ) -> "SessionData":
     """Mix session data to account for class imbalance.
 
@@ -176,25 +179,30 @@ def balance_session_data(
     by repeating them. Mimics stratified batching, but also takes into account
     that more populated classes should appear more often.
     """
+    example_lengths = [len(x) for x in session_data.X.values()]
+
+    if not all(l == example_lengths[0] for l in example_lengths):
+        raise ValueError("Number of examples in X differ.")
+
+    if label_key not in session_data.labels:
+        raise ValueError(f"{label_key} not in SessionData.labels.")
 
-    num_examples = len(session_data.X_dense)
+    num_examples = example_lengths[0]
     unique_label_ids, counts_label_ids = np.unique(
-        session_data.labels, return_counts=True, axis=0
+        session_data.labels[label_key], return_counts=True, axis=0
     )
     num_label_ids = len(unique_label_ids)
 
     # need to call every time, so that the data is shuffled inside each class
-    label_data = split_session_data_by_label(session_data, unique_label_ids)
+    label_data = split_session_data_by_label(session_data, label_key, unique_label_ids)
 
     data_idx = [0] * num_label_ids
     num_data_cycles = [0] * num_label_ids
     skipped = [False] * num_label_ids
 
-    new_X_sparse = []
-    new_X_dense = []
-    new_Y = []
-    new_labels = []
-    new_tags = []
+    new_X = defaultdict(list)
+    new_Y = defaultdict(list)
+    new_labels = defaultdict(list)
 
     while min(num_data_cycles) == 0:
         if shuffle:
@@ -213,31 +221,14 @@ def balance_session_data(
                 int(counts_label_ids[index] / num_examples * batch_size) + 1
             )
 
-            new_X_dense.append(
-                label_data[index].X_dense[
-                    data_idx[index] : data_idx[index] + index_batch_size
-                ]
-            )
-            new_X_sparse.append(
-                label_data[index].X_sparse[
-                    data_idx[index] : data_idx[index] + index_batch_size
-                ]
-            )
-            new_Y.append(
-                label_data[index].Y[
-                    data_idx[index] : data_idx[index] + index_batch_size
-                ]
-            )
-            new_tags.append(
-                label_data[index].tags[
-                    data_idx[index] : data_idx[index] + index_batch_size
-                ]
-            )
-            new_labels.append(
-                label_data[index].labels[
-                    data_idx[index] : data_idx[index] + index_batch_size
-                ]
-            )
+            for k, v in label_data[index].X.items():
+                new_X[k].append(v[data_idx[index] : data_idx[index] + index_batch_size])
+            for k, v in label_data[index].Y.items():
+                new_Y[k].append(v[data_idx[index] : data_idx[index] + index_batch_size])
+            for k, v in label_data[index].labels.items():
+                new_labels[k].append(
+                    v[data_idx[index] : data_idx[index] + index_batch_size]
+                )
 
             data_idx[index] += index_batch_size
             if data_idx[index] >= counts_label_ids[index]:
@@ -247,13 +238,24 @@ def balance_session_data(
             if min(num_data_cycles) > 0:
                 break
 
-    return SessionData(
-        X_dense=np.concatenate(new_X_dense),
-        X_sparse=np.concatenate(new_X_sparse),
-        Y=np.concatenate(new_Y),
-        tags=np.concatenate(new_tags),
-        labels=np.concatenate(new_labels),
-    )
+    new_X = {k: np.concatenate(v) for k, v in new_X.items()}
+    new_Y = {k: np.concatenate(v) for k, v in new_Y.items()}
+    new_labels = {k: np.concatenate(v) for k, v in new_labels.items()}
+
+    return SessionData(X=new_X, Y=new_Y, labels=new_labels)
+
+
+def get_number_of_examples(session_data: SessionData):
+    example_lengths = [len(v) for v in session_data.X.values()]
+
+    # check if number of examples is the same for all X
+    if len(set(example_lengths)) != 1:
+        raise ValueError(
+            f"Number of examples differs for X ({session_data.X.keys()}). There should "
+            f"be the same."
+        )
+
+    return example_lengths[0]
 
 
 def gen_batch(
@@ -270,22 +272,22 @@ def gen_batch(
     if batch_strategy == "balanced":
         session_data = balance_session_data(session_data, batch_size, shuffle)
 
-    num_batches = session_data.X_sparse.shape[0] // batch_size + int(
-        session_data.X_sparse.shape[0] % batch_size > 0
-    )
+    num_examples = get_number_of_examples(session_data)
+    num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
 
     for batch_num in range(num_batches):
         start = batch_num * batch_size
         end = (batch_num + 1) * batch_size
 
-        batch_x_sparse = convert_sparse_to_dense(session_data.X_sparse[start:end])
-        batch_x_dense = convert_sparse_to_dense(
-            session_data.X_dense[start:end], init_with_zero=True
-        )
-        batch_y = convert_sparse_to_dense(session_data.Y[start:end])
-        batch_tags = convert_sparse_to_dense(session_data.tags[start:end])
+        batch_data = []
+        for v in session_data.X.values():
+            batch_data.append(convert_sparse_to_dense(v[start:end]))
+        for v in session_data.Y.values():
+            batch_data.append(convert_sparse_to_dense(v[start:end]))
+        for v in session_data.labels.values():
+            batch_data.append(convert_sparse_to_dense(v[start:end]))
 
-        yield batch_x_sparse, batch_x_dense, batch_y, batch_tags
+        yield tuple(batch_data)
 
 
 def convert_sparse_to_dense(
@@ -317,29 +319,35 @@ def create_tf_dataset(
     """Create tf dataset."""
 
     # set batch and sequence length to None
-    shape_X_dense = _get_shape(session_data.X_dense)
-    shape_X_sparse = _get_shape(session_data.X_sparse)
-    shape_Y = _get_shape(session_data.Y)
-    shape_tags = _get_shape(session_data.tags)
+    shapes = _get_shape(session_data)
+    types = tuple([np.float32] * len(shapes))
 
     return tf.data.Dataset.from_generator(
         lambda batch_size_: gen_batch(
             session_data, batch_size_, batch_strategy, shuffle
         ),
-        output_types=(tf.float32, tf.float32, tf.float32, tf.float32),
-        output_shapes=(shape_X_sparse, shape_X_dense, shape_Y, shape_tags),
+        output_types=types,
+        output_shapes=shapes,
         args=([batch_size]),
     )
 
 
-def _get_shape(data: Union[np.ndarray, List[scipy.sparse.csr_matrix]]) -> Tuple:
-    if data is None:
-        return ()
+def _get_shape(session_data: SessionData) -> Tuple:
+    shapes = []
+
+    def append_shape(v: Union[np.ndarray, scipy.sparse.spmatrix]):
+        if v[0].ndim == 1:
+            shapes.append((None, v[0].shape[-1]))
+        shapes.append((None, None, v[0].shape[-1]))
 
-    if data[0].ndim == 1:
-        return None, data[0].shape[-1]
+    for v in session_data.X.values():
+        append_shape(v)
+    for v in session_data.Y.values():
+        append_shape(v)
+    for v in session_data.labels.values():
+        append_shape(v)
 
-    return None, None, data[0].shape[-1]
+    return tuple(shapes)
 
 
 def create_iterator_init_datasets(
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
new file mode 100644
index 000000000000..4a6c3336fea5
--- /dev/null
+++ b/tests/utils/test_train_utils.py
@@ -0,0 +1,77 @@
+import pytest
+import numpy as np
+
+from rasa.utils.train_utils import (
+    SessionData,
+    shuffle_session_data,
+    split_session_data_by_label,
+    train_val_split,
+    session_data_for_ids,
+)
+
+
+@pytest.fixture
+async def session_data() -> SessionData:
+    return SessionData(
+        X={
+            "sparse": np.random.rand(5, 10),
+            "dense": np.random.randint(5, size=(5, 10)),
+        },
+        Y={"Y": np.random.randint(2, size=(5, 10))},
+        labels={
+            "tags": np.random.randint(2, size=(5, 10)),
+            "labels": np.random.randint(2, size=(5)),
+        },
+    )
+
+
+def test_shuffle_session_data(session_data: SessionData):
+    shuffeled_session_data = shuffle_session_data(session_data)
+
+    assert np.array(shuffeled_session_data.X.values()) != np.array(
+        session_data.X.values()
+    )
+    assert np.array(shuffeled_session_data.Y.values()) != np.array(
+        session_data.Y.values()
+    )
+    assert np.array(shuffeled_session_data.labels.values()) != np.array(
+        session_data.labels.values()
+    )
+
+
+def test_split_session_data_by_label(session_data: SessionData):
+    split_session_data = split_session_data_by_label(
+        session_data, "labels", np.array([1, 2, 3, 4, 5])
+    )
+
+    assert len(split_session_data) == 5
+    for s in split_session_data:
+        assert len(set(s.labels["labels"])) <= 1
+
+
+def test_train_val_split(session_data: SessionData):
+    train_session_data, val_session_data = train_val_split(
+        session_data, 2, 42, "labels"
+    )
+
+    for v in train_session_data.X.values():
+        assert len(v) == 3
+
+    for v in val_session_data.X.values():
+        assert len(v) == 2
+
+
+def test_session_data_for_ids(session_data: SessionData):
+    filtered_session_data = session_data_for_ids(session_data, np.array([0, 1]))
+
+    for v in filtered_session_data.X.values():
+        assert len(v) == 2
+
+    k = list(session_data.X.keys())[0]
+
+    assert np.all(
+        np.array(filtered_session_data.X[k][0]) == np.array(session_data.X[k][0])
+    )
+    assert np.all(
+        np.array(filtered_session_data.X[k][1]) == np.array(session_data.X[k][1])
+    )

From 834d265a2343ebe53a117c6762cb87b2cf135d9c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 28 Oct 2019 14:24:11 +0100
Subject: [PATCH 099/239] adapt classifiers

---
 rasa/core/policies/embedding_policy.py        |  6 +--
 rasa/core/test.py                             |  5 ++
 .../embedding_intent_classifier.py            | 53 +++----------------
 .../selectors/embedding_response_selector.py  | 19 +------
 4 files changed, 16 insertions(+), 67 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 6e1f6b2c4823..5cf5215cbf90 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -372,12 +372,12 @@ def _create_tf_placeholders(self, session_data: "train_utils.SessionData") -> No
         dialogue_len = None  # use dynamic time
         self.a_in = tf.placeholder(
             dtype=tf.float32,
-            shape=(None, dialogue_len, session_data.X_dense.shape[-1]),
+            shape=(None, dialogue_len, session_data.X["X"].shape[-1]),
             name="a",
         )
         self.b_in = tf.placeholder(
             dtype=tf.float32,
-            shape=(None, dialogue_len, None, session_data.Y.shape[-1]),
+            shape=(None, dialogue_len, None, session_data.Y["Y"].shape[-1]),
             name="b",
         )
 
@@ -537,7 +537,7 @@ def tf_feed_dict_for_prediction(
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
 
-        return {self.a_in: session_data.X_dense}
+        return {self.a_in: session_data.X["X"]}
 
     def predict_action_probabilities(
         self, tracker: "DialogueStateTracker", domain: "Domain"
diff --git a/rasa/core/test.py b/rasa/core/test.py
index 6bacf7684a84..e1ce0ec71ea2 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -16,6 +16,11 @@
 if typing.TYPE_CHECKING:
     from rasa.core.agent import Agent
 
+import matplotlib
+
+matplotlib.use("TkAgg")
+
+
 logger = logging.getLogger(__name__)
 
 StoryEvalution = namedtuple(
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f32af40d64ff..f36927d78b23 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -331,31 +331,11 @@ def _create_encoded_label_ids(
 
         return encoded_id_labels
 
-    # training data helpers:
-    @staticmethod
-    def _create_tag_id_dict(
-        training_data: "TrainingData", attribute: Text
-    ) -> Dict[Text, int]:
-        """Create label_id dictionary"""
-        distinct_tag_ids = set(
-            [
-                e["entity"]
-                for example in training_data.entity_examples
-                for e in example.get(attribute)
-            ]
-        ) - {None}
-        tag_id_dict = {
-            tag_id: idx for idx, tag_id in enumerate(sorted(distinct_tag_ids), 1)
-        }
-        tag_id_dict["O"] = 0
-        return tag_id_dict
-
     # noinspection PyPep8Naming
     def _create_session_data(
         self,
         training_data: "TrainingData",
         label_id_dict: Dict[Text, int],
-        tag_id_dict: Dict[Text, int],
         attribute: Text,
     ) -> "train_utils.SessionData":
         """Prepare data for training and create a SessionData object"""
@@ -363,7 +343,6 @@ def _create_session_data(
         X_dense = []
         Y = []
         labels = []
-        tags = []
 
         for e in training_data.training_examples:
             if e.get(attribute):
@@ -376,28 +355,16 @@ def _create_session_data(
                 # every example should have an intent
                 labels.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
 
-        for e in training_data.training_examples:
-            _tags = []
-            for t in e.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]):
-                _tag = determine_token_labels(
-                    t, e.get(MESSAGE_ENTITIES_ATTRIBUTE), None
-                )
-                _tags.append(tag_id_dict[_tag])
-            tags.append(scipy.sparse.csr_matrix(np.array([_tags]).T))
-
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
         labels = np.array(labels)
-        tags = np.array(tags)
 
         for label_id_idx in labels:
             Y.append(self._encoded_all_label_ids[label_id_idx])
         Y = np.array(Y)
 
         return train_utils.SessionData(
-            {"dense": X_dense, "sparse": X_sparse},
-            {"Y": Y},
-            {"tags": tags, "labels": labels},
+            {"dense": X_dense, "sparse": X_sparse}, {"Y": Y}, {"labels": labels}
         )
 
     # tf helpers:
@@ -473,10 +440,10 @@ def _build_tf_pred_graph(
         self, session_data: "train_utils.SessionData"
     ) -> "tf.Tensor":
         self.a_in = tf.placeholder(
-            tf.float32, (None, session_data.X.shape[-1]), name="a"
+            tf.float32, (None, session_data.X["sparse"].shape[-1]), name="a"
         )
         self.b_in = tf.placeholder(
-            tf.float32, (None, None, session_data.Y.shape[-1]), name="b"
+            tf.float32, (None, None, session_data.Y["Y"].shape[-1]), name="b"
         )
 
         self.message_embed = self._create_tf_embed_fnn(
@@ -508,7 +475,7 @@ def _build_tf_pred_graph(
     def check_input_dimension_consistency(self, session_data):
 
         if self.share_hidden_layers:
-            if session_data.X[0].shape[-1] != session_data.Y[0].shape[-1]:
+            if session_data.X["sparse"].shape[-1] != session_data.Y["Y"].shape[-1]:
                 raise ValueError(
                     "If embeddings are shared "
                     "text features and label features "
@@ -521,10 +488,6 @@ def preprocess_train_data(self, training_data):
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
-        tag_id_dict = self._create_tag_id_dict(
-            training_data, attribute=MESSAGE_ENTITIES_ATTRIBUTE
-        )
-        self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
         self._encoded_all_label_ids = self._create_encoded_label_ids(
@@ -547,10 +510,7 @@ def preprocess_train_data(self, training_data):
         self.num_neg = min(self.num_neg, self._encoded_all_label_ids.shape[0] - 1)
 
         session_data = self._create_session_data(
-            training_data,
-            label_id_dict,
-            tag_id_dict,
-            attribute=MESSAGE_INTENT_ATTRIBUTE,
+            training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
 
         self.check_input_dimension_consistency(session_data)
@@ -558,8 +518,7 @@ def preprocess_train_data(self, training_data):
         return session_data
 
     def _check_enough_labels(self, session_data) -> bool:
-
-        return len(np.unique(session_data.labels)) >= 2
+        return len(np.unique(session_data.labels["labels"])) >= 2
 
     def train(
         self,
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 3283249b1ce3..271c32c89b80 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -1,22 +1,14 @@
 import logging
 import typing
-from typing import Any, Dict, Optional, Text
+from typing import Any, Dict, Text
 
 from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
-from rasa.core.actions.action import RESPOND_PREFIX
 from rasa.nlu.constants import (
     MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
-    MESSAGE_SPACY_FEATURES_NAMES,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    OPEN_UTTERANCE_PREDICTION_KEY,
-    OPEN_UTTERANCE_RANKING_KEY,
     MESSAGE_SELECTOR_PROPERTY_NAME,
     DEFAULT_OPEN_UTTERANCE_TYPE,
-    MESSAGE_ENTITIES_ATTRIBUTE,
 )
 
 logger = logging.getLogger(__name__)
@@ -147,10 +139,6 @@ def preprocess_train_data(self, training_data):
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_RESPONSE_ATTRIBUTE
         )
-        tag_id_dict = self._create_tag_id_dict(
-            training_data, attribute=MESSAGE_ENTITIES_ATTRIBUTE
-        )
-        self.inverted_tag_dict = {v: k for k, v in tag_id_dict.items()}
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
         self._encoded_all_label_ids = self._create_encoded_label_ids(
@@ -173,10 +161,7 @@ def preprocess_train_data(self, training_data):
         self.num_neg = min(self.num_neg, self._encoded_all_label_ids.shape[0] - 1)
 
         session_data = self._create_session_data(
-            training_data,
-            label_id_dict,
-            tag_id_dict,
-            attribute=MESSAGE_RESPONSE_ATTRIBUTE,
+            training_data, label_id_dict, attribute=MESSAGE_RESPONSE_ATTRIBUTE
         )
 
         self.check_input_dimension_consistency(session_data)

From 3f0750b4d0495d85932168339773f9fb9a8662ae Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 28 Oct 2019 15:21:09 +0100
Subject: [PATCH 100/239] fix classifier

---
 CHANGELOG.rst                                 |  1 +
 .../embedding_intent_classifier.py            | 29 ++++++++-------
 rasa/utils/train_utils.py                     | 36 ++++++++-----------
 3 files changed, 29 insertions(+), 37 deletions(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 046a45e1863d..91feb1b406ea 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -22,6 +22,7 @@ Changed
 - All featurizers in ``rasa.nlu`` return a sequence
 - Renamed the feature name ``ner_features`` to ``text_dense_features`` in ``CRFEntityExtractor``.
   The ``text_dense_features`` are created by any dense featurizer.
+- Values of ``SessionData`` are dictionaries instead of ``np.ndarray``
 
 Removed
 -------
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f36927d78b23..745ea8900159 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -339,33 +339,32 @@ def _create_session_data(
         attribute: Text,
     ) -> "train_utils.SessionData":
         """Prepare data for training and create a SessionData object"""
-        X_sparse = []
-        X_dense = []
+        X = []
         Y = []
         labels = []
 
         for e in training_data.training_examples:
             if e.get(attribute):
-                X_sparse.append(
-                    e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-                )
-                X_dense.append(
-                    e.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                X.append(
+                    sequence_to_sentence_features(
+                        e.get(
+                            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                        )
+                    )
+                    .toarray()
+                    .squeeze()
                 )
                 # every example should have an intent
                 labels.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
 
-        X_sparse = np.array(X_sparse)
-        X_dense = np.array(X_dense)
+        X = np.array(X)
         labels = np.array(labels)
 
         for label_id_idx in labels:
             Y.append(self._encoded_all_label_ids[label_id_idx])
         Y = np.array(Y)
 
-        return train_utils.SessionData(
-            {"dense": X_dense, "sparse": X_sparse}, {"Y": Y}, {"labels": labels}
-        )
+        return train_utils.SessionData({"X": X}, {"Y": Y}, {"labels": labels})
 
     # tf helpers:
     def _create_tf_embed_fnn(
@@ -394,7 +393,7 @@ def _create_tf_embed_fnn(
         )
 
     def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        self.a_in, self.b_in = self._iterator.get_next()
+        self.a_in, self.b_in, _ = self._iterator.get_next()
 
         all_label_ids = tf.constant(
             self._encoded_all_label_ids, dtype=tf.float32, name="all_label_ids"
@@ -440,7 +439,7 @@ def _build_tf_pred_graph(
         self, session_data: "train_utils.SessionData"
     ) -> "tf.Tensor":
         self.a_in = tf.placeholder(
-            tf.float32, (None, session_data.X["sparse"].shape[-1]), name="a"
+            tf.float32, (None, session_data.X["X"].shape[-1]), name="a"
         )
         self.b_in = tf.placeholder(
             tf.float32, (None, None, session_data.Y["Y"].shape[-1]), name="b"
@@ -475,7 +474,7 @@ def _build_tf_pred_graph(
     def check_input_dimension_consistency(self, session_data):
 
         if self.share_hidden_layers:
-            if session_data.X["sparse"].shape[-1] != session_data.Y["Y"].shape[-1]:
+            if session_data.X["X"].shape[-1] != session_data.Y["Y"].shape[-1]:
                 raise ValueError(
                     "If embeddings are shared "
                     "text features and label features "
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index a10ec5f571e8..ca580f9c570b 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -281,32 +281,21 @@ def gen_batch(
 
         batch_data = []
         for v in session_data.X.values():
-            batch_data.append(convert_sparse_to_dense(v[start:end]))
+            batch_data.append(convert_to_batch(v[start:end]))
         for v in session_data.Y.values():
-            batch_data.append(convert_sparse_to_dense(v[start:end]))
+            batch_data.append(convert_to_batch(v[start:end]))
         for v in session_data.labels.values():
-            batch_data.append(convert_sparse_to_dense(v[start:end]))
+            batch_data.append(convert_to_batch(v[start:end]))
 
         yield tuple(batch_data)
 
 
-def convert_sparse_to_dense(
-    data_sparse: Union[np.ndarray, List[scipy.sparse.csr_matrix]],
-    init_with_zero: bool = False,
-):
-    data_size = len(data_sparse)
-    max_seq_len = max([x.shape[0] for x in data_sparse])
-    feature_len = max([x.shape[-1] for x in data_sparse])
+def convert_to_batch(data_points: Union[np.ndarray, scipy.sparse.csr_matrix],):
+    is_sparse = isinstance(data_points[0], scipy.sparse.spmatrix)
 
-    if init_with_zero:
-        data_dense = np.zeros([data_size, max_seq_len, feature_len], dtype=np.float)
-    else:
-        data_dense = np.ones([data_size, max_seq_len, feature_len], dtype=np.float) * -1
-
-    for i in range(data_size):
-        data_dense[i, : data_sparse[i].shape[0], :] = data_sparse[i].toarray()
-
-    return data_dense
+    if is_sparse:
+        return data_points.toarray()
+    return data_points
 
 
 # noinspection PyPep8Naming
@@ -336,9 +325,12 @@ def _get_shape(session_data: SessionData) -> Tuple:
     shapes = []
 
     def append_shape(v: Union[np.ndarray, scipy.sparse.spmatrix]):
-        if v[0].ndim == 1:
-            shapes.append((None, v[0].shape[-1]))
-        shapes.append((None, None, v[0].shape[-1]))
+        if v[0].ndim == 0:
+            shapes.append((v.shape[-1]))
+        elif v[0].ndim == 1:
+            shapes.append((None, v.shape[-1]))
+        else:
+            shapes.append((None, None, v.shape[-1]))
 
     for v in session_data.X.values():
         append_shape(v)

From 4c8f811305d8244e1756643e20205043a09b789d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 28 Oct 2019 16:54:00 +0100
Subject: [PATCH 101/239] add more tests

---
 .../embedding_intent_classifier.py            |  9 ++----
 rasa/utils/train_utils.py                     | 16 +++++-----
 tests/utils/test_train_utils.py               | 31 ++++++++++++++-----
 3 files changed, 34 insertions(+), 22 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 745ea8900159..2347865206af 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -3,12 +3,10 @@
 import os
 import pickle
 import typing
-import scipy.sparse
 from typing import Any, Dict, List, Optional, Text, Tuple
 import warnings
 
 from rasa.nlu.featurizers.featurzier import sequence_to_sentence_features
-from rasa.nlu.test import determine_token_labels
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
@@ -16,9 +14,6 @@
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    MESSAGE_ENTITIES_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
 )
 
 import tensorflow as tf
@@ -343,7 +338,7 @@ def _create_session_data(
         Y = []
         labels = []
 
-        for e in training_data.training_examples:
+        for e in training_data.intent_examples:
             if e.get(attribute):
                 X.append(
                     sequence_to_sentence_features(
@@ -474,7 +469,7 @@ def _build_tf_pred_graph(
     def check_input_dimension_consistency(self, session_data):
 
         if self.share_hidden_layers:
-            if session_data.X["X"].shape[-1] != session_data.Y["Y"].shape[-1]:
+            if session_data.X["X"][0].shape[-1] != session_data.Y["Y"][0].shape[-1]:
                 raise ValueError(
                     "If embeddings are shared "
                     "text features and label features "
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index ca580f9c570b..898293d46cba 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -58,6 +58,8 @@ def train_val_split(
     label_key: Text = "labels",
 ) -> Tuple["SessionData", "SessionData"]:
     """Create random hold out validation set using stratified split."""
+    if label_key not in session_data.labels:
+        raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
 
     label_counts = dict(
         zip(*np.unique(session_data.labels[label_key], return_counts=True, axis=0))
@@ -158,6 +160,8 @@ def split_session_data_by_label(
     session_data: "SessionData", label_key: Text, unique_label_ids: "np.ndarray"
 ) -> List["SessionData"]:
     """Reorganize session data into a list of session data with the same labels."""
+    if label_key not in session_data.labels:
+        raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
 
     label_data = []
     for label_id in unique_label_ids:
@@ -179,15 +183,11 @@ def balance_session_data(
     by repeating them. Mimics stratified batching, but also takes into account
     that more populated classes should appear more often.
     """
-    example_lengths = [len(x) for x in session_data.X.values()]
-
-    if not all(l == example_lengths[0] for l in example_lengths):
-        raise ValueError("Number of examples in X differ.")
-
     if label_key not in session_data.labels:
-        raise ValueError(f"{label_key} not in SessionData.labels.")
+        raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
+
+    num_examples = get_number_of_examples(session_data)
 
-    num_examples = example_lengths[0]
     unique_label_ids, counts_label_ids = np.unique(
         session_data.labels[label_key], return_counts=True, axis=0
     )
@@ -249,7 +249,7 @@ def get_number_of_examples(session_data: SessionData):
     example_lengths = [len(v) for v in session_data.X.values()]
 
     # check if number of examples is the same for all X
-    if len(set(example_lengths)) != 1:
+    if not all(l == example_lengths[0] for l in example_lengths):
         raise ValueError(
             f"Number of examples differs for X ({session_data.X.keys()}). There should "
             f"be the same."
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 4a6c3336fea5..35c3f1c69b2e 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -7,6 +7,7 @@
     split_session_data_by_label,
     train_val_split,
     session_data_for_ids,
+    get_number_of_examples,
 )
 
 
@@ -18,10 +19,7 @@ async def session_data() -> SessionData:
             "dense": np.random.randint(5, size=(5, 10)),
         },
         Y={"Y": np.random.randint(2, size=(5, 10))},
-        labels={
-            "tags": np.random.randint(2, size=(5, 10)),
-            "labels": np.random.randint(2, size=(5)),
-        },
+        labels={"labels": np.random.randint(2, size=(5))},
     )
 
 
@@ -41,12 +39,19 @@ def test_shuffle_session_data(session_data: SessionData):
 
 def test_split_session_data_by_label(session_data: SessionData):
     split_session_data = split_session_data_by_label(
-        session_data, "labels", np.array([1, 2, 3, 4, 5])
+        session_data, "labels", np.array([0, 1])
     )
 
-    assert len(split_session_data) == 5
+    assert len(split_session_data) == 2
     for s in split_session_data:
-        assert len(set(s.labels["labels"])) <= 1
+        assert len(set(s.labels["labels"])) == 1
+
+
+def test_split_session_data_by_incorrect_label(session_data: SessionData):
+    with pytest.raises(ValueError):
+        split_session_data_by_label(
+            session_data, "not-existing", np.array([1, 2, 3, 4, 5])
+        )
 
 
 def test_train_val_split(session_data: SessionData):
@@ -75,3 +80,15 @@ def test_session_data_for_ids(session_data: SessionData):
     assert np.all(
         np.array(filtered_session_data.X[k][1]) == np.array(session_data.X[k][1])
     )
+
+
+def test_get_number_of_examples(session_data: SessionData):
+    num = get_number_of_examples(session_data)
+
+    assert num == 5
+
+
+def test_get_number_of_examples_raises_value_error(session_data: SessionData):
+    session_data.X["dense"] = np.random.randint(5, size=(2, 10))
+    with pytest.raises(ValueError):
+        get_number_of_examples(session_data)

From e05a9d67b20194360a219a54fead1b446125a9d3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 28 Oct 2019 17:39:35 +0100
Subject: [PATCH 102/239] use sparse in tests

---
 rasa/utils/train_utils.py       | 56 ++++++++++++++++++++-------------
 tests/utils/test_train_utils.py | 13 ++++----
 2 files changed, 41 insertions(+), 28 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 898293d46cba..41b8bb8bec8a 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -109,30 +109,28 @@ def train_val_split(
     # order is kept, so first session_data.X values, then session_data.Y values, and
     # finally session_data.labels values
     for i in range(len(session_data.X)):
-        X_train[keys[i]] = np.concatenate([output_values[i * 2], solo_values[i]])
+        X_train[keys[i]] = combine_features(output_values[i * 2], solo_values[i])
 
     for i in range(len(session_data.X), len(session_data.X) + len(session_data.Y)):
-        Y_train[keys[i]] = np.concatenate([output_values[i * 2], solo_values[i]])
+        Y_train[keys[i]] = combine_features(output_values[i * 2], solo_values[i])
 
     for i in range(
         len(session_data.X) + len(session_data.Y),
         len(session_data.X) + len(session_data.Y) + len(session_data.labels),
     ):
-        labels_train[keys[i]] = np.concatenate([output_values[i * 2], solo_values[i]])
+        labels_train[keys[i]] = combine_features(output_values[i * 2], solo_values[i])
 
     for i in range(len(session_data.X)):
-        X_val[keys[i]] = np.concatenate([output_values[(i * 2) + 1], solo_values[i]])
+        X_val[keys[i]] = output_values[(i * 2) + 1]
 
     for i in range(len(session_data.X), len(session_data.X) + len(session_data.Y)):
-        Y_val[keys[i]] = np.concatenate([output_values[(i * 2) + 1], solo_values[i]])
+        Y_val[keys[i]] = output_values[(i * 2) + 1]
 
     for i in range(
         len(session_data.X) + len(session_data.Y),
         len(session_data.X) + len(session_data.Y) + len(session_data.labels),
     ):
-        labels_val[keys[i]] = np.concatenate(
-            [output_values[(i * 2) + 1], solo_values[i]]
-        )
+        labels_val[keys[i]] = output_values[(i * 2) + 1]
 
     return (
         SessionData(X_train, Y_train, labels_train),
@@ -140,6 +138,22 @@ def train_val_split(
     )
 
 
+def combine_features(
+    feature_1: Union[np.ndarray, scipy.sparse.spmatrix],
+    feature_2: Union[np.ndarray, scipy.sparse.spmatrix],
+) -> Union[np.ndarray, scipy.sparse.spmatrix]:
+    if isinstance(feature_1, scipy.sparse.spmatrix) and isinstance(
+        feature_2, scipy.sparse.spmatrix
+    ):
+        if feature_2.shape[0] == 0:
+            return feature_1
+        if feature_1.shape[0] == 0:
+            return feature_2
+        return scipy.sparse.vstack([feature_1, feature_2])
+
+    return np.concatenate([feature_1, feature_2])
+
+
 def shuffle_session_data(session_data: "SessionData") -> "SessionData":
     """Shuffle session data."""
     data_points = get_number_of_examples(session_data)
@@ -246,7 +260,7 @@ def balance_session_data(
 
 
 def get_number_of_examples(session_data: SessionData):
-    example_lengths = [len(v) for v in session_data.X.values()]
+    example_lengths = [v.shape[0] for v in session_data.X.values()]
 
     # check if number of examples is the same for all X
     if not all(l == example_lengths[0] for l in example_lengths):
@@ -279,23 +293,21 @@ def gen_batch(
         start = batch_num * batch_size
         end = (batch_num + 1) * batch_size
 
-        batch_data = []
-        for v in session_data.X.values():
-            batch_data.append(convert_to_batch(v[start:end]))
-        for v in session_data.Y.values():
-            batch_data.append(convert_to_batch(v[start:end]))
-        for v in session_data.labels.values():
-            batch_data.append(convert_to_batch(v[start:end]))
+        batch_data = [sparse_to_dense(v[start:end]) for v in session_data.X.values()]
+        batch_data = batch_data + [
+            sparse_to_dense(v[start:end]) for v in session_data.Y.values()
+        ]
+        batch_data = batch_data + [
+            sparse_to_dense(v[start:end]) for v in session_data.labels.values()
+        ]
 
         yield tuple(batch_data)
 
 
-def convert_to_batch(data_points: Union[np.ndarray, scipy.sparse.csr_matrix],):
-    is_sparse = isinstance(data_points[0], scipy.sparse.spmatrix)
-
-    if is_sparse:
-        return data_points.toarray()
-    return data_points
+def sparse_to_dense(examples: Union[np.ndarray, scipy.sparse.csr_matrix]):
+    if isinstance(examples[0], scipy.sparse.spmatrix):
+        return examples.toarray()
+    return examples
 
 
 # noinspection PyPep8Naming
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 35c3f1c69b2e..7025abd62bba 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -1,4 +1,5 @@
 import pytest
+import scipy.sparse
 import numpy as np
 
 from rasa.utils.train_utils import (
@@ -15,11 +16,11 @@
 async def session_data() -> SessionData:
     return SessionData(
         X={
-            "sparse": np.random.rand(5, 10),
-            "dense": np.random.randint(5, size=(5, 10)),
+            "dense": np.random.rand(5, 10),
+            "sparse": scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
         },
         Y={"Y": np.random.randint(2, size=(5, 10))},
-        labels={"labels": np.random.randint(2, size=(5))},
+        labels={"labels": np.array([0, 1, 0, 0, 0])},
     )
 
 
@@ -60,17 +61,17 @@ def test_train_val_split(session_data: SessionData):
     )
 
     for v in train_session_data.X.values():
-        assert len(v) == 3
+        assert v.shape[0] == 3
 
     for v in val_session_data.X.values():
-        assert len(v) == 2
+        assert v.shape[0] == 2
 
 
 def test_session_data_for_ids(session_data: SessionData):
     filtered_session_data = session_data_for_ids(session_data, np.array([0, 1]))
 
     for v in filtered_session_data.X.values():
-        assert len(v) == 2
+        assert v.shape[0] == 2
 
     k = list(session_data.X.keys())[0]
 

From b1107205c695ff34c5e6cf240d921ccfc8d4b6d7 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 29 Oct 2019 09:43:54 +0100
Subject: [PATCH 103/239] fix shapes

---
 rasa/utils/train_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 41b8bb8bec8a..5f7570b5a650 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -263,7 +263,7 @@ def get_number_of_examples(session_data: SessionData):
     example_lengths = [v.shape[0] for v in session_data.X.values()]
 
     # check if number of examples is the same for all X
-    if not all(l == example_lengths[0] for l in example_lengths):
+    if not all(length == example_lengths[0] for length in example_lengths):
         raise ValueError(
             f"Number of examples differs for X ({session_data.X.keys()}). There should "
             f"be the same."
@@ -338,7 +338,7 @@ def _get_shape(session_data: SessionData) -> Tuple:
 
     def append_shape(v: Union[np.ndarray, scipy.sparse.spmatrix]):
         if v[0].ndim == 0:
-            shapes.append((v.shape[-1]))
+            shapes.append((None))
         elif v[0].ndim == 1:
             shapes.append((None, v.shape[-1]))
         else:

From 219d9ddf440ff1e52ab3af7470aeb4586bd11b84 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 29 Oct 2019 10:54:15 +0100
Subject: [PATCH 104/239] fix tests.

---
 rasa/core/policies/embedding_policy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 5cf5215cbf90..6cb106b23fd6 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -335,7 +335,7 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Bulid train graph using iterator."""
 
         # session data are int counts but we need a float tensors
-        self.a_in, self.b_in = self._iterator.get_next()
+        self.a_in, self.b_in, _ = self._iterator.get_next()
         if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
             # add time dimension if max history featurizer is used
             self.b_in = self.b_in[:, tf.newaxis, :]

From bca0b8507497e6d6e7d0232a99e674b84f24d8cc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 29 Oct 2019 11:40:55 +0100
Subject: [PATCH 105/239] review comments

---
 rasa/core/policies/embedding_policy.py        | 15 +++--
 .../embedding_intent_classifier.py            | 30 ++++++---
 rasa/utils/train_utils.py                     | 65 ++++++++++++-------
 3 files changed, 72 insertions(+), 38 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 6cb106b23fd6..102df4a35a3b 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -271,7 +271,9 @@ def _create_session_data(
             Y = None
 
         return train_utils.SessionData(
-            X={"X": data_X}, Y={"Y": Y}, labels={"labels": label_ids}
+            X={"dialogue_features": data_X},
+            Y={"bot_features": Y},
+            labels={"action_ids": label_ids},
         )
 
     def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
@@ -372,12 +374,12 @@ def _create_tf_placeholders(self, session_data: "train_utils.SessionData") -> No
         dialogue_len = None  # use dynamic time
         self.a_in = tf.placeholder(
             dtype=tf.float32,
-            shape=(None, dialogue_len, session_data.X["X"].shape[-1]),
+            shape=(None, dialogue_len, session_data.X["dialogue_features"].shape[-1]),
             name="a",
         )
         self.b_in = tf.placeholder(
             dtype=tf.float32,
-            shape=(None, dialogue_len, None, session_data.Y["Y"].shape[-1]),
+            shape=(None, dialogue_len, None, session_data.Y["bot_features"].shape[-1]),
             name="b",
         )
 
@@ -442,7 +444,10 @@ def train(
 
         if self.evaluate_on_num_examples:
             session_data, eval_session_data = train_utils.train_val_split(
-                session_data, self.evaluate_on_num_examples, self.random_seed
+                session_data,
+                self.evaluate_on_num_examples,
+                self.random_seed,
+                label_key="action_ids",
             )
         else:
             eval_session_data = None
@@ -537,7 +542,7 @@ def tf_feed_dict_for_prediction(
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
 
-        return {self.a_in: session_data.X["X"]}
+        return {self.a_in: session_data.X["dialogue_features"]}
 
     def predict_action_probabilities(
         self, tracker: "DialogueStateTracker", domain: "Domain"
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 2347865206af..74eda324d251 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -336,7 +336,7 @@ def _create_session_data(
         """Prepare data for training and create a SessionData object"""
         X = []
         Y = []
-        labels = []
+        label_ids = []
 
         for e in training_data.intent_examples:
             if e.get(attribute):
@@ -350,16 +350,18 @@ def _create_session_data(
                     .squeeze()
                 )
                 # every example should have an intent
-                labels.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
+                label_ids.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
 
         X = np.array(X)
-        labels = np.array(labels)
+        label_ids = np.array(label_ids)
 
-        for label_id_idx in labels:
+        for label_id_idx in label_ids:
             Y.append(self._encoded_all_label_ids[label_id_idx])
         Y = np.array(Y)
 
-        return train_utils.SessionData({"X": X}, {"Y": Y}, {"labels": labels})
+        return train_utils.SessionData(
+            {"text_features": X}, {"intent_features": Y}, {"intent_ids": label_ids}
+        )
 
     # tf helpers:
     def _create_tf_embed_fnn(
@@ -434,10 +436,12 @@ def _build_tf_pred_graph(
         self, session_data: "train_utils.SessionData"
     ) -> "tf.Tensor":
         self.a_in = tf.placeholder(
-            tf.float32, (None, session_data.X["X"].shape[-1]), name="a"
+            tf.float32, (None, session_data.X["text_features"].shape[-1]), name="a"
         )
         self.b_in = tf.placeholder(
-            tf.float32, (None, None, session_data.Y["Y"].shape[-1]), name="b"
+            tf.float32,
+            (None, None, session_data.Y["intent_features"].shape[-1]),
+            name="b",
         )
 
         self.message_embed = self._create_tf_embed_fnn(
@@ -469,7 +473,10 @@ def _build_tf_pred_graph(
     def check_input_dimension_consistency(self, session_data):
 
         if self.share_hidden_layers:
-            if session_data.X["X"][0].shape[-1] != session_data.Y["Y"][0].shape[-1]:
+            if (
+                session_data.X["text_features"][0].shape[-1]
+                != session_data.Y["intent_features"][0].shape[-1]
+            ):
                 raise ValueError(
                     "If embeddings are shared "
                     "text features and label features "
@@ -512,7 +519,7 @@ def preprocess_train_data(self, training_data):
         return session_data
 
     def _check_enough_labels(self, session_data) -> bool:
-        return len(np.unique(session_data.labels["labels"])) >= 2
+        return len(np.unique(session_data.labels["intent_ids"])) >= 2
 
     def train(
         self,
@@ -541,7 +548,10 @@ def train(
 
         if self.evaluate_on_num_examples:
             session_data, eval_session_data = train_utils.train_val_split(
-                session_data, self.evaluate_on_num_examples, self.random_seed
+                session_data,
+                self.evaluate_on_num_examples,
+                self.random_seed,
+                label_key="intent_ids",
             )
         else:
             eval_session_data = None
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 5f7570b5a650..f07faeca0843 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -65,7 +65,44 @@ def train_val_split(
         zip(*np.unique(session_data.labels[label_key], return_counts=True, axis=0))
     )
 
+    check_train_test_sizes(evaluate_on_num_examples, label_counts, session_data)
+
+    counts = np.array([label_counts[label] for label in session_data.labels[label_key]])
+
+    multi_values = []
+    [multi_values.append(v[counts > 1]) for v in session_data.X.values()]
+    [multi_values.append(v[counts > 1]) for v in session_data.Y.values()]
+    [multi_values.append(v[counts > 1]) for v in session_data.labels.values()]
+
+    solo_values = []
+    [solo_values.append(v[counts == 1]) for v in session_data.X.values()]
+    [solo_values.append(v[counts == 1]) for v in session_data.Y.values()]
+    [solo_values.append(v[counts == 1]) for v in session_data.labels.values()]
+
+    output_values = train_test_split(
+        *multi_values,
+        test_size=evaluate_on_num_examples,
+        random_state=random_seed,
+        stratify=session_data.labels[label_key][counts > 1],
+    )
+
+    X_train, X_val, Y_train, Y_val, labels_train, labels_val = convert_train_test_split(
+        output_values, session_data, solo_values
+    )
+
+    return (
+        SessionData(X_train, Y_train, labels_train),
+        SessionData(X_val, Y_val, labels_val),
+    )
+
+
+def check_train_test_sizes(
+    evaluate_on_num_examples: int,
+    label_counts: Dict[Any, int],
+    session_data: SessionData,
+):
     num_examples = get_number_of_examples(session_data)
+
     if evaluate_on_num_examples >= num_examples - len(label_counts):
         raise ValueError(
             f"Validation set of {evaluate_on_num_examples} is too large. Remaining train set "
@@ -77,27 +114,12 @@ def train_val_split(
             "at least equal to number of classes {label_counts}."
         )
 
-    counts = np.array([label_counts[label] for label in session_data.labels[label_key]])
-
-    multi_values = []
-    [multi_values.append(v[counts > 1]) for k, v in session_data.X.items()]
-    [multi_values.append(v[counts > 1]) for k, v in session_data.Y.items()]
-    [multi_values.append(v[counts > 1]) for k, v in session_data.labels.items()]
-
-    solo_values = []
-    [solo_values.append(v[counts == 1]) for k, v in session_data.X.items()]
-    [solo_values.append(v[counts == 1]) for k, v in session_data.Y.items()]
-    [solo_values.append(v[counts == 1]) for k, v in session_data.labels.items()]
 
+def convert_train_test_split(
+    output_values: List[Any], session_data: SessionData, solo_values: List[Any]
+):
     keys = [k for d in session_data for k, v in d.items()]
 
-    output_values = train_test_split(
-        *multi_values,
-        test_size=evaluate_on_num_examples,
-        random_state=random_seed,
-        stratify=session_data.labels[label_key][counts > 1],
-    )
-
     X_train = {}
     Y_train = {}
     labels_train = {}
@@ -132,10 +154,7 @@ def train_val_split(
     ):
         labels_val[keys[i]] = output_values[(i * 2) + 1]
 
-    return (
-        SessionData(X_train, Y_train, labels_train),
-        SessionData(X_val, Y_val, labels_val),
-    )
+    return X_train, X_val, Y_train, Y_val, labels_train, labels_val
 
 
 def combine_features(
@@ -291,7 +310,7 @@ def gen_batch(
 
     for batch_num in range(num_batches):
         start = batch_num * batch_size
-        end = (batch_num + 1) * batch_size
+        end = start + batch_size
 
         batch_data = [sparse_to_dense(v[start:end]) for v in session_data.X.values()]
         batch_data = batch_data + [

From adc84fe5a5c0efa5cfbcd46f9355ebd68dcce00b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 29 Oct 2019 12:34:53 +0100
Subject: [PATCH 106/239] use label_key

---
 rasa/core/policies/embedding_policy.py        | 10 ++++-
 .../embedding_intent_classifier.py            |  6 ++-
 rasa/utils/train_utils.py                     | 37 ++++++++++++-------
 tests/core/test_policies.py                   | 17 +++++----
 4 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 102df4a35a3b..5d815a7c587c 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -465,7 +465,11 @@ def train(
                 train_init_op,
                 eval_init_op,
             ) = train_utils.create_iterator_init_datasets(
-                session_data, eval_session_data, batch_size_in, self.batch_strategy
+                session_data,
+                eval_session_data,
+                batch_size_in,
+                self.batch_strategy,
+                label_key="action_ids",
             )
 
             self._is_training = tf.placeholder_with_default(False, shape=())
@@ -519,7 +523,9 @@ def continue_training(
                 session_data = self._create_session_data(
                     training_data.X, training_data.y
                 )
-                train_dataset = train_utils.create_tf_dataset(session_data, batch_size)
+                train_dataset = train_utils.create_tf_dataset(
+                    session_data, batch_size, label_key="action_ids"
+                )
                 train_init_op = self._iterator.make_initializer(train_dataset)
                 self.session.run(train_init_op)
 
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 74eda324d251..ba65c0161865 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -569,7 +569,11 @@ def train(
                 train_init_op,
                 eval_init_op,
             ) = train_utils.create_iterator_init_datasets(
-                session_data, eval_session_data, batch_size_in, self.batch_strategy
+                session_data,
+                eval_session_data,
+                batch_size_in,
+                self.batch_strategy,
+                label_key="intent_ids",
             )
 
             self._is_training = tf.placeholder_with_default(False, shape=())
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index f07faeca0843..e3c1923a502c 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -55,7 +55,7 @@ def train_val_split(
     session_data: "SessionData",
     evaluate_on_num_examples: int,
     random_seed: int,
-    label_key: Text = "labels",
+    label_key: Text,
 ) -> Tuple["SessionData", "SessionData"]:
     """Create random hold out validation set using stratified split."""
     if label_key not in session_data.labels:
@@ -205,10 +205,7 @@ def split_session_data_by_label(
 
 # noinspection PyPep8Naming
 def balance_session_data(
-    session_data: "SessionData",
-    batch_size: int,
-    shuffle: bool,
-    label_key: Text = "labels",
+    session_data: "SessionData", batch_size: int, shuffle: bool, label_key: Text
 ) -> "SessionData":
     """Mix session data to account for class imbalance.
 
@@ -294,16 +291,19 @@ def get_number_of_examples(session_data: SessionData):
 def gen_batch(
     session_data: "SessionData",
     batch_size: int,
+    label_key: Text,
     batch_strategy: Text = "sequence",
     shuffle: bool = False,
-) -> Generator[Tuple["np.ndarray", "np.ndarray"], None, None]:
+) -> Generator[Tuple, None, None]:
     """Generate batches."""
 
     if shuffle:
         session_data = shuffle_session_data(session_data)
 
     if batch_strategy == "balanced":
-        session_data = balance_session_data(session_data, batch_size, shuffle)
+        session_data = balance_session_data(
+            session_data, batch_size, shuffle, label_key
+        )
 
     num_examples = get_number_of_examples(session_data)
     num_batches = num_examples // batch_size + int(num_examples % batch_size > 0)
@@ -335,16 +335,16 @@ def create_tf_dataset(
     batch_size: Union["tf.Tensor", int],
     batch_strategy: Text = "sequence",
     shuffle: bool = False,
+    label_key: Text = "labels",
 ) -> "tf.data.Dataset":
     """Create tf dataset."""
 
     # set batch and sequence length to None
-    shapes = _get_shape(session_data)
-    types = tuple([np.float32] * len(shapes))
+    shapes, types = _get_shape_and_types(session_data)
 
     return tf.data.Dataset.from_generator(
         lambda batch_size_: gen_batch(
-            session_data, batch_size_, batch_strategy, shuffle
+            session_data, batch_size_, label_key, batch_strategy, shuffle
         ),
         output_types=types,
         output_shapes=shapes,
@@ -352,8 +352,9 @@ def create_tf_dataset(
     )
 
 
-def _get_shape(session_data: SessionData) -> Tuple:
+def _get_shape_and_types(session_data: SessionData) -> Tuple[Tuple, Tuple]:
     shapes = []
+    types = []
 
     def append_shape(v: Union[np.ndarray, scipy.sparse.spmatrix]):
         if v[0].ndim == 0:
@@ -365,12 +366,15 @@ def append_shape(v: Union[np.ndarray, scipy.sparse.spmatrix]):
 
     for v in session_data.X.values():
         append_shape(v)
+        types.append(np.float32)
     for v in session_data.Y.values():
         append_shape(v)
+        types.append(np.float32)
     for v in session_data.labels.values():
         append_shape(v)
+        types.append(v.dtype)
 
-    return tuple(shapes)
+    return tuple(shapes), tuple(types)
 
 
 def create_iterator_init_datasets(
@@ -378,11 +382,16 @@ def create_iterator_init_datasets(
     eval_session_data: "SessionData",
     batch_size: Union["tf.Tensor", int],
     batch_strategy: Text,
+    label_key: Text,
 ) -> Tuple["tf.data.Iterator", "tf.Operation", "tf.Operation"]:
     """Create iterator and init datasets."""
 
     train_dataset = create_tf_dataset(
-        session_data, batch_size, batch_strategy=batch_strategy, shuffle=True
+        session_data,
+        batch_size,
+        batch_strategy=batch_strategy,
+        shuffle=True,
+        label_key=label_key,
     )
 
     iterator = tf.data.Iterator.from_structure(
@@ -395,7 +404,7 @@ def create_iterator_init_datasets(
 
     if eval_session_data is not None:
         eval_init_op = iterator.make_initializer(
-            create_tf_dataset(eval_session_data, batch_size)
+            create_tf_dataset(eval_session_data, batch_size, label_key=label_key)
         )
     else:
         eval_init_op = None
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index b4e800718fe1..e1487ff1947c 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -348,26 +348,29 @@ async def test_gen_batch(self, trained_policy, default_domain):
             training_data.X, training_data.y
         )
         batch_size = 2
-        batch_x, batch_y = next(
-            train_utils.gen_batch(session_data=session_data, batch_size=batch_size)
+        batch_x, batch_y, _ = next(
+            train_utils.gen_batch(
+                session_data=session_data, batch_size=batch_size, label_key="action_ids"
+            )
         )
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
-            batch_x[0].shape == session_data.X[0].shape
-            and batch_y[0].shape == session_data.Y[0].shape
+            batch_x[0].shape == session_data.X["dialogue_features"][0].shape
+            and batch_y[0].shape == session_data.Y["bot_features"][0].shape
         )
-        batch_x, batch_y = next(
+        batch_x, batch_y, _ = next(
             train_utils.gen_batch(
                 session_data=session_data,
                 batch_size=batch_size,
+                label_key="action_ids",
                 batch_strategy="balanced",
                 shuffle=True,
             )
         )
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
-            batch_x[0].shape == session_data.X[0].shape
-            and batch_y[0].shape == session_data.Y[0].shape
+            batch_x[0].shape == session_data.X["dialogue_features"][0].shape
+            and batch_y[0].shape == session_data.Y["bot_features"][0].shape
         )
 
 

From 1dbaa5cff1938ee9ceaca9c90b8977f2ee7d789a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 29 Oct 2019 14:55:42 +0100
Subject: [PATCH 107/239] intent classifier makes use of sparse and dense
 features.

---
 .../embedding_intent_classifier.py            | 144 +++++++++++++++---
 rasa/utils/train_utils.py                     |  12 +-
 2 files changed, 126 insertions(+), 30 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index ba65c0161865..46e0a65cd34a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -14,6 +14,7 @@
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
 )
 
 import tensorflow as tf
@@ -334,33 +335,58 @@ def _create_session_data(
         attribute: Text,
     ) -> "train_utils.SessionData":
         """Prepare data for training and create a SessionData object"""
-        X = []
+        X_sparse = []
+        X_dense = []
         Y = []
         label_ids = []
 
         for e in training_data.intent_examples:
             if e.get(attribute):
-                X.append(
-                    sequence_to_sentence_features(
-                        e.get(
-                            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                if (
+                    e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                    is not None
+                ):
+                    X_sparse.append(
+                        sequence_to_sentence_features(
+                            e.get(
+                                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
+                                    MESSAGE_TEXT_ATTRIBUTE
+                                ]
+                            )
+                        )
+                    )
+                if (
+                    e.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                    is not None
+                ):
+                    X_dense.append(
+                        sequence_to_sentence_features(
+                            e.get(
+                                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[
+                                    MESSAGE_TEXT_ATTRIBUTE
+                                ]
+                            )
                         )
                     )
-                    .toarray()
-                    .squeeze()
-                )
                 # every example should have an intent
                 label_ids.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
 
-        X = np.array(X)
+        X_sparse = np.array(X_sparse)
+        X_dense = np.array(X_dense)
         label_ids = np.array(label_ids)
 
         for label_id_idx in label_ids:
             Y.append(self._encoded_all_label_ids[label_id_idx])
         Y = np.array(Y)
 
+        X = {}
+        if X_sparse.size > 0:
+            X["text_features_sparse"] = X_sparse
+        if X_dense.size > 0:
+            X["text_features_dense"] = X_dense
+
         return train_utils.SessionData(
-            {"text_features": X}, {"intent_features": Y}, {"intent_ids": label_ids}
+            X, {"intent_features": Y}, {"intent_ids": label_ids}
         )
 
     # tf helpers:
@@ -390,7 +416,9 @@ def _create_tf_embed_fnn(
         )
 
     def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        self.a_in, self.b_in, _ = self._iterator.get_next()
+        batch = self._iterator.get_next()
+
+        self.a_in, self.b_in = self.batch_to_input(batch)
 
         all_label_ids = tf.constant(
             self._encoded_all_label_ids, dtype=tf.float32, name="all_label_ids"
@@ -432,15 +460,52 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
             self.scale_loss,
         )
 
+    def batch_to_input(self, batch: Tuple) -> Tuple[tf.Tensor, tf.Tensor]:
+        """Convert batch input into correct tensors.
+
+        As we do not know what features (sparse and/or dense) were used, we need to
+        check what features are provided and parse them accordingly.
+        """
+        b_in = batch[-2]
+
+        if len(batch) == 3:
+            a_in = batch[0]
+            # Needed if a_in comes from a csr_matrix (e.g. sparse features)
+            if len(a_in.shape) == 3:
+                a_in = tf.squeeze(a_in, axis=1)
+
+        elif len(batch) == 4:
+            a_in_sparse = batch[0]
+            a_in_dense = batch[1]
+
+            # Needed as a_in_sparse comes from a csr_matrix
+            a_in_sparse = tf.squeeze(a_in_sparse, axis=1)
+            # Concatenate sparse and dense features
+            a_in = tf.concat([a_in_sparse, a_in_dense], axis=1)
+        else:
+            raise ValueError("Iterator return unexpected number of tensors.")
+
+        return a_in, b_in
+
     def _build_tf_pred_graph(
         self, session_data: "train_utils.SessionData"
     ) -> "tf.Tensor":
+        n_features_1 = (
+            session_data.X["text_features_sparse"][0].shape[-1]
+            if "text_features_sparse" in session_data.X
+            else 0
+        )
+        n_features_2 = (
+            session_data.X["text_features_dense"][0].shape[-1]
+            if "text_features_dense" in session_data.X
+            else 0
+        )
         self.a_in = tf.placeholder(
-            tf.float32, (None, session_data.X["text_features"].shape[-1]), name="a"
+            tf.float32, (None, n_features_1 + n_features_2), name="a"
         )
         self.b_in = tf.placeholder(
             tf.float32,
-            (None, None, session_data.Y["intent_features"].shape[-1]),
+            (None, None, session_data.Y["intent_features"][0].shape[-1]),
             name="b",
         )
 
@@ -618,7 +683,7 @@ def _calculate_message_sim(self, X: np.ndarray) -> Tuple[np.ndarray, List[float]
         # transform sim to python list for JSON serializing
         return label_ids, message_sim.tolist()
 
-    def predict_label(self, message):
+    def predict_label(self, message: "Message"):
 
         label = {"name": None, "confidence": 0.0}
         label_ranking = []
@@ -630,18 +695,9 @@ def predict_label(self, message):
             )
 
         else:
-            # get features (bag of words) for a message
+            # get features (bag of words/embeddings) for a message
             # noinspection PyPep8Naming
-            X = (
-                sequence_to_sentence_features(
-                    message.get(
-                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-                    )
-                )
-                .toarray()
-                .squeeze()
-                .reshape(1, -1)
-            )
+            X = self._extract_features(message)
 
             # load tf graph and session
             label_ids, message_sim = self._calculate_message_sim(X)
@@ -661,6 +717,44 @@ def predict_label(self, message):
                 ]
         return label, label_ranking
 
+    def _extract_features(self, message: "Message") -> np.ndarray:
+        X_sparse = None
+        X_dense = None
+
+        if (
+            message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            is not None
+        ):
+            X_sparse = (
+                sequence_to_sentence_features(
+                    message.get(
+                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+                    )
+                )
+                .toarray()
+                .squeeze()
+                .reshape(1, -1)
+            )
+
+        if (
+            message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            is not None
+        ):
+            X_dense = sequence_to_sentence_features(
+                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            ).reshape(1, -1)
+
+        if X_sparse is not None and X_dense is not None:
+            return np.concatenate((X_sparse, X_dense), axis=1)
+
+        if X_sparse is None and X_dense is not None:
+            return X_dense
+
+        if X_sparse is not None and X_dense is None:
+            return X_sparse
+
+        raise ValueError("No features found for X.")
+
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e3c1923a502c..ae9c1f9821c6 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -323,9 +323,11 @@ def gen_batch(
         yield tuple(batch_data)
 
 
-def sparse_to_dense(examples: Union[np.ndarray, scipy.sparse.csr_matrix]):
+def sparse_to_dense(
+    examples: List[Union[np.ndarray, scipy.sparse.csr_matrix]]
+) -> List[np.ndarray]:
     if isinstance(examples[0], scipy.sparse.spmatrix):
-        return examples.toarray()
+        return [e.toarray() for e in examples]
     return examples
 
 
@@ -356,13 +358,13 @@ def _get_shape_and_types(session_data: SessionData) -> Tuple[Tuple, Tuple]:
     shapes = []
     types = []
 
-    def append_shape(v: Union[np.ndarray, scipy.sparse.spmatrix]):
+    def append_shape(v: np.ndarray):
         if v[0].ndim == 0:
             shapes.append((None))
         elif v[0].ndim == 1:
-            shapes.append((None, v.shape[-1]))
+            shapes.append((None, v[0].shape[-1]))
         else:
-            shapes.append((None, None, v.shape[-1]))
+            shapes.append((None, None, v[0].shape[-1]))
 
     for v in session_data.X.values():
         append_shape(v)

From f2a859924d441be19d585ef92dbfac3820c30963 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 29 Oct 2019 14:58:18 +0100
Subject: [PATCH 108/239] remove default value for label_key

---
 rasa/utils/train_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index ae9c1f9821c6..d11f5992b97a 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -335,9 +335,9 @@ def sparse_to_dense(
 def create_tf_dataset(
     session_data: "SessionData",
     batch_size: Union["tf.Tensor", int],
+    label_key: Text,
     batch_strategy: Text = "sequence",
     shuffle: bool = False,
-    label_key: Text = "labels",
 ) -> "tf.data.Dataset":
     """Create tf dataset."""
 
@@ -391,9 +391,9 @@ def create_iterator_init_datasets(
     train_dataset = create_tf_dataset(
         session_data,
         batch_size,
+        label_key=label_key,
         batch_strategy=batch_strategy,
         shuffle=True,
-        label_key=label_key,
     )
 
     iterator = tf.data.Iterator.from_structure(

From 9c25095db4893e1412263bedb0b5c7551a0807dc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 29 Oct 2019 15:06:02 +0100
Subject: [PATCH 109/239] clean up

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 13 ++++++++-----
 1 file changed, 8 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 46e0a65cd34a..27c6e78198ba 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -355,6 +355,7 @@ def _create_session_data(
                             )
                         )
                     )
+
                 if (
                     e.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
                     is not None
@@ -368,7 +369,7 @@ def _create_session_data(
                             )
                         )
                     )
-                # every example should have an intent
+
                 label_ids.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
 
         X_sparse = np.array(X_sparse)
@@ -538,10 +539,12 @@ def _build_tf_pred_graph(
     def check_input_dimension_consistency(self, session_data):
 
         if self.share_hidden_layers:
-            if (
-                session_data.X["text_features"][0].shape[-1]
-                != session_data.Y["intent_features"][0].shape[-1]
-            ):
+            dim = (
+                session_data.X["text_features_sparse"][0].shape[-1]
+                if "text_features_sparse" in session_data.X
+                else session_data.X["text_features_dense"][0].shape[-1]
+            )
+            if dim != session_data.Y["intent_features"][0].shape[-1]:
                 raise ValueError(
                     "If embeddings are shared "
                     "text features and label features "

From dafbaf93b7913050769496bc7a52b648a7cb2494 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 30 Oct 2019 09:15:06 +0100
Subject: [PATCH 110/239] review comments

---
 rasa/core/policies/embedding_policy.py        |   1 -
 .../embedding_intent_classifier.py            | 203 +++++++++---------
 rasa/utils/train_utils.py                     |   6 +-
 3 files changed, 107 insertions(+), 103 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 5d815a7c587c..fcd11036cb27 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -336,7 +336,6 @@ def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
     def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Bulid train graph using iterator."""
 
-        # session data are int counts but we need a float tensors
         self.a_in, self.b_in, _ = self._iterator.get_next()
         if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
             # add time dimension if max history featurizer is used
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 27c6e78198ba..ed2f13dc8507 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -2,8 +2,9 @@
 import numpy as np
 import os
 import pickle
+import scipy.sparse
 import typing
-from typing import Any, Dict, List, Optional, Text, Tuple
+from typing import Any, Dict, List, Optional, Text, Tuple, Union
 import warnings
 
 from rasa.nlu.featurizers.featurzier import sequence_to_sentence_features
@@ -20,6 +21,8 @@
 import tensorflow as tf
 
 # avoid warning println on contrib import - remove for tf 2
+from utils.train_utils import SessionData
+
 tf.contrib._warning = None
 
 logger = logging.getLogger(__name__)
@@ -251,7 +254,9 @@ def _create_label_id_dict(
         }
 
     @staticmethod
-    def _find_example_for_label(label, examples, attribute):
+    def _find_example_for_label(
+        label: Text, examples: List["Message"], attribute: Text
+    ) -> Optional["Message"]:
         for ex in examples:
             if ex.get(attribute) == label:
                 return ex
@@ -333,7 +338,7 @@ def _create_session_data(
         training_data: "TrainingData",
         label_id_dict: Dict[Text, int],
         attribute: Text,
-    ) -> "train_utils.SessionData":
+    ) -> "SessionData":
         """Prepare data for training and create a SessionData object"""
         X_sparse = []
         X_dense = []
@@ -342,33 +347,12 @@ def _create_session_data(
 
         for e in training_data.intent_examples:
             if e.get(attribute):
-                if (
-                    e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-                    is not None
-                ):
-                    X_sparse.append(
-                        sequence_to_sentence_features(
-                            e.get(
-                                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
-                                    MESSAGE_TEXT_ATTRIBUTE
-                                ]
-                            )
-                        )
-                    )
-
-                if (
-                    e.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-                    is not None
-                ):
-                    X_dense.append(
-                        sequence_to_sentence_features(
-                            e.get(
-                                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[
-                                    MESSAGE_TEXT_ATTRIBUTE
-                                ]
-                            )
-                        )
-                    )
+                x_sparse, x_dense = self._get_x_features(e)
+
+                if x_sparse is not None:
+                    X_sparse.append(x_sparse)
+                if x_dense is not None:
+                    X_dense.append(x_dense)
 
                 label_ids.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
 
@@ -380,16 +364,43 @@ def _create_session_data(
             Y.append(self._encoded_all_label_ids[label_id_idx])
         Y = np.array(Y)
 
-        X = {}
+        X_dict = {}
         if X_sparse.size > 0:
-            X["text_features_sparse"] = X_sparse
+            X_dict["text_features_sparse"] = X_sparse
         if X_dense.size > 0:
-            X["text_features_dense"] = X_dense
+            X_dict["text_features_dense"] = X_dense
 
         return train_utils.SessionData(
-            X, {"intent_features": Y}, {"intent_ids": label_ids}
+            X_dict, {"intent_features": Y}, {"intent_ids": label_ids}
         )
 
+    def _get_x_features(
+        self, message: "Message"
+    ) -> Tuple[
+        Optional[Union[np.ndarray, scipy.sparse.spmatrix]],
+        Optional[Union[np.ndarray, scipy.sparse.spmatrix]],
+    ]:
+        x_sparse = None
+        x_dense = None
+
+        if (
+            message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            is not None
+        ):
+            x_sparse = sequence_to_sentence_features(
+                message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            )
+
+        if (
+            message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            is not None
+        ):
+            x_dense = sequence_to_sentence_features(
+                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            )
+
+        return x_sparse, x_dense
+
     # tf helpers:
     def _create_tf_embed_fnn(
         self,
@@ -417,8 +428,8 @@ def _create_tf_embed_fnn(
         )
 
     def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
+        # batch = 1 or 2 a_in values, b_in, intent_ids
         batch = self._iterator.get_next()
-
         self.a_in, self.b_in = self.batch_to_input(batch)
 
         all_label_ids = tf.constant(
@@ -467,42 +478,44 @@ def batch_to_input(self, batch: Tuple) -> Tuple[tf.Tensor, tf.Tensor]:
         As we do not know what features (sparse and/or dense) were used, we need to
         check what features are provided and parse them accordingly.
         """
+        # batch contains 1 or 2 a_in values, b_in, label_ids
         b_in = batch[-2]
 
         if len(batch) == 3:
-            a_in = batch[0]
-            # Needed if a_in comes from a csr_matrix (e.g. sparse features)
-            if len(a_in.shape) == 3:
-                a_in = tf.squeeze(a_in, axis=1)
-
-        elif len(batch) == 4:
-            a_in_sparse = batch[0]
-            a_in_dense = batch[1]
-
-            # Needed as a_in_sparse comes from a csr_matrix
-            a_in_sparse = tf.squeeze(a_in_sparse, axis=1)
-            # Concatenate sparse and dense features
-            a_in = tf.concat([a_in_sparse, a_in_dense], axis=1)
-        else:
-            raise ValueError("Iterator return unexpected number of tensors.")
+            a_in = self._squeeze_sparse_features(batch[0])
+            return a_in, b_in
 
-        return a_in, b_in
+        if len(batch) == 4:
+            a_in_1 = self._squeeze_sparse_features(batch[0])
+            a_in_2 = self._squeeze_sparse_features(batch[1])
+            # Concatenate a_in features
+            a_in = tf.concat([a_in_1, a_in_2], axis=1)
+
+            return a_in, b_in
+
+        raise ValueError("Iterator return unexpected number of tensors.")
+
+    def _squeeze_sparse_features(self, a_in: tf.Tensor) -> tf.Tensor:
+        # we need to squeeze sparse features as the classifier cannot handle
+        # sequences
+        # as sparse features come from a scipy.sparse.csr_matrix they have a
+        # additional dimension
+        if len(a_in.shape) == 3:
+            a_in = tf.squeeze(a_in, axis=1)
+        return a_in
 
     def _build_tf_pred_graph(
         self, session_data: "train_utils.SessionData"
     ) -> "tf.Tensor":
-        n_features_1 = (
-            session_data.X["text_features_sparse"][0].shape[-1]
-            if "text_features_sparse" in session_data.X
-            else 0
+        num_features_sparse = self._get_num_of_features(
+            session_data, "text_features_sparse"
         )
-        n_features_2 = (
-            session_data.X["text_features_dense"][0].shape[-1]
-            if "text_features_dense" in session_data.X
-            else 0
+        num_features_dense = self._get_num_of_features(
+            session_data, "text_features_dense"
         )
+
         self.a_in = tf.placeholder(
-            tf.float32, (None, n_features_1 + n_features_2), name="a"
+            tf.float32, (None, num_features_sparse + num_features_dense), name="a"
         )
         self.b_in = tf.placeholder(
             tf.float32,
@@ -536,23 +549,31 @@ def _build_tf_pred_graph(
 
         return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
 
-    def check_input_dimension_consistency(self, session_data):
+    def _get_num_of_features(self, session_data: "SessionData", x_key: Text) -> int:
+        return session_data.X[x_key][0].shape[-1] if x_key in session_data.X else 0
 
+    def check_input_dimension_consistency(self, session_data: "SessionData"):
         if self.share_hidden_layers:
-            dim = (
-                session_data.X["text_features_sparse"][0].shape[-1]
-                if "text_features_sparse" in session_data.X
-                else session_data.X["text_features_dense"][0].shape[-1]
+            num_features_sparse = self._get_num_of_features(
+                session_data, "text_features_sparse"
+            )
+            num_features_dense = self._get_num_of_features(
+                session_data, "text_features_dense"
             )
-            if dim != session_data.Y["intent_features"][0].shape[-1]:
+
+            if (
+                num_features_sparse + num_features_dense
+                != session_data.Y["intent_features"][0].shape[-1]
+            ):
                 raise ValueError(
                     "If embeddings are shared "
                     "text features and label features "
                     "must coincide. Check the output dimensions of previous components."
                 )
 
-    def preprocess_train_data(self, training_data):
-        """Performs sanity checks on training data, extracts encodings for labels and prepares data for training"""
+    def preprocess_train_data(self, training_data: "TrainingData"):
+        """Performs sanity checks on training data, extracts encodings for labels and
+        prepares data for training"""
 
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_INTENT_ATTRIBUTE
@@ -586,7 +607,7 @@ def preprocess_train_data(self, training_data):
 
         return session_data
 
-    def _check_enough_labels(self, session_data) -> bool:
+    def _check_enough_labels(self, session_data: "SessionData") -> bool:
         return len(np.unique(session_data.labels["intent_ids"])) >= 2
 
     def train(
@@ -686,7 +707,9 @@ def _calculate_message_sim(self, X: np.ndarray) -> Tuple[np.ndarray, List[float]
         # transform sim to python list for JSON serializing
         return label_ids, message_sim.tolist()
 
-    def predict_label(self, message: "Message"):
+    def predict_label(
+        self, message: "Message"
+    ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
 
         label = {"name": None, "confidence": 0.0}
         label_ranking = []
@@ -721,40 +744,22 @@ def predict_label(self, message: "Message"):
         return label, label_ranking
 
     def _extract_features(self, message: "Message") -> np.ndarray:
-        X_sparse = None
-        X_dense = None
+        x_sparse, x_dense = self._get_x_features(message)
 
-        if (
-            message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-            is not None
-        ):
-            X_sparse = (
-                sequence_to_sentence_features(
-                    message.get(
-                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-                    )
-                )
-                .toarray()
-                .squeeze()
-                .reshape(1, -1)
-            )
+        if x_sparse is not None:
+            x_sparse = x_sparse.toarray().squeeze().reshape(1, -1)
 
-        if (
-            message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-            is not None
-        ):
-            X_dense = sequence_to_sentence_features(
-                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-            ).reshape(1, -1)
+        if x_dense is not None:
+            x_dense = x_dense.reshape(1, -1)
 
-        if X_sparse is not None and X_dense is not None:
-            return np.concatenate((X_sparse, X_dense), axis=1)
+        if x_sparse is not None and x_dense is not None:
+            return np.concatenate((x_sparse, x_dense), axis=-1)
 
-        if X_sparse is None and X_dense is not None:
-            return X_dense
+        if x_sparse is None and x_dense is not None:
+            return x_dense
 
-        if X_sparse is not None and X_dense is None:
-            return X_sparse
+        if x_sparse is not None and x_dense is None:
+            return x_sparse
 
         raise ValueError("No features found for X.")
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index d11f5992b97a..e7decd594143 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -324,10 +324,10 @@ def gen_batch(
 
 
 def sparse_to_dense(
-    examples: List[Union[np.ndarray, scipy.sparse.csr_matrix]]
-) -> List[np.ndarray]:
+    examples: Union[np.ndarray, List[scipy.sparse.csr_matrix]]
+) -> np.ndarray:
     if isinstance(examples[0], scipy.sparse.spmatrix):
-        return [e.toarray() for e in examples]
+        return np.stack([e.toarray() for e in examples])
     return examples
 
 

From 3c78a86f9702cd4c05e03ec49f185a665b51d66c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 30 Oct 2019 09:37:25 +0100
Subject: [PATCH 111/239] add more tests

---
 .../embedding_intent_classifier.py            | 15 +++--------
 tests/utils/test_train_utils.py               | 26 +++++++++++++++++++
 2 files changed, 30 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index ed2f13dc8507..cea0022de6fb 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -11,6 +11,7 @@
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
+from rasa.utils.train_utils import SessionData
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
@@ -21,8 +22,6 @@
 import tensorflow as tf
 
 # avoid warning println on contrib import - remove for tf 2
-from utils.train_utils import SessionData
-
 tf.contrib._warning = None
 
 logger = logging.getLogger(__name__)
@@ -370,9 +369,7 @@ def _create_session_data(
         if X_dense.size > 0:
             X_dict["text_features_dense"] = X_dense
 
-        return train_utils.SessionData(
-            X_dict, {"intent_features": Y}, {"intent_ids": label_ids}
-        )
+        return SessionData(X_dict, {"intent_features": Y}, {"intent_ids": label_ids})
 
     def _get_x_features(
         self, message: "Message"
@@ -496,17 +493,13 @@ def batch_to_input(self, batch: Tuple) -> Tuple[tf.Tensor, tf.Tensor]:
         raise ValueError("Iterator return unexpected number of tensors.")
 
     def _squeeze_sparse_features(self, a_in: tf.Tensor) -> tf.Tensor:
-        # we need to squeeze sparse features as the classifier cannot handle
-        # sequences
-        # as sparse features come from a scipy.sparse.csr_matrix they have a
+        # as sparse features come from a scipy.sparse.csr_matrix they have an
         # additional dimension
         if len(a_in.shape) == 3:
             a_in = tf.squeeze(a_in, axis=1)
         return a_in
 
-    def _build_tf_pred_graph(
-        self, session_data: "train_utils.SessionData"
-    ) -> "tf.Tensor":
+    def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         num_features_sparse = self._get_num_of_features(
             session_data, "text_features_sparse"
         )
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 7025abd62bba..1ad2f9a35cad 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -9,6 +9,7 @@
     train_val_split,
     session_data_for_ids,
     get_number_of_examples,
+    gen_batch,
 )
 
 
@@ -67,6 +68,12 @@ def test_train_val_split(session_data: SessionData):
         assert v.shape[0] == 2
 
 
+@pytest.mark.parametrize("size", [0, 1, 5])
+def test_train_val_split_incorrect_size(session_data: SessionData, size):
+    with pytest.raises(ValueError):
+        train_val_split(session_data, size, 42, "labels")
+
+
 def test_session_data_for_ids(session_data: SessionData):
     filtered_session_data = session_data_for_ids(session_data, np.array([0, 1]))
 
@@ -93,3 +100,22 @@ def test_get_number_of_examples_raises_value_error(session_data: SessionData):
     session_data.X["dense"] = np.random.randint(5, size=(2, 10))
     with pytest.raises(ValueError):
         get_number_of_examples(session_data)
+
+
+def test_gen_batch(session_data: SessionData):
+    iterator = gen_batch(session_data, 2, "labels", shuffle=True)
+
+    batch = next(iterator)
+    assert len(batch) == 4
+    assert len(batch[0]) == 2
+
+    batch = next(iterator)
+    assert len(batch) == 4
+    assert len(batch[0]) == 2
+
+    batch = next(iterator)
+    assert len(batch) == 4
+    assert len(batch[0]) == 1
+
+    with pytest.raises(StopIteration):
+        next(iterator)

From 31196cf3659d38ed537f11c6dace36591a7c8c7d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 30 Oct 2019 13:04:34 +0100
Subject: [PATCH 112/239] add test for blanance session data

---
 rasa/core/policies/embedding_policy.py |  3 ++-
 rasa/utils/train_utils.py              | 28 ++++++++++++++++++++++----
 tests/utils/test_train_utils.py        | 12 +++++++++++
 3 files changed, 38 insertions(+), 5 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index fcd11036cb27..144bbdfcd952 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -335,8 +335,9 @@ def _create_tf_dial(self, a_in) -> Tuple["tf.Tensor", "tf.Tensor"]:
 
     def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
         """Bulid train graph using iterator."""
-
+        # iterator returns a_in, b_in, action_ids
         self.a_in, self.b_in, _ = self._iterator.get_next()
+
         if isinstance(self.featurizer, MaxHistoryTrackerFeaturizer):
             # add time dimension if max history featurizer is used
             self.b_in = self.b_in[:, tf.newaxis, :]
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e7decd594143..1df1c0b3d79a 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -130,6 +130,8 @@ def convert_train_test_split(
     # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
     # order is kept, so first session_data.X values, then session_data.Y values, and
     # finally session_data.labels values
+
+    # train datasets have an even index
     for i in range(len(session_data.X)):
         X_train[keys[i]] = combine_features(output_values[i * 2], solo_values[i])
 
@@ -142,6 +144,7 @@ def convert_train_test_split(
     ):
         labels_train[keys[i]] = combine_features(output_values[i * 2], solo_values[i])
 
+    # val datasets have an odd index
     for i in range(len(session_data.X)):
         X_val[keys[i]] = output_values[(i * 2) + 1]
 
@@ -268,11 +271,23 @@ def balance_session_data(
             if min(num_data_cycles) > 0:
                 break
 
-    new_X = {k: np.concatenate(v) for k, v in new_X.items()}
-    new_Y = {k: np.concatenate(v) for k, v in new_Y.items()}
-    new_labels = {k: np.concatenate(v) for k, v in new_labels.items()}
+    return SessionData(
+        X=concatenate_data(new_X),
+        Y=concatenate_data(new_Y),
+        labels=concatenate_data(new_labels),
+    )
+
 
-    return SessionData(X=new_X, Y=new_Y, labels=new_labels)
+def concatenate_data(
+    data_dict: Dict[Text, Union[np.ndarray, List[scipy.sparse.spmatrix]]]
+) -> Dict[Text, Union[np.ndarray, List[scipy.sparse.spmatrix]]]:
+    new_dict = {}
+    for k, v in data_dict.items():
+        if isinstance(v[0], scipy.sparse.spmatrix):
+            new_dict[k] = scipy.sparse.vstack(v)
+        else:
+            new_dict[k] = np.concatenate(v)
+    return new_dict
 
 
 def get_number_of_examples(session_data: SessionData):
@@ -320,12 +335,17 @@ def gen_batch(
             sparse_to_dense(v[start:end]) for v in session_data.labels.values()
         ]
 
+        # len of batch_data is equal to the number of keys in session data
         yield tuple(batch_data)
 
 
 def sparse_to_dense(
     examples: Union[np.ndarray, List[scipy.sparse.csr_matrix]]
 ) -> np.ndarray:
+    # in case of BOW features it'll be either a 2D dense array or list of sparse
+    # matrices 1xN (because sparse vector doesn't exist)
+    # in case of sequence it'll be either a 3D dense array or a list of sparse
+    # matrices seq_lenxN
     if isinstance(examples[0], scipy.sparse.spmatrix):
         return np.stack([e.toarray() for e in examples])
     return examples
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 1ad2f9a35cad..3b07b1273bb3 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -10,6 +10,7 @@
     session_data_for_ids,
     get_number_of_examples,
     gen_batch,
+    balance_session_data,
 )
 
 
@@ -119,3 +120,14 @@ def test_gen_batch(session_data: SessionData):
 
     with pytest.raises(StopIteration):
         next(iterator)
+
+
+def test_balance_session_data(session_data: SessionData):
+    session_data.labels["labels"] = np.array([0, 0, 0, 1, 1])
+
+    balanced_session_data = balance_session_data(session_data, 2, False, "labels")
+
+    labels = balanced_session_data.labels["labels"]
+
+    assert 5 == len(labels)
+    assert np.all(np.array([0, 0, 1, 0, 1]) == labels)

From 9f4ed631becbb2eb7835fb0b3e63b63353611941 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 30 Oct 2019 13:09:22 +0100
Subject: [PATCH 113/239] use given attribute in create session data

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index cea0022de6fb..7d68232f74d2 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -353,7 +353,7 @@ def _create_session_data(
                 if x_dense is not None:
                     X_dense.append(x_dense)
 
-                label_ids.append(label_id_dict[e.get(MESSAGE_INTENT_ATTRIBUTE)])
+                label_ids.append(label_id_dict[e.get(attribute)])
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)

From 7545745200199ddce01ab451007bd219f22892a1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 31 Oct 2019 15:36:54 +0100
Subject: [PATCH 114/239] gen_batch can handle sequence

---
 .../embedding_intent_classifier.py            | 24 +++++--
 rasa/utils/train_utils.py                     | 69 +++++++++++++++++--
 tests/utils/test_train_utils.py               | 67 ++++++++++++++----
 3 files changed, 134 insertions(+), 26 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7d68232f74d2..20de809155f2 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -359,6 +359,8 @@ def _create_session_data(
         X_dense = np.array(X_dense)
         label_ids = np.array(label_ids)
 
+        # TODO: get Y directly from message (sparse and dense)
+        # all_encoded_labels should be sparse
         for label_id_idx in label_ids:
             Y.append(self._encoded_all_label_ids[label_id_idx])
         Y = np.array(Y)
@@ -369,6 +371,8 @@ def _create_session_data(
         if X_dense.size > 0:
             X_dict["text_features_dense"] = X_dense
 
+        # TODO: session data should be dict
+        # TODO: include mask inside session data
         return SessionData(X_dict, {"intent_features": Y}, {"intent_ids": label_ids})
 
     def _get_x_features(
@@ -384,16 +388,16 @@ def _get_x_features(
             message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             is not None
         ):
-            x_sparse = sequence_to_sentence_features(
-                message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            x_sparse = message.get(
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
             )
 
         if (
             message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
             is not None
         ):
-            x_dense = sequence_to_sentence_features(
-                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            x_dense = message.get(
+                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
             )
 
         return x_sparse, x_dense
@@ -427,8 +431,15 @@ def _create_tf_embed_fnn(
     def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
         # batch = 1 or 2 a_in values, b_in, intent_ids
         batch = self._iterator.get_next()
+
+        # TODO: convert seq to sentence (sum and mean)
+        # TODO: convert batch into session data (same keys, but with tensors)
+
         self.a_in, self.b_in = self.batch_to_input(batch)
 
+        # TODO _encoded_all_label_ids is sparse add dense layer to convert it to dense
+        # https://medium.com/dailymotion/how-to-design-deep-learning-models-with-sparse-inputs-in-tensorflow-keras-fd5e754abec1
+        # https: // github.com / tensorflow / tensorflow / issues / 9210  # issuecomment-497889961
         all_label_ids = tf.constant(
             self._encoded_all_label_ids, dtype=tf.float32, name="all_label_ids"
         )
@@ -516,6 +527,9 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
             name="b",
         )
 
+        # TODO check this idea:
+        self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
+
         self.message_embed = self._create_tf_embed_fnn(
             self.a_in,
             self.hidden_layer_sizes["a"],
@@ -573,6 +587,8 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         )
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
+        # TODO: sparse + dense, maybe dict?
+        # TODO: can we use somehing else
         self._encoded_all_label_ids = self._create_encoded_label_ids(
             training_data,
             label_id_dict,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 1df1c0b3d79a..a9943745df7d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -36,6 +36,7 @@
 
 
 # namedtuple for all tf session related data
+# TODO: use simple dict, no X, Y, lables
 class SessionData(NamedTuple):
     X: Dict[Text, np.ndarray]
     Y: Dict[Text, np.ndarray]
@@ -50,6 +51,10 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
+# TODO: add method to converst scipy.sparse matrix to indices, values, shapes
+# TODO: add method to convert indices, vales, shapes to tf.SparseTensor
+# TODO: add wrapper around all denses layers to use https://medium.com/dailymotion/how-to-design-deep-learning-models-with-sparse-inputs-in-tensorflow-keras-fd5e754abec1
+
 # noinspection PyPep8Naming
 def train_val_split(
     session_data: "SessionData",
@@ -312,6 +317,9 @@ def gen_batch(
 ) -> Generator[Tuple, None, None]:
     """Generate batches."""
 
+    # TODO: should keep everything sequence
+    # https://github.com/tensorflow/tensorflow/issues/16689
+
     if shuffle:
         session_data = shuffle_session_data(session_data)
 
@@ -327,18 +335,64 @@ def gen_batch(
         start = batch_num * batch_size
         end = start + batch_size
 
-        batch_data = [sparse_to_dense(v[start:end]) for v in session_data.X.values()]
-        batch_data = batch_data + [
-            sparse_to_dense(v[start:end]) for v in session_data.Y.values()
-        ]
-        batch_data = batch_data + [
-            sparse_to_dense(v[start:end]) for v in session_data.labels.values()
-        ]
+        batch_data = []
+        for v in session_data.X.values():
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                batch_data.append(get_sparse_values(v[start:end]))
+            else:
+                batch_data.append(pad_data(v[start:end]))
+
+        for v in session_data.Y.values():
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                batch_data.append(get_sparse_values(v[start:end]))
+            else:
+                batch_data.append(pad_data(v[start:end]))
+
+        for v in session_data.labels.values():
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                batch_data.append(get_sparse_values(v[start:end]))
+            else:
+                batch_data.append(pad_data(v[start:end]))
 
         # len of batch_data is equal to the number of keys in session data
         yield tuple(batch_data)
 
 
+def get_sparse_values(data: np.ndarray) -> np.ndarray:
+    converted = []
+
+    # TODO padding
+
+    for d in data:
+        coo = d.tocoo()
+        indices = np.mat([coo.row, coo.col]).transpose()
+        converted.append((indices, coo.data, coo.shape))
+
+    return np.array(converted)
+
+
+def pad_data(data: np.ndarray) -> np.ndarray:
+    if data[0].ndim == 0:
+        return data
+
+    data_size = len(data)
+    feature_len = max([x.shape[-1] for x in data])
+
+    if data[0].ndim == 1:
+        data_padded = np.zeros([data_size, feature_len], dtype=data[0].dtype)
+        for i in range(data_size):
+            data_padded[i, : data[i].shape[0]] = data[i]
+    else:
+        max_seq_len = max([x.shape[0] for x in data])
+        data_padded = np.zeros(
+            [data_size, max_seq_len, feature_len], dtype=data[0].dtype
+        )
+        for i in range(data_size):
+            data_padded[i, : data[i].shape[0], :] = data[i]
+
+    return data_padded
+
+
 def sparse_to_dense(
     examples: Union[np.ndarray, List[scipy.sparse.csr_matrix]]
 ) -> np.ndarray:
@@ -362,6 +416,7 @@ def create_tf_dataset(
     """Create tf dataset."""
 
     # set batch and sequence length to None
+    # TODO: can we remove the shape?
     shapes, types = _get_shape_and_types(session_data)
 
     return tf.data.Dataset.from_generator(
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 3b07b1273bb3..1993df1f01d7 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -18,11 +18,48 @@
 async def session_data() -> SessionData:
     return SessionData(
         X={
-            "dense": np.random.rand(5, 10),
-            "sparse": scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
+            "dense": np.array(
+                [
+                    np.random.rand(5, 14),
+                    np.random.rand(2, 14),
+                    np.random.rand(3, 14),
+                    np.random.rand(1, 14),
+                    np.random.rand(3, 14),
+                ]
+            ),
+            "sparse": np.array(
+                [
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                ]
+            ),
+        },
+        Y={
+            "Y": np.array(
+                [
+                    np.random.randint(2, size=(5, 10)),
+                    np.random.randint(2, size=(2, 10)),
+                    np.random.randint(2, size=(3, 10)),
+                    np.random.randint(2, size=(1, 10)),
+                    np.random.randint(2, size=(3, 10)),
+                ]
+            )
+        },
+        labels={
+            "intent_ids": np.array([0, 1, 0, 1, 1]),
+            "tag_ids": np.array(
+                [
+                    np.array([0, 1, 1, 0, 2]),
+                    np.array([2, 0]),
+                    np.array([0, 1, 1]),
+                    np.array([0, 1]),
+                    np.array([0, 0, 0]),
+                ]
+            ),
         },
-        Y={"Y": np.random.randint(2, size=(5, 10))},
-        labels={"labels": np.array([0, 1, 0, 0, 0])},
     )
 
 
@@ -42,12 +79,12 @@ def test_shuffle_session_data(session_data: SessionData):
 
 def test_split_session_data_by_label(session_data: SessionData):
     split_session_data = split_session_data_by_label(
-        session_data, "labels", np.array([0, 1])
+        session_data, "intent_ids", np.array([0, 1])
     )
 
     assert len(split_session_data) == 2
     for s in split_session_data:
-        assert len(set(s.labels["labels"])) == 1
+        assert len(set(s.labels["intent_ids"])) == 1
 
 
 def test_split_session_data_by_incorrect_label(session_data: SessionData):
@@ -59,7 +96,7 @@ def test_split_session_data_by_incorrect_label(session_data: SessionData):
 
 def test_train_val_split(session_data: SessionData):
     train_session_data, val_session_data = train_val_split(
-        session_data, 2, 42, "labels"
+        session_data, 2, 42, "intent_ids"
     )
 
     for v in train_session_data.X.values():
@@ -72,7 +109,7 @@ def test_train_val_split(session_data: SessionData):
 @pytest.mark.parametrize("size", [0, 1, 5])
 def test_train_val_split_incorrect_size(session_data: SessionData, size):
     with pytest.raises(ValueError):
-        train_val_split(session_data, size, 42, "labels")
+        train_val_split(session_data, size, 42, "intent_ids")
 
 
 def test_session_data_for_ids(session_data: SessionData):
@@ -104,18 +141,18 @@ def test_get_number_of_examples_raises_value_error(session_data: SessionData):
 
 
 def test_gen_batch(session_data: SessionData):
-    iterator = gen_batch(session_data, 2, "labels", shuffle=True)
+    iterator = gen_batch(session_data, 2, "intent_ids", shuffle=True)
 
     batch = next(iterator)
-    assert len(batch) == 4
+    assert len(batch) == 5
     assert len(batch[0]) == 2
 
     batch = next(iterator)
-    assert len(batch) == 4
+    assert len(batch) == 5
     assert len(batch[0]) == 2
 
     batch = next(iterator)
-    assert len(batch) == 4
+    assert len(batch) == 5
     assert len(batch[0]) == 1
 
     with pytest.raises(StopIteration):
@@ -123,11 +160,11 @@ def test_gen_batch(session_data: SessionData):
 
 
 def test_balance_session_data(session_data: SessionData):
-    session_data.labels["labels"] = np.array([0, 0, 0, 1, 1])
+    session_data.labels["intent_ids"] = np.array([0, 0, 0, 1, 1])
 
-    balanced_session_data = balance_session_data(session_data, 2, False, "labels")
+    balanced_session_data = balance_session_data(session_data, 2, False, "intent_ids")
 
-    labels = balanced_session_data.labels["labels"]
+    labels = balanced_session_data.labels["intent_ids"]
 
     assert 5 == len(labels)
     assert np.all(np.array([0, 0, 1, 0, 1]) == labels)

From d3b48ea3180bd50d0140b05e4e980b7f76c2b6e6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 31 Oct 2019 16:27:53 +0100
Subject: [PATCH 115/239] session data is simple dict

---
 rasa/utils/train_utils.py       | 145 +++++++++-----------------------
 tests/utils/test_train_utils.py | 128 +++++++++++++---------------
 2 files changed, 98 insertions(+), 175 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index a9943745df7d..3bc35d181757 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -37,10 +37,7 @@
 
 # namedtuple for all tf session related data
 # TODO: use simple dict, no X, Y, lables
-class SessionData(NamedTuple):
-    X: Dict[Text, np.ndarray]
-    Y: Dict[Text, np.ndarray]
-    labels: Dict[Text, np.ndarray]
+SessionData = Dict[Text, np.ndarray]
 
 
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
@@ -63,42 +60,35 @@ def train_val_split(
     label_key: Text,
 ) -> Tuple["SessionData", "SessionData"]:
     """Create random hold out validation set using stratified split."""
-    if label_key not in session_data.labels:
-        raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
+    if label_key not in session_data:
+        raise ValueError(f"Key '{label_key}' not in SessionData.")
 
     label_counts = dict(
-        zip(*np.unique(session_data.labels[label_key], return_counts=True, axis=0))
+        zip(*np.unique(session_data[label_key], return_counts=True, axis=0))
     )
 
     check_train_test_sizes(evaluate_on_num_examples, label_counts, session_data)
 
-    counts = np.array([label_counts[label] for label in session_data.labels[label_key]])
+    counts = np.array([label_counts[label] for label in session_data[label_key]])
 
     multi_values = []
-    [multi_values.append(v[counts > 1]) for v in session_data.X.values()]
-    [multi_values.append(v[counts > 1]) for v in session_data.Y.values()]
-    [multi_values.append(v[counts > 1]) for v in session_data.labels.values()]
+    [multi_values.append(v[counts > 1]) for v in session_data.values()]
 
     solo_values = []
-    [solo_values.append(v[counts == 1]) for v in session_data.X.values()]
-    [solo_values.append(v[counts == 1]) for v in session_data.Y.values()]
-    [solo_values.append(v[counts == 1]) for v in session_data.labels.values()]
+    [solo_values.append(v[counts == 1]) for v in session_data.values()]
 
     output_values = train_test_split(
         *multi_values,
         test_size=evaluate_on_num_examples,
         random_state=random_seed,
-        stratify=session_data.labels[label_key][counts > 1],
+        stratify=session_data[label_key][counts > 1],
     )
 
-    X_train, X_val, Y_train, Y_val, labels_train, labels_val = convert_train_test_split(
+    session_data_train, session_data_val = convert_train_test_split(
         output_values, session_data, solo_values
     )
 
-    return (
-        SessionData(X_train, Y_train, labels_train),
-        SessionData(X_val, Y_val, labels_val),
-    )
+    return (session_data_train, session_data_val)
 
 
 def check_train_test_sizes(
@@ -123,46 +113,26 @@ def check_train_test_sizes(
 def convert_train_test_split(
     output_values: List[Any], session_data: SessionData, solo_values: List[Any]
 ):
-    keys = [k for d in session_data for k, v in d.items()]
+    keys = [k for k, v in session_data.items()]
 
-    X_train = {}
-    Y_train = {}
-    labels_train = {}
-    X_val = {}
-    Y_val = {}
-    labels_val = {}
+    session_data_train = {}
+    session_data_val = {}
 
     # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
     # order is kept, so first session_data.X values, then session_data.Y values, and
     # finally session_data.labels values
 
     # train datasets have an even index
-    for i in range(len(session_data.X)):
-        X_train[keys[i]] = combine_features(output_values[i * 2], solo_values[i])
-
-    for i in range(len(session_data.X), len(session_data.X) + len(session_data.Y)):
-        Y_train[keys[i]] = combine_features(output_values[i * 2], solo_values[i])
-
-    for i in range(
-        len(session_data.X) + len(session_data.Y),
-        len(session_data.X) + len(session_data.Y) + len(session_data.labels),
-    ):
-        labels_train[keys[i]] = combine_features(output_values[i * 2], solo_values[i])
+    for i in range(len(session_data)):
+        session_data_train[keys[i]] = combine_features(
+            output_values[i * 2], solo_values[i]
+        )
 
     # val datasets have an odd index
-    for i in range(len(session_data.X)):
-        X_val[keys[i]] = output_values[(i * 2) + 1]
+    for i in range(len(session_data)):
+        session_data_val[keys[i]] = output_values[(i * 2) + 1]
 
-    for i in range(len(session_data.X), len(session_data.X) + len(session_data.Y)):
-        Y_val[keys[i]] = output_values[(i * 2) + 1]
-
-    for i in range(
-        len(session_data.X) + len(session_data.Y),
-        len(session_data.X) + len(session_data.Y) + len(session_data.labels),
-    ):
-        labels_val[keys[i]] = output_values[(i * 2) + 1]
-
-    return X_train, X_val, Y_train, Y_val, labels_train, labels_val
+    return session_data_train, session_data_val
 
 
 def combine_features(
@@ -190,23 +160,19 @@ def shuffle_session_data(session_data: "SessionData") -> "SessionData":
 
 def session_data_for_ids(session_data: SessionData, ids: np.ndarray):
     """Filter session data by ids."""
-    X = {k: v[ids] for k, v in session_data.X.items()}
-    Y = {k: v[ids] for k, v in session_data.Y.items()}
-    labels = {k: v[ids] for k, v in session_data.labels.items()}
-
-    return SessionData(X, Y, labels)
+    return {k: v[ids] for k, v in session_data.items()}
 
 
 def split_session_data_by_label(
     session_data: "SessionData", label_key: Text, unique_label_ids: "np.ndarray"
 ) -> List["SessionData"]:
     """Reorganize session data into a list of session data with the same labels."""
-    if label_key not in session_data.labels:
+    if label_key not in session_data:
         raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
 
     label_data = []
     for label_id in unique_label_ids:
-        ids = session_data.labels[label_key] == label_id
+        ids = session_data[label_key] == label_id
         label_data.append(session_data_for_ids(session_data, ids))
     return label_data
 
@@ -221,13 +187,11 @@ def balance_session_data(
     by repeating them. Mimics stratified batching, but also takes into account
     that more populated classes should appear more often.
     """
-    if label_key not in session_data.labels:
+    if label_key not in session_data:
         raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
 
-    num_examples = get_number_of_examples(session_data)
-
     unique_label_ids, counts_label_ids = np.unique(
-        session_data.labels[label_key], return_counts=True, axis=0
+        session_data[label_key], return_counts=True, axis=0
     )
     num_label_ids = len(unique_label_ids)
 
@@ -238,9 +202,7 @@ def balance_session_data(
     num_data_cycles = [0] * num_label_ids
     skipped = [False] * num_label_ids
 
-    new_X = defaultdict(list)
-    new_Y = defaultdict(list)
-    new_labels = defaultdict(list)
+    new_session_data = defaultdict(list)
 
     while min(num_data_cycles) == 0:
         if shuffle:
@@ -255,20 +217,15 @@ def balance_session_data(
             else:
                 skipped[index] = False
 
-            index_batch_size = (
-                int(counts_label_ids[index] / num_examples * batch_size) + 1
-            )
-
-            for k, v in label_data[index].X.items():
-                new_X[k].append(v[data_idx[index] : data_idx[index] + index_batch_size])
-            for k, v in label_data[index].Y.items():
-                new_Y[k].append(v[data_idx[index] : data_idx[index] + index_batch_size])
-            for k, v in label_data[index].labels.items():
-                new_labels[k].append(
-                    v[data_idx[index] : data_idx[index] + index_batch_size]
-                )
+            for k, v in label_data[index].items():
+                if v[0].ndim == 0:
+                    new_session_data[k].append(
+                        v[data_idx[index] : data_idx[index] + 1][0]
+                    )
+                else:
+                    new_session_data[k].append(v[data_idx[index] : data_idx[index] + 1])
 
-            data_idx[index] += index_batch_size
+            data_idx[index] += 1
             if data_idx[index] >= counts_label_ids[index]:
                 num_data_cycles[index] += 1
                 data_idx[index] = 0
@@ -276,11 +233,9 @@ def balance_session_data(
             if min(num_data_cycles) > 0:
                 break
 
-    return SessionData(
-        X=concatenate_data(new_X),
-        Y=concatenate_data(new_Y),
-        labels=concatenate_data(new_labels),
-    )
+    new_session_data = {k: np.array(v) for k, v in new_session_data.items()}
+
+    return new_session_data
 
 
 def concatenate_data(
@@ -296,12 +251,12 @@ def concatenate_data(
 
 
 def get_number_of_examples(session_data: SessionData):
-    example_lengths = [v.shape[0] for v in session_data.X.values()]
+    example_lengths = [v.shape[0] for v in session_data.values()]
 
     # check if number of examples is the same for all X
     if not all(length == example_lengths[0] for length in example_lengths):
         raise ValueError(
-            f"Number of examples differs for X ({session_data.X.keys()}). There should "
+            f"Number of examples differs for X ({session_data.keys()}). There should "
             f"be the same."
         )
 
@@ -336,19 +291,7 @@ def gen_batch(
         end = start + batch_size
 
         batch_data = []
-        for v in session_data.X.values():
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                batch_data.append(get_sparse_values(v[start:end]))
-            else:
-                batch_data.append(pad_data(v[start:end]))
-
-        for v in session_data.Y.values():
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                batch_data.append(get_sparse_values(v[start:end]))
-            else:
-                batch_data.append(pad_data(v[start:end]))
-
-        for v in session_data.labels.values():
+        for v in session_data.values():
             if isinstance(v[0], scipy.sparse.spmatrix):
                 batch_data.append(get_sparse_values(v[start:end]))
             else:
@@ -441,13 +384,7 @@ def append_shape(v: np.ndarray):
         else:
             shapes.append((None, None, v[0].shape[-1]))
 
-    for v in session_data.X.values():
-        append_shape(v)
-        types.append(np.float32)
-    for v in session_data.Y.values():
-        append_shape(v)
-        types.append(np.float32)
-    for v in session_data.labels.values():
+    for v in session_data.values():
         append_shape(v)
         types.append(v.dtype)
 
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 1993df1f01d7..ca55356edbe9 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -16,65 +16,51 @@
 
 @pytest.fixture
 async def session_data() -> SessionData:
-    return SessionData(
-        X={
-            "dense": np.array(
-                [
-                    np.random.rand(5, 14),
-                    np.random.rand(2, 14),
-                    np.random.rand(3, 14),
-                    np.random.rand(1, 14),
-                    np.random.rand(3, 14),
-                ]
-            ),
-            "sparse": np.array(
-                [
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
-                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                ]
-            ),
-        },
-        Y={
-            "Y": np.array(
-                [
-                    np.random.randint(2, size=(5, 10)),
-                    np.random.randint(2, size=(2, 10)),
-                    np.random.randint(2, size=(3, 10)),
-                    np.random.randint(2, size=(1, 10)),
-                    np.random.randint(2, size=(3, 10)),
-                ]
-            )
-        },
-        labels={
-            "intent_ids": np.array([0, 1, 0, 1, 1]),
-            "tag_ids": np.array(
-                [
-                    np.array([0, 1, 1, 0, 2]),
-                    np.array([2, 0]),
-                    np.array([0, 1, 1]),
-                    np.array([0, 1]),
-                    np.array([0, 0, 0]),
-                ]
-            ),
-        },
-    )
+    return {
+        "dense": np.array(
+            [
+                np.random.rand(5, 14),
+                np.random.rand(2, 14),
+                np.random.rand(3, 14),
+                np.random.rand(1, 14),
+                np.random.rand(3, 14),
+            ]
+        ),
+        "sparse": np.array(
+            [
+                scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
+                scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
+                scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
+                scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+            ]
+        ),
+        "Y": np.array(
+            [
+                np.random.randint(2, size=(5, 10)),
+                np.random.randint(2, size=(2, 10)),
+                np.random.randint(2, size=(3, 10)),
+                np.random.randint(2, size=(1, 10)),
+                np.random.randint(2, size=(3, 10)),
+            ]
+        ),
+        "intent_ids": np.array([0, 1, 0, 1, 1]),
+        "tag_ids": np.array(
+            [
+                np.array([0, 1, 1, 0, 2]),
+                np.array([2, 0]),
+                np.array([0, 1, 1]),
+                np.array([0, 1]),
+                np.array([0, 0, 0]),
+            ]
+        ),
+    }
 
 
 def test_shuffle_session_data(session_data: SessionData):
     shuffeled_session_data = shuffle_session_data(session_data)
 
-    assert np.array(shuffeled_session_data.X.values()) != np.array(
-        session_data.X.values()
-    )
-    assert np.array(shuffeled_session_data.Y.values()) != np.array(
-        session_data.Y.values()
-    )
-    assert np.array(shuffeled_session_data.labels.values()) != np.array(
-        session_data.labels.values()
-    )
+    assert np.array(shuffeled_session_data.values()) != np.array(session_data.values())
 
 
 def test_split_session_data_by_label(session_data: SessionData):
@@ -84,7 +70,7 @@ def test_split_session_data_by_label(session_data: SessionData):
 
     assert len(split_session_data) == 2
     for s in split_session_data:
-        assert len(set(s.labels["intent_ids"])) == 1
+        assert len(set(s["intent_ids"])) == 1
 
 
 def test_split_session_data_by_incorrect_label(session_data: SessionData):
@@ -99,10 +85,10 @@ def test_train_val_split(session_data: SessionData):
         session_data, 2, 42, "intent_ids"
     )
 
-    for v in train_session_data.X.values():
+    for v in train_session_data.values():
         assert v.shape[0] == 3
 
-    for v in val_session_data.X.values():
+    for v in val_session_data.values():
         assert v.shape[0] == 2
 
 
@@ -115,17 +101,13 @@ def test_train_val_split_incorrect_size(session_data: SessionData, size):
 def test_session_data_for_ids(session_data: SessionData):
     filtered_session_data = session_data_for_ids(session_data, np.array([0, 1]))
 
-    for v in filtered_session_data.X.values():
+    for v in filtered_session_data.values():
         assert v.shape[0] == 2
 
-    k = list(session_data.X.keys())[0]
+    k = list(session_data.keys())[0]
 
-    assert np.all(
-        np.array(filtered_session_data.X[k][0]) == np.array(session_data.X[k][0])
-    )
-    assert np.all(
-        np.array(filtered_session_data.X[k][1]) == np.array(session_data.X[k][1])
-    )
+    assert np.all(np.array(filtered_session_data[k][0]) == np.array(session_data[k][0]))
+    assert np.all(np.array(filtered_session_data[k][1]) == np.array(session_data[k][1]))
 
 
 def test_get_number_of_examples(session_data: SessionData):
@@ -135,7 +117,7 @@ def test_get_number_of_examples(session_data: SessionData):
 
 
 def test_get_number_of_examples_raises_value_error(session_data: SessionData):
-    session_data.X["dense"] = np.random.randint(5, size=(2, 10))
+    session_data["dense"] = np.random.randint(5, size=(2, 10))
     with pytest.raises(ValueError):
         get_number_of_examples(session_data)
 
@@ -159,12 +141,16 @@ def test_gen_batch(session_data: SessionData):
         next(iterator)
 
 
-def test_balance_session_data(session_data: SessionData):
-    session_data.labels["intent_ids"] = np.array([0, 0, 0, 1, 1])
+@pytest.mark.parametrize(
+    "intent_ids, expected_labels",
+    [([0, 0, 0, 1, 1], [0, 1, 0, 1, 0]), ([0, 0, 0, 0, 1], [0, 1, 0, 0, 1, 0])],
+)
+def test_balance_session_data(session_data: SessionData, intent_ids, expected_labels):
+    session_data["intent_ids"] = np.array(intent_ids)
 
     balanced_session_data = balance_session_data(session_data, 2, False, "intent_ids")
 
-    labels = balanced_session_data.labels["intent_ids"]
+    labels = balanced_session_data["intent_ids"]
 
-    assert 5 == len(labels)
-    assert np.all(np.array([0, 0, 1, 0, 1]) == labels)
+    assert len(expected_labels) == len(labels)
+    assert np.all(expected_labels == labels)

From a65f39719e9646ba655d17a8864cbb450d25e818 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 1 Nov 2019 17:41:05 +0100
Subject: [PATCH 116/239] use sparse tensors

---
 rasa/core/policies/embedding_policy.py        |   6 +-
 .../embedding_intent_classifier.py            | 279 +++++++++---------
 rasa/utils/train_utils.py                     | 170 ++++++++---
 tests/utils/test_train_utils.py               |   7 +-
 4 files changed, 278 insertions(+), 184 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 144bbdfcd952..942db2840e4e 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -270,11 +270,7 @@ def _create_session_data(
             label_ids = None
             Y = None
 
-        return train_utils.SessionData(
-            X={"dialogue_features": data_X},
-            Y={"bot_features": Y},
-            labels={"action_ids": label_ids},
-        )
+        return {"dialogue_features": data_X, "bot_features": Y, "action_ids": label_ids}
 
     def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
         """Create embedding bot vector."""
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 20de809155f2..50b9d915c771 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,13 +1,14 @@
 import logging
+from collections import defaultdict, OrderedDict
+
 import numpy as np
 import os
 import pickle
 import scipy.sparse
 import typing
-from typing import Any, Dict, List, Optional, Text, Tuple, Union
+from typing import Any, Dict, List, Optional, Text, Tuple
 import warnings
 
-from rasa.nlu.featurizers.featurzier import sequence_to_sentence_features
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
@@ -263,36 +264,60 @@ def _find_example_for_label(
 
     @staticmethod
     def _check_labels_features_exist(
-        labels_example: List[Tuple[int, "Message"]], attribute_feature_name: Text
+        labels_example: List[Tuple[int, "Message"]], attribute: Text
     ) -> bool:
         """Check if all labels have features set"""
         for (label_idx, label_example) in labels_example:
-            if label_example.get(attribute_feature_name) is None:
+            if label_example.get(
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]
+            ) is None and label_example.get(
+                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+            ):
                 return False
         return True
 
-    @staticmethod
     def _extract_labels_precomputed_features(
-        label_examples: List[Tuple[int, "Message"]], attribute_feature_name: Text
-    ) -> np.ndarray:
+        self, label_examples: List[Tuple[int, "Message"]]
+    ) -> Dict[int, Dict[Text, Any]]:
 
         # Collect precomputed encodings
-        encoded_id_labels = [
-            (
-                label_idx,
-                sequence_to_sentence_features(label_example.get(attribute_feature_name))
-                .toarray()
-                .squeeze(),
+        sparse_features = []
+        dense_features = []
+
+        for i, e in label_examples:
+            self._extract_and_add_features(
+                e, MESSAGE_INTENT_ATTRIBUTE, sparse_features, dense_features
             )
-            for (label_idx, label_example) in label_examples
-        ]
 
-        # Sort the list of tuples based on label_idx
-        encoded_id_labels = sorted(encoded_id_labels, key=lambda x: x[0])
+        encoded_id_labels = defaultdict(dict)
+        for i, s in zip(label_examples, sparse_features):
+            indices, data, shape = train_utils.scipy_matrix_to_values(np.array([s]))
+            sparse_tensor = train_utils.values_to_sparse_tensor(indices, data, shape)
+            encoded_id_labels[i[0]]["intent_features_sparse"] = sparse_tensor
+        for i, d in zip(label_examples, dense_features):
+            encoded_id_labels[i[0]]["intent_features_dense"] = tf.constant(d)
+
+        # Sort the dict based on label_idx
+        encoded_id_labels = OrderedDict(sorted(encoded_id_labels.items()))
 
-        encoded_all_labels = [encoding for (index, encoding) in encoded_id_labels]
+        return encoded_id_labels
+
+    def _extract_and_add_features(
+        self,
+        message: "Message",
+        attribute: Text,
+        sparse_features: List[scipy.sparse.spmatrix],
+        dense_features: List[np.ndarray],
+    ):
+        if message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]) is not None:
+            sparse_features.append(
+                message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute])
+            )
 
-        return np.array(encoded_all_labels)
+        if message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]) is not None:
+            dense_features.append(
+                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute])
+            )
 
     def _compute_default_label_features(
         self, labels_example: List[Tuple[int, "Message"]]
@@ -306,7 +331,6 @@ def _create_encoded_label_ids(
         training_data: "TrainingData",
         label_id_dict: Dict[Text, int],
         attribute: Text,
-        attribute_feature_name: Text,
     ) -> np.ndarray:
         """Create matrix with label_ids encoded in rows as bag of words. If the features are already computed, fetch
         them from the message object else compute a one hot encoding for the label as the feature vector
@@ -322,9 +346,9 @@ def _create_encoded_label_ids(
             labels_example.append((idx, label_example))
 
         # Collect features, precomputed if they exist, else compute on the fly
-        if self._check_labels_features_exist(labels_example, attribute_feature_name):
+        if self._check_labels_features_exist(labels_example, attribute):
             encoded_id_labels = self._extract_labels_precomputed_features(
-                labels_example, attribute_feature_name
+                labels_example
             )
         else:
             encoded_id_labels = self._compute_default_label_features(labels_example)
@@ -334,73 +358,46 @@ def _create_encoded_label_ids(
     # noinspection PyPep8Naming
     def _create_session_data(
         self,
-        training_data: "TrainingData",
-        label_id_dict: Dict[Text, int],
-        attribute: Text,
+        training_data: List["Message"],
+        label_id_dict: Optional[Dict[Text, int]] = None,
+        attribute: Optional[Text] = None,
     ) -> "SessionData":
         """Prepare data for training and create a SessionData object"""
         X_sparse = []
         X_dense = []
-        Y = []
+        Y_sparse = []
+        Y_dense = []
         label_ids = []
 
-        for e in training_data.intent_examples:
-            if e.get(attribute):
-                x_sparse, x_dense = self._get_x_features(e)
-
-                if x_sparse is not None:
-                    X_sparse.append(x_sparse)
-                if x_dense is not None:
-                    X_dense.append(x_dense)
+        for e in training_data:
+            self._extract_and_add_features(e, MESSAGE_TEXT_ATTRIBUTE, X_sparse, X_dense)
+            self._extract_and_add_features(
+                e, MESSAGE_INTENT_ATTRIBUTE, Y_sparse, Y_dense
+            )
 
+            if e.get(attribute):
                 label_ids.append(label_id_dict[e.get(attribute)])
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
+        Y_sparse = np.array(Y_sparse)
+        Y_dense = np.array(Y_dense)
         label_ids = np.array(label_ids)
 
-        # TODO: get Y directly from message (sparse and dense)
-        # all_encoded_labels should be sparse
-        for label_id_idx in label_ids:
-            Y.append(self._encoded_all_label_ids[label_id_idx])
-        Y = np.array(Y)
-
-        X_dict = {}
-        if X_sparse.size > 0:
-            X_dict["text_features_sparse"] = X_sparse
-        if X_dense.size > 0:
-            X_dict["text_features_dense"] = X_dense
+        session_data = {}
+        self._add_to_session_data(session_data, "text_features_sparse", X_sparse)
+        self._add_to_session_data(session_data, "text_features_dense", X_dense)
+        self._add_to_session_data(session_data, "intent_features_sparse", Y_sparse)
+        self._add_to_session_data(session_data, "intent_features_dense", Y_dense)
+        session_data["intent_ids"] = label_ids
 
-        # TODO: session data should be dict
-        # TODO: include mask inside session data
-        return SessionData(X_dict, {"intent_features": Y}, {"intent_ids": label_ids})
-
-    def _get_x_features(
-        self, message: "Message"
-    ) -> Tuple[
-        Optional[Union[np.ndarray, scipy.sparse.spmatrix]],
-        Optional[Union[np.ndarray, scipy.sparse.spmatrix]],
-    ]:
-        x_sparse = None
-        x_dense = None
-
-        if (
-            message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-            is not None
-        ):
-            x_sparse = message.get(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-            )
-
-        if (
-            message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-            is not None
-        ):
-            x_dense = message.get(
-                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-            )
+        return session_data
 
-        return x_sparse, x_dense
+    def _add_to_session_data(
+        self, session_data: SessionData, key: Text, data: np.ndarray
+    ):
+        if data.size > 0:
+            session_data[key] = data
 
     # tf helpers:
     def _create_tf_embed_fnn(
@@ -428,18 +425,26 @@ def _create_tf_embed_fnn(
             layer_name_suffix=embed_name,
         )
 
-    def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
+    def _build_tf_train_graph(
+        self, session_data: SessionData
+    ) -> Tuple["tf.Tensor", "tf.Tensor"]:
         # batch = 1 or 2 a_in values, b_in, intent_ids
         batch = self._iterator.get_next()
 
-        # TODO: convert seq to sentence (sum and mean)
-        # TODO: convert batch into session data (same keys, but with tensors)
+        batch = train_utils.batch_to_session_data(batch, session_data)
+
+        # TODO shape missmatch
+
+        self.a_in = self.combine_sparse_dense_features(batch, "text_features_")
+        self.b_in = self.combine_sparse_dense_features(batch, "intent_features_")
 
-        self.a_in, self.b_in = self.batch_to_input(batch)
+        self._encoded_all_label_ids = tf.stack(
+            [
+                self.combine_sparse_dense_features(v, "intent_features_")
+                for k, v in self._encoded_all_label_ids.items()
+            ]
+        )
 
-        # TODO _encoded_all_label_ids is sparse add dense layer to convert it to dense
-        # https://medium.com/dailymotion/how-to-design-deep-learning-models-with-sparse-inputs-in-tensorflow-keras-fd5e754abec1
-        # https: // github.com / tensorflow / tensorflow / issues / 9210  # issuecomment-497889961
         all_label_ids = tf.constant(
             self._encoded_all_label_ids, dtype=tf.float32, name="all_label_ids"
         )
@@ -480,6 +485,34 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
             self.scale_loss,
         )
 
+    def combine_sparse_dense_features(
+        self, batch: Dict[Text, tf.Tensor], key_prefix: Text
+    ):
+        key_sparse = f"{key_prefix}sparse"
+        key_dense = f"{key_prefix}dense"
+
+        # apply mean/sum to convert sequence to sentence features
+
+        if key_dense in batch and key_sparse in batch:
+            _sparse = tf.math.reduce_sum(
+                train_utils.tf_dense_layer(
+                    batch[key_sparse], batch[key_sparse].shape[-1], "a", self.C2
+                )
+            )
+            _dense = tf.math.reduce_mean(batch[key_dense], axis=1)
+
+            return tf.concat([_sparse, _dense])
+
+        if key_dense in batch:
+            return tf.math.reduce_mean(batch[key_dense], axis=1)
+
+        if key_sparse in batch:
+            return tf.math.reduce_sum(
+                train_utils.tf_dense_layer(
+                    batch[key_sparse], batch[key_sparse].shape[-1], "a", self.C2
+                )
+            )
+
     def batch_to_input(self, batch: Tuple) -> Tuple[tf.Tensor, tf.Tensor]:
         """Convert batch input into correct tensors.
 
@@ -511,24 +544,20 @@ def _squeeze_sparse_features(self, a_in: tf.Tensor) -> tf.Tensor:
         return a_in
 
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
-        num_features_sparse = self._get_num_of_features(
-            session_data, "text_features_sparse"
-        )
-        num_features_dense = self._get_num_of_features(
-            session_data, "text_features_dense"
+        num_text_features = self._get_num_of_features(session_data, "text_features_")
+        num_intent_features = self._get_num_of_features(
+            session_data, "intent_features_"
         )
 
         self.a_in = tf.placeholder(
-            tf.float32, (None, num_features_sparse + num_features_dense), name="a"
+            tf.float32, (None, None, num_text_features), name="a"
         )
         self.b_in = tf.placeholder(
-            tf.float32,
-            (None, None, session_data.Y["intent_features"][0].shape[-1]),
-            name="b",
+            tf.float32, (None, None, num_intent_features), name="b"
         )
 
         # TODO check this idea:
-        self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
+        # self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
 
         self.message_embed = self._create_tf_embed_fnn(
             self.a_in,
@@ -556,22 +585,25 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
 
-    def _get_num_of_features(self, session_data: "SessionData", x_key: Text) -> int:
-        return session_data.X[x_key][0].shape[-1] if x_key in session_data.X else 0
+    def _get_num_of_features(
+        self, session_data: "SessionData", key_prefix: Text
+    ) -> int:
+        num_features = 0
+        for k, v in session_data.items():
+            if k.startswith(key_prefix):
+                num_features += v[0].shape[-1]
+        return num_features
 
     def check_input_dimension_consistency(self, session_data: "SessionData"):
         if self.share_hidden_layers:
-            num_features_sparse = self._get_num_of_features(
-                session_data, "text_features_sparse"
+            num_text_features = self._get_num_of_features(
+                session_data, "text_features_"
             )
-            num_features_dense = self._get_num_of_features(
-                session_data, "text_features_dense"
+            num_intent_features = self._get_num_of_features(
+                session_data, "intent_features_"
             )
 
-            if (
-                num_features_sparse + num_features_dense
-                != session_data.Y["intent_features"][0].shape[-1]
-            ):
+            if num_text_features != num_intent_features:
                 raise ValueError(
                     "If embeddings are shared "
                     "text features and label features "
@@ -587,15 +619,9 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         )
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
-        # TODO: sparse + dense, maybe dict?
-        # TODO: can we use somehing else
+
         self._encoded_all_label_ids = self._create_encoded_label_ids(
-            training_data,
-            label_id_dict,
-            attribute=MESSAGE_INTENT_ATTRIBUTE,
-            attribute_feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
-                MESSAGE_INTENT_ATTRIBUTE
-            ],
+            training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
 
         # check if number of negatives is less than number of label_ids
@@ -603,13 +629,15 @@ def preprocess_train_data(self, training_data: "TrainingData"):
             "Check if num_neg {} is smaller than "
             "number of label_ids {}, "
             "else set num_neg to the number of label_ids - 1"
-            "".format(self.num_neg, self._encoded_all_label_ids.shape[0])
+            "".format(self.num_neg, len(self._encoded_all_label_ids))
         )
         # noinspection PyAttributeOutsideInit
-        self.num_neg = min(self.num_neg, self._encoded_all_label_ids.shape[0] - 1)
+        self.num_neg = min(self.num_neg, len(self._encoded_all_label_ids) - 1)
 
         session_data = self._create_session_data(
-            training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
+            training_data.intent_examples,
+            label_id_dict,
+            attribute=MESSAGE_INTENT_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)
@@ -617,7 +645,7 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         return session_data
 
     def _check_enough_labels(self, session_data: "SessionData") -> bool:
-        return len(np.unique(session_data.labels["intent_ids"])) >= 2
+        return len(np.unique(session_data["intent_ids"])) >= 2
 
     def train(
         self,
@@ -676,7 +704,7 @@ def train(
 
             self._is_training = tf.placeholder_with_default(False, shape=())
 
-            loss, acc = self._build_tf_train_graph()
+            loss, acc = self._build_tf_train_graph(session_data)
 
             # define which optimizer to use
             self._train_op = tf.train.AdamOptimizer().minimize(loss)
@@ -732,7 +760,8 @@ def predict_label(
         else:
             # get features (bag of words/embeddings) for a message
             # noinspection PyPep8Naming
-            X = self._extract_features(message)
+            X = self._create_session_data([message])
+            # TODO convert input
 
             # load tf graph and session
             label_ids, message_sim = self._calculate_message_sim(X)
@@ -752,26 +781,6 @@ def predict_label(
                 ]
         return label, label_ranking
 
-    def _extract_features(self, message: "Message") -> np.ndarray:
-        x_sparse, x_dense = self._get_x_features(message)
-
-        if x_sparse is not None:
-            x_sparse = x_sparse.toarray().squeeze().reshape(1, -1)
-
-        if x_dense is not None:
-            x_dense = x_dense.reshape(1, -1)
-
-        if x_sparse is not None and x_dense is not None:
-            return np.concatenate((x_sparse, x_dense), axis=-1)
-
-        if x_sparse is None and x_dense is not None:
-            return x_dense
-
-        if x_sparse is not None and x_dense is None:
-            return x_sparse
-
-        raise ValueError("No features found for X.")
-
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 3bc35d181757..c04a73d65f32 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -36,7 +36,6 @@
 
 
 # namedtuple for all tf session related data
-# TODO: use simple dict, no X, Y, lables
 SessionData = Dict[Text, np.ndarray]
 
 
@@ -48,10 +47,9 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
-# TODO: add method to converst scipy.sparse matrix to indices, values, shapes
-# TODO: add method to convert indices, vales, shapes to tf.SparseTensor
 # TODO: add wrapper around all denses layers to use https://medium.com/dailymotion/how-to-design-deep-learning-models-with-sparse-inputs-in-tensorflow-keras-fd5e754abec1
 
+
 # noinspection PyPep8Naming
 def train_val_split(
     session_data: "SessionData",
@@ -218,12 +216,7 @@ def balance_session_data(
                 skipped[index] = False
 
             for k, v in label_data[index].items():
-                if v[0].ndim == 0:
-                    new_session_data[k].append(
-                        v[data_idx[index] : data_idx[index] + 1][0]
-                    )
-                else:
-                    new_session_data[k].append(v[data_idx[index] : data_idx[index] + 1])
+                new_session_data[k].append(v[data_idx[index] : data_idx[index] + 1][0])
 
             data_idx[index] += 1
             if data_idx[index] >= counts_label_ids[index]:
@@ -271,10 +264,6 @@ def gen_batch(
     shuffle: bool = False,
 ) -> Generator[Tuple, None, None]:
     """Generate batches."""
-
-    # TODO: should keep everything sequence
-    # https://github.com/tensorflow/tensorflow/issues/16689
-
     if shuffle:
         session_data = shuffle_session_data(session_data)
 
@@ -292,26 +281,45 @@ def gen_batch(
 
         batch_data = []
         for v in session_data.values():
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                batch_data.append(get_sparse_values(v[start:end]))
+            _data = v[start:end]
+            if isinstance(_data[0], scipy.sparse.spmatrix):
+                batch_data = batch_data + scipy_matrix_to_values(_data)
             else:
-                batch_data.append(pad_data(v[start:end]))
+                batch_data.append(pad_data(_data))
 
         # len of batch_data is equal to the number of keys in session data
         yield tuple(batch_data)
 
 
-def get_sparse_values(data: np.ndarray) -> np.ndarray:
-    converted = []
+def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
+    seq_len = max([x.shape[0] for x in array_of_sparse])
+    coo = [x.tocoo() for x in array_of_sparse]
+    data = [v for x in array_of_sparse for v in x.data]
+
+    if seq_len == 1:
+        indices = [
+            ids for i, x in enumerate(coo) for ids in zip([i] * len(x.row), x.col)
+        ]
+        shape = (len(array_of_sparse), array_of_sparse[0].shape[-1])
+    else:
+        indices = [
+            ids
+            for i, x in enumerate(coo)
+            for ids in zip([i] * len(x.row), x.row, x.col)
+        ]
+        shape = (len(array_of_sparse), seq_len, array_of_sparse[0].shape[-1])
+
+    return [np.array(indices), np.array(data), shape]
 
-    # TODO padding
 
-    for d in data:
-        coo = d.tocoo()
-        indices = np.mat([coo.row, coo.col]).transpose()
-        converted.append((indices, coo.data, coo.shape))
+def values_to_sparse_tensor(
+    indices: np.ndarray, data: np.ndarray, shape: np.ndarray
+) -> tf.SparseTensor:
+    # make sure indices and shape have the correct type
+    indices = tf.cast(indices, dtype=tf.int64)
+    shape = tf.cast(shape, dtype=tf.int64)
 
-    return np.array(converted)
+    return tf.SparseTensor(indices, data, shape)
 
 
 def pad_data(data: np.ndarray) -> np.ndarray:
@@ -336,16 +344,21 @@ def pad_data(data: np.ndarray) -> np.ndarray:
     return data_padded
 
 
-def sparse_to_dense(
-    examples: Union[np.ndarray, List[scipy.sparse.csr_matrix]]
-) -> np.ndarray:
-    # in case of BOW features it'll be either a 2D dense array or list of sparse
-    # matrices 1xN (because sparse vector doesn't exist)
-    # in case of sequence it'll be either a 3D dense array or a list of sparse
-    # matrices seq_lenxN
-    if isinstance(examples[0], scipy.sparse.spmatrix):
-        return np.stack([e.toarray() for e in examples])
-    return examples
+def batch_to_session_data(batch: Tuple[np.ndarray], session_data: SessionData):
+    batch_data = {}
+    idx = 0
+
+    for k, v in session_data.items():
+        if isinstance(v[0], scipy.sparse.spmatrix):
+            batch_data[k] = values_to_sparse_tensor(
+                batch[idx], batch[idx + 1], batch[idx + 2]
+            )
+            idx += 3
+        else:
+            batch_data[k] = batch[idx]
+            idx += 1
+
+    return batch_data
 
 
 # noinspection PyPep8Naming
@@ -358,9 +371,8 @@ def create_tf_dataset(
 ) -> "tf.data.Dataset":
     """Create tf dataset."""
 
-    # set batch and sequence length to None
-    # TODO: can we remove the shape?
-    shapes, types = _get_shape_and_types(session_data)
+    shapes, types = _get_shapes_types(session_data)
+    # TODO shapes
 
     return tf.data.Dataset.from_generator(
         lambda batch_size_: gen_batch(
@@ -372,21 +384,38 @@ def create_tf_dataset(
     )
 
 
-def _get_shape_and_types(session_data: SessionData) -> Tuple[Tuple, Tuple]:
-    shapes = []
+def _get_shapes_types(session_data: SessionData) -> Tuple:
     types = []
+    shapes = []
 
     def append_shape(v: np.ndarray):
-        if v[0].ndim == 0:
+        if isinstance(v[0], scipy.sparse.spmatrix):
+            # scipy matrix is converted into indices, data, shape
+            shapes.append((None, None))
+            shapes.append((None))
+            shapes.append((None))
+        elif v[0].ndim == 0:
             shapes.append((None))
         elif v[0].ndim == 1:
             shapes.append((None, v[0].shape[-1]))
         else:
             shapes.append((None, None, v[0].shape[-1]))
 
+    def append_type(v: np.ndarray):
+        if isinstance(v[0], scipy.sparse.spmatrix):
+            # scipy matrix is converted into indices, data, shape
+            # as int64 is not supported in generator use int32 instead
+            types.append(tf.int32)
+            types.append(tf.float64)
+            types.append(tf.int32)
+        elif v.dtype == np.dtype(np.int64):
+            types.append(tf.int32)
+        else:
+            types.append(v.dtype)
+
     for v in session_data.values():
         append_shape(v)
-        types.append(v.dtype)
+        append_type(v)
 
     return tuple(shapes), tuple(types)
 
@@ -687,6 +716,65 @@ def sample_negatives(
     )
 
 
+def tf_matmul_sparse(inputs: tf.SparseTensor, kernel: tf.Tensor):
+    def map_function(x):
+        i, dense_slice = x[0], x[1]
+        sparse_slice = tf.sparse.reshape(
+            tf.sparse.slice(
+                inputs, [i, 0, 0], [1, inputs.dense_shape[1], inputs.dense_shape[2]]
+            ),
+            [inputs.dense_shape[1], inputs.dense_shape[2]],
+        )
+        mult_slice = tf.sparse.matmul(sparse_slice, dense_slice)
+        return mult_slice
+
+    elems = (tf.range(0, inputs.dense_shape[0], delta=1, dtype=tf.int64), kernel)
+    return tf.map_fn(map_function, elems, dtype=inputs.dtype, back_prop=True)
+
+
+def tf_dense_layer(
+    inputs: tf.Tensor,
+    units: int,
+    name: Text,
+    C2: int,
+    activation: Optional[Callable] = tf.nn.relu,
+    use_bias: bool = True,
+    kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
+) -> tf.Tensor:
+
+    if isinstance(inputs, tf.SparseTensor):
+        # TODO add bias ?
+        if len(inputs.shape) == 3:
+            kernel = tf.get_variable(
+                "kernel",
+                shape=[inputs.shape[0], inputs.shape[-1], units],
+                dtype=inputs.dtype,
+            )
+            outputs = tf_matmul_sparse(inputs, kernel)
+        else:
+            kernel = tf.get_variable(
+                "kernel", shape=[inputs.shape[-1], units], dtype=inputs.dtype
+            )
+            outputs = tf.sparse.matmul(inputs, kernel)
+    else:
+        kernel_regularizer = tf.contrib.layers.l2_regularizer(C2)
+        outputs = tf.layers.dense(
+            inputs=inputs,
+            units=units,
+            activation=activation,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            kernel_regularizer=kernel_regularizer,
+            name=name,
+            reuse=tf.AUTO_REUSE,
+        )
+
+    if activation is None:
+        return outputs
+
+    return activation(outputs)
+
+
 def tf_raw_sim(
     a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"]
 ) -> "tf.Tensor":
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index ca55356edbe9..a63bd2806d7d 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -126,15 +126,15 @@ def test_gen_batch(session_data: SessionData):
     iterator = gen_batch(session_data, 2, "intent_ids", shuffle=True)
 
     batch = next(iterator)
-    assert len(batch) == 5
+    assert len(batch) == 7
     assert len(batch[0]) == 2
 
     batch = next(iterator)
-    assert len(batch) == 5
+    assert len(batch) == 7
     assert len(batch[0]) == 2
 
     batch = next(iterator)
-    assert len(batch) == 5
+    assert len(batch) == 7
     assert len(batch[0]) == 1
 
     with pytest.raises(StopIteration):
@@ -146,6 +146,7 @@ def test_gen_batch(session_data: SessionData):
     [([0, 0, 0, 1, 1], [0, 1, 0, 1, 0]), ([0, 0, 0, 0, 1], [0, 1, 0, 0, 1, 0])],
 )
 def test_balance_session_data(session_data: SessionData, intent_ids, expected_labels):
+    # TODO improve test
     session_data["intent_ids"] = np.array(intent_ids)
 
     balanced_session_data = balance_session_data(session_data, 2, False, "intent_ids")

From 60bdec261206a0e6c769053627d72ad5eb79427a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 4 Nov 2019 13:17:33 +0100
Subject: [PATCH 117/239] warp tf.layers.dense with dense_layer function

---
 .../embedding_intent_classifier.py            |  7 ++---
 rasa/utils/train_utils.py                     | 31 +++++++++----------
 2 files changed, 18 insertions(+), 20 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 50b9d915c771..4c86486943c8 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -292,7 +292,9 @@ def _extract_labels_precomputed_features(
         encoded_id_labels = defaultdict(dict)
         for i, s in zip(label_examples, sparse_features):
             indices, data, shape = train_utils.scipy_matrix_to_values(np.array([s]))
-            sparse_tensor = train_utils.values_to_sparse_tensor(indices, data, shape)
+            sparse_tensor = train_utils.values_to_sparse_tensor(
+                indices, tf.constant(data, dtype=tf.float64), shape
+            )
             encoded_id_labels[i[0]]["intent_features_sparse"] = sparse_tensor
         for i, d in zip(label_examples, dense_features):
             encoded_id_labels[i[0]]["intent_features_dense"] = tf.constant(d)
@@ -433,8 +435,6 @@ def _build_tf_train_graph(
 
         batch = train_utils.batch_to_session_data(batch, session_data)
 
-        # TODO shape missmatch
-
         self.a_in = self.combine_sparse_dense_features(batch, "text_features_")
         self.b_in = self.combine_sparse_dense_features(batch, "intent_features_")
 
@@ -654,7 +654,6 @@ def train(
         **kwargs: Any,
     ) -> None:
         """Train the embedding label classifier on a data set."""
-
         logger.debug("Started training embedding classifier.")
 
         # set numpy random seed
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index c04a73d65f32..b90aa63deab6 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -47,9 +47,6 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
-# TODO: add wrapper around all denses layers to use https://medium.com/dailymotion/how-to-design-deep-learning-models-with-sparse-inputs-in-tensorflow-keras-fd5e754abec1
-
-
 # noinspection PyPep8Naming
 def train_val_split(
     session_data: "SessionData",
@@ -351,7 +348,7 @@ def batch_to_session_data(batch: Tuple[np.ndarray], session_data: SessionData):
     for k, v in session_data.items():
         if isinstance(v[0], scipy.sparse.spmatrix):
             batch_data[k] = values_to_sparse_tensor(
-                batch[idx], batch[idx + 1], batch[idx + 2]
+                batch[idx], batch[idx + 1], tf.shape(batch[idx + 2])
             )
             idx += 3
         else:
@@ -469,18 +466,16 @@ def create_tf_fnn(
 ) -> "tf.Tensor":
     """Create nn with hidden layers and name suffix."""
 
-    reg = tf.contrib.layers.l2_regularizer(C2)
     x = tf.nn.relu(x_in)
     for i, layer_size in enumerate(layer_sizes):
-        x = tf.layers.dense(
+        x = tf_dense_layer(
             inputs=x,
             units=layer_size,
             activation=activation,
             use_bias=use_bias,
             kernel_initializer=kernel_initializer,
-            kernel_regularizer=reg,
+            C2=C2,
             name="hidden_layer_{}_{}".format(layer_name_suffix, i),
-            reuse=tf.AUTO_REUSE,
         )
         x = tf.layers.dropout(x, rate=droprate, training=is_training)
     return x
@@ -511,14 +506,12 @@ def create_tf_embed(
 ) -> "tf.Tensor":
     """Create dense embedding layer with a name."""
 
-    reg = tf.contrib.layers.l2_regularizer(C2)
-    embed_x = tf.layers.dense(
+    embed_x = tf_dense_layer(
         inputs=x,
         units=embed_dim,
         activation=None,
-        kernel_regularizer=reg,
+        C2=C2,
         name="embed_layer_{}".format(layer_name_suffix),
-        reuse=tf.AUTO_REUSE,
     )
     # normalize embedding vectors for cosine similarity
     return tf_normalize_if_cosine(embed_x, similarity_type)
@@ -736,7 +729,7 @@ def tf_dense_layer(
     inputs: tf.Tensor,
     units: int,
     name: Text,
-    C2: int,
+    C2: float,
     activation: Optional[Callable] = tf.nn.relu,
     use_bias: bool = True,
     kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
@@ -744,16 +737,22 @@ def tf_dense_layer(
 
     if isinstance(inputs, tf.SparseTensor):
         # TODO add bias ?
-        if len(inputs.shape) == 3:
+        if len(inputs.dense_shape.shape) == 3:
             kernel = tf.get_variable(
                 "kernel",
-                shape=[inputs.shape[0], inputs.shape[-1], units],
+                shape=[
+                    inputs.dense_shape.shape[0],
+                    inputs.dense_shape.shape[-1],
+                    units,
+                ],
                 dtype=inputs.dtype,
             )
             outputs = tf_matmul_sparse(inputs, kernel)
         else:
             kernel = tf.get_variable(
-                "kernel", shape=[inputs.shape[-1], units], dtype=inputs.dtype
+                "kernel",
+                shape=[inputs.dense_shape.shape[-1], units],
+                dtype=inputs.dtype,
             )
             outputs = tf.sparse.matmul(inputs, kernel)
     else:

From c448fe77250a1bb0cf1829b7d87a0f485390c3c8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 4 Nov 2019 16:34:34 +0100
Subject: [PATCH 118/239] get feature_dim from session data instead of sparse
 tensor

---
 .../embedding_intent_classifier.py            | 79 ++++++++++---------
 rasa/utils/train_utils.py                     | 42 +++++-----
 2 files changed, 62 insertions(+), 59 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 4c86486943c8..e3ae497c41eb 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -290,6 +290,7 @@ def _extract_labels_precomputed_features(
             )
 
         encoded_id_labels = defaultdict(dict)
+        # TODO we should use SparseTensorValue outside graphs
         for i, s in zip(label_examples, sparse_features):
             indices, data, shape = train_utils.scipy_matrix_to_values(np.array([s]))
             sparse_tensor = train_utils.values_to_sparse_tensor(
@@ -427,6 +428,18 @@ def _create_tf_embed_fnn(
             layer_name_suffix=embed_name,
         )
 
+    def _get_feature_dim_batch_size(self, session_data: SessionData) -> Tuple[int, int]:
+        if "text_features_sparse" in session_data:
+            return (
+                session_data["text_features_sparse"][0].shape[-1],
+                session_data["text_features_sparse"][0].shape[0],
+            )
+        if "text_features_dense" in session_data:
+            return (
+                session_data["text_features_dense"][0].shape[-1],
+                session_data["text_features_sparse"][0].shape[0],
+            )
+
     def _build_tf_train_graph(
         self, session_data: SessionData
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
@@ -435,12 +448,23 @@ def _build_tf_train_graph(
 
         batch = train_utils.batch_to_session_data(batch, session_data)
 
-        self.a_in = self.combine_sparse_dense_features(batch, "text_features_")
-        self.b_in = self.combine_sparse_dense_features(batch, "intent_features_")
+        feature_dim, batch_size = self._get_feature_dim_batch_size(session_data)
+
+        self.a_in = self.combine_sparse_dense_features(
+            batch, "text_features_", feature_dim=feature_dim, batch_size=batch_size
+        )
+        self.b_in = self.combine_sparse_dense_features(
+            batch, "intent_features_", feature_dim=feature_dim, batch_size=batch_size
+        )
 
         self._encoded_all_label_ids = tf.stack(
             [
-                self.combine_sparse_dense_features(v, "intent_features_")
+                self.combine_sparse_dense_features(
+                    v,
+                    "intent_features_",
+                    feature_dim=feature_dim,
+                    batch_size=batch_size,
+                )
                 for k, v in self._encoded_all_label_ids.items()
             ]
         )
@@ -486,7 +510,11 @@ def _build_tf_train_graph(
         )
 
     def combine_sparse_dense_features(
-        self, batch: Dict[Text, tf.Tensor], key_prefix: Text
+        self,
+        batch: Dict[Text, tf.Tensor],
+        key_prefix: Text,
+        feature_dim: int,
+        batch_size: int,
     ):
         key_sparse = f"{key_prefix}sparse"
         key_dense = f"{key_prefix}dense"
@@ -496,7 +524,12 @@ def combine_sparse_dense_features(
         if key_dense in batch and key_sparse in batch:
             _sparse = tf.math.reduce_sum(
                 train_utils.tf_dense_layer(
-                    batch[key_sparse], batch[key_sparse].shape[-1], "a", self.C2
+                    batch[key_sparse],
+                    feature_dim,  # TODO define proper size
+                    "a",
+                    self.C2,
+                    feature_dim=feature_dim,
+                    batch_size=batch_size,
                 )
             )
             _dense = tf.math.reduce_mean(batch[key_dense], axis=1)
@@ -509,40 +542,14 @@ def combine_sparse_dense_features(
         if key_sparse in batch:
             return tf.math.reduce_sum(
                 train_utils.tf_dense_layer(
-                    batch[key_sparse], batch[key_sparse].shape[-1], "a", self.C2
+                    batch[key_sparse],
+                    feature_dim,  # TODO define proper size
+                    "a",
+                    self.C2,
+                    feature_dim=feature_dim,
                 )
             )
 
-    def batch_to_input(self, batch: Tuple) -> Tuple[tf.Tensor, tf.Tensor]:
-        """Convert batch input into correct tensors.
-
-        As we do not know what features (sparse and/or dense) were used, we need to
-        check what features are provided and parse them accordingly.
-        """
-        # batch contains 1 or 2 a_in values, b_in, label_ids
-        b_in = batch[-2]
-
-        if len(batch) == 3:
-            a_in = self._squeeze_sparse_features(batch[0])
-            return a_in, b_in
-
-        if len(batch) == 4:
-            a_in_1 = self._squeeze_sparse_features(batch[0])
-            a_in_2 = self._squeeze_sparse_features(batch[1])
-            # Concatenate a_in features
-            a_in = tf.concat([a_in_1, a_in_2], axis=1)
-
-            return a_in, b_in
-
-        raise ValueError("Iterator return unexpected number of tensors.")
-
-    def _squeeze_sparse_features(self, a_in: tf.Tensor) -> tf.Tensor:
-        # as sparse features come from a scipy.sparse.csr_matrix they have an
-        # additional dimension
-        if len(a_in.shape) == 3:
-            a_in = tf.squeeze(a_in, axis=1)
-        return a_in
-
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         num_text_features = self._get_num_of_features(session_data, "text_features_")
         num_intent_features = self._get_num_of_features(
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index b90aa63deab6..1953fb5712d5 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -306,7 +306,7 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
         ]
         shape = (len(array_of_sparse), seq_len, array_of_sparse[0].shape[-1])
 
-    return [np.array(indices), np.array(data), shape]
+    return [np.array(indices).astype(np.int64), np.array(data), shape.astype(np.int64)]
 
 
 def values_to_sparse_tensor(
@@ -348,7 +348,7 @@ def batch_to_session_data(batch: Tuple[np.ndarray], session_data: SessionData):
     for k, v in session_data.items():
         if isinstance(v[0], scipy.sparse.spmatrix):
             batch_data[k] = values_to_sparse_tensor(
-                batch[idx], batch[idx + 1], tf.shape(batch[idx + 2])
+                batch[idx], batch[idx + 1], batch[idx + 2]
             )
             idx += 3
         else:
@@ -388,9 +388,9 @@ def _get_shapes_types(session_data: SessionData) -> Tuple:
     def append_shape(v: np.ndarray):
         if isinstance(v[0], scipy.sparse.spmatrix):
             # scipy matrix is converted into indices, data, shape
-            shapes.append((None, None))
-            shapes.append((None))
-            shapes.append((None))
+            shapes.append((len(v), v[0].ndim + 1))
+            shapes.append((len(v)))
+            shapes.append((v[0].ndim + 1))
         elif v[0].ndim == 0:
             shapes.append((None))
         elif v[0].ndim == 1:
@@ -402,11 +402,9 @@ def append_type(v: np.ndarray):
         if isinstance(v[0], scipy.sparse.spmatrix):
             # scipy matrix is converted into indices, data, shape
             # as int64 is not supported in generator use int32 instead
-            types.append(tf.int32)
+            types.append(tf.int64)
             types.append(tf.float64)
-            types.append(tf.int32)
-        elif v.dtype == np.dtype(np.int64):
-            types.append(tf.int32)
+            types.append(tf.int64)
         else:
             types.append(v.dtype)
 
@@ -435,9 +433,7 @@ def create_iterator_init_datasets(
     )
 
     iterator = tf.data.Iterator.from_structure(
-        train_dataset.output_types,
-        train_dataset.output_shapes,
-        output_classes=train_dataset.output_classes,
+        train_dataset.output_types, train_dataset.output_shapes
     )
 
     train_init_op = iterator.make_initializer(train_dataset)
@@ -733,26 +729,26 @@ def tf_dense_layer(
     activation: Optional[Callable] = tf.nn.relu,
     use_bias: bool = True,
     kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
+    feature_dim: int = 0,
+    batch_size: int = 0,
 ) -> tf.Tensor:
 
     if isinstance(inputs, tf.SparseTensor):
+        # TODO kernel should just be 2D ?
         # TODO add bias ?
-        if len(inputs.dense_shape.shape) == 3:
+        # TODO make use of inputs.dense_shape somehow instead of feature_dim (subclass tf.SparseTensor and create additional shape property to be set in init by provided numpy shape)
+
+        if feature_dim < 0:
+            raise ValueError(f"Cannot create kernel of shape {feature_dim}x{units}.")
+
+        if len(inputs.shape) == 3:
             kernel = tf.get_variable(
-                "kernel",
-                shape=[
-                    inputs.dense_shape.shape[0],
-                    inputs.dense_shape.shape[-1],
-                    units,
-                ],
-                dtype=inputs.dtype,
+                "kernel", shape=[batch_size, feature_dim, units], dtype=inputs.dtype
             )
             outputs = tf_matmul_sparse(inputs, kernel)
         else:
             kernel = tf.get_variable(
-                "kernel",
-                shape=[inputs.dense_shape.shape[-1], units],
-                dtype=inputs.dtype,
+                "kernel", shape=[feature_dim, units], dtype=inputs.dtype
             )
             outputs = tf.sparse.matmul(inputs, kernel)
     else:

From 093024fa3203485ca7e3e00d429a7908a7a8bed3 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Thu, 7 Nov 2019 13:21:07 +0100
Subject: [PATCH 119/239] Update rasa/utils/train_utils.py

---
 rasa/utils/train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 1953fb5712d5..815a9255f29d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -306,7 +306,7 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
         ]
         shape = (len(array_of_sparse), seq_len, array_of_sparse[0].shape[-1])
 
-    return [np.array(indices).astype(np.int64), np.array(data), shape.astype(np.int64)]
+    return [np.array(indices).astype(np.int64), np.array(data),  np.array(shape).astype(np.int64)]
 
 
 def values_to_sparse_tensor(

From b600c26215905b3ad07ff08315309e583995c419 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 7 Nov 2019 14:57:33 +0100
Subject: [PATCH 120/239] pass last dim of sparse tensor into the SparseTensor
 directly, separate normal dense and sparse dense layers

---
 .../embedding_intent_classifier.py            | 181 ++++++++----------
 rasa/nlu/tokenizers/whitespace_tokenizer.py   |   5 +-
 rasa/utils/train_utils.py                     |  93 ++++-----
 3 files changed, 119 insertions(+), 160 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e3ae497c41eb..4d314313d288 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -6,7 +6,7 @@
 import pickle
 import scipy.sparse
 import typing
-from typing import Any, Dict, List, Optional, Text, Tuple
+from typing import Any, Dict, List, Optional, Text, Tuple, Union
 import warnings
 
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
@@ -133,6 +133,7 @@ def __init__(
 
         self._load_params()
 
+        self.dense_dim = 512  # TODO make configurable /extract form dense features
         # transform numbers to labels
         self.inverted_label_dict = inverted_label_dict
         # encode all label_ids with numbers
@@ -176,13 +177,13 @@ def _check_old_config_variables(self, config: Dict[Text, Any]) -> None:
     # init helpers
     def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
         self.hidden_layer_sizes = {
-            "a": config["hidden_layers_sizes_a"],
-            "b": config["hidden_layers_sizes_b"],
+            "text": config["hidden_layers_sizes_a"],
+            "intent": config["hidden_layers_sizes_b"],
         }
         self.share_hidden_layers = config["share_hidden_layers"]
         if (
             self.share_hidden_layers
-            and self.hidden_layer_sizes["a"] != self.hidden_layer_sizes["b"]
+            and self.hidden_layer_sizes["text"] != self.hidden_layer_sizes["intent"]
         ):
             raise ValueError(
                 "If hidden layer weights are shared,"
@@ -290,7 +291,9 @@ def _extract_labels_precomputed_features(
             )
 
         encoded_id_labels = defaultdict(dict)
+        # TODO redesign it, we shouldn't use any tf here, conversion to tf should be inside build tf graph
         # TODO we should use SparseTensorValue outside graphs
+        # TODO why can't we keep using csr_matrices here?
         for i, s in zip(label_examples, sparse_features):
             indices, data, shape = train_utils.scipy_matrix_to_values(np.array([s]))
             sparse_tensor = train_utils.values_to_sparse_tensor(
@@ -428,18 +431,6 @@ def _create_tf_embed_fnn(
             layer_name_suffix=embed_name,
         )
 
-    def _get_feature_dim_batch_size(self, session_data: SessionData) -> Tuple[int, int]:
-        if "text_features_sparse" in session_data:
-            return (
-                session_data["text_features_sparse"][0].shape[-1],
-                session_data["text_features_sparse"][0].shape[0],
-            )
-        if "text_features_dense" in session_data:
-            return (
-                session_data["text_features_dense"][0].shape[-1],
-                session_data["text_features_sparse"][0].shape[0],
-            )
-
     def _build_tf_train_graph(
         self, session_data: SessionData
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
@@ -448,49 +439,44 @@ def _build_tf_train_graph(
 
         batch = train_utils.batch_to_session_data(batch, session_data)
 
-        feature_dim, batch_size = self._get_feature_dim_batch_size(session_data)
-
-        self.a_in = self.combine_sparse_dense_features(
-            batch, "text_features_", feature_dim=feature_dim, batch_size=batch_size
+        a = self.combine_sparse_dense_features(
+            batch, "text"
         )
-        self.b_in = self.combine_sparse_dense_features(
-            batch, "intent_features_", feature_dim=feature_dim, batch_size=batch_size
+        b = self.combine_sparse_dense_features(
+            batch, "intent"
         )
-
-        self._encoded_all_label_ids = tf.stack(
+        print(b.shape)
+        all_label_ids = tf.stack(
             [
-                self.combine_sparse_dense_features(
-                    v,
-                    "intent_features_",
-                    feature_dim=feature_dim,
-                    batch_size=batch_size,
-                )
+                self.combine_sparse_dense_features(v, "intent")
                 for k, v in self._encoded_all_label_ids.items()
-            ]
-        )
-
-        all_label_ids = tf.constant(
-            self._encoded_all_label_ids, dtype=tf.float32, name="all_label_ids"
+            ],
+            name="all_label_ids"
         )
+        print(all_label_ids.shape)
+        exit()
+        # all_label_ids = tf.constant(
+        #     self._encoded_all_label_ids, dtype=tf.float32, name="all_label_ids"
+        # )
 
         self.message_embed = self._create_tf_embed_fnn(
-            self.a_in,
-            self.hidden_layer_sizes["a"],
-            fnn_name="a_b" if self.share_hidden_layers else "a",
-            embed_name="a",
+            a,
+            self.hidden_layer_sizes["text"],
+            fnn_name="text_intent" if self.share_hidden_layers else "text",
+            embed_name="text",
         )
 
         self.label_embed = self._create_tf_embed_fnn(
-            self.b_in,
-            self.hidden_layer_sizes["b"],
-            fnn_name="a_b" if self.share_hidden_layers else "b",
-            embed_name="b",
+            b,
+            self.hidden_layer_sizes["intent"],
+            fnn_name="text_intent" if self.share_hidden_layers else "intent",
+            embed_name="intent",
         )
         self.all_labels_embed = self._create_tf_embed_fnn(
             all_label_ids,
-            self.hidden_layer_sizes["b"],
-            fnn_name="a_b" if self.share_hidden_layers else "b",
-            embed_name="b",
+            self.hidden_layer_sizes["intent"],
+            fnn_name="text_intent" if self.share_hidden_layers else "intent",
+            embed_name="intent",
         )
 
         return train_utils.calculate_loss_acc(
@@ -511,45 +497,33 @@ def _build_tf_train_graph(
 
     def combine_sparse_dense_features(
         self,
-        batch: Dict[Text, tf.Tensor],
+        batch: Dict[Text, Union[tf.Tensor, tf.SparseTensor]],
         key_prefix: Text,
-        feature_dim: int,
-        batch_size: int,
     ):
-        key_sparse = f"{key_prefix}sparse"
-        key_dense = f"{key_prefix}dense"
-
-        # apply mean/sum to convert sequence to sentence features
-
-        if key_dense in batch and key_sparse in batch:
-            _sparse = tf.math.reduce_sum(
-                train_utils.tf_dense_layer(
-                    batch[key_sparse],
-                    feature_dim,  # TODO define proper size
-                    "a",
-                    self.C2,
-                    feature_dim=feature_dim,
-                    batch_size=batch_size,
-                )
-            )
-            _dense = tf.math.reduce_mean(batch[key_dense], axis=1)
+        key_sparse = f"{key_prefix}_features_sparse"
+        key_dense = f"{key_prefix}_features_dense"
 
-            return tf.concat([_sparse, _dense])
+        all_dense = []
 
         if key_dense in batch:
-            return tf.math.reduce_mean(batch[key_dense], axis=1)
+            dense_dim = batch[key_dense].shape[-1]
+            all_dense.append(batch[key_dense])
 
         if key_sparse in batch:
-            return tf.math.reduce_sum(
-                train_utils.tf_dense_layer(
+            all_dense.append(
+                train_utils.tf_dense_layer_for_sparse(
                     batch[key_sparse],
-                    feature_dim,  # TODO define proper size
-                    "a",
+                    self.dense_dim,
+                    key_prefix,
                     self.C2,
-                    feature_dim=feature_dim,
                 )
             )
 
+        output = tf.concat(all_dense, axis=-1)
+        # apply mean to convert sequence to sentence features
+        output = tf.reduce_mean(output, axis=1)
+        return output
+
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         num_text_features = self._get_num_of_features(session_data, "text_features_")
         num_intent_features = self._get_num_of_features(
@@ -557,10 +531,10 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         )
 
         self.a_in = tf.placeholder(
-            tf.float32, (None, None, num_text_features), name="a"
+            tf.float32, (None, None, num_text_features), name="text"
         )
         self.b_in = tf.placeholder(
-            tf.float32, (None, None, num_intent_features), name="b"
+            tf.float32, (None, None, num_intent_features), name="intent"
         )
 
         # TODO check this idea:
@@ -568,9 +542,9 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         self.message_embed = self._create_tf_embed_fnn(
             self.a_in,
-            self.hidden_layer_sizes["a"],
-            fnn_name="a_b" if self.share_hidden_layers else "a",
-            embed_name="a",
+            self.hidden_layer_sizes["text"],
+            fnn_name="text_intent" if self.share_hidden_layers else "text",
+            embed_name="text",
         )
 
         self.sim_all = train_utils.tf_raw_sim(
@@ -581,9 +555,9 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         self.label_embed = self._create_tf_embed_fnn(
             self.b_in,
-            self.hidden_layer_sizes["b"],
-            fnn_name="a_b" if self.share_hidden_layers else "b",
-            embed_name="b",
+            self.hidden_layer_sizes["intent"],
+            fnn_name="text_intent" if self.share_hidden_layers else "intent",
+            embed_name="intent",
         )
 
         self.sim = train_utils.tf_raw_sim(
@@ -666,30 +640,33 @@ def train(
         # set numpy random seed
         np.random.seed(self.random_seed)
 
-        session_data = self.preprocess_train_data(training_data)
-
-        possible_to_train = self._check_enough_labels(session_data)
+        self.graph = tf.Graph()
+        with self.graph.as_default():
+            # TODO we use SparseTensor - in ecoded_all... do we need it?
+            session_data = self.preprocess_train_data(training_data)
 
-        if not possible_to_train:
-            logger.error(
-                "Can not train a classifier. "
-                "Need at least 2 different classes. "
-                "Skipping training of classifier."
-            )
-            return
+            possible_to_train = self._check_enough_labels(session_data)
 
-        if self.evaluate_on_num_examples:
-            session_data, eval_session_data = train_utils.train_val_split(
-                session_data,
-                self.evaluate_on_num_examples,
-                self.random_seed,
-                label_key="intent_ids",
-            )
-        else:
-            eval_session_data = None
+            if not possible_to_train:
+                logger.error(
+                    "Can not train a classifier. "
+                    "Need at least 2 different classes. "
+                    "Skipping training of classifier."
+                )
+                return
+
+            if self.evaluate_on_num_examples:
+                session_data, eval_session_data = train_utils.train_val_split(
+                    session_data,
+                    self.evaluate_on_num_examples,
+                    self.random_seed,
+                    label_key="intent_ids",
+                )
+            else:
+                eval_session_data = None
 
-        self.graph = tf.Graph()
-        with self.graph.as_default():
+        # self.graph = tf.Graph()
+        # with self.graph.as_default():
             # set random seed
             tf.set_random_seed(self.random_seed)
 
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index 98cd182a4567..ddaf12575f7d 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -63,8 +63,8 @@ def tokenize(
         if not self.case_sensitive:
             text = text.lower()
 
-        # remove 'not a word character' if
         if attribute != MESSAGE_INTENT_ATTRIBUTE:
+            # remove 'not a word character' if
             words = re.sub(
                 # there is a space or an end of a string after it
                 r"[^\w#@&]+(?=\s|$)|"
@@ -78,6 +78,9 @@ def tokenize(
                 " ",
                 text,
             ).split()
+            # if we removed everything like smiles `:)`, use the whole text as 1 token
+            if not words:
+                words = [text]
         else:
             words = (
                 text.split(self.intent_split_symbol)
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 815a9255f29d..736e62480974 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -83,7 +83,7 @@ def train_val_split(
         output_values, session_data, solo_values
     )
 
-    return (session_data_train, session_data_val)
+    return session_data_train, session_data_val
 
 
 def check_train_test_sizes(
@@ -309,12 +309,13 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     return [np.array(indices).astype(np.int64), np.array(data),  np.array(shape).astype(np.int64)]
 
 
+# TODO types, could be tf.Tensor or Tuple for shape
 def values_to_sparse_tensor(
     indices: np.ndarray, data: np.ndarray, shape: np.ndarray
 ) -> tf.SparseTensor:
     # make sure indices and shape have the correct type
-    indices = tf.cast(indices, dtype=tf.int64)
-    shape = tf.cast(shape, dtype=tf.int64)
+    # indices = tf.cast(indices, dtype=tf.int64)
+    # shape = tf.cast(shape, dtype=tf.int64)
 
     return tf.SparseTensor(indices, data, shape)
 
@@ -341,14 +342,15 @@ def pad_data(data: np.ndarray) -> np.ndarray:
     return data_padded
 
 
-def batch_to_session_data(batch: Tuple[np.ndarray], session_data: SessionData):
+def batch_to_session_data(batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionData):
     batch_data = {}
     idx = 0
 
     for k, v in session_data.items():
         if isinstance(v[0], scipy.sparse.spmatrix):
+            # explicitly substitute last dimension in shape with known static value
             batch_data[k] = values_to_sparse_tensor(
-                batch[idx], batch[idx + 1], batch[idx + 2]
+                batch[idx], batch[idx + 1], [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]]
             )
             idx += 3
         else:
@@ -462,16 +464,18 @@ def create_tf_fnn(
 ) -> "tf.Tensor":
     """Create nn with hidden layers and name suffix."""
 
+    reg = tf.contrib.layers.l2_regularizer(C2)
     x = tf.nn.relu(x_in)
     for i, layer_size in enumerate(layer_sizes):
-        x = tf_dense_layer(
+        x = tf.layers.dense(
             inputs=x,
             units=layer_size,
             activation=activation,
             use_bias=use_bias,
             kernel_initializer=kernel_initializer,
-            C2=C2,
+            kernel_regularizer=reg,
             name="hidden_layer_{}_{}".format(layer_name_suffix, i),
+            reuse=tf.AUTO_REUSE,
         )
         x = tf.layers.dropout(x, rate=droprate, training=is_training)
     return x
@@ -502,12 +506,14 @@ def create_tf_embed(
 ) -> "tf.Tensor":
     """Create dense embedding layer with a name."""
 
-    embed_x = tf_dense_layer(
+    reg = tf.contrib.layers.l2_regularizer(C2)
+    embed_x = tf.layers.dense(
         inputs=x,
         units=embed_dim,
         activation=None,
-        C2=C2,
+        kernel_regularizer=reg,
         name="embed_layer_{}".format(layer_name_suffix),
+        reuse=tf.AUTO_REUSE,
     )
     # normalize embedding vectors for cosine similarity
     return tf_normalize_if_cosine(embed_x, similarity_type)
@@ -705,64 +711,37 @@ def sample_negatives(
     )
 
 
-def tf_matmul_sparse(inputs: tf.SparseTensor, kernel: tf.Tensor):
-    def map_function(x):
-        i, dense_slice = x[0], x[1]
-        sparse_slice = tf.sparse.reshape(
-            tf.sparse.slice(
-                inputs, [i, 0, 0], [1, inputs.dense_shape[1], inputs.dense_shape[2]]
-            ),
-            [inputs.dense_shape[1], inputs.dense_shape[2]],
-        )
-        mult_slice = tf.sparse.matmul(sparse_slice, dense_slice)
-        return mult_slice
-
-    elems = (tf.range(0, inputs.dense_shape[0], delta=1, dtype=tf.int64), kernel)
-    return tf.map_fn(map_function, elems, dtype=inputs.dtype, back_prop=True)
-
-
-def tf_dense_layer(
-    inputs: tf.Tensor,
+def tf_dense_layer_for_sparse(
+    inputs: tf.SparseTensor,
     units: int,
     name: Text,
     C2: float,
     activation: Optional[Callable] = tf.nn.relu,
     use_bias: bool = True,
-    kernel_initializer: Optional["tf.keras.initializers.Initializer"] = None,
-    feature_dim: int = 0,
-    batch_size: int = 0,
 ) -> tf.Tensor:
+    """Idea from
+    https://medium.com/dailymotion/how-to-design-deep-learning-models-with-sparse-inputs-in-tensorflow-keras-fd5e754abec1
+    """
 
-    if isinstance(inputs, tf.SparseTensor):
-        # TODO kernel should just be 2D ?
-        # TODO add bias ?
-        # TODO make use of inputs.dense_shape somehow instead of feature_dim (subclass tf.SparseTensor and create additional shape property to be set in init by provided numpy shape)
-
-        if feature_dim < 0:
-            raise ValueError(f"Cannot create kernel of shape {feature_dim}x{units}.")
+    if not isinstance(inputs, tf.SparseTensor):
+        raise
 
-        if len(inputs.shape) == 3:
-            kernel = tf.get_variable(
-                "kernel", shape=[batch_size, feature_dim, units], dtype=inputs.dtype
-            )
-            outputs = tf_matmul_sparse(inputs, kernel)
-        else:
-            kernel = tf.get_variable(
-                "kernel", shape=[feature_dim, units], dtype=inputs.dtype
-            )
-            outputs = tf.sparse.matmul(inputs, kernel)
-    else:
+    with tf.variable_scope("dense_layer_for_sparse_" + name, reuse=tf.AUTO_REUSE):
         kernel_regularizer = tf.contrib.layers.l2_regularizer(C2)
-        outputs = tf.layers.dense(
-            inputs=inputs,
-            units=units,
-            activation=activation,
-            use_bias=use_bias,
-            kernel_initializer=kernel_initializer,
-            kernel_regularizer=kernel_regularizer,
-            name=name,
-            reuse=tf.AUTO_REUSE,
+        kernel = tf.get_variable(
+            "kernel", shape=[inputs.shape[-1], units], dtype=inputs.dtype, regularizer=kernel_regularizer
         )
+        bias = tf.get_variable("bias", shape=[units, ], dtype=inputs.dtype)
+
+        # outputs will be 2D
+        outputs = tf.sparse.matmul(tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), kernel)
+
+        if len(inputs.shape) == 3:
+            # reshape back
+            outputs = tf.reshape(outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1))
+
+        if use_bias:
+            outputs = tf.nn.bias_add(outputs, bias)
 
     if activation is None:
         return outputs

From d53ffb9922b42efef521d76fa219062dd0749cc8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 7 Nov 2019 15:19:22 +0100
Subject: [PATCH 121/239] rephrase todo

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 4d314313d288..ae0e95307671 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -291,9 +291,7 @@ def _extract_labels_precomputed_features(
             )
 
         encoded_id_labels = defaultdict(dict)
-        # TODO redesign it, we shouldn't use any tf here, conversion to tf should be inside build tf graph
-        # TODO we should use SparseTensorValue outside graphs
-        # TODO why can't we keep using csr_matrices here?
+        # TODO this should contain the same thing as batch for intents from interator
         for i, s in zip(label_examples, sparse_features):
             indices, data, shape = train_utils.scipy_matrix_to_values(np.array([s]))
             sparse_tensor = train_utils.values_to_sparse_tensor(

From 086ee13b3f9cb9d019be4792b9c4eba50c624ebd Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 8 Nov 2019 08:29:52 +0100
Subject: [PATCH 122/239] rephrase todo

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index ae0e95307671..863e3209c963 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -519,6 +519,7 @@ def combine_sparse_dense_features(
 
         output = tf.concat(all_dense, axis=-1)
         # apply mean to convert sequence to sentence features
+        # TODO we cannot use reduce_mean, we should use reduce_sum / real_length
         output = tf.reduce_mean(output, axis=1)
         return output
 

From e3f8a63fcf031915204180a7047508041262235e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 8 Nov 2019 11:59:43 +0100
Subject: [PATCH 123/239] keep _encoded_all_label_ids scipy.sparse.csr_matrix.

---
 .../embedding_intent_classifier.py            | 71 +++++++++----------
 rasa/nlu/constants.py                         |  2 -
 rasa/utils/train_utils.py                     | 41 ++++++++---
 3 files changed, 64 insertions(+), 50 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 863e3209c963..9c6600b9fdf4 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -291,15 +291,11 @@ def _extract_labels_precomputed_features(
             )
 
         encoded_id_labels = defaultdict(dict)
-        # TODO this should contain the same thing as batch for intents from interator
+
         for i, s in zip(label_examples, sparse_features):
-            indices, data, shape = train_utils.scipy_matrix_to_values(np.array([s]))
-            sparse_tensor = train_utils.values_to_sparse_tensor(
-                indices, tf.constant(data, dtype=tf.float64), shape
-            )
-            encoded_id_labels[i[0]]["intent_features_sparse"] = sparse_tensor
+            encoded_id_labels[i[0]]["intent_features_sparse"] = sparse_features
         for i, d in zip(label_examples, dense_features):
-            encoded_id_labels[i[0]]["intent_features_dense"] = tf.constant(d)
+            encoded_id_labels[i[0]]["intent_features_dense"] = dense_features
 
         # Sort the dict based on label_idx
         encoded_id_labels = OrderedDict(sorted(encoded_id_labels.items()))
@@ -327,7 +323,7 @@ def _compute_default_label_features(
         self, labels_example: List[Tuple[int, "Message"]]
     ) -> np.ndarray:
         """Compute one-hot representation for the labels"""
-
+        # TODO
         return np.eye(len(labels_example))
 
     def _create_encoded_label_ids(
@@ -359,6 +355,21 @@ def _create_encoded_label_ids(
 
         return encoded_id_labels
 
+    def labels_to_tensors(self, labels_dict: Dict[Text, Union[np.ndarray]]):
+        converted_dict = {}
+
+        for k, v in labels_dict.items():
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                indices, values, shape = train_utils.scipy_matrix_to_values(v)
+                converted_dict[k] = tf.cast(
+                    train_utils.values_to_sparse_tensor(indices, values, shape),
+                    tf.float64,
+                )
+            else:
+                converted_dict[k] = tf.cast(tf.constant(v), tf.float64)
+
+        return self.combine_sparse_dense_features(converted_dict, "intent")
+
     # noinspection PyPep8Naming
     def _create_session_data(
         self,
@@ -437,25 +448,13 @@ def _build_tf_train_graph(
 
         batch = train_utils.batch_to_session_data(batch, session_data)
 
-        a = self.combine_sparse_dense_features(
-            batch, "text"
-        )
-        b = self.combine_sparse_dense_features(
-            batch, "intent"
-        )
-        print(b.shape)
+        a = self.combine_sparse_dense_features(batch, "text")
+        b = self.combine_sparse_dense_features(batch, "intent")
+
         all_label_ids = tf.stack(
-            [
-                self.combine_sparse_dense_features(v, "intent")
-                for k, v in self._encoded_all_label_ids.items()
-            ],
-            name="all_label_ids"
+            [self.labels_to_tensors(v) for v in self._encoded_all_label_ids.values()],
+            name="all_label_ids",
         )
-        print(all_label_ids.shape)
-        exit()
-        # all_label_ids = tf.constant(
-        #     self._encoded_all_label_ids, dtype=tf.float32, name="all_label_ids"
-        # )
 
         self.message_embed = self._create_tf_embed_fnn(
             a,
@@ -463,24 +462,24 @@ def _build_tf_train_graph(
             fnn_name="text_intent" if self.share_hidden_layers else "text",
             embed_name="text",
         )
-
         self.label_embed = self._create_tf_embed_fnn(
             b,
             self.hidden_layer_sizes["intent"],
             fnn_name="text_intent" if self.share_hidden_layers else "intent",
             embed_name="intent",
         )
+
         self.all_labels_embed = self._create_tf_embed_fnn(
             all_label_ids,
             self.hidden_layer_sizes["intent"],
             fnn_name="text_intent" if self.share_hidden_layers else "intent",
-            embed_name="intent",
+            embed_name="all_intents",
         )
 
         return train_utils.calculate_loss_acc(
             self.message_embed,
             self.label_embed,
-            self.b_in,
+            b,
             self.all_labels_embed,
             all_label_ids,
             self.num_neg,
@@ -494,26 +493,20 @@ def _build_tf_train_graph(
         )
 
     def combine_sparse_dense_features(
-        self,
-        batch: Dict[Text, Union[tf.Tensor, tf.SparseTensor]],
-        key_prefix: Text,
-    ):
+        self, batch: Dict[Text, Union[tf.Tensor, tf.SparseTensor]], key_prefix: Text
+    ) -> tf.Tensor:
         key_sparse = f"{key_prefix}_features_sparse"
         key_dense = f"{key_prefix}_features_dense"
 
         all_dense = []
 
         if key_dense in batch:
-            dense_dim = batch[key_dense].shape[-1]
             all_dense.append(batch[key_dense])
 
         if key_sparse in batch:
             all_dense.append(
                 train_utils.tf_dense_layer_for_sparse(
-                    batch[key_sparse],
-                    self.dense_dim,
-                    key_prefix,
-                    self.C2,
+                    batch[key_sparse], self.dense_dim, key_prefix, self.C2
                 )
             )
 
@@ -664,8 +657,8 @@ def train(
             else:
                 eval_session_data = None
 
-        # self.graph = tf.Graph()
-        # with self.graph.as_default():
+            # self.graph = tf.Graph()
+            # with self.graph.as_default():
             # set random seed
             tf.set_random_seed(self.random_seed)
 
diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index b31f84cc671f..08cc2925a827 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -10,8 +10,6 @@
 
 CLS_TOKEN = "__CLS__"
 
-CLS_TOKEN = "__CLS__"
-
 MESSAGE_ATTRIBUTES = [
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_INTENT_ATTRIBUTE,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 736e62480974..b761c72bf9e3 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -306,7 +306,11 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
         ]
         shape = (len(array_of_sparse), seq_len, array_of_sparse[0].shape[-1])
 
-    return [np.array(indices).astype(np.int64), np.array(data),  np.array(shape).astype(np.int64)]
+    return [
+        np.array(indices).astype(np.int64),
+        np.array(data),
+        np.array(shape).astype(np.int64),
+    ]
 
 
 # TODO types, could be tf.Tensor or Tuple for shape
@@ -321,6 +325,10 @@ def values_to_sparse_tensor(
 
 
 def pad_data(data: np.ndarray) -> np.ndarray:
+    """
+    Pad data of different lengths.
+    Data is padded with zeros. Zeros are added to the beginning of data.
+    """
     if data[0].ndim == 0:
         return data
 
@@ -342,7 +350,15 @@ def pad_data(data: np.ndarray) -> np.ndarray:
     return data_padded
 
 
-def batch_to_session_data(batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionData):
+def batch_to_session_data(
+    batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionData
+):
+    """
+    Batch contains any number of batch data. The order is equal to the
+    key-value pairs in session data. As sparse data were converted into indices, data,
+    shape before, this methods converts them into sparse tensors. Dense data is
+    kept.
+    """
     batch_data = {}
     idx = 0
 
@@ -350,7 +366,9 @@ def batch_to_session_data(batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], ses
         if isinstance(v[0], scipy.sparse.spmatrix):
             # explicitly substitute last dimension in shape with known static value
             batch_data[k] = values_to_sparse_tensor(
-                batch[idx], batch[idx + 1], [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]]
+                batch[idx],
+                batch[idx + 1],
+                [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]],
             )
             idx += 3
         else:
@@ -371,7 +389,6 @@ def create_tf_dataset(
     """Create tf dataset."""
 
     shapes, types = _get_shapes_types(session_data)
-    # TODO shapes
 
     return tf.data.Dataset.from_generator(
         lambda batch_size_: gen_batch(
@@ -403,7 +420,6 @@ def append_shape(v: np.ndarray):
     def append_type(v: np.ndarray):
         if isinstance(v[0], scipy.sparse.spmatrix):
             # scipy matrix is converted into indices, data, shape
-            # as int64 is not supported in generator use int32 instead
             types.append(tf.int64)
             types.append(tf.float64)
             types.append(tf.int64)
@@ -729,16 +745,23 @@ def tf_dense_layer_for_sparse(
     with tf.variable_scope("dense_layer_for_sparse_" + name, reuse=tf.AUTO_REUSE):
         kernel_regularizer = tf.contrib.layers.l2_regularizer(C2)
         kernel = tf.get_variable(
-            "kernel", shape=[inputs.shape[-1], units], dtype=inputs.dtype, regularizer=kernel_regularizer
+            "kernel",
+            shape=[inputs.shape[-1], units],
+            dtype=inputs.dtype,
+            regularizer=kernel_regularizer,
         )
-        bias = tf.get_variable("bias", shape=[units, ], dtype=inputs.dtype)
+        bias = tf.get_variable("bias", shape=[units], dtype=inputs.dtype)
 
         # outputs will be 2D
-        outputs = tf.sparse.matmul(tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), kernel)
+        outputs = tf.sparse.matmul(
+            tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), kernel
+        )
 
         if len(inputs.shape) == 3:
             # reshape back
-            outputs = tf.reshape(outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1))
+            outputs = tf.reshape(
+                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
+            )
 
         if use_bias:
             outputs = tf.nn.bias_add(outputs, bias)

From 98829a96120690b3c0aafb11967d111dd0cbd5dd Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 8 Nov 2019 14:32:00 +0100
Subject: [PATCH 124/239] session data values are list of np.ndarray

---
 .../embedding_intent_classifier.py            | 123 +++++++++---------
 rasa/utils/train_utils.py                     |  99 ++++++++------
 tests/utils/test_train_utils.py               |  90 +++++++------
 3 files changed, 169 insertions(+), 143 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 9c6600b9fdf4..419e25542010 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -290,12 +290,12 @@ def _extract_labels_precomputed_features(
                 e, MESSAGE_INTENT_ATTRIBUTE, sparse_features, dense_features
             )
 
-        encoded_id_labels = defaultdict(dict)
+        encoded_id_labels = defaultdict(list)
 
         for i, s in zip(label_examples, sparse_features):
-            encoded_id_labels[i[0]]["intent_features_sparse"] = sparse_features
+            encoded_id_labels[i[0]].append(sparse_features)
         for i, d in zip(label_examples, dense_features):
-            encoded_id_labels[i[0]]["intent_features_dense"] = dense_features
+            encoded_id_labels[i[0]].append(dense_features)
 
         # Sort the dict based on label_idx
         encoded_id_labels = OrderedDict(sorted(encoded_id_labels.items()))
@@ -355,20 +355,22 @@ def _create_encoded_label_ids(
 
         return encoded_id_labels
 
-    def labels_to_tensors(self, labels_dict: Dict[Text, Union[np.ndarray]]):
-        converted_dict = {}
-
-        for k, v in labels_dict.items():
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                indices, values, shape = train_utils.scipy_matrix_to_values(v)
-                converted_dict[k] = tf.cast(
-                    train_utils.values_to_sparse_tensor(indices, values, shape),
-                    tf.float64,
+    def labels_to_tensors(self, features: List[np.ndarray]):
+        label_features = []
+
+        for f in features:
+            if isinstance(f[0], scipy.sparse.spmatrix):
+                indices, values, shape = train_utils.scipy_matrix_to_values(f)
+                label_features.append(
+                    tf.cast(
+                        train_utils.values_to_sparse_tensor(indices, values, shape),
+                        tf.float64,
+                    )
                 )
             else:
-                converted_dict[k] = tf.cast(tf.constant(v), tf.float64)
+                label_features.append(tf.cast(f, tf.float64))
 
-        return self.combine_sparse_dense_features(converted_dict, "intent")
+        return self.combine_sparse_dense_features(label_features, "label")
 
     # noinspection PyPep8Naming
     def _create_session_data(
@@ -400,19 +402,23 @@ def _create_session_data(
         label_ids = np.array(label_ids)
 
         session_data = {}
-        self._add_to_session_data(session_data, "text_features_sparse", X_sparse)
-        self._add_to_session_data(session_data, "text_features_dense", X_dense)
-        self._add_to_session_data(session_data, "intent_features_sparse", Y_sparse)
-        self._add_to_session_data(session_data, "intent_features_dense", Y_dense)
-        session_data["intent_ids"] = label_ids
+        self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
+        self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
+        session_data["intent_ids"] = [label_ids]
 
         return session_data
 
     def _add_to_session_data(
-        self, session_data: SessionData, key: Text, data: np.ndarray
+        self, session_data: SessionData, key: Text, features: List[np.ndarray]
     ):
-        if data.size > 0:
-            session_data[key] = data
+        if not features:
+            return
+
+        session_data[key] = []
+
+        for data in features:
+            if data.size > 0:
+                session_data[key].append(data)
 
     # tf helpers:
     def _create_tf_embed_fnn(
@@ -448,8 +454,8 @@ def _build_tf_train_graph(
 
         batch = train_utils.batch_to_session_data(batch, session_data)
 
-        a = self.combine_sparse_dense_features(batch, "text")
-        b = self.combine_sparse_dense_features(batch, "intent")
+        a = self.combine_sparse_dense_features(batch["text_features"], "text")
+        b = self.combine_sparse_dense_features(batch["intent_features"], "intent")
 
         all_label_ids = tf.stack(
             [self.labels_to_tensors(v) for v in self._encoded_all_label_ids.values()],
@@ -493,24 +499,22 @@ def _build_tf_train_graph(
         )
 
     def combine_sparse_dense_features(
-        self, batch: Dict[Text, Union[tf.Tensor, tf.SparseTensor]], key_prefix: Text
+        self, features: List[Union[tf.Tensor, tf.SparseTensor]], name: Text
     ) -> tf.Tensor:
-        key_sparse = f"{key_prefix}_features_sparse"
-        key_dense = f"{key_prefix}_features_dense"
 
-        all_dense = []
-
-        if key_dense in batch:
-            all_dense.append(batch[key_dense])
+        dense_features = []
 
-        if key_sparse in batch:
-            all_dense.append(
-                train_utils.tf_dense_layer_for_sparse(
-                    batch[key_sparse], self.dense_dim, key_prefix, self.C2
+        for f in features:
+            if isinstance(f, tf.SparseTensor):
+                dense_features.append(
+                    train_utils.tf_dense_layer_for_sparse(
+                        f, self.dense_dim, name, self.C2
+                    )
                 )
-            )
+            else:
+                dense_features.append(f)
 
-        output = tf.concat(all_dense, axis=-1)
+        output = tf.concat(dense_features, axis=-1)
         # apply mean to convert sequence to sentence features
         # TODO we cannot use reduce_mean, we should use reduce_sum / real_length
         output = tf.reduce_mean(output, axis=1)
@@ -632,33 +636,30 @@ def train(
         # set numpy random seed
         np.random.seed(self.random_seed)
 
-        self.graph = tf.Graph()
-        with self.graph.as_default():
-            # TODO we use SparseTensor - in ecoded_all... do we need it?
-            session_data = self.preprocess_train_data(training_data)
+        session_data = self.preprocess_train_data(training_data)
 
-            possible_to_train = self._check_enough_labels(session_data)
+        possible_to_train = self._check_enough_labels(session_data)
 
-            if not possible_to_train:
-                logger.error(
-                    "Can not train a classifier. "
-                    "Need at least 2 different classes. "
-                    "Skipping training of classifier."
-                )
-                return
-
-            if self.evaluate_on_num_examples:
-                session_data, eval_session_data = train_utils.train_val_split(
-                    session_data,
-                    self.evaluate_on_num_examples,
-                    self.random_seed,
-                    label_key="intent_ids",
-                )
-            else:
-                eval_session_data = None
+        if not possible_to_train:
+            logger.error(
+                "Can not train a classifier. "
+                "Need at least 2 different classes. "
+                "Skipping training of classifier."
+            )
+            return
 
-            # self.graph = tf.Graph()
-            # with self.graph.as_default():
+        if self.evaluate_on_num_examples:
+            session_data, eval_session_data = train_utils.train_val_split(
+                session_data,
+                self.evaluate_on_num_examples,
+                self.random_seed,
+                label_key="intent_ids",
+            )
+        else:
+            eval_session_data = None
+
+        self.graph = tf.Graph()
+        with self.graph.as_default():
             # set random seed
             tf.set_random_seed(self.random_seed)
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index b761c72bf9e3..fe5150a3607c 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -36,7 +36,7 @@
 
 
 # namedtuple for all tf session related data
-SessionData = Dict[Text, np.ndarray]
+SessionData = Dict[Text, List[np.ndarray]]
 
 
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
@@ -55,28 +55,36 @@ def train_val_split(
     label_key: Text,
 ) -> Tuple["SessionData", "SessionData"]:
     """Create random hold out validation set using stratified split."""
-    if label_key not in session_data:
+    if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionData.")
 
     label_counts = dict(
-        zip(*np.unique(session_data[label_key], return_counts=True, axis=0))
+        zip(*np.unique(session_data[label_key][0], return_counts=True, axis=0))
     )
 
     check_train_test_sizes(evaluate_on_num_examples, label_counts, session_data)
 
-    counts = np.array([label_counts[label] for label in session_data[label_key]])
+    counts = np.array([label_counts[label] for label in session_data[label_key][0]])
 
     multi_values = []
-    [multi_values.append(v[counts > 1]) for v in session_data.values()]
+    [
+        multi_values.append(v[counts > 1])
+        for values in session_data.values()
+        for v in values
+    ]
 
     solo_values = []
-    [solo_values.append(v[counts == 1]) for v in session_data.values()]
+    [
+        solo_values.append(v[counts == 1])
+        for values in session_data.values()
+        for v in values
+    ]
 
     output_values = train_test_split(
         *multi_values,
         test_size=evaluate_on_num_examples,
         random_state=random_seed,
-        stratify=session_data[label_key][counts > 1],
+        stratify=session_data[label_key][0][counts > 1],
     )
 
     session_data_train, session_data_val = convert_train_test_split(
@@ -108,10 +116,10 @@ def check_train_test_sizes(
 def convert_train_test_split(
     output_values: List[Any], session_data: SessionData, solo_values: List[Any]
 ):
-    keys = [k for k, v in session_data.items()]
+    keys = [k for k in session_data.keys()]
 
-    session_data_train = {}
-    session_data_val = {}
+    session_data_train = defaultdict(list)
+    session_data_val = defaultdict(list)
 
     # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
     # order is kept, so first session_data.X values, then session_data.Y values, and
@@ -119,13 +127,13 @@ def convert_train_test_split(
 
     # train datasets have an even index
     for i in range(len(session_data)):
-        session_data_train[keys[i]] = combine_features(
-            output_values[i * 2], solo_values[i]
+        session_data_train[keys[i]].append(
+            combine_features(output_values[i * 2], solo_values[i])
         )
 
     # val datasets have an odd index
     for i in range(len(session_data)):
-        session_data_val[keys[i]] = output_values[(i * 2) + 1]
+        session_data_val[keys[i]].append(output_values[(i * 2) + 1])
 
     return session_data_train, session_data_val
 
@@ -155,19 +163,23 @@ def shuffle_session_data(session_data: "SessionData") -> "SessionData":
 
 def session_data_for_ids(session_data: SessionData, ids: np.ndarray):
     """Filter session data by ids."""
-    return {k: v[ids] for k, v in session_data.items()}
+    new_session_data = defaultdict(list)
+    for k, values in session_data.items():
+        for v in values:
+            new_session_data[k].append(v[ids])
+    return new_session_data
 
 
 def split_session_data_by_label(
     session_data: "SessionData", label_key: Text, unique_label_ids: "np.ndarray"
 ) -> List["SessionData"]:
     """Reorganize session data into a list of session data with the same labels."""
-    if label_key not in session_data:
+    if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
 
     label_data = []
     for label_id in unique_label_ids:
-        ids = session_data[label_key] == label_id
+        ids = session_data[label_key][0] == label_id
         label_data.append(session_data_for_ids(session_data, ids))
     return label_data
 
@@ -182,11 +194,11 @@ def balance_session_data(
     by repeating them. Mimics stratified batching, but also takes into account
     that more populated classes should appear more often.
     """
-    if label_key not in session_data:
+    if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
 
     unique_label_ids, counts_label_ids = np.unique(
-        session_data[label_key], return_counts=True, axis=0
+        session_data[label_key][0], return_counts=True, axis=0
     )
     num_label_ids = len(unique_label_ids)
 
@@ -241,7 +253,7 @@ def concatenate_data(
 
 
 def get_number_of_examples(session_data: SessionData):
-    example_lengths = [v.shape[0] for v in session_data.values()]
+    example_lengths = [v.shape[0] for values in session_data.values() for v in values]
 
     # check if number of examples is the same for all X
     if not all(length == example_lengths[0] for length in example_lengths):
@@ -277,12 +289,13 @@ def gen_batch(
         end = start + batch_size
 
         batch_data = []
-        for v in session_data.values():
-            _data = v[start:end]
-            if isinstance(_data[0], scipy.sparse.spmatrix):
-                batch_data = batch_data + scipy_matrix_to_values(_data)
-            else:
-                batch_data.append(pad_data(_data))
+        for values in session_data.values():
+            for v in values:
+                _data = v[start:end]
+                if isinstance(_data[0], scipy.sparse.spmatrix):
+                    batch_data = batch_data + scipy_matrix_to_values(_data)
+                else:
+                    batch_data.append(pad_data(_data))
 
         # len of batch_data is equal to the number of keys in session data
         yield tuple(batch_data)
@@ -359,21 +372,24 @@ def batch_to_session_data(
     shape before, this methods converts them into sparse tensors. Dense data is
     kept.
     """
-    batch_data = {}
+    batch_data = defaultdict(list)
     idx = 0
 
-    for k, v in session_data.items():
-        if isinstance(v[0], scipy.sparse.spmatrix):
-            # explicitly substitute last dimension in shape with known static value
-            batch_data[k] = values_to_sparse_tensor(
-                batch[idx],
-                batch[idx + 1],
-                [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]],
-            )
-            idx += 3
-        else:
-            batch_data[k] = batch[idx]
-            idx += 1
+    for k, values in session_data.items():
+        for v in values:
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                # explicitly substitute last dimension in shape with known static value
+                batch_data[k].append(
+                    values_to_sparse_tensor(
+                        batch[idx],
+                        batch[idx + 1],
+                        [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]],
+                    )
+                )
+                idx += 3
+            else:
+                batch_data[k].append(batch[idx])
+                idx += 1
 
     return batch_data
 
@@ -426,9 +442,10 @@ def append_type(v: np.ndarray):
         else:
             types.append(v.dtype)
 
-    for v in session_data.values():
-        append_shape(v)
-        append_type(v)
+    for values in session_data.values():
+        for v in values:
+            append_shape(v)
+            append_type(v)
 
     return tuple(shapes), tuple(types)
 
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index a63bd2806d7d..06cb30b2a89d 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -17,43 +17,49 @@
 @pytest.fixture
 async def session_data() -> SessionData:
     return {
-        "dense": np.array(
-            [
-                np.random.rand(5, 14),
-                np.random.rand(2, 14),
-                np.random.rand(3, 14),
-                np.random.rand(1, 14),
-                np.random.rand(3, 14),
-            ]
-        ),
-        "sparse": np.array(
-            [
-                scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
-                scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
-                scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-                scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
-                scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
-            ]
-        ),
-        "Y": np.array(
-            [
-                np.random.randint(2, size=(5, 10)),
-                np.random.randint(2, size=(2, 10)),
-                np.random.randint(2, size=(3, 10)),
-                np.random.randint(2, size=(1, 10)),
-                np.random.randint(2, size=(3, 10)),
-            ]
-        ),
-        "intent_ids": np.array([0, 1, 0, 1, 1]),
-        "tag_ids": np.array(
-            [
-                np.array([0, 1, 1, 0, 2]),
-                np.array([2, 0]),
-                np.array([0, 1, 1]),
-                np.array([0, 1]),
-                np.array([0, 0, 0]),
-            ]
-        ),
+        "text_features": [
+            np.array(
+                [
+                    np.random.rand(5, 14),
+                    np.random.rand(2, 14),
+                    np.random.rand(3, 14),
+                    np.random.rand(1, 14),
+                    np.random.rand(3, 14),
+                ]
+            ),
+            np.array(
+                [
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                ]
+            ),
+        ],
+        "intent_features": [
+            np.array(
+                [
+                    np.random.randint(2, size=(5, 10)),
+                    np.random.randint(2, size=(2, 10)),
+                    np.random.randint(2, size=(3, 10)),
+                    np.random.randint(2, size=(1, 10)),
+                    np.random.randint(2, size=(3, 10)),
+                ]
+            )
+        ],
+        "intent_ids": [np.array([0, 1, 0, 1, 1])],
+        "tag_ids": [
+            np.array(
+                [
+                    np.array([0, 1, 1, 0, 2]),
+                    np.array([2, 0]),
+                    np.array([0, 1, 1]),
+                    np.array([0, 1]),
+                    np.array([0, 0, 0]),
+                ]
+            )
+        ],
     }
 
 
@@ -85,11 +91,13 @@ def test_train_val_split(session_data: SessionData):
         session_data, 2, 42, "intent_ids"
     )
 
-    for v in train_session_data.values():
-        assert v.shape[0] == 3
+    for values in train_session_data.values():
+        for v in values:
+            assert v.shape[0] == 3
 
-    for v in val_session_data.values():
-        assert v.shape[0] == 2
+    for values in val_session_data.values():
+        for v in values:
+            assert v.shape[0] == 2
 
 
 @pytest.mark.parametrize("size", [0, 1, 5])

From f97f6dfb9de7cdef64235b60a24bdac669be659f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 8 Nov 2019 15:25:09 +0100
Subject: [PATCH 125/239] fix encoded all label ids

---
 .../embedding_intent_classifier.py            | 21 ++++++++++++-------
 rasa/utils/train_utils.py                     | 16 ++++----------
 2 files changed, 18 insertions(+), 19 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 419e25542010..2b6b28935f8e 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -293,9 +293,9 @@ def _extract_labels_precomputed_features(
         encoded_id_labels = defaultdict(list)
 
         for i, s in zip(label_examples, sparse_features):
-            encoded_id_labels[i[0]].append(sparse_features)
+            encoded_id_labels[i[0]].append(s)
         for i, d in zip(label_examples, dense_features):
-            encoded_id_labels[i[0]].append(dense_features)
+            encoded_id_labels[i[0]].append(d)
 
         # Sort the dict based on label_idx
         encoded_id_labels = OrderedDict(sorted(encoded_id_labels.items()))
@@ -359,8 +359,10 @@ def labels_to_tensors(self, features: List[np.ndarray]):
         label_features = []
 
         for f in features:
-            if isinstance(f[0], scipy.sparse.spmatrix):
-                indices, values, shape = train_utils.scipy_matrix_to_values(f)
+            if isinstance(f, scipy.sparse.spmatrix):
+                indices, values, shape = train_utils.scipy_matrix_to_values(
+                    np.array([f])
+                )
                 label_features.append(
                     tf.cast(
                         train_utils.values_to_sparse_tensor(indices, values, shape),
@@ -457,9 +459,14 @@ def _build_tf_train_graph(
         a = self.combine_sparse_dense_features(batch["text_features"], "text")
         b = self.combine_sparse_dense_features(batch["intent_features"], "intent")
 
-        all_label_ids = tf.stack(
-            [self.labels_to_tensors(v) for v in self._encoded_all_label_ids.values()],
-            name="all_label_ids",
+        all_label_ids = tf.squeeze(
+            tf.stack(
+                [
+                    self.labels_to_tensors(v)
+                    for v in self._encoded_all_label_ids.values()
+                ],
+                name="all_label_ids",
+            )
         )
 
         self.message_embed = self._create_tf_embed_fnn(
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index fe5150a3607c..942bcc38982e 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -306,18 +306,10 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     coo = [x.tocoo() for x in array_of_sparse]
     data = [v for x in array_of_sparse for v in x.data]
 
-    if seq_len == 1:
-        indices = [
-            ids for i, x in enumerate(coo) for ids in zip([i] * len(x.row), x.col)
-        ]
-        shape = (len(array_of_sparse), array_of_sparse[0].shape[-1])
-    else:
-        indices = [
-            ids
-            for i, x in enumerate(coo)
-            for ids in zip([i] * len(x.row), x.row, x.col)
-        ]
-        shape = (len(array_of_sparse), seq_len, array_of_sparse[0].shape[-1])
+    indices = [
+        ids for i, x in enumerate(coo) for ids in zip([i] * len(x.row), x.row, x.col)
+    ]
+    shape = (len(array_of_sparse), seq_len, array_of_sparse[0].shape[-1])
 
     return [
         np.array(indices).astype(np.int64),

From 5d53eb180c8d3bfa0eb6231e87bb0c4cb68aea69 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 8 Nov 2019 17:54:26 +0100
Subject: [PATCH 126/239] fix train utils methods

---
 .../embedding_intent_classifier.py            |  2 --
 rasa/utils/train_utils.py                     | 13 ++++++++---
 tests/utils/test_train_utils.py               | 23 ++++++++++++-------
 3 files changed, 25 insertions(+), 13 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 2b6b28935f8e..dcc4ebdcf1d0 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -451,9 +451,7 @@ def _create_tf_embed_fnn(
     def _build_tf_train_graph(
         self, session_data: SessionData
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        # batch = 1 or 2 a_in values, b_in, intent_ids
         batch = self._iterator.get_next()
-
         batch = train_utils.batch_to_session_data(batch, session_data)
 
         a = self.combine_sparse_dense_features(batch["text_features"], "text")
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 942bcc38982e..66d66c388ea8 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -224,8 +224,13 @@ def balance_session_data(
             else:
                 skipped[index] = False
 
-            for k, v in label_data[index].items():
-                new_session_data[k].append(v[data_idx[index] : data_idx[index] + 1][0])
+            for k, values in label_data[index].items():
+                for i, v in enumerate(values):
+                    if len(new_session_data[k]) < i + 1:
+                        new_session_data[k].append([])
+                    new_session_data[k][i].append(
+                        v[data_idx[index] : data_idx[index] + 1][0]
+                    )
 
             data_idx[index] += 1
             if data_idx[index] >= counts_label_ids[index]:
@@ -235,7 +240,9 @@ def balance_session_data(
             if min(num_data_cycles) > 0:
                 break
 
-    new_session_data = {k: np.array(v) for k, v in new_session_data.items()}
+    new_session_data = {
+        k: [np.array(v) for v in values] for k, values in new_session_data.items()
+    }
 
     return new_session_data
 
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 06cb30b2a89d..c9dff846d485 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -76,7 +76,7 @@ def test_split_session_data_by_label(session_data: SessionData):
 
     assert len(split_session_data) == 2
     for s in split_session_data:
-        assert len(set(s["intent_ids"])) == 1
+        assert len(set(s["intent_ids"][0])) == 1
 
 
 def test_split_session_data_by_incorrect_label(session_data: SessionData):
@@ -109,13 +109,18 @@ def test_train_val_split_incorrect_size(session_data: SessionData, size):
 def test_session_data_for_ids(session_data: SessionData):
     filtered_session_data = session_data_for_ids(session_data, np.array([0, 1]))
 
-    for v in filtered_session_data.values():
-        assert v.shape[0] == 2
+    for values in filtered_session_data.values():
+        for v in values:
+            assert v.shape[0] == 2
 
     k = list(session_data.keys())[0]
 
-    assert np.all(np.array(filtered_session_data[k][0]) == np.array(session_data[k][0]))
-    assert np.all(np.array(filtered_session_data[k][1]) == np.array(session_data[k][1]))
+    assert np.all(
+        np.array(filtered_session_data[k][0][0]) == np.array(session_data[k][0][0])
+    )
+    assert np.all(
+        np.array(filtered_session_data[k][0][1]) == np.array(session_data[k][0][1])
+    )
 
 
 def test_get_number_of_examples(session_data: SessionData):
@@ -131,7 +136,9 @@ def test_get_number_of_examples_raises_value_error(session_data: SessionData):
 
 
 def test_gen_batch(session_data: SessionData):
-    iterator = gen_batch(session_data, 2, "intent_ids", shuffle=True)
+    iterator = gen_batch(
+        session_data, 2, "intent_ids", shuffle=True, batch_strategy="balanced"
+    )
 
     batch = next(iterator)
     assert len(batch) == 7
@@ -155,11 +162,11 @@ def test_gen_batch(session_data: SessionData):
 )
 def test_balance_session_data(session_data: SessionData, intent_ids, expected_labels):
     # TODO improve test
-    session_data["intent_ids"] = np.array(intent_ids)
+    session_data["intent_ids"] = [np.array(intent_ids)]
 
     balanced_session_data = balance_session_data(session_data, 2, False, "intent_ids")
 
-    labels = balanced_session_data["intent_ids"]
+    labels = balanced_session_data["intent_ids"][0]
 
     assert len(expected_labels) == len(labels)
     assert np.all(expected_labels == labels)

From f79ed36526fee893d9954344cee41ca333265dba Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 8 Nov 2019 17:54:47 +0100
Subject: [PATCH 127/239] convert encoded_all_labels into a list of
 sparse,dense

---
 .../embedding_intent_classifier.py            | 81 ++++++++++---------
 1 file changed, 43 insertions(+), 38 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 2b6b28935f8e..7faa3e95fa43 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -265,10 +265,10 @@ def _find_example_for_label(
 
     @staticmethod
     def _check_labels_features_exist(
-        labels_example: List[Tuple[int, "Message"]], attribute: Text
+        labels_example: List["Message"], attribute: Text
     ) -> bool:
         """Check if all labels have features set"""
-        for (label_idx, label_example) in labels_example:
+        for label_example in labels_example:
             if label_example.get(
                 MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]
             ) is None and label_example.get(
@@ -277,38 +277,14 @@ def _check_labels_features_exist(
                 return False
         return True
 
-    def _extract_labels_precomputed_features(
-        self, label_examples: List[Tuple[int, "Message"]]
-    ) -> Dict[int, Dict[Text, Any]]:
-
-        # Collect precomputed encodings
-        sparse_features = []
-        dense_features = []
-
-        for i, e in label_examples:
-            self._extract_and_add_features(
-                e, MESSAGE_INTENT_ATTRIBUTE, sparse_features, dense_features
-            )
-
-        encoded_id_labels = defaultdict(list)
-
-        for i, s in zip(label_examples, sparse_features):
-            encoded_id_labels[i[0]].append(s)
-        for i, d in zip(label_examples, dense_features):
-            encoded_id_labels[i[0]].append(d)
-
-        # Sort the dict based on label_idx
-        encoded_id_labels = OrderedDict(sorted(encoded_id_labels.items()))
-
-        return encoded_id_labels
-
+    @staticmethod
     def _extract_and_add_features(
-        self,
         message: "Message",
         attribute: Text,
         sparse_features: List[scipy.sparse.spmatrix],
         dense_features: List[np.ndarray],
     ):
+        # we mutate sparse_features and dense_features
         if message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]) is not None:
             sparse_features.append(
                 message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute])
@@ -319,31 +295,59 @@ def _extract_and_add_features(
                 message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute])
             )
 
+    def _extract_labels_precomputed_features(
+        self, label_examples: List["Message"]
+    ) -> List[np.ndarray]:
+
+        # Collect precomputed encodings
+        sparse_features = []
+        dense_features = []
+
+        for e in label_examples:
+            self._extract_and_add_features(
+                e, MESSAGE_INTENT_ATTRIBUTE, sparse_features, dense_features
+            )
+
+        sparse_features = np.array(sparse_features)
+        dense_features = np.array(dense_features)
+
+        return [sparse_features, dense_features]
+
+    @staticmethod
     def _compute_default_label_features(
-        self, labels_example: List[Tuple[int, "Message"]]
-    ) -> np.ndarray:
+        labels_example: List[Tuple[int, "Message"]]
+    ) -> List[np.ndarray]:
         """Compute one-hot representation for the labels"""
-        # TODO
-        return np.eye(len(labels_example))
+
+        # TODO check:
+        # features should be sequences
+        return [np.expand_dims(np.eye(len(labels_example)), axis=1)]
 
     def _create_encoded_label_ids(
         self,
         training_data: "TrainingData",
         label_id_dict: Dict[Text, int],
         attribute: Text,
-    ) -> np.ndarray:
-        """Create matrix with label_ids encoded in rows as bag of words. If the features are already computed, fetch
-        them from the message object else compute a one hot encoding for the label as the feature vector
-        Find a training example for each label and get the encoded features from the corresponding Message object"""
+    ) -> List[np.ndarray]:
+        """Create matrix with label_ids encoded in rows as bag of words.
 
-        labels_example = []
+        Find a training example for each label and get the encoded features
+        from the corresponding Message object.
+        If the features are already computed, fetch them from the message object
+        else compute a one hot encoding for the label as the feature vector.
+        """
 
         # Collect one example for each label
+        labels_idx_example = []
         for label_name, idx in label_id_dict.items():
             label_example = self._find_example_for_label(
                 label_name, training_data.intent_examples, attribute
             )
-            labels_example.append((idx, label_example))
+            labels_idx_example.append((idx, label_example))
+
+        # Sort the list of tuples based on label_idx
+        labels_idx_example = sorted(labels_idx_example, key=lambda x: x[0])
+        labels_example = [example for (_, example) in labels_idx_example]
 
         # Collect features, precomputed if they exist, else compute on the fly
         if self._check_labels_features_exist(labels_example, attribute):
@@ -459,6 +463,7 @@ def _build_tf_train_graph(
         a = self.combine_sparse_dense_features(batch["text_features"], "text")
         b = self.combine_sparse_dense_features(batch["intent_features"], "intent")
 
+        # TODO change below: (currantly it'll raise aan error)
         all_label_ids = tf.squeeze(
             tf.stack(
                 [

From b208db71a8ef512b20834514efabe83073bb8375 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 8 Nov 2019 19:00:25 +0100
Subject: [PATCH 128/239] create sparse matrices, if no intent features
 provided

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index b397d4c3b826..e561a672b044 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -320,8 +320,16 @@ def _compute_default_label_features(
         """Compute one-hot representation for the labels"""
 
         # TODO check:
-        # features should be sequences
-        return [np.expand_dims(np.eye(len(labels_example)), axis=1)]
+        return [
+            np.array(
+                [
+                    scipy.sparse.csr_matrix(
+                        ([1], ([0], [idx])), shape=(1, len(labels_example))
+                    )
+                    for idx in range(len(labels_example))
+                ]
+            )
+        ]
 
     def _create_encoded_label_ids(
         self,

From ee378521a9d10d79b51ce9e73d274f94f99a262b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 11 Nov 2019 09:21:41 +0100
Subject: [PATCH 129/239] embedding intent classifier is training.

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 3 ++-
 rasa/utils/train_utils.py                           | 4 ++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e561a672b044..93baf613120e 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -474,7 +474,8 @@ def _build_tf_train_graph(
             tf.stack(
                 [
                     self.labels_to_tensors(v)
-                    for v in self._encoded_all_label_ids.values()
+                    for values in self._encoded_all_label_ids
+                    for v in values
                 ],
                 name="all_label_ids",
             )
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 66d66c388ea8..0099b242d098 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -422,8 +422,8 @@ def _get_shapes_types(session_data: SessionData) -> Tuple:
     def append_shape(v: np.ndarray):
         if isinstance(v[0], scipy.sparse.spmatrix):
             # scipy matrix is converted into indices, data, shape
-            shapes.append((len(v), v[0].ndim + 1))
-            shapes.append((len(v)))
+            shapes.append((None, v[0].ndim + 1))
+            shapes.append((None))
             shapes.append((v[0].ndim + 1))
         elif v[0].ndim == 0:
             shapes.append((None))

From b9256bdcbd3ed8970ef693545142118a296ae15d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 11 Nov 2019 10:39:50 +0100
Subject: [PATCH 130/239] create session data during prediction.

---
 .../embedding_intent_classifier.py            | 70 ++++++++-----------
 rasa/utils/train_utils.py                     | 34 +++++----
 2 files changed, 51 insertions(+), 53 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 93baf613120e..f91f75d88d97 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -118,12 +118,10 @@ def __init__(
         inverted_label_dict: Optional[Dict[int, Text]] = None,
         session: Optional["tf.Session"] = None,
         graph: Optional["tf.Graph"] = None,
-        message_placeholder: Optional["tf.Tensor"] = None,
-        label_placeholder: Optional["tf.Tensor"] = None,
+        batch_placeholder: Optional["tf.Tensor"] = None,
         similarity_all: Optional["tf.Tensor"] = None,
         pred_confidence: Optional["tf.Tensor"] = None,
         similarity: Optional["tf.Tensor"] = None,
-        message_embed: Optional["tf.Tensor"] = None,
         label_embed: Optional["tf.Tensor"] = None,
         all_labels_embed: Optional["tf.Tensor"] = None,
     ) -> None:
@@ -142,14 +140,12 @@ def __init__(
         # tf related instances
         self.session = session
         self.graph = graph
-        self.a_in = message_placeholder
-        self.b_in = label_placeholder
+        self.batch = batch_placeholder
         self.sim_all = similarity_all
         self.pred_confidence = pred_confidence
         self.sim = similarity
 
         # persisted embeddings
-        self.message_embed = message_embed
         self.label_embed = label_embed
         self.all_labels_embed = all_labels_embed
 
@@ -418,7 +414,7 @@ def _create_session_data(
         session_data = {}
         self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
-        session_data["intent_ids"] = [label_ids]
+        self._add_to_session_data(session_data, "intent_ids", [label_ids])
 
         return session_data
 
@@ -463,13 +459,12 @@ def _create_tf_embed_fnn(
     def _build_tf_train_graph(
         self, session_data: SessionData
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        batch = self._iterator.get_next()
-        batch = train_utils.batch_to_session_data(batch, session_data)
+        self.batch = self._iterator.get_next()
+        batch = train_utils.batch_to_session_data(self.batch, session_data)
 
         a = self.combine_sparse_dense_features(batch["text_features"], "text")
         b = self.combine_sparse_dense_features(batch["intent_features"], "intent")
 
-        # TODO change below: (currantly it'll raise aan error)
         all_label_ids = tf.squeeze(
             tf.stack(
                 [
@@ -481,7 +476,7 @@ def _build_tf_train_graph(
             )
         )
 
-        self.message_embed = self._create_tf_embed_fnn(
+        message_embed = self._create_tf_embed_fnn(
             a,
             self.hidden_layer_sizes["text"],
             fnn_name="text_intent" if self.share_hidden_layers else "text",
@@ -502,7 +497,7 @@ def _build_tf_train_graph(
         )
 
         return train_utils.calculate_loss_acc(
-            self.message_embed,
+            message_embed,
             self.label_embed,
             b,
             self.all_labels_embed,
@@ -540,43 +535,44 @@ def combine_sparse_dense_features(
         return output
 
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
-        num_text_features = self._get_num_of_features(session_data, "text_features_")
-        num_intent_features = self._get_num_of_features(
-            session_data, "intent_features_"
-        )
+        shapes, types = train_utils.get_shapes_types(session_data)
 
-        self.a_in = tf.placeholder(
-            tf.float32, (None, None, num_text_features), name="text"
-        )
-        self.b_in = tf.placeholder(
-            tf.float32, (None, None, num_intent_features), name="intent"
-        )
+        batch_placeholder = []
+        for s, t in zip(shapes, types):
+            batch_placeholder.append(tf.placeholder(t, s))
+
+        self.batch = tf.tuple(batch_placeholder)
+
+        batch = train_utils.batch_to_session_data(self.batch, session_data)
+
+        a = self.combine_sparse_dense_features(batch["text_features"], "text")
+        b = self.combine_sparse_dense_features(batch["intent_features"], "intent")
 
         # TODO check this idea:
         # self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
 
-        self.message_embed = self._create_tf_embed_fnn(
-            self.a_in,
+        message_embed = self._create_tf_embed_fnn(
+            a,
             self.hidden_layer_sizes["text"],
             fnn_name="text_intent" if self.share_hidden_layers else "text",
             embed_name="text",
         )
 
         self.sim_all = train_utils.tf_raw_sim(
-            self.message_embed[:, tf.newaxis, :],
+            message_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
             None,
         )
 
         self.label_embed = self._create_tf_embed_fnn(
-            self.b_in,
+            b,
             self.hidden_layer_sizes["intent"],
             fnn_name="text_intent" if self.share_hidden_layers else "intent",
             embed_name="intent",
         )
 
         self.sim = train_utils.tf_raw_sim(
-            self.message_embed[:, tf.newaxis, :], self.label_embed, None
+            message_embed[:, tf.newaxis, :], self.label_embed, None
         )
 
         return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
@@ -729,7 +725,7 @@ def train(
     def _calculate_message_sim(self, X: np.ndarray) -> Tuple[np.ndarray, List[float]]:
         """Calculate message similarities"""
 
-        message_sim = self.session.run(self.pred_confidence, feed_dict={self.a_in: X})
+        message_sim = self.session.run(self.pred_confidence, feed_dict={self.batch: X})
 
         message_sim = message_sim.flatten()  # sim is a matrix
 
@@ -753,10 +749,8 @@ def predict_label(
             )
 
         else:
-            # get features (bag of words/embeddings) for a message
-            # noinspection PyPep8Naming
-            X = self._create_session_data([message])
-            # TODO convert input
+            session_data = self._create_session_data([message])
+            X = train_utils.prepare_batch(0, 1, session_data)
 
             # load tf graph and session
             label_ids, message_sim = self._calculate_message_sim(X)
@@ -804,8 +798,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             if e.errno != errno.EEXIST:
                 raise
         with self.graph.as_default():
-            train_utils.persist_tensor("message_placeholder", self.a_in, self.graph)
-            train_utils.persist_tensor("label_placeholder", self.b_in, self.graph)
+            train_utils.persist_tensor("batch_placeholder", self.batch, self.graph)
 
             train_utils.persist_tensor("similarity_all", self.sim_all, self.graph)
             train_utils.persist_tensor(
@@ -813,7 +806,6 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             )
             train_utils.persist_tensor("similarity", self.sim, self.graph)
 
-            train_utils.persist_tensor("message_embed", self.message_embed, self.graph)
             train_utils.persist_tensor("label_embed", self.label_embed, self.graph)
             train_utils.persist_tensor(
                 "all_labels_embed", self.all_labels_embed, self.graph
@@ -856,14 +848,12 @@ def load(
 
                 saver.restore(session, checkpoint)
 
-                a_in = train_utils.load_tensor("message_placeholder")
-                b_in = train_utils.load_tensor("label_placeholder")
+                batch = train_utils.load_tensor("batch_placeholder")
 
                 sim_all = train_utils.load_tensor("similarity_all")
                 pred_confidence = train_utils.load_tensor("pred_confidence")
                 sim = train_utils.load_tensor("similarity")
 
-                message_embed = train_utils.load_tensor("message_embed")
                 label_embed = train_utils.load_tensor("label_embed")
                 all_labels_embed = train_utils.load_tensor("all_labels_embed")
 
@@ -877,12 +867,10 @@ def load(
                 inverted_label_dict=inv_label_dict,
                 session=session,
                 graph=graph,
-                message_placeholder=a_in,
-                label_placeholder=b_in,
+                batch_placeholder=batch,
                 similarity_all=sim_all,
                 pred_confidence=pred_confidence,
                 similarity=sim,
-                message_embed=message_embed,
                 label_embed=label_embed,
                 all_labels_embed=all_labels_embed,
             )
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 0099b242d098..e55e626389b3 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -295,17 +295,27 @@ def gen_batch(
         start = batch_num * batch_size
         end = start + batch_size
 
-        batch_data = []
-        for values in session_data.values():
-            for v in values:
-                _data = v[start:end]
-                if isinstance(_data[0], scipy.sparse.spmatrix):
-                    batch_data = batch_data + scipy_matrix_to_values(_data)
-                else:
-                    batch_data.append(pad_data(_data))
+        yield prepare_batch(start, end, session_data)
 
-        # len of batch_data is equal to the number of keys in session data
-        yield tuple(batch_data)
+
+def prepare_batch(start: int, end: int, session_data: SessionData):
+    batch_data = []
+
+    for values in session_data.values():
+        # add None for not present values during processing
+        if not values:
+            batch_data.append(None)
+            continue
+
+        for v in values:
+            _data = v[start:end]
+            if isinstance(_data[0], scipy.sparse.spmatrix):
+                batch_data = batch_data + scipy_matrix_to_values(_data)
+            else:
+                batch_data.append(pad_data(_data))
+
+    # len of batch_data is equal to the number of keys in session data
+    return tuple(batch_data)
 
 
 def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
@@ -403,7 +413,7 @@ def create_tf_dataset(
 ) -> "tf.data.Dataset":
     """Create tf dataset."""
 
-    shapes, types = _get_shapes_types(session_data)
+    shapes, types = get_shapes_types(session_data)
 
     return tf.data.Dataset.from_generator(
         lambda batch_size_: gen_batch(
@@ -415,7 +425,7 @@ def create_tf_dataset(
     )
 
 
-def _get_shapes_types(session_data: SessionData) -> Tuple:
+def get_shapes_types(session_data: SessionData) -> Tuple:
     types = []
     shapes = []
 

From 112f06553ec499b89e9dc50d9e16e689067e8713 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 11 Nov 2019 11:55:28 +0100
Subject: [PATCH 131/239] prediction of embedding intent classifier works.

---
 .../embedding_intent_classifier.py            | 45 ++++++++++++++-----
 rasa/utils/train_utils.py                     | 15 ++++++-
 2 files changed, 48 insertions(+), 12 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f91f75d88d97..e5ab1a2e5b1b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -1,5 +1,4 @@
 import logging
-from collections import defaultdict, OrderedDict
 
 import numpy as np
 import os
@@ -124,6 +123,7 @@ def __init__(
         similarity: Optional["tf.Tensor"] = None,
         label_embed: Optional["tf.Tensor"] = None,
         all_labels_embed: Optional["tf.Tensor"] = None,
+        shapes: Optional[Tuple] = None,
     ) -> None:
         """Declare instant variables with default values"""
 
@@ -154,6 +154,8 @@ def __init__(
         self._train_op = None
         self._is_training = None
 
+        self.shapes = shapes
+
     # config migration warning
     def _check_old_config_variables(self, config: Dict[Text, Any]) -> None:
 
@@ -535,13 +537,12 @@ def combine_sparse_dense_features(
         return output
 
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
-        shapes, types = train_utils.get_shapes_types(session_data)
+        self.shapes, types = train_utils.get_shapes_types(session_data)
 
         batch_placeholder = []
-        for s, t in zip(shapes, types):
+        for s, t in zip(self.shapes, types):
             batch_placeholder.append(tf.placeholder(t, s))
-
-        self.batch = tf.tuple(batch_placeholder)
+        self.batch = tuple(batch_placeholder)
 
         batch = train_utils.batch_to_session_data(self.batch, session_data)
 
@@ -722,10 +723,12 @@ def train(
 
     # process helpers
     # noinspection PyPep8Naming
-    def _calculate_message_sim(self, X: np.ndarray) -> Tuple[np.ndarray, List[float]]:
+    def _calculate_message_sim(self, X: Tuple) -> Tuple[np.ndarray, List[float]]:
         """Calculate message similarities"""
-
-        message_sim = self.session.run(self.pred_confidence, feed_dict={self.batch: X})
+        message_sim = self.session.run(
+            self.pred_confidence,
+            feed_dict={_x: _x_in for _x, _x_in in zip(self.batch, X)},
+        )
 
         message_sim = message_sim.flatten()  # sim is a matrix
 
@@ -750,13 +753,28 @@ def predict_label(
 
         else:
             session_data = self._create_session_data([message])
-            X = train_utils.prepare_batch(0, 1, session_data)
+            batch = train_utils.prepare_batch(0, 1, session_data)
+
+            X = []
+            if len(batch) != len(self.shapes):
+                i = 0
+                for s in self.shapes:
+                    if i >= len(batch) or batch[i] is None:
+                        if isinstance(s, tuple):
+                            s = tuple([x if x is not None else 1 for x in s])
+                        elif s is None:
+                            s = 1
+                        X.append(np.zeros(s))
+                    else:
+                        X.append(batch[i])
+                    i += 1
+            X = tuple(X)
 
             # load tf graph and session
             label_ids, message_sim = self._calculate_message_sim(X)
 
             # if X contains all zeros do not predict some label
-            if X.any() and label_ids.size > 0:
+            if label_ids.size > 0:
                 label = {
                     "name": self.inverted_label_dict[label_ids[0]],
                     "confidence": message_sim[0],
@@ -822,6 +840,9 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "wb") as f:
             pickle.dump(self._tf_config, f)
 
+        with open(os.path.join(model_dir, file_name + ".shapes.pkl"), "wb") as f:
+            pickle.dump(self.shapes, f)
+
         return {"file": file_name}
 
     @classmethod
@@ -862,6 +883,9 @@ def load(
             ) as f:
                 inv_label_dict = pickle.load(f)
 
+            with open(os.path.join(model_dir, file_name + ".shapes.pkl"), "rb") as f:
+                shapes = pickle.load(f)
+
             return cls(
                 component_config=meta,
                 inverted_label_dict=inv_label_dict,
@@ -873,6 +897,7 @@ def load(
                 similarity=sim,
                 label_embed=label_embed,
                 all_labels_embed=all_labels_embed,
+                shapes=shapes,
             )
 
         else:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e55e626389b3..65244ab7e561 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1223,11 +1223,22 @@ def persist_tensor(name: Text, tensor: "tf.Tensor", graph: "tf.Graph") -> None:
 
     if tensor is not None:
         graph.clear_collection(name)
-        graph.add_to_collection(name, tensor)
+        if isinstance(tensor, tuple) or isinstance(tensor, list):
+            for t in tensor:
+                graph.add_to_collection(name, t)
+        else:
+            graph.add_to_collection(name, tensor)
 
 
 def load_tensor(name: Text) -> Optional["tf.Tensor"]:
     """Load tensor or set it to None"""
 
     tensor_list = tf.get_collection(name)
-    return tensor_list[0] if tensor_list else None
+
+    if tensor_list is None:
+        return tensor_list
+
+    if len(tensor_list) == 1:
+        return tensor_list[0]
+
+    return tensor_list

From 26354648d59b3c6c9aad33b68eb51911261e82dc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 11 Nov 2019 13:25:36 +0100
Subject: [PATCH 132/239] clean up code

---
 .../embedding_intent_classifier.py            | 37 +++++++++++--------
 rasa/utils/train_utils.py                     |  6 +--
 2 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e5ab1a2e5b1b..d1d3ca374ce8 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -752,23 +752,10 @@ def predict_label(
             )
 
         else:
+            # create session data from message and convert it into a batch of 1
             session_data = self._create_session_data([message])
             batch = train_utils.prepare_batch(0, 1, session_data)
-
-            X = []
-            if len(batch) != len(self.shapes):
-                i = 0
-                for s in self.shapes:
-                    if i >= len(batch) or batch[i] is None:
-                        if isinstance(s, tuple):
-                            s = tuple([x if x is not None else 1 for x in s])
-                        elif s is None:
-                            s = 1
-                        X.append(np.zeros(s))
-                    else:
-                        X.append(batch[i])
-                    i += 1
-            X = tuple(X)
+            X = self._add_missing_placeholder_tensors(batch)
 
             # load tf graph and session
             label_ids, message_sim = self._calculate_message_sim(X)
@@ -788,6 +775,26 @@ def predict_label(
                 ]
         return label, label_ranking
 
+    def _add_missing_placeholder_tensors(self, batch):
+        if self.shapes is not None and len(batch) == len(self.shapes):
+            return batch
+
+        X = []
+        for i, shape in enumerate(self.shapes):
+            # if features are not present add dummy tensor
+            if i >= len(batch) or batch[i] is None:
+                # shape may contain None, replace None by 1
+                if isinstance(shape, tuple):
+                    shape = tuple([x if x is not None else 1 for x in shape])
+                elif shape is None:
+                    shape = 1
+                # add dummy tensor of shape
+                X.append(np.zeros(shape))
+            else:
+                X.append(batch[i])
+            i += 1
+        return tuple(X)
+
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 65244ab7e561..6fb71ef919db 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -335,14 +335,10 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     ]
 
 
-# TODO types, could be tf.Tensor or Tuple for shape
+# TODO types, could be tf.Tensor or Tuple for shape - still relevant?
 def values_to_sparse_tensor(
     indices: np.ndarray, data: np.ndarray, shape: np.ndarray
 ) -> tf.SparseTensor:
-    # make sure indices and shape have the correct type
-    # indices = tf.cast(indices, dtype=tf.int64)
-    # shape = tf.cast(shape, dtype=tf.int64)
-
     return tf.SparseTensor(indices, data, shape)
 
 

From 4104574ca78dead1daa183e216c7cdfc534cf753 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 11 Nov 2019 13:55:33 +0100
Subject: [PATCH 133/239] convert encoded all labels the same way as session
 data

---
 .../embedding_intent_classifier.py            | 60 +++++++++----------
 rasa/utils/train_utils.py                     | 24 +++++---
 2 files changed, 46 insertions(+), 38 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f91f75d88d97..8b98e36bf9c0 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -140,7 +140,7 @@ def __init__(
         # tf related instances
         self.session = session
         self.graph = graph
-        self.batch = batch_placeholder
+        self.batch_in = batch_placeholder
         self.sim_all = similarity_all
         self.pred_confidence = pred_confidence
         self.sim = similarity
@@ -186,8 +186,8 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
                 "hidden_layer_sizes for a and b must coincide"
             )
 
-        self.batch_size = config["batch_size"]
-        self.batch_strategy = config["batch_strategy"]
+        self.batch_in_size = config["batch_size"]
+        self.batch_in_strategy = config["batch_strategy"]
 
         self.epochs = config["epochs"]
 
@@ -293,7 +293,7 @@ def _extract_and_add_features(
 
     def _extract_labels_precomputed_features(
         self, label_examples: List["Message"]
-    ) -> List[np.ndarray]:
+    ) -> "SessionData":
 
         # Collect precomputed encodings
         sparse_features = []
@@ -307,7 +307,10 @@ def _extract_labels_precomputed_features(
         sparse_features = np.array(sparse_features)
         dense_features = np.array(dense_features)
 
-        return [sparse_features, dense_features]
+        data = {}
+        self._add_to_session_data(data, "intent_features", [sparse_features, dense_features])
+
+        return data
 
     @staticmethod
     def _compute_default_label_features(
@@ -459,22 +462,19 @@ def _create_tf_embed_fnn(
     def _build_tf_train_graph(
         self, session_data: SessionData
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
-        self.batch = self._iterator.get_next()
-        batch = train_utils.batch_to_session_data(self.batch, session_data)
 
-        a = self.combine_sparse_dense_features(batch["text_features"], "text")
-        b = self.combine_sparse_dense_features(batch["intent_features"], "intent")
+        # get in tensors from generator
+        self.batch_in = self._iterator.get_next()
+        # convert encoded all labels into the batch format
+        all_label_ids_batch = train_utils.prepare_batch(self._encoded_all_label_ids)
 
-        all_label_ids = tf.squeeze(
-            tf.stack(
-                [
-                    self.labels_to_tensors(v)
-                    for values in self._encoded_all_label_ids
-                    for v in values
-                ],
-                name="all_label_ids",
-            )
-        )
+        # convert batch format into sparse and dense tensors
+        batch_in = train_utils.batch_to_session_data(self.batch_in, session_data)
+        all_label_ids_batch = train_utils.batch_to_session_data(all_label_ids_batch, self._encoded_all_label_ids)
+
+        a = self.combine_sparse_dense_features(batch_in["text_features"], "text")
+        b = self.combine_sparse_dense_features(batch_in["intent_features"], "intent")
+        all_label_ids = self.combine_sparse_dense_features(all_label_ids_batch["intent_features"], "intent")
 
         message_embed = self._create_tf_embed_fnn(
             a,
@@ -541,12 +541,12 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         for s, t in zip(shapes, types):
             batch_placeholder.append(tf.placeholder(t, s))
 
-        self.batch = tf.tuple(batch_placeholder)
+        self.batch_in = tf.tuple(batch_placeholder)
 
-        batch = train_utils.batch_to_session_data(self.batch, session_data)
+        batch_in = train_utils.batch_to_session_data(self.batch_in, session_data)
 
-        a = self.combine_sparse_dense_features(batch["text_features"], "text")
-        b = self.combine_sparse_dense_features(batch["intent_features"], "intent")
+        a = self.combine_sparse_dense_features(batch_in["text_features"], "text")
+        b = self.combine_sparse_dense_features(batch_in["intent_features"], "intent")
 
         # TODO check this idea:
         # self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
@@ -689,7 +689,7 @@ def train(
                 session_data,
                 eval_session_data,
                 batch_size_in,
-                self.batch_strategy,
+                self.batch_in_strategy,
                 label_key="intent_ids",
             )
 
@@ -712,7 +712,7 @@ def train(
                 self.session,
                 self._is_training,
                 self.epochs,
-                self.batch_size,
+                self.batch_in_size,
                 self.evaluate_on_num_examples,
                 self.evaluate_every_num_epochs,
             )
@@ -725,7 +725,7 @@ def train(
     def _calculate_message_sim(self, X: np.ndarray) -> Tuple[np.ndarray, List[float]]:
         """Calculate message similarities"""
 
-        message_sim = self.session.run(self.pred_confidence, feed_dict={self.batch: X})
+        message_sim = self.session.run(self.pred_confidence, feed_dict={self.batch_in: X})
 
         message_sim = message_sim.flatten()  # sim is a matrix
 
@@ -750,7 +750,7 @@ def predict_label(
 
         else:
             session_data = self._create_session_data([message])
-            X = train_utils.prepare_batch(0, 1, session_data)
+            X = train_utils.prepare_batch(session_data)
 
             # load tf graph and session
             label_ids, message_sim = self._calculate_message_sim(X)
@@ -798,7 +798,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             if e.errno != errno.EEXIST:
                 raise
         with self.graph.as_default():
-            train_utils.persist_tensor("batch_placeholder", self.batch, self.graph)
+            train_utils.persist_tensor("batch_placeholder", self.batch_in, self.graph)
 
             train_utils.persist_tensor("similarity_all", self.sim_all, self.graph)
             train_utils.persist_tensor(
@@ -848,7 +848,7 @@ def load(
 
                 saver.restore(session, checkpoint)
 
-                batch = train_utils.load_tensor("batch_placeholder")
+                batch_in = train_utils.load_tensor("batch_placeholder")
 
                 sim_all = train_utils.load_tensor("similarity_all")
                 pred_confidence = train_utils.load_tensor("pred_confidence")
@@ -867,7 +867,7 @@ def load(
                 inverted_label_dict=inv_label_dict,
                 session=session,
                 graph=graph,
-                batch_placeholder=batch,
+                batch_placeholder=batch_in,
                 similarity_all=sim_all,
                 pred_confidence=pred_confidence,
                 similarity=sim,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e55e626389b3..4440608ec90a 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -11,8 +11,8 @@
     Union,
     Generator,
     Callable,
+    ValuesView,
     Any,
-    NamedTuple,
 )
 import numpy as np
 from tqdm import tqdm
@@ -295,10 +295,10 @@ def gen_batch(
         start = batch_num * batch_size
         end = start + batch_size
 
-        yield prepare_batch(start, end, session_data)
+        yield prepare_batch(session_data, start, end)
 
 
-def prepare_batch(start: int, end: int, session_data: SessionData):
+def prepare_batch(session_data: SessionData, start: Optional[int] = None, end: Optional[int] = None):
     batch_data = []
 
     for values in session_data.values():
@@ -308,7 +308,15 @@ def prepare_batch(start: int, end: int, session_data: SessionData):
             continue
 
         for v in values:
-            _data = v[start:end]
+            if start is not None and end is not None:
+                _data = v[start:end]
+            elif start is not None:
+                _data = v[start:]
+            elif end is not None:
+                _data = v[:end]
+            else:
+                _data = v[:]
+
             if isinstance(_data[0], scipy.sparse.spmatrix):
                 batch_data = batch_data + scipy_matrix_to_values(_data)
             else:
@@ -330,7 +338,7 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
 
     return [
         np.array(indices).astype(np.int64),
-        np.array(data),
+        np.array(data).astype(np.float64),
         np.array(shape).astype(np.int64),
     ]
 
@@ -340,8 +348,8 @@ def values_to_sparse_tensor(
     indices: np.ndarray, data: np.ndarray, shape: np.ndarray
 ) -> tf.SparseTensor:
     # make sure indices and shape have the correct type
-    # indices = tf.cast(indices, dtype=tf.int64)
-    # shape = tf.cast(shape, dtype=tf.int64)
+    indices = tf.cast(indices, dtype=tf.int64)
+    shape = tf.cast(shape, dtype=tf.int64)
 
     return tf.SparseTensor(indices, data, shape)
 
@@ -369,7 +377,7 @@ def pad_data(data: np.ndarray) -> np.ndarray:
         for i in range(data_size):
             data_padded[i, : data[i].shape[0], :] = data[i]
 
-    return data_padded
+    return data_padded.astype(np.float64)
 
 
 def batch_to_session_data(

From 9d665755bae22395687db27150aaf7567f7cc61f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 11 Nov 2019 14:22:59 +0100
Subject: [PATCH 134/239] add mask

---
 .../embedding_intent_classifier.py            | 27 ++++++++++++++++---
 rasa/utils/train_utils.py                     | 11 +++++---
 2 files changed, 31 insertions(+), 7 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index caef926559e5..d14de67e53a7 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -310,7 +310,9 @@ def _extract_labels_precomputed_features(
         dense_features = np.array(dense_features)
 
         data = {}
-        self._add_to_session_data(data, "intent_features", [sparse_features, dense_features])
+        self._add_to_session_data(
+            data, "intent_features", [sparse_features, dense_features]
+        )
 
         return data
 
@@ -400,6 +402,10 @@ def _create_session_data(
         Y_sparse = []
         Y_dense = []
         label_ids = []
+        masks = []
+
+        # TODO should be variable
+        max_seq_len = max([len(e.get("tokens")) for e in training_data])
 
         for e in training_data:
             self._extract_and_add_features(e, MESSAGE_TEXT_ATTRIBUTE, X_sparse, X_dense)
@@ -410,13 +416,19 @@ def _create_session_data(
             if e.get(attribute):
                 label_ids.append(label_id_dict[e.get(attribute)])
 
+            mask = np.zeros(max_seq_len)
+            mask[0 : len(e.get("tokens"))] = 1
+            masks.append(mask)
+
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
         Y_sparse = np.array(Y_sparse)
         Y_dense = np.array(Y_dense)
         label_ids = np.array(label_ids)
+        masks = np.array(masks)
 
         session_data = {}
+        self._add_to_session_data(session_data, "masks", [masks])
         self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
@@ -472,11 +484,15 @@ def _build_tf_train_graph(
 
         # convert batch format into sparse and dense tensors
         batch_in = train_utils.batch_to_session_data(self.batch_in, session_data)
-        all_label_ids_batch = train_utils.batch_to_session_data(all_label_ids_batch, self._encoded_all_label_ids)
+        all_label_ids_batch = train_utils.batch_to_session_data(
+            all_label_ids_batch, self._encoded_all_label_ids
+        )
 
         a = self.combine_sparse_dense_features(batch_in["text_features"], "text")
         b = self.combine_sparse_dense_features(batch_in["intent_features"], "intent")
-        all_label_ids = self.combine_sparse_dense_features(all_label_ids_batch["intent_features"], "intent")
+        all_label_ids = self.combine_sparse_dense_features(
+            all_label_ids_batch["intent_features"], "intent"
+        )
 
         message_embed = self._create_tf_embed_fnn(
             a,
@@ -793,6 +809,11 @@ def _add_missing_placeholder_tensors(self, batch):
                     shape = 1
                 # add dummy tensor of shape
                 X.append(np.zeros(shape))
+            # TODO
+            elif (isinstance(shape, tuple) or isinstance(shape, list)) and batch[
+                i
+            ].shape[-1] != shape[-1]:
+                X.append(train_utils.pad_data(batch[i], feature_len=shape[-1]))
             else:
                 X.append(batch[i])
             i += 1
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2bc51f9559da..2b50a3ee4813 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -298,7 +298,9 @@ def gen_batch(
         yield prepare_batch(session_data, start, end)
 
 
-def prepare_batch(session_data: SessionData, start: Optional[int] = None, end: Optional[int] = None):
+def prepare_batch(
+    session_data: SessionData, start: Optional[int] = None, end: Optional[int] = None
+):
     batch_data = []
 
     for values in session_data.values():
@@ -351,7 +353,7 @@ def values_to_sparse_tensor(
     return tf.SparseTensor(indices, data, shape)
 
 
-def pad_data(data: np.ndarray) -> np.ndarray:
+def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
     """
     Pad data of different lengths.
     Data is padded with zeros. Zeros are added to the beginning of data.
@@ -360,7 +362,8 @@ def pad_data(data: np.ndarray) -> np.ndarray:
         return data
 
     data_size = len(data)
-    feature_len = max([x.shape[-1] for x in data])
+    if feature_len is None:
+        feature_len = max([x.shape[-1] for x in data])
 
     if data[0].ndim == 1:
         data_padded = np.zeros([data_size, feature_len], dtype=data[0].dtype)
@@ -454,7 +457,7 @@ def append_type(v: np.ndarray):
             types.append(tf.float64)
             types.append(tf.int64)
         else:
-            types.append(v.dtype)
+            types.append(v[0].dtype)
 
     for values in session_data.values():
         for v in values:

From 1c4591ee9b27c1d645caddb3397d0c01c423604f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 11 Nov 2019 14:48:00 +0100
Subject: [PATCH 135/239] check if tokens are present

---
 rasa/nlu/tokenizers/tokenizer.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index bd9ec25d1475..2c19d1d00ae2 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -51,6 +51,7 @@ def add_cls_token(
         if (
             attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
             and self.use_cls_token
+            and tokens
         ):
             # +1 to have a space between the last token and the __cls__ token
             idx = tokens[-1].offset + len(tokens[-1].text) + 1

From 7889033e2a29bc018772946a8e3bb34b4c4ffb76 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 11 Nov 2019 15:43:37 +0100
Subject: [PATCH 136/239] add TODO

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index d14de67e53a7..253cc6b6e41d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -430,6 +430,7 @@ def _create_session_data(
         session_data = {}
         self._add_to_session_data(session_data, "masks", [masks])
         self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
+        # TODO there might be no features for Y, in this case need to create sparse 1-hot encoding like we do for encoded_all_labels
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
 
@@ -493,7 +494,9 @@ def _build_tf_train_graph(
         all_label_ids = self.combine_sparse_dense_features(
             all_label_ids_batch["intent_features"], "intent"
         )
-
+        print(session_data["intent_ids"])
+        print(batch_in["intent_ids"])
+        exit()
         message_embed = self._create_tf_embed_fnn(
             a,
             self.hidden_layer_sizes["text"],

From 6f20dbcbac948fc3596274f0150e556ef0a91f46 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 11 Nov 2019 17:03:44 +0100
Subject: [PATCH 137/239] fix wrong embed layer

---
 .../embedding_intent_classifier.py            | 45 +++----------------
 rasa/utils/train_utils.py                     | 11 ++---
 2 files changed, 13 insertions(+), 43 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 253cc6b6e41d..f34287633b56 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -370,25 +370,6 @@ def _create_encoded_label_ids(
 
         return encoded_id_labels
 
-    def labels_to_tensors(self, features: List[np.ndarray]):
-        label_features = []
-
-        for f in features:
-            if isinstance(f, scipy.sparse.spmatrix):
-                indices, values, shape = train_utils.scipy_matrix_to_values(
-                    np.array([f])
-                )
-                label_features.append(
-                    tf.cast(
-                        train_utils.values_to_sparse_tensor(indices, values, shape),
-                        tf.float64,
-                    )
-                )
-            else:
-                label_features.append(tf.cast(f, tf.float64))
-
-        return self.combine_sparse_dense_features(label_features, "label")
-
     # noinspection PyPep8Naming
     def _create_session_data(
         self,
@@ -485,18 +466,16 @@ def _build_tf_train_graph(
 
         # convert batch format into sparse and dense tensors
         batch_in = train_utils.batch_to_session_data(self.batch_in, session_data)
-        all_label_ids_batch = train_utils.batch_to_session_data(
+        encoded_all_label_ids_batch = train_utils.batch_to_session_data(
             all_label_ids_batch, self._encoded_all_label_ids
         )
 
         a = self.combine_sparse_dense_features(batch_in["text_features"], "text")
         b = self.combine_sparse_dense_features(batch_in["intent_features"], "intent")
-        all_label_ids = self.combine_sparse_dense_features(
-            all_label_ids_batch["intent_features"], "intent"
+        encoded_all_label_ids = self.combine_sparse_dense_features(
+            encoded_all_label_ids_batch["intent_features"], "intent"
         )
-        print(session_data["intent_ids"])
-        print(batch_in["intent_ids"])
-        exit()
+
         message_embed = self._create_tf_embed_fnn(
             a,
             self.hidden_layer_sizes["text"],
@@ -511,10 +490,10 @@ def _build_tf_train_graph(
         )
 
         self.all_labels_embed = self._create_tf_embed_fnn(
-            all_label_ids,
+            encoded_all_label_ids,
             self.hidden_layer_sizes["intent"],
             fnn_name="text_intent" if self.share_hidden_layers else "intent",
-            embed_name="all_intents",
+            embed_name="intent",
         )
 
         return train_utils.calculate_loss_acc(
@@ -522,7 +501,7 @@ def _build_tf_train_graph(
             self.label_embed,
             b,
             self.all_labels_embed,
-            all_label_ids,
+            encoded_all_label_ids,
             self.num_neg,
             None,
             self.loss_type,
@@ -637,16 +616,6 @@ def preprocess_train_data(self, training_data: "TrainingData"):
             training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
 
-        # check if number of negatives is less than number of label_ids
-        logger.debug(
-            "Check if num_neg {} is smaller than "
-            "number of label_ids {}, "
-            "else set num_neg to the number of label_ids - 1"
-            "".format(self.num_neg, len(self._encoded_all_label_ids))
-        )
-        # noinspection PyAttributeOutsideInit
-        self.num_neg = min(self.num_neg, len(self._encoded_all_label_ids) - 1)
-
         session_data = self._create_session_data(
             training_data.intent_examples,
             label_id_dict,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2b50a3ee4813..1243b65c404a 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -685,15 +685,16 @@ def _tf_calc_iou_mask(
     pos_b: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
 ) -> "tf.Tensor":
     """Calculate IOU mask for given indices"""
-
     pos_b_in_flat = tf.expand_dims(pos_b, -2)
     neg_b_in_flat = _tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
 
-    intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
-    union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat)
+    return tf.cast(tf.reduce_all(tf.equal(neg_b_in_flat, pos_b_in_flat), axis=-1), pos_b_in_flat.dtype)
 
-    iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1)
-    return 1.0 - tf.nn.relu(tf.sign(1.0 - iou))
+    # intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
+    # union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat)
+    #
+    # iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1)
+    # return 1.0 - tf.nn.relu(tf.sign(1.0 - iou))
 
 
 def _tf_get_negs(

From 6cf4385afe09d3beee67a2713f1b0dde18fcafa5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 11 Nov 2019 17:11:11 +0100
Subject: [PATCH 138/239] more consistent var naming

---
 .../embedding_intent_classifier.py            | 31 +++++++++----------
 1 file changed, 14 insertions(+), 17 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f34287633b56..39967010b709 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -135,7 +135,7 @@ def __init__(
         # transform numbers to labels
         self.inverted_label_dict = inverted_label_dict
         # encode all label_ids with numbers
-        self._encoded_all_label_ids = None
+        self._label_data = None
 
         # tf related instances
         self.session = session
@@ -462,18 +462,16 @@ def _build_tf_train_graph(
         # get in tensors from generator
         self.batch_in = self._iterator.get_next()
         # convert encoded all labels into the batch format
-        all_label_ids_batch = train_utils.prepare_batch(self._encoded_all_label_ids)
+        label_batch = train_utils.prepare_batch(self._label_data)
 
         # convert batch format into sparse and dense tensors
-        batch_in = train_utils.batch_to_session_data(self.batch_in, session_data)
-        encoded_all_label_ids_batch = train_utils.batch_to_session_data(
-            all_label_ids_batch, self._encoded_all_label_ids
-        )
+        batch_data = train_utils.batch_to_session_data(self.batch_in, session_data)
+        label_data = train_utils.batch_to_session_data(label_batch, self._label_data)
 
-        a = self.combine_sparse_dense_features(batch_in["text_features"], "text")
-        b = self.combine_sparse_dense_features(batch_in["intent_features"], "intent")
-        encoded_all_label_ids = self.combine_sparse_dense_features(
-            encoded_all_label_ids_batch["intent_features"], "intent"
+        a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
+        b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
+        all_bs = self.combine_sparse_dense_features(
+            label_data["intent_features"], "intent"
         )
 
         message_embed = self._create_tf_embed_fnn(
@@ -488,9 +486,8 @@ def _build_tf_train_graph(
             fnn_name="text_intent" if self.share_hidden_layers else "intent",
             embed_name="intent",
         )
-
         self.all_labels_embed = self._create_tf_embed_fnn(
-            encoded_all_label_ids,
+            all_bs,
             self.hidden_layer_sizes["intent"],
             fnn_name="text_intent" if self.share_hidden_layers else "intent",
             embed_name="intent",
@@ -501,7 +498,7 @@ def _build_tf_train_graph(
             self.label_embed,
             b,
             self.all_labels_embed,
-            encoded_all_label_ids,
+            all_bs,
             self.num_neg,
             None,
             self.loss_type,
@@ -543,10 +540,10 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         self.batch_in = tf.tuple(batch_placeholder)
 
-        batch_in = train_utils.batch_to_session_data(self.batch_in, session_data)
+        batch_data = train_utils.batch_to_session_data(self.batch_in, session_data)
 
-        a = self.combine_sparse_dense_features(batch_in["text_features"], "text")
-        b = self.combine_sparse_dense_features(batch_in["intent_features"], "intent")
+        a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
+        b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
 
         # TODO check this idea:
         # self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
@@ -612,7 +609,7 @@ def preprocess_train_data(self, training_data: "TrainingData"):
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
 
-        self._encoded_all_label_ids = self._create_encoded_label_ids(
+        self._label_data = self._create_encoded_label_ids(
             training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
 

From bcc52c114c603789d2b83d7ea18df32e200c61b6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 09:05:08 +0100
Subject: [PATCH 139/239] fix balance session data

---
 rasa/utils/train_utils.py       | 19 +++++++++++++------
 tests/utils/test_train_utils.py | 18 +++++++-----------
 2 files changed, 20 insertions(+), 17 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 1243b65c404a..a3f68d148622 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -210,6 +210,7 @@ def balance_session_data(
     skipped = [False] * num_label_ids
 
     new_session_data = defaultdict(list)
+    num_examples = get_number_of_examples(session_data)
 
     while min(num_data_cycles) == 0:
         if shuffle:
@@ -224,15 +225,19 @@ def balance_session_data(
             else:
                 skipped[index] = False
 
+            index_batch_size = (
+                int(counts_label_ids[index] / num_examples * batch_size) + 1
+            )
+
             for k, values in label_data[index].items():
                 for i, v in enumerate(values):
                     if len(new_session_data[k]) < i + 1:
                         new_session_data[k].append([])
                     new_session_data[k][i].append(
-                        v[data_idx[index] : data_idx[index] + 1][0]
+                        v[data_idx[index] : data_idx[index] + index_batch_size]
                     )
 
-            data_idx[index] += 1
+            data_idx[index] += index_batch_size
             if data_idx[index] >= counts_label_ids[index]:
                 num_data_cycles[index] += 1
                 data_idx[index] = 0
@@ -240,11 +245,13 @@ def balance_session_data(
             if min(num_data_cycles) > 0:
                 break
 
-    new_session_data = {
-        k: [np.array(v) for v in values] for k, values in new_session_data.items()
-    }
+    updated = {}
+    for k, values in new_session_data.items():
+        updated[k] = []
+        for v in values:
+            updated[k].append(np.concatenate(np.array(v)))
 
-    return new_session_data
+    return updated
 
 
 def concatenate_data(
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index c9dff846d485..7c0ba8e400be 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -156,17 +156,13 @@ def test_gen_batch(session_data: SessionData):
         next(iterator)
 
 
-@pytest.mark.parametrize(
-    "intent_ids, expected_labels",
-    [([0, 0, 0, 1, 1], [0, 1, 0, 1, 0]), ([0, 0, 0, 0, 1], [0, 1, 0, 0, 1, 0])],
-)
-def test_balance_session_data(session_data: SessionData, intent_ids, expected_labels):
-    # TODO improve test
-    session_data["intent_ids"] = [np.array(intent_ids)]
-
+def test_balance_session_data(session_data: SessionData):
     balanced_session_data = balance_session_data(session_data, 2, False, "intent_ids")
 
-    labels = balanced_session_data["intent_ids"][0]
+    for k, values in session_data.items():
+        assert k in balanced_session_data
+
+        for i, v in enumerate(values):
+            assert len(v) == len(balanced_session_data[k][i])
 
-    assert len(expected_labels) == len(labels)
-    assert np.all(expected_labels == labels)
+    assert np.all(balanced_session_data["intent_ids"][0] == np.array([0, 1, 1, 0, 1]))

From c161a435f73701a2a512049016f9648710422d93 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 09:31:10 +0100
Subject: [PATCH 140/239] add comments

---
 .../embedding_intent_classifier.py            |  6 ++-
 rasa/utils/train_utils.py                     | 54 +++++++------------
 2 files changed, 24 insertions(+), 36 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 39967010b709..f3e43a7b5b12 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -461,6 +461,7 @@ def _build_tf_train_graph(
 
         # get in tensors from generator
         self.batch_in = self._iterator.get_next()
+
         # convert encoded all labels into the batch format
         label_batch = train_utils.prepare_batch(self._label_data)
 
@@ -764,12 +765,13 @@ def predict_label(
         return label, label_ranking
 
     def _add_missing_placeholder_tensors(self, batch):
+        # check if all data is already present
         if self.shapes is not None and len(batch) == len(self.shapes):
             return batch
 
         X = []
+        # if features are not present add dummy tensor
         for i, shape in enumerate(self.shapes):
-            # if features are not present add dummy tensor
             if i >= len(batch) or batch[i] is None:
                 # shape may contain None, replace None by 1
                 if isinstance(shape, tuple):
@@ -778,7 +780,7 @@ def _add_missing_placeholder_tensors(self, batch):
                     shape = 1
                 # add dummy tensor of shape
                 X.append(np.zeros(shape))
-            # TODO
+            # TODO mask
             elif (isinstance(shape, tuple) or isinstance(shape, list)) and batch[
                 i
             ].shape[-1] != shape[-1]:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index a3f68d148622..9ae3c6e73440 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -2,18 +2,7 @@
 import logging
 import scipy.sparse
 import typing
-from typing import (
-    List,
-    Optional,
-    Text,
-    Dict,
-    Tuple,
-    Union,
-    Generator,
-    Callable,
-    ValuesView,
-    Any,
-)
+from typing import List, Optional, Text, Dict, Tuple, Union, Generator, Callable, Any
 import numpy as np
 from tqdm import tqdm
 from sklearn.model_selection import train_test_split
@@ -103,8 +92,9 @@ def check_train_test_sizes(
 
     if evaluate_on_num_examples >= num_examples - len(label_counts):
         raise ValueError(
-            f"Validation set of {evaluate_on_num_examples} is too large. Remaining train set "
-            "should be at least equal to number of classes {len(label_counts)}."
+            f"Validation set of {evaluate_on_num_examples} is too large. Remaining "
+            f"train set should be at least equal to number of classes "
+            f"{len(label_counts)}."
         )
     elif evaluate_on_num_examples < len(label_counts):
         raise ValueError(
@@ -122,8 +112,7 @@ def convert_train_test_split(
     session_data_val = defaultdict(list)
 
     # output_values = x_train, x_val, y_train, y_val, z_train, z_val, etc.
-    # order is kept, so first session_data.X values, then session_data.Y values, and
-    # finally session_data.labels values
+    # order is kept, e.g. same order as session data keys
 
     # train datasets have an even index
     for i in range(len(session_data)):
@@ -142,6 +131,7 @@ def combine_features(
     feature_1: Union[np.ndarray, scipy.sparse.spmatrix],
     feature_2: Union[np.ndarray, scipy.sparse.spmatrix],
 ) -> Union[np.ndarray, scipy.sparse.spmatrix]:
+    """Concatenate features."""
     if isinstance(feature_1, scipy.sparse.spmatrix) and isinstance(
         feature_2, scipy.sparse.spmatrix
     ):
@@ -175,7 +165,7 @@ def split_session_data_by_label(
 ) -> List["SessionData"]:
     """Reorganize session data into a list of session data with the same labels."""
     if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
+        raise ValueError(f"Key '{label_key}' not in SessionData.")
 
     label_data = []
     for label_id in unique_label_ids:
@@ -195,7 +185,7 @@ def balance_session_data(
     that more populated classes should appear more often.
     """
     if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionData.labels.")
+        raise ValueError(f"Key '{label_key}' not in SessionData.")
 
     unique_label_ids, counts_label_ids = np.unique(
         session_data[label_key][0], return_counts=True, axis=0
@@ -254,19 +244,10 @@ def balance_session_data(
     return updated
 
 
-def concatenate_data(
-    data_dict: Dict[Text, Union[np.ndarray, List[scipy.sparse.spmatrix]]]
-) -> Dict[Text, Union[np.ndarray, List[scipy.sparse.spmatrix]]]:
-    new_dict = {}
-    for k, v in data_dict.items():
-        if isinstance(v[0], scipy.sparse.spmatrix):
-            new_dict[k] = scipy.sparse.vstack(v)
-        else:
-            new_dict[k] = np.concatenate(v)
-    return new_dict
-
-
 def get_number_of_examples(session_data: SessionData):
+    """Obtain number of examples in session data.
+    Raise a ValueError if number of examples differ for different data in session data.
+    """
     example_lengths = [v.shape[0] for values in session_data.values() for v in values]
 
     # check if number of examples is the same for all X
@@ -308,6 +289,7 @@ def gen_batch(
 def prepare_batch(
     session_data: SessionData, start: Optional[int] = None, end: Optional[int] = None
 ):
+    """Slices session data into batch using given start and end value."""
     batch_data = []
 
     for values in session_data.values():
@@ -336,6 +318,7 @@ def prepare_batch(
 
 
 def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
+    """Convert a scipy matrix into inidces, data, and shape."""
     seq_len = max([x.shape[0] for x in array_of_sparse])
     coo = [x.tocoo() for x in array_of_sparse]
     data = [v for x in array_of_sparse for v in x.data]
@@ -352,11 +335,10 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     ]
 
 
-# TODO types, could be tf.Tensor or Tuple for shape - still relevant?
 def values_to_sparse_tensor(
-    indices: np.ndarray, data: np.ndarray, shape: np.ndarray
+    indices: np.ndarray, data: np.ndarray, shape: Union[np.ndarray, List]
 ) -> tf.SparseTensor:
-
+    """Create a Sparse Tensor from given indices, data, and shape."""
     return tf.SparseTensor(indices, data, shape)
 
 
@@ -441,6 +423,7 @@ def create_tf_dataset(
 
 
 def get_shapes_types(session_data: SessionData) -> Tuple:
+    """Extract shapes and types from session data."""
     types = []
     shapes = []
 
@@ -695,7 +678,10 @@ def _tf_calc_iou_mask(
     pos_b_in_flat = tf.expand_dims(pos_b, -2)
     neg_b_in_flat = _tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
 
-    return tf.cast(tf.reduce_all(tf.equal(neg_b_in_flat, pos_b_in_flat), axis=-1), pos_b_in_flat.dtype)
+    return tf.cast(
+        tf.reduce_all(tf.equal(neg_b_in_flat, pos_b_in_flat), axis=-1),
+        pos_b_in_flat.dtype,
+    )
 
     # intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
     # union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat)

From c7e3251b5d355124d3633e86abc4220fee4fe19a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 10:00:31 +0100
Subject: [PATCH 141/239] extract dense_dim from dense features

---
 .../embedding_intent_classifier.py            | 38 ++++++++++++++-----
 .../selectors/embedding_response_selector.py  | 25 ++++--------
 2 files changed, 35 insertions(+), 28 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f3e43a7b5b12..21f6f6cc1737 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -78,6 +78,8 @@ class EmbeddingIntentClassifier(Component):
         # set random seed to any int to get reproducible results
         "random_seed": None,
         # embedding parameters
+        # default dense dimension used if no dense features are present
+        "dense_dim": 512,
         # dimension size of embedding vectors
         "embed_dim": 20,
         # the type of the similarity
@@ -131,7 +133,6 @@ def __init__(
 
         self._load_params()
 
-        self.dense_dim = 512  # TODO make configurable /extract form dense features
         # transform numbers to labels
         self.inverted_label_dict = inverted_label_dict
         # encode all label_ids with numbers
@@ -198,6 +199,7 @@ def _load_nn_architecture_params(self, config: Dict[Text, Any]) -> None:
     def _load_embedding_params(self, config: Dict[Text, Any]) -> None:
         self.embed_dim = config["embed_dim"]
         self.num_neg = config["num_neg"]
+        self.dense_dim = config["dense_dim"]
 
         self.similarity_type = config["similarity_type"]
         self.loss_type = config["loss_type"]
@@ -469,10 +471,14 @@ def _build_tf_train_graph(
         batch_data = train_utils.batch_to_session_data(self.batch_in, session_data)
         label_data = train_utils.batch_to_session_data(label_batch, self._label_data)
 
-        a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
-        b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
+        a = self.combine_sparse_dense_features(
+            batch_data["text_features"], session_data["text_features"], "text"
+        )
+        b = self.combine_sparse_dense_features(
+            batch_data["intent_features"], session_data["intent_features"], "intent"
+        )
         all_bs = self.combine_sparse_dense_features(
-            label_data["intent_features"], "intent"
+            label_data["intent_features"], self._label_data["intent_features"], "intent"
         )
 
         message_embed = self._create_tf_embed_fnn(
@@ -511,17 +517,25 @@ def _build_tf_train_graph(
         )
 
     def combine_sparse_dense_features(
-        self, features: List[Union[tf.Tensor, tf.SparseTensor]], name: Text
+        self,
+        features: List[Union[tf.Tensor, tf.SparseTensor]],
+        session_data: List[np.ndarray],
+        name: Text,
     ) -> tf.Tensor:
 
         dense_features = []
 
+        dense_dim = self.dense_dim
+        # if dense features are present use the feature dimension of the dense features
+        for d in session_data:
+            if not isinstance(d[0], scipy.sparse.spmatrix):
+                dense_dim = d[0].shape[-1]
+                break
+
         for f in features:
             if isinstance(f, tf.SparseTensor):
                 dense_features.append(
-                    train_utils.tf_dense_layer_for_sparse(
-                        f, self.dense_dim, name, self.C2
-                    )
+                    train_utils.tf_dense_layer_for_sparse(f, dense_dim, name, self.C2)
                 )
             else:
                 dense_features.append(f)
@@ -543,8 +557,12 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         batch_data = train_utils.batch_to_session_data(self.batch_in, session_data)
 
-        a = self.combine_sparse_dense_features(batch_data["text_features"], "text")
-        b = self.combine_sparse_dense_features(batch_data["intent_features"], "intent")
+        a = self.combine_sparse_dense_features(
+            batch_data["text_features"], session_data["text_features"], "text"
+        )
+        b = self.combine_sparse_dense_features(
+            batch_data["intent_features"], session_data["intent_features"], "intent"
+        )
 
         # TODO check this idea:
         # self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 271c32c89b80..2f05b9766ad1 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -67,6 +67,8 @@ class ResponseSelector(EmbeddingIntentClassifier):
         # set random seed to any int to get reproducible results
         "random_seed": None,
         # embedding parameters
+        # default dense dimension used if no dense features are present
+        "dense_dim": 512,
         # dimension size of embedding vectors
         "embed_dim": 20,
         # the type of the similarity
@@ -141,27 +143,14 @@ def preprocess_train_data(self, training_data):
         )
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
-        self._encoded_all_label_ids = self._create_encoded_label_ids(
-            training_data,
-            label_id_dict,
-            attribute=MESSAGE_RESPONSE_ATTRIBUTE,
-            attribute_feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
-                MESSAGE_RESPONSE_ATTRIBUTE
-            ],
-        )
-
-        # check if number of negatives is less than number of label_ids
-        logger.debug(
-            "Check if num_neg {} is smaller than "
-            "number of label_ids {}, "
-            "else set num_neg to the number of label_ids - 1"
-            "".format(self.num_neg, self._encoded_all_label_ids.shape[0])
+        self._label_data = self._create_encoded_label_ids(
+            training_data, label_id_dict, attribute=MESSAGE_RESPONSE_ATTRIBUTE
         )
-        # noinspection PyAttributeOutsideInit
-        self.num_neg = min(self.num_neg, self._encoded_all_label_ids.shape[0] - 1)
 
         session_data = self._create_session_data(
-            training_data, label_id_dict, attribute=MESSAGE_RESPONSE_ATTRIBUTE
+            training_data.intent_examples,
+            label_id_dict,
+            attribute=MESSAGE_RESPONSE_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)

From b98bab6bac6ae489ed315c064f6ff63863b4d5cc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 10:10:31 +0100
Subject: [PATCH 142/239] Fix test_train test.

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 21f6f6cc1737..5b7fe90562cf 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -388,7 +388,12 @@ def _create_session_data(
         masks = []
 
         # TODO should be variable
-        max_seq_len = max([len(e.get("tokens")) for e in training_data])
+        # TODO what if not present? use default value? raise error?
+        seq_len = [
+            len(e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]))
+            for e in training_data
+        ]
+        max_seq_len = max(seq_len) if seq_len else 25
 
         for e in training_data:
             self._extract_and_add_features(e, MESSAGE_TEXT_ATTRIBUTE, X_sparse, X_dense)

From 615bb62a3ae176917e823543fe4af915c916c9ec Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 11:16:12 +0100
Subject: [PATCH 143/239] _compute_default_label_features works as expected

---
 .../embedding_intent_classifier.py            | 73 ++++++++++---------
 .../selectors/embedding_response_selector.py  |  2 +-
 tests/nlu/training/test_train.py              | 21 ++++++
 3 files changed, 62 insertions(+), 34 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 5b7fe90562cf..63db659fb0c6 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -279,52 +279,49 @@ def _check_labels_features_exist(
 
     @staticmethod
     def _extract_and_add_features(
-        message: "Message",
-        attribute: Text,
-        sparse_features: List[scipy.sparse.spmatrix],
-        dense_features: List[np.ndarray],
-    ):
+        message: "Message", attribute: Text
+    ) -> Tuple[Optional[scipy.sparse.spmatrix], Optional[np.ndarray]]:
+        sparse_features = None
+        dense_features = None
+
         # we mutate sparse_features and dense_features
         if message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]) is not None:
-            sparse_features.append(
-                message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute])
+            sparse_features = message.get(
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]
             )
 
         if message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]) is not None:
-            dense_features.append(
-                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute])
-            )
+            dense_features = message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute])
+
+        return sparse_features, dense_features
 
     def _extract_labels_precomputed_features(
         self, label_examples: List["Message"]
-    ) -> "SessionData":
+    ) -> List[np.ndarray]:
 
         # Collect precomputed encodings
         sparse_features = []
         dense_features = []
 
         for e in label_examples:
-            self._extract_and_add_features(
-                e, MESSAGE_INTENT_ATTRIBUTE, sparse_features, dense_features
+            _sparse, _dense = self._extract_and_add_features(
+                e, MESSAGE_INTENT_ATTRIBUTE
             )
+            if _sparse is not None:
+                sparse_features.append(_sparse)
+            if _dense is not None:
+                dense_features.append(_dense)
 
         sparse_features = np.array(sparse_features)
         dense_features = np.array(dense_features)
 
-        data = {}
-        self._add_to_session_data(
-            data, "intent_features", [sparse_features, dense_features]
-        )
-
-        return data
+        return [sparse_features, dense_features]
 
     @staticmethod
     def _compute_default_label_features(
-        labels_example: List[Tuple[int, "Message"]]
+        labels_example: List["Message"]
     ) -> List[np.ndarray]:
         """Compute one-hot representation for the labels"""
-
-        # TODO check:
         return [
             np.array(
                 [
@@ -336,12 +333,12 @@ def _compute_default_label_features(
             )
         ]
 
-    def _create_encoded_label_ids(
+    def _create_label_data(
         self,
         training_data: "TrainingData",
         label_id_dict: Dict[Text, int],
         attribute: Text,
-    ) -> List[np.ndarray]:
+    ) -> "SessionData":
         """Create matrix with label_ids encoded in rows as bag of words.
 
         Find a training example for each label and get the encoded features
@@ -364,13 +361,14 @@ def _create_encoded_label_ids(
 
         # Collect features, precomputed if they exist, else compute on the fly
         if self._check_labels_features_exist(labels_example, attribute):
-            encoded_id_labels = self._extract_labels_precomputed_features(
-                labels_example
-            )
+            features = self._extract_labels_precomputed_features(labels_example)
         else:
-            encoded_id_labels = self._compute_default_label_features(labels_example)
+            features = self._compute_default_label_features(labels_example)
+
+        label_data = {}
+        self._add_to_session_data(label_data, "intent_features", features)
 
-        return encoded_id_labels
+        return label_data
 
     # noinspection PyPep8Naming
     def _create_session_data(
@@ -396,10 +394,19 @@ def _create_session_data(
         max_seq_len = max(seq_len) if seq_len else 25
 
         for e in training_data:
-            self._extract_and_add_features(e, MESSAGE_TEXT_ATTRIBUTE, X_sparse, X_dense)
-            self._extract_and_add_features(
-                e, MESSAGE_INTENT_ATTRIBUTE, Y_sparse, Y_dense
+            _sparse, _dense = self._extract_and_add_features(e, MESSAGE_TEXT_ATTRIBUTE)
+            if _sparse is not None:
+                X_sparse.append(_sparse)
+            if _dense is not None:
+                X_dense.append(_dense)
+
+            _sparse, _dense = self._extract_and_add_features(
+                e, MESSAGE_INTENT_ATTRIBUTE
             )
+            if _sparse is not None:
+                Y_sparse.append(_sparse)
+            if _dense is not None:
+                Y_dense.append(_dense)
 
             if e.get(attribute):
                 label_ids.append(label_id_dict[e.get(attribute)])
@@ -633,7 +640,7 @@ def preprocess_train_data(self, training_data: "TrainingData"):
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
 
-        self._label_data = self._create_encoded_label_ids(
+        self._label_data = self._create_label_data(
             training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
         )
 
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 2f05b9766ad1..9e0a363501b6 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -143,7 +143,7 @@ def preprocess_train_data(self, training_data):
         )
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
-        self._label_data = self._create_encoded_label_ids(
+        self._label_data = self._create_label_data(
             training_data, label_id_dict, attribute=MESSAGE_RESPONSE_ATTRIBUTE
         )
 
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index d0f711efd60e..7c902bcfdddf 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -2,6 +2,8 @@
 
 import os
 import pytest
+from nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from nlu.training_data import Message
 
 from rasa.nlu import registry, train
 from rasa.nlu.config import RasaNLUModelConfig
@@ -243,3 +245,22 @@ async def test_train_model_training_data_persisted(component_builder, tmpdir):
     loaded = Interpreter.load(persisted_path, component_builder)
     assert loaded.pipeline
     assert loaded.model_metadata.get("training_data") is not None
+
+
+def test_compute_default_label_features():
+    label_features = [
+        Message("test a"),
+        Message("test b"),
+        Message("test c"),
+        Message("test d"),
+    ]
+
+    output = EmbeddingIntentClassifier._compute_default_label_features(label_features)
+
+    output = output[0]
+
+    assert output.size == len(label_features)
+    for i, o in enumerate(output):
+        assert o.data[0] == 1
+        assert o.indices[0] == i
+        assert o.shape == (1, len(label_features))

From 2ef87447f8d1a17c842f93539265b8f0294545a9 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 12 Nov 2019 11:24:00 +0100
Subject: [PATCH 144/239] fix len error'

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 rasa/utils/train_utils.py                           | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 5b7fe90562cf..e6243a743261 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -390,7 +390,7 @@ def _create_session_data(
         # TODO should be variable
         # TODO what if not present? use default value? raise error?
         seq_len = [
-            len(e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]))
+            e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]).shape[0]
             for e in training_data
         ]
         max_seq_len = max(seq_len) if seq_len else 25
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 9ae3c6e73440..e89859580d43 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -784,6 +784,9 @@ def tf_dense_layer_for_sparse(
         outputs = tf.sparse.matmul(
             tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), kernel
         )
+        # outputs = tf.matmul(
+        #     tf.reshape(tf.sparse.to_dense(inputs, validate_indices=False), [-1, tf.shape(inputs)[-1]]), kernel, a_is_sparse=True
+        # )
 
         if len(inputs.shape) == 3:
             # reshape back

From 718aff02a009450534c3b3881eb1bfc12f8fca13 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 13:01:36 +0100
Subject: [PATCH 145/239] use default label features if not present

---
 .../classifiers/embedding_intent_classifier.py    | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 01bd4d08b388..7bb59b36607c 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -370,6 +370,16 @@ def _create_label_data(
 
         return label_data
 
+    def use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
+        return [
+            np.array(
+                [
+                    self._label_data["intent_features"][0][label_id]
+                    for label_id in label_ids
+                ]
+            )
+        ]
+
     # noinspection PyPep8Naming
     def _create_session_data(
         self,
@@ -425,10 +435,13 @@ def _create_session_data(
         session_data = {}
         self._add_to_session_data(session_data, "masks", [masks])
         self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
-        # TODO there might be no features for Y, in this case need to create sparse 1-hot encoding like we do for encoded_all_labels
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
 
+        if "intent_features" not in session_data:
+            # no intent features are present, get default features from _label_data
+            session_data["intent_features"] = self.use_default_label_features(label_ids)
+
         return session_data
 
     def _add_to_session_data(

From 220d6d02ef2e6d0ba0e2b8275678e3fdfdb2eb83 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 13:12:09 +0100
Subject: [PATCH 146/239] correct use of session data in policy

---
 rasa/core/policies/embedding_policy.py              | 6 +++---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 3 +--
 rasa/utils/train_utils.py                           | 2 +-
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index b010a254d17e..db2bd117d295 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -370,12 +370,12 @@ def _create_tf_placeholders(self, session_data: "train_utils.SessionData") -> No
         dialogue_len = None  # use dynamic time
         self.a_in = tf.placeholder(
             dtype=tf.float32,
-            shape=(None, dialogue_len, session_data.X["dialogue_features"].shape[-1]),
+            shape=(None, dialogue_len, session_data["dialogue_features"][0].shape[-1]),
             name="a",
         )
         self.b_in = tf.placeholder(
             dtype=tf.float32,
-            shape=(None, dialogue_len, None, session_data.Y["bot_features"].shape[-1]),
+            shape=(None, dialogue_len, None, session_data["bot_features"][0].shape[-1]),
             name="b",
         )
 
@@ -544,7 +544,7 @@ def tf_feed_dict_for_prediction(
         data_X = self.featurizer.create_X([tracker], domain)
         session_data = self._create_session_data(data_X)
 
-        return {self.a_in: session_data.X["dialogue_features"]}
+        return {self.a_in: session_data["dialogue_features"][0]}
 
     def predict_action_probabilities(
         self, tracker: "DialogueStateTracker", domain: "Domain"
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 7bb59b36607c..3cd5eb4c4c7a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -589,8 +589,7 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
             batch_data["intent_features"], session_data["intent_features"], "intent"
         )
 
-        # TODO check this idea:
-        # self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
+        self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
 
         message_embed = self._create_tf_embed_fnn(
             a,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index b5bc77474e1a..76fe5ed54543 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -24,7 +24,7 @@
 logger = logging.getLogger(__name__)
 
 
-# namedtuple for all tf session related data
+# dictionary for all tf session related data
 SessionData = Dict[Text, List[np.ndarray]]
 
 

From 18fe94fb0d3a0eb7fbf8c80157c0845a52ce1c77 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 13:18:35 +0100
Subject: [PATCH 147/239] Use coo_matrix.

---
 rasa/nlu/classifiers/embedding_intent_classifier.py         | 2 +-
 rasa/nlu/featurizers/featurzier.py                          | 2 +-
 .../sparse_featurizer/count_vectors_featurizer.py           | 6 +++---
 rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py  | 4 ++--
 rasa/utils/train_utils.py                                   | 5 ++++-
 5 files changed, 11 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 3cd5eb4c4c7a..94a46ac9d14f 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -325,7 +325,7 @@ def _compute_default_label_features(
         return [
             np.array(
                 [
-                    scipy.sparse.csr_matrix(
+                    scipy.sparse.coo_matrix(
                         ([1], ([0], [idx])), shape=(1, len(labels_example))
                     )
                     for idx in range(len(labels_example))
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index a35cace93ff3..edd5a3e2f46a 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -17,7 +17,7 @@ def sequence_to_sentence_features(
         return None
 
     if isinstance(features, scipy.sparse.spmatrix):
-        return scipy.sparse.csr_matrix(features.sum(axis=0))
+        return scipy.sparse.coo_matrix(features.sum(axis=0))
 
     return np.mean(features, axis=0)
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 8d6a5593e2bd..7ac463008a53 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -391,7 +391,7 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
 
     def _get_featurized_attribute(
         self, attribute: Text, attribute_texts: List[Text]
-    ) -> Optional[List[scipy.sparse.csr_matrix]]:
+    ) -> Optional[List[scipy.sparse.coo_matrix]]:
         """Return features of a particular attribute for complete data"""
 
         if self._check_attribute_vocabulary(attribute):
@@ -406,7 +406,7 @@ def _get_text_sequence(text: Text) -> List[Text]:
 
     def _create_sequence(
         self, attribute: Text, attribute_texts: List[Text]
-    ) -> List[scipy.sparse.csr_matrix]:
+    ) -> List[scipy.sparse.coo_matrix]:
         texts = [self._get_text_sequence(text) for text in attribute_texts]
 
         X = []
@@ -414,7 +414,7 @@ def _create_sequence(
         for i, tokens in enumerate(texts):
             x = self.vectorizers[attribute].transform(tokens)
             x.sort_indices()
-            X.append(x)
+            X.append(x.tocoo())
 
         return X
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index c156ba359494..71a6d0f168ff 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -79,7 +79,7 @@ def _add_lookup_table_regexes(
 
     def _features_for_patterns(
         self, message: Message, attribute: Text
-    ) -> scipy.sparse.csr_matrix:
+    ) -> scipy.sparse.coo_matrix:
         """Checks which known patterns match the message.
 
         Given a sentence, returns a vector of {1,0} values indicating which
@@ -105,7 +105,7 @@ def _features_for_patterns(
 
                 t.set("pattern", patterns)
 
-        return scipy.sparse.csr_matrix(vec)
+        return scipy.sparse.coo_matrix(vec)
 
     def _generate_lookup_regex(
         self, lookup_table: Dict[Text, Union[Text, List]]
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 76fe5ed54543..1ae4ade49841 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -320,7 +320,10 @@ def prepare_batch(
 def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     """Convert a scipy matrix into inidces, data, and shape."""
     seq_len = max([x.shape[0] for x in array_of_sparse])
-    coo = [x.tocoo() for x in array_of_sparse]
+    if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
+        coo = [x.tocoo() for x in array_of_sparse]
+    else:
+        coo = array_of_sparse
     data = [v for x in array_of_sparse for v in x.data]
 
     indices = [

From c56db96d3c4aa2ce10b4cb716604cbf9ccca9820 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 13:23:12 +0100
Subject: [PATCH 148/239] Update Changelog

---
 CHANGELOG.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 0fde08ba5d49..7823cb73f2cf 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -26,7 +26,9 @@ Changed
 - All featurizers in ``rasa.nlu`` return a sequence
 - Renamed the feature name ``ner_features`` to ``text_dense_features`` in ``CRFEntityExtractor``.
   The ``text_dense_features`` are created by any dense featurizer.
-- Values of ``SessionData`` are dictionaries instead of ``np.ndarray``
+- ``SessionData`` is a dictionary instead of namedtuple with ``np.ndarray`` values.
+- Keep sparse features as long as possible: Batch generator will output tensors of indices, data, shape for sparse data
+  so that they can be converted into ``SparseTensor`` inside the graph.
 - Do not retrain the entire Core model if only the ``templates`` section of the domain is changed.
 - Upgraded ``jsonschema`` version
 

From d22055ca0d342ffe017c515e6ecb781f8efa9cdf Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 14:12:00 +0100
Subject: [PATCH 149/239] clean up

---
 rasa/core/policies/embedding_policy.py        | 10 ++-
 .../embedding_intent_classifier.py            | 10 +--
 rasa/utils/train_utils.py                     | 13 ++-
 tests/core/test_policies.py                   |  8 +-
 tests/nlu/classifiers/__init__.py             |  0
 .../test_embedding_intent_classifier.py       | 87 +++++++++++++++++++
 tests/nlu/training/test_train.py              | 19 ----
 7 files changed, 110 insertions(+), 37 deletions(-)
 create mode 100644 tests/nlu/classifiers/__init__.py
 create mode 100644 tests/nlu/classifiers/test_embedding_intent_classifier.py

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index db2bd117d295..a353185d1e7b 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -253,12 +253,14 @@ def _label_features_for_Y(self, label_ids: "np.ndarray") -> "np.ndarray":
     def _create_session_data(
         self, data_X: "np.ndarray", data_Y: Optional["np.ndarray"] = None
     ) -> "train_utils.SessionData":
-        """Combine all tf session related data into a named tuple"""
+        """Combine all tf session related data into dict."""
+        data_X = data_X.astype(np.float32)
 
         if data_Y is not None:
             # training time
             label_ids = self._label_ids_for_Y(data_Y)
             Y = self._label_features_for_Y(label_ids)
+            Y = Y.astype(np.float32)
 
             # idea taken from sklearn's stratify split
             if label_ids.ndim == 2:
@@ -270,7 +272,11 @@ def _create_session_data(
             label_ids = None
             Y = None
 
-        return {"dialogue_features": data_X, "bot_features": Y, "action_ids": label_ids}
+        return {
+            "dialogue_features": [data_X],
+            "bot_features": [Y],
+            "action_ids": [label_ids],
+        }
 
     def _create_tf_bot_embed(self, b_in: "tf.Tensor") -> "tf.Tensor":
         """Create embedding bot vector."""
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 94a46ac9d14f..e06d65ee6827 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -269,10 +269,11 @@ def _check_labels_features_exist(
     ) -> bool:
         """Check if all labels have features set"""
         for label_example in labels_example:
-            if label_example.get(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]
-            ) is None and label_example.get(
-                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+            if (
+                label_example.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute])
+                is None
+                and label_example.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute])
+                is None
             ):
                 return False
         return True
@@ -284,7 +285,6 @@ def _extract_and_add_features(
         sparse_features = None
         dense_features = None
 
-        # we mutate sparse_features and dense_features
         if message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]) is not None:
             sparse_features = message.get(
                 MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 1ae4ade49841..0546518e7491 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -235,13 +235,12 @@ def balance_session_data(
             if min(num_data_cycles) > 0:
                 break
 
-    updated = {}
+    final_session_data = defaultdict(list)
     for k, values in new_session_data.items():
-        updated[k] = []
         for v in values:
-            updated[k].append(np.concatenate(np.array(v)))
+            final_session_data[k].append(np.concatenate(np.array(v)))
 
-    return updated
+    return final_session_data
 
 
 def get_number_of_examples(session_data: SessionData):
@@ -250,11 +249,11 @@ def get_number_of_examples(session_data: SessionData):
     """
     example_lengths = [v.shape[0] for values in session_data.values() for v in values]
 
-    # check if number of examples is the same for all X
+    # check if number of examples is the same for all values
     if not all(length == example_lengths[0] for length in example_lengths):
         raise ValueError(
-            f"Number of examples differs for X ({session_data.keys()}). There should "
-            f"be the same."
+            f"Number of examples differs for keys '{session_data.keys()}'. Number of "
+            f"examples should be the same for all data in session data."
         )
 
     return example_lengths[0]
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 293a5ce92e5a..6f376c475be3 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -355,8 +355,8 @@ async def test_gen_batch(self, trained_policy, default_domain):
         )
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
-            batch_x[0].shape == session_data.X["dialogue_features"][0].shape
-            and batch_y[0].shape == session_data.Y["bot_features"][0].shape
+            batch_x[0].shape == session_data["dialogue_features"][0][0].shape
+            and batch_y[0].shape == session_data["bot_features"][0][0].shape
         )
         batch_x, batch_y, _ = next(
             train_utils.gen_batch(
@@ -369,8 +369,8 @@ async def test_gen_batch(self, trained_policy, default_domain):
         )
         assert batch_x.shape[0] == batch_size and batch_y.shape[0] == batch_size
         assert (
-            batch_x[0].shape == session_data.X["dialogue_features"][0].shape
-            and batch_y[0].shape == session_data.Y["bot_features"][0].shape
+            batch_x[0].shape == session_data["dialogue_features"][0][0].shape
+            and batch_y[0].shape == session_data["bot_features"][0][0].shape
         )
 
 
diff --git a/tests/nlu/classifiers/__init__.py b/tests/nlu/classifiers/__init__.py
new file mode 100644
index 000000000000..e69de29bb2d1
diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_embedding_intent_classifier.py
new file mode 100644
index 000000000000..5f95702e35be
--- /dev/null
+++ b/tests/nlu/classifiers/test_embedding_intent_classifier.py
@@ -0,0 +1,87 @@
+import numpy as np
+import pytest
+
+from rasa.nlu.constants import (
+    MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    MESSAGE_INTENT_ATTRIBUTE,
+)
+from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
+from rasa.nlu.training_data import Message
+
+
+def test_compute_default_label_features():
+    label_features = [
+        Message("test a"),
+        Message("test b"),
+        Message("test c"),
+        Message("test d"),
+    ]
+
+    output = EmbeddingIntentClassifier._compute_default_label_features(label_features)
+
+    output = output[0]
+
+    assert output.size == len(label_features)
+    for i, o in enumerate(output):
+        assert o.data[0] == 1
+        assert o.indices[0] == i
+        assert o.shape == (1, len(label_features))
+
+
+@pytest.mark.parametrize(
+    "messages, expected",
+    [
+        (
+            [
+                Message(
+                    "test a",
+                    data={
+                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
+                            MESSAGE_TEXT_ATTRIBUTE
+                        ]: np.zeros(1),
+                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[
+                            MESSAGE_TEXT_ATTRIBUTE
+                        ]: np.zeros(1),
+                    },
+                ),
+                Message(
+                    "test b",
+                    data={
+                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
+                            MESSAGE_TEXT_ATTRIBUTE
+                        ]: np.zeros(1),
+                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[
+                            MESSAGE_TEXT_ATTRIBUTE
+                        ]: np.zeros(1),
+                    },
+                ),
+            ],
+            True,
+        ),
+        (
+            [
+                Message(
+                    "test a",
+                    data={
+                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
+                            MESSAGE_INTENT_ATTRIBUTE
+                        ]: np.zeros(1),
+                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[
+                            MESSAGE_INTENT_ATTRIBUTE
+                        ]: np.zeros(1),
+                    },
+                )
+            ],
+            False,
+        ),
+    ],
+)
+def test_check_labels_features_exist(messages, expected):
+    attribute = MESSAGE_TEXT_ATTRIBUTE
+
+    assert (
+        EmbeddingIntentClassifier._check_labels_features_exist(messages, attribute)
+        == expected
+    )
diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index 0ef996f01715..b2149b6b42e4 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -243,22 +243,3 @@ async def test_train_model_training_data_persisted(component_builder, tmpdir):
     loaded = Interpreter.load(persisted_path, component_builder)
     assert loaded.pipeline
     assert loaded.model_metadata.get("training_data") is not None
-
-
-def test_compute_default_label_features():
-    label_features = [
-        Message("test a"),
-        Message("test b"),
-        Message("test c"),
-        Message("test d"),
-    ]
-
-    output = EmbeddingIntentClassifier._compute_default_label_features(label_features)
-
-    output = output[0]
-
-    assert output.size == len(label_features)
-    for i, o in enumerate(output):
-        assert o.data[0] == 1
-        assert o.indices[0] == i
-        assert o.shape == (1, len(label_features))

From ad8695ad2120e05f7b9536cd6e67b3c3f11be700 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 12 Nov 2019 15:17:03 +0100
Subject: [PATCH 150/239] Fix imports.

---
 tests/nlu/training/test_train.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/tests/nlu/training/test_train.py b/tests/nlu/training/test_train.py
index b2149b6b42e4..2bc2d24897a3 100644
--- a/tests/nlu/training/test_train.py
+++ b/tests/nlu/training/test_train.py
@@ -1,7 +1,5 @@
 import os
 import pytest
-from nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
-from nlu.training_data import Message
 
 from rasa.nlu import registry, train
 from rasa.nlu.config import RasaNLUModelConfig

From 1c835c50537322db4fce01803286157f73dce9b7 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 12 Nov 2019 18:14:50 +0100
Subject: [PATCH 151/239] add masks, update prediction batch creation

---
 rasa/core/channels/slack.py                   |   2 +-
 rasa/core/training/interactive.py             |   2 +-
 .../embedding_intent_classifier.py            | 123 ++++++++----------
 rasa/server.py                                |  18 +--
 rasa/utils/train_utils.py                     |  27 +++-
 5 files changed, 87 insertions(+), 85 deletions(-)

diff --git a/rasa/core/channels/slack.py b/rasa/core/channels/slack.py
index d21e57e65e8d..57ce1da95600 100644
--- a/rasa/core/channels/slack.py
+++ b/rasa/core/channels/slack.py
@@ -204,7 +204,7 @@ def _sanitize_user_message(text, uids_to_remove) -> Text:
             # but is a good first approximation
             for regex, replacement in [
                 (fr"<@{uid_to_remove}>\s", ""),
-                (fr"\s<@{uid_to_remove}>", "",),  # a bit arbitrary but probably OK
+                (fr"\s<@{uid_to_remove}>", ""),  # a bit arbitrary but probably OK
                 (fr"<@{uid_to_remove}>", " "),
             ]:
                 text = re.sub(regex, replacement, text)
diff --git a/rasa/core/training/interactive.py b/rasa/core/training/interactive.py
index 84ed3e88572d..0a674f0f0533 100644
--- a/rasa/core/training/interactive.py
+++ b/rasa/core/training/interactive.py
@@ -130,7 +130,7 @@ async def send_message(
     }
 
     return await endpoint.request(
-        json=payload, method="post", subpath=f"/conversations/{sender_id}/messages",
+        json=payload, method="post", subpath=f"/conversations/{sender_id}/messages"
     )
 
 
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e06d65ee6827..c321558dce4a 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -125,7 +125,7 @@ def __init__(
         similarity: Optional["tf.Tensor"] = None,
         label_embed: Optional["tf.Tensor"] = None,
         all_labels_embed: Optional["tf.Tensor"] = None,
-        shapes: Optional[Tuple] = None,
+        batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
         """Declare instant variables with default values"""
 
@@ -155,7 +155,7 @@ def __init__(
         self._train_op = None
         self._is_training = None
 
-        self.shapes = shapes
+        self.batch_tuple_sizes = batch_tuple_sizes
 
     # config migration warning
     def _check_old_config_variables(self, config: Dict[Text, Any]) -> None:
@@ -367,6 +367,7 @@ def _create_label_data(
 
         label_data = {}
         self._add_to_session_data(label_data, "intent_features", features)
+        self._add_mask_to_session_data(label_data, "intent_mask", "intent_features")
 
         return label_data
 
@@ -393,15 +394,6 @@ def _create_session_data(
         Y_sparse = []
         Y_dense = []
         label_ids = []
-        masks = []
-
-        # TODO should be variable
-        # TODO what if not present? use default value? raise error?
-        seq_len = [
-            e.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]).shape[0]
-            for e in training_data
-        ]
-        max_seq_len = max(seq_len) if seq_len else 25
 
         for e in training_data:
             _sparse, _dense = self._extract_and_add_features(e, MESSAGE_TEXT_ATTRIBUTE)
@@ -421,31 +413,29 @@ def _create_session_data(
             if e.get(attribute):
                 label_ids.append(label_id_dict[e.get(attribute)])
 
-            mask = np.zeros(max_seq_len)
-            mask[0 : len(e.get("tokens"))] = 1
-            masks.append(mask)
-
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
         Y_sparse = np.array(Y_sparse)
         Y_dense = np.array(Y_dense)
         label_ids = np.array(label_ids)
-        masks = np.array(masks)
 
         session_data = {}
-        self._add_to_session_data(session_data, "masks", [masks])
         self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
 
+        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
+        self._add_mask_to_session_data(session_data, "intent_mask", "intent_features")
+
         if "intent_features" not in session_data:
             # no intent features are present, get default features from _label_data
             session_data["intent_features"] = self.use_default_label_features(label_ids)
 
         return session_data
 
+    @staticmethod
     def _add_to_session_data(
-        self, session_data: SessionData, key: Text, features: List[np.ndarray]
+        session_data: SessionData, key: Text, features: List[np.ndarray]
     ):
         if not features:
             return
@@ -456,6 +446,16 @@ def _add_to_session_data(
             if data.size > 0:
                 session_data[key].append(data)
 
+    @staticmethod
+    def _add_mask_to_session_data(session_data: SessionData, key: Text, from_key: Text):
+        session_data[key] = []
+
+        for data in session_data[from_key]:
+            if data.size > 0:
+                mask = np.array([np.ones((x.shape[0], 1)) for x in data])
+                session_data[key].append(mask)
+                break
+
     # tf helpers:
     def _create_tf_embed_fnn(
         self,
@@ -497,13 +497,13 @@ def _build_tf_train_graph(
         label_data = train_utils.batch_to_session_data(label_batch, self._label_data)
 
         a = self.combine_sparse_dense_features(
-            batch_data["text_features"], session_data["text_features"], "text"
+            batch_data["text_features"], batch_data["text_mask"][0], "text"
         )
         b = self.combine_sparse_dense_features(
-            batch_data["intent_features"], session_data["intent_features"], "intent"
+            batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
         )
         all_bs = self.combine_sparse_dense_features(
-            label_data["intent_features"], self._label_data["intent_features"], "intent"
+            label_data["intent_features"], label_data["intent_mask"][0], "intent"
         )
 
         message_embed = self._create_tf_embed_fnn(
@@ -544,7 +544,7 @@ def _build_tf_train_graph(
     def combine_sparse_dense_features(
         self,
         features: List[Union[tf.Tensor, tf.SparseTensor]],
-        session_data: List[np.ndarray],
+        mask: tf.Tensor,
         name: Text,
     ) -> tf.Tensor:
 
@@ -552,9 +552,9 @@ def combine_sparse_dense_features(
 
         dense_dim = self.dense_dim
         # if dense features are present use the feature dimension of the dense features
-        for d in session_data:
-            if not isinstance(d[0], scipy.sparse.spmatrix):
-                dense_dim = d[0].shape[-1]
+        for f in features:
+            if not isinstance(f, tf.SparseTensor):
+                dense_dim = f.shape[-1]
                 break
 
         for f in features:
@@ -567,15 +567,17 @@ def combine_sparse_dense_features(
 
         output = tf.concat(dense_features, axis=-1)
         # apply mean to convert sequence to sentence features
-        # TODO we cannot use reduce_mean, we should use reduce_sum / real_length
-        output = tf.reduce_mean(output, axis=1)
+        output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
         return output
 
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
-        self.shapes, types = train_utils.get_shapes_types(session_data)
+        # save the amount of placeholders attributed to session data keys
+        self.batch_tuple_sizes = train_utils.session_data_to_tuple_sizes(session_data)
+
+        shapes, types = train_utils.get_shapes_types(session_data)
 
         batch_placeholder = []
-        for s, t in zip(self.shapes, types):
+        for s, t in zip(shapes, types):
             batch_placeholder.append(tf.placeholder(t, s))
 
         self.batch_in = tf.tuple(batch_placeholder)
@@ -583,10 +585,10 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         batch_data = train_utils.batch_to_session_data(self.batch_in, session_data)
 
         a = self.combine_sparse_dense_features(
-            batch_data["text_features"], session_data["text_features"], "text"
+            batch_data["text_features"], batch_data["text_mask"][0], "text"
         )
         b = self.combine_sparse_dense_features(
-            batch_data["intent_features"], session_data["intent_features"], "intent"
+            batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
         )
 
         self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
@@ -617,9 +619,8 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
 
-    def _get_num_of_features(
-        self, session_data: "SessionData", key_prefix: Text
-    ) -> int:
+    @staticmethod
+    def _get_num_of_features(session_data: "SessionData", key_prefix: Text) -> int:
         num_features = 0
         for k, v in session_data.items():
             if k.startswith(key_prefix):
@@ -752,12 +753,14 @@ def train(
 
     # process helpers
     # noinspection PyPep8Naming
-    def _calculate_message_sim(self, X: Tuple) -> Tuple[np.ndarray, List[float]]:
+    def _calculate_message_sim(self, batch: Tuple) -> Tuple[np.ndarray, List[float]]:
         """Calculate message similarities"""
 
         message_sim = self.session.run(
             self.pred_confidence,
-            feed_dict={_x: _x_in for _x, _x_in in zip(self.batch_in, X)},
+            feed_dict={
+                _x_in: _x for _x_in, _x in zip(self.batch_in, batch) if _x is not None
+            },
         )
 
         message_sim = message_sim.flatten()  # sim is a matrix
@@ -784,12 +787,12 @@ def predict_label(
         else:
             # create session data from message and convert it into a batch of 1
             session_data = self._create_session_data([message])
-
-            batch = train_utils.prepare_batch(session_data)
-            X = self._add_missing_placeholder_tensors(batch)
+            batch = train_utils.prepare_batch(
+                session_data, tuple_sizes=self.batch_tuple_sizes
+            )
 
             # load tf graph and session
-            label_ids, message_sim = self._calculate_message_sim(X)
+            label_ids, message_sim = self._calculate_message_sim(batch)
 
             # if X contains all zeros do not predict some label
             if label_ids.size > 0:
@@ -806,32 +809,6 @@ def predict_label(
                 ]
         return label, label_ranking
 
-    def _add_missing_placeholder_tensors(self, batch):
-        # check if all data is already present
-        if self.shapes is not None and len(batch) == len(self.shapes):
-            return batch
-
-        X = []
-        # if features are not present add dummy tensor
-        for i, shape in enumerate(self.shapes):
-            if i >= len(batch) or batch[i] is None:
-                # shape may contain None, replace None by 1
-                if isinstance(shape, tuple):
-                    shape = tuple([x if x is not None else 1 for x in shape])
-                elif shape is None:
-                    shape = 1
-                # add dummy tensor of shape
-                X.append(np.zeros(shape))
-            # TODO mask
-            elif (isinstance(shape, tuple) or isinstance(shape, list)) and batch[
-                i
-            ].shape[-1] != shape[-1]:
-                X.append(train_utils.pad_data(batch[i], feature_len=shape[-1]))
-            else:
-                X.append(batch[i])
-            i += 1
-        return tuple(X)
-
     def process(self, message: "Message", **kwargs: Any) -> None:
         """Return the most likely label and its similarity to the input."""
 
@@ -884,8 +861,10 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
         with open(os.path.join(model_dir, file_name + ".tf_config.pkl"), "wb") as f:
             pickle.dump(self._tf_config, f)
 
-        with open(os.path.join(model_dir, file_name + ".shapes.pkl"), "wb") as f:
-            pickle.dump(self.shapes, f)
+        with open(
+            os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "wb"
+        ) as f:
+            pickle.dump(self.batch_tuple_sizes, f)
 
         return {"file": file_name}
 
@@ -927,8 +906,10 @@ def load(
             ) as f:
                 inv_label_dict = pickle.load(f)
 
-            with open(os.path.join(model_dir, file_name + ".shapes.pkl"), "rb") as f:
-                shapes = pickle.load(f)
+            with open(
+                os.path.join(model_dir, file_name + ".batch_tuple_sizes.pkl"), "rb"
+            ) as f:
+                batch_tuple_sizes = pickle.load(f)
 
             return cls(
                 component_config=meta,
@@ -941,7 +922,7 @@ def load(
                 similarity=sim,
                 label_embed=label_embed,
                 all_labels_embed=all_labels_embed,
-                shapes=shapes,
+                batch_tuple_sizes=batch_tuple_sizes,
             )
 
         else:
diff --git a/rasa/server.py b/rasa/server.py
index 8943e3b06f98..608a5a258e8b 100644
--- a/rasa/server.py
+++ b/rasa/server.py
@@ -442,7 +442,7 @@ async def retrieve_tracker(request: Request, conversation_id: Text):
         except Exception as e:
             logger.debug(traceback.format_exc())
             raise ErrorResponse(
-                500, "ConversationError", f"An unexpected error occurred. Error: {e}",
+                500, "ConversationError", f"An unexpected error occurred. Error: {e}"
             )
 
     @app.post("/conversations/<conversation_id>/tracker/events")
@@ -488,7 +488,7 @@ async def append_events(request: Request, conversation_id: Text):
         except Exception as e:
             logger.debug(traceback.format_exc())
             raise ErrorResponse(
-                500, "ConversationError", f"An unexpected error occurred. Error: {e}",
+                500, "ConversationError", f"An unexpected error occurred. Error: {e}"
             )
 
     @app.put("/conversations/<conversation_id>/tracker/events")
@@ -517,7 +517,7 @@ async def replace_events(request: Request, conversation_id: Text):
         except Exception as e:
             logger.debug(traceback.format_exc())
             raise ErrorResponse(
-                500, "ConversationError", f"An unexpected error occurred. Error: {e}",
+                500, "ConversationError", f"An unexpected error occurred. Error: {e}"
             )
 
     @app.get("/conversations/<conversation_id>/story")
@@ -541,7 +541,7 @@ async def retrieve_story(request: Request, conversation_id: Text):
         except Exception as e:
             logger.debug(traceback.format_exc())
             raise ErrorResponse(
-                500, "ConversationError", f"An unexpected error occurred. Error: {e}",
+                500, "ConversationError", f"An unexpected error occurred. Error: {e}"
             )
 
     @app.post("/conversations/<conversation_id>/execute")
@@ -579,7 +579,7 @@ async def execute_action(request: Request, conversation_id: Text):
         except Exception as e:
             logger.debug(traceback.format_exc())
             raise ErrorResponse(
-                500, "ConversationError", f"An unexpected error occurred. Error: {e}",
+                500, "ConversationError", f"An unexpected error occurred. Error: {e}"
             )
 
         tracker = get_tracker(app.agent, conversation_id)
@@ -606,7 +606,7 @@ async def predict(request: Request, conversation_id: Text):
         except Exception as e:
             logger.debug(traceback.format_exc())
             raise ErrorResponse(
-                500, "ConversationError", f"An unexpected error occurred. Error: {e}",
+                500, "ConversationError", f"An unexpected error occurred. Error: {e}"
             )
 
     @app.post("/conversations/<conversation_id>/messages")
@@ -646,7 +646,7 @@ async def add_message(request: Request, conversation_id: Text):
         except Exception as e:
             logger.debug(traceback.format_exc())
             raise ErrorResponse(
-                500, "ConversationError", f"An unexpected error occurred. Error: {e}",
+                500, "ConversationError", f"An unexpected error occurred. Error: {e}"
             )
 
     @app.post("/model/train")
@@ -865,7 +865,7 @@ async def tracker_predict(request: Request):
         except Exception as e:
             logger.debug(traceback.format_exc())
             raise ErrorResponse(
-                500, "PredictionError", f"An unexpected error occurred. Error: {e}",
+                500, "PredictionError", f"An unexpected error occurred. Error: {e}"
             )
 
     @app.post("/model/parse")
@@ -889,7 +889,7 @@ async def parse(request: Request):
             except Exception as e:
                 logger.debug(traceback.format_exc())
                 raise ErrorResponse(
-                    400, "ParsingError", f"An unexpected error occurred. Error: {e}",
+                    400, "ParsingError", f"An unexpected error occurred. Error: {e}"
                 )
             response_data = emulator.normalise_response_json(parsed_data)
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 0546518e7491..14834d037371 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -286,15 +286,21 @@ def gen_batch(
 
 
 def prepare_batch(
-    session_data: SessionData, start: Optional[int] = None, end: Optional[int] = None
+    session_data: SessionData,
+    start: Optional[int] = None,
+    end: Optional[int] = None,
+    tuple_sizes: Dict[Text:int] = None,
 ):
     """Slices session data into batch using given start and end value."""
     batch_data = []
 
-    for values in session_data.values():
+    for key, values in session_data.items():
         # add None for not present values during processing
         if not values:
-            batch_data.append(None)
+            if tuple_sizes:
+                batch_data += [None] * tuple_sizes[key]
+            else:
+                batch_data.append(None)
             continue
 
         for v in values:
@@ -402,6 +408,21 @@ def batch_to_session_data(
     return batch_data
 
 
+def session_data_to_tuple_sizes(session_data: SessionData) -> Dict[Text:int]:
+    batch_sizes = {}
+
+    for k, values in session_data.items():
+        idx = 0
+        for v in values:
+            if isinstance(v[0], scipy.sparse.spmatrix):
+                idx += 3
+            else:
+                idx += 1
+        batch_sizes[k] = idx
+
+    return batch_sizes
+
+
 # noinspection PyPep8Naming
 def create_tf_dataset(
     session_data: "SessionData",

From ff0c707236e0639c75cc0711d0eb75f08eabe715 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 12 Nov 2019 18:17:53 +0100
Subject: [PATCH 152/239] fix types

---
 rasa/utils/train_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 14834d037371..32caaf026b48 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -289,7 +289,7 @@ def prepare_batch(
     session_data: SessionData,
     start: Optional[int] = None,
     end: Optional[int] = None,
-    tuple_sizes: Dict[Text:int] = None,
+    tuple_sizes: Dict[Text, int] = None,
 ):
     """Slices session data into batch using given start and end value."""
     batch_data = []
@@ -408,7 +408,7 @@ def batch_to_session_data(
     return batch_data
 
 
-def session_data_to_tuple_sizes(session_data: SessionData) -> Dict[Text:int]:
+def session_data_to_tuple_sizes(session_data: SessionData) -> Dict[Text, int]:
     batch_sizes = {}
 
     for k, values in session_data.items():

From 597265bfc5cfef162cee788bb52c0c0e4a934a43 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Tue, 12 Nov 2019 21:08:11 +0100
Subject: [PATCH 153/239] merge helper methods

---
 .../embedding_intent_classifier.py            |  8 +++----
 rasa/utils/train_utils.py                     | 21 +++++--------------
 2 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index c321558dce4a..2c3aa9412b6d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -493,8 +493,8 @@ def _build_tf_train_graph(
         label_batch = train_utils.prepare_batch(self._label_data)
 
         # convert batch format into sparse and dense tensors
-        batch_data = train_utils.batch_to_session_data(self.batch_in, session_data)
-        label_data = train_utils.batch_to_session_data(label_batch, self._label_data)
+        batch_data, _ = train_utils.batch_to_session_data(self.batch_in, session_data)
+        label_data, _ = train_utils.batch_to_session_data(label_batch, self._label_data)
 
         a = self.combine_sparse_dense_features(
             batch_data["text_features"], batch_data["text_mask"][0], "text"
@@ -571,8 +571,6 @@ def combine_sparse_dense_features(
         return output
 
     def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
-        # save the amount of placeholders attributed to session data keys
-        self.batch_tuple_sizes = train_utils.session_data_to_tuple_sizes(session_data)
 
         shapes, types = train_utils.get_shapes_types(session_data)
 
@@ -582,7 +580,7 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         self.batch_in = tf.tuple(batch_placeholder)
 
-        batch_data = train_utils.batch_to_session_data(self.batch_in, session_data)
+        batch_data, self.batch_tuple_sizes = train_utils.batch_to_session_data(self.batch_in, session_data)
 
         a = self.combine_sparse_dense_features(
             batch_data["text_features"], batch_data["text_mask"][0], "text"
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 32caaf026b48..2a6b86ecd94c 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -387,6 +387,8 @@ def batch_to_session_data(
     kept.
     """
     batch_data = defaultdict(list)
+    # save the amount of placeholders attributed to session data keys
+    batch_sizes = defaultdict(int)
     idx = 0
 
     for k, values in session_data.items():
@@ -401,26 +403,13 @@ def batch_to_session_data(
                     )
                 )
                 idx += 3
+                batch_sizes[k] += 3
             else:
                 batch_data[k].append(batch[idx])
                 idx += 1
+                batch_sizes[k] += 1
 
-    return batch_data
-
-
-def session_data_to_tuple_sizes(session_data: SessionData) -> Dict[Text, int]:
-    batch_sizes = {}
-
-    for k, values in session_data.items():
-        idx = 0
-        for v in values:
-            if isinstance(v[0], scipy.sparse.spmatrix):
-                idx += 3
-            else:
-                idx += 1
-        batch_sizes[k] = idx
-
-    return batch_sizes
+    return batch_data, batch_sizes
 
 
 # noinspection PyPep8Naming

From 29a9c7fe30a2b54f8e60691c3f0dfddbbc0b7294 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 08:44:00 +0100
Subject: [PATCH 154/239] some refactoring

---
 .../embedding_intent_classifier.py            | 60 ++++++++++---------
 rasa/utils/train_utils.py                     |  2 +-
 2 files changed, 32 insertions(+), 30 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 2c3aa9412b6d..1cf62971c390 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -424,13 +424,13 @@ def _create_session_data(
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
 
-        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
-        self._add_mask_to_session_data(session_data, "intent_mask", "intent_features")
-
         if "intent_features" not in session_data:
             # no intent features are present, get default features from _label_data
             session_data["intent_features"] = self.use_default_label_features(label_ids)
 
+        self._add_mask_to_session_data(session_data, "text_mask", "text_features")
+        self._add_mask_to_session_data(session_data, "intent_mask", "intent_features")
+
         return session_data
 
     @staticmethod
@@ -448,12 +448,10 @@ def _add_to_session_data(
 
     @staticmethod
     def _add_mask_to_session_data(session_data: SessionData, key: Text, from_key: Text):
-        session_data[key] = []
-
         for data in session_data[from_key]:
             if data.size > 0:
                 mask = np.array([np.ones((x.shape[0], 1)) for x in data])
-                session_data[key].append(mask)
+                session_data[key] = [mask]
                 break
 
     # tf helpers:
@@ -580,7 +578,9 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
 
         self.batch_in = tf.tuple(batch_placeholder)
 
-        batch_data, self.batch_tuple_sizes = train_utils.batch_to_session_data(self.batch_in, session_data)
+        batch_data, self.batch_tuple_sizes = train_utils.batch_to_session_data(
+            self.batch_in, session_data
+        )
 
         a = self.combine_sparse_dense_features(
             batch_data["text_features"], batch_data["text_mask"][0], "text"
@@ -775,36 +775,38 @@ def predict_label(
 
         label = {"name": None, "confidence": 0.0}
         label_ranking = []
+
         if self.session is None:
             logger.error(
                 "There is no trained tf.session: "
                 "component is either not trained or "
                 "didn't receive enough training data"
             )
+            return label, label_ranking
 
-        else:
-            # create session data from message and convert it into a batch of 1
-            session_data = self._create_session_data([message])
-            batch = train_utils.prepare_batch(
-                session_data, tuple_sizes=self.batch_tuple_sizes
-            )
+        # create session data from message and convert it into a batch of 1
+        session_data = self._create_session_data([message])
+        batch = train_utils.prepare_batch(
+            session_data, tuple_sizes=self.batch_tuple_sizes
+        )
+
+        # load tf graph and session
+        label_ids, message_sim = self._calculate_message_sim(batch)
+
+        # if X contains all zeros do not predict some label
+        if label_ids.size > 0:
+            label = {
+                "name": self.inverted_label_dict[label_ids[0]],
+                "confidence": message_sim[0],
+            }
+
+            ranking = list(zip(list(label_ids), message_sim))
+            ranking = ranking[:LABEL_RANKING_LENGTH]
+            label_ranking = [
+                {"name": self.inverted_label_dict[label_idx], "confidence": score}
+                for label_idx, score in ranking
+            ]
 
-            # load tf graph and session
-            label_ids, message_sim = self._calculate_message_sim(batch)
-
-            # if X contains all zeros do not predict some label
-            if label_ids.size > 0:
-                label = {
-                    "name": self.inverted_label_dict[label_ids[0]],
-                    "confidence": message_sim[0],
-                }
-
-                ranking = list(zip(list(label_ids), message_sim))
-                ranking = ranking[:LABEL_RANKING_LENGTH]
-                label_ranking = [
-                    {"name": self.inverted_label_dict[label_idx], "confidence": score}
-                    for label_idx, score in ranking
-                ]
         return label, label_ranking
 
     def process(self, message: "Message", **kwargs: Any) -> None:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2a6b86ecd94c..c2f843422031 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -379,7 +379,7 @@ def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
 
 def batch_to_session_data(
     batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionData
-):
+) -> Tuple[Dict[Text, List[tf.Tensor]], Dict[Text, int]]:
     """
     Batch contains any number of batch data. The order is equal to the
     key-value pairs in session data. As sparse data were converted into indices, data,

From 4c03841e8158d5462e1da6ca6200dab8ec81a11f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 09:03:12 +0100
Subject: [PATCH 155/239] add test for get number of features

---
 .../embedding_intent_classifier.py            | 11 +++----
 .../test_embedding_intent_classifier.py       | 32 +++++++++++++++++++
 2 files changed, 37 insertions(+), 6 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 1cf62971c390..f63561782fe1 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -620,18 +620,17 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
     @staticmethod
     def _get_num_of_features(session_data: "SessionData", key_prefix: Text) -> int:
         num_features = 0
-        for k, v in session_data.items():
+        for k, values in session_data.items():
             if k.startswith(key_prefix):
-                num_features += v[0].shape[-1]
+                for v in values:
+                    num_features += v[0].shape[-1]
         return num_features
 
     def check_input_dimension_consistency(self, session_data: "SessionData"):
         if self.share_hidden_layers:
-            num_text_features = self._get_num_of_features(
-                session_data, "text_features_"
-            )
+            num_text_features = self._get_num_of_features(session_data, "text_features")
             num_intent_features = self._get_num_of_features(
-                session_data, "intent_features_"
+                session_data, "intent_features"
             )
 
             if num_text_features != num_intent_features:
diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_embedding_intent_classifier.py
index 5f95702e35be..6e746fae6f3c 100644
--- a/tests/nlu/classifiers/test_embedding_intent_classifier.py
+++ b/tests/nlu/classifiers/test_embedding_intent_classifier.py
@@ -1,5 +1,6 @@
 import numpy as np
 import pytest
+import scipy.sparse
 
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
@@ -30,6 +31,37 @@ def test_compute_default_label_features():
         assert o.shape == (1, len(label_features))
 
 
+def test_get_num_of_features():
+    session_data = {
+        "text_features": [
+            np.array(
+                [
+                    np.random.rand(5, 14),
+                    np.random.rand(2, 14),
+                    np.random.rand(3, 14),
+                    np.random.rand(1, 14),
+                    np.random.rand(3, 14),
+                ]
+            ),
+            np.array(
+                [
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(5, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(2, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(1, 10))),
+                    scipy.sparse.csr_matrix(np.random.randint(5, size=(3, 10))),
+                ]
+            ),
+        ]
+    }
+
+    num_features = EmbeddingIntentClassifier._get_num_of_features(
+        session_data, "text_features"
+    )
+
+    assert num_features == 24
+
+
 @pytest.mark.parametrize(
     "messages, expected",
     [

From fa7b50c36d7c799ee32c2247e357b36bda600ba8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 13 Nov 2019 10:12:41 +0100
Subject: [PATCH 156/239] set initial tuple size to zero

---
 rasa/utils/train_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2a6b86ecd94c..658c9c1fde25 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -393,6 +393,7 @@ def batch_to_session_data(
 
     for k, values in session_data.items():
         for v in values:
+            batch_sizes[k] = 0
             if isinstance(v[0], scipy.sparse.spmatrix):
                 # explicitly substitute last dimension in shape with known static value
                 batch_data[k].append(

From 35e2bdb8ee7bbbd3e6979a0819b5a10eac11adbd Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 13 Nov 2019 10:13:41 +0100
Subject: [PATCH 157/239] rename the variable

---
 rasa/utils/train_utils.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 74f9d9fd0aef..64fcde691791 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -388,12 +388,12 @@ def batch_to_session_data(
     """
     batch_data = defaultdict(list)
     # save the amount of placeholders attributed to session data keys
-    batch_sizes = defaultdict(int)
+    tuple_sizes = defaultdict(int)
     idx = 0
 
     for k, values in session_data.items():
         for v in values:
-            batch_sizes[k] = 0
+            tuple_sizes[k] = 0
             if isinstance(v[0], scipy.sparse.spmatrix):
                 # explicitly substitute last dimension in shape with known static value
                 batch_data[k].append(
@@ -404,13 +404,13 @@ def batch_to_session_data(
                     )
                 )
                 idx += 3
-                batch_sizes[k] += 3
+                tuple_sizes[k] += 3
             else:
                 batch_data[k].append(batch[idx])
                 idx += 1
-                batch_sizes[k] += 1
+                tuple_sizes[k] += 1
 
-    return batch_data, batch_sizes
+    return batch_data, tuple_sizes
 
 
 # noinspection PyPep8Naming

From 9bbe1c1bd17b37283269956e373fbb3e7d043b13 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 10:44:30 +0100
Subject: [PATCH 158/239] formatting

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f63561782fe1..e030b0f38356 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -319,7 +319,7 @@ def _extract_labels_precomputed_features(
 
     @staticmethod
     def _compute_default_label_features(
-        labels_example: List["Message"]
+        labels_example: List["Message"],
     ) -> List[np.ndarray]:
         """Compute one-hot representation for the labels"""
         return [

From f383cf02135f1cf58dfe8fc4865bc4f6b686553e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 10:55:25 +0100
Subject: [PATCH 159/239] Update cli startup test

---
 tests/cli/test_cli.py | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/tests/cli/test_cli.py b/tests/cli/test_cli.py
index 1a7a3607453c..dc221349b647 100644
--- a/tests/cli/test_cli.py
+++ b/tests/cli/test_cli.py
@@ -3,21 +3,26 @@
 from _pytest.pytester import RunResult
 
 
-@pytest.mark.repeat(3)
 def test_cli_start(run: Callable[..., RunResult]):
     """
-    Startup of cli should not take longer than n seconds
+    Measures an average startup time and checks that it
+    does not deviate more than x seconds from 5.
     """
     import time
 
-    start = time.time()
-    run("--help")
-    end = time.time()
+    durations = []
 
-    duration = end - start
+    for i in range(5):
+        start = time.time()
+        run("--help")
+        end = time.time()
+
+        durations.append(end - start)
+
+    avg_duration = sum(durations) / len(durations)
 
     # When run in parallel, it takes a little longer
-    assert duration <= 5
+    assert avg_duration - 5 <= 2
 
 
 def test_data_convert_help(run: Callable[..., RunResult]):

From 7ab1f970ba348d71c2ba6308e616f3974d287adc Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 12:11:39 +0100
Subject: [PATCH 160/239] fix test.

---
 .../test_embedding_intent_classifier.py       |  3 ++-
 .../test_count_vectors_featurizer.py          | 24 +++++++++----------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_embedding_intent_classifier.py
index 6e746fae6f3c..44691cbf0fa7 100644
--- a/tests/nlu/classifiers/test_embedding_intent_classifier.py
+++ b/tests/nlu/classifiers/test_embedding_intent_classifier.py
@@ -26,8 +26,9 @@ def test_compute_default_label_features():
 
     assert output.size == len(label_features)
     for i, o in enumerate(output):
+        assert isinstance(o, scipy.sparse.coo_matrix)
         assert o.data[0] == 1
-        assert o.indices[0] == i
+        assert o.col[0] == i
         assert o.shape == (1, len(label_features))
 
 
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index 2f8a9c374af7..fa90bf2d43bf 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -31,11 +31,11 @@ def test_count_vector_featurizer(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert isinstance(test_message.get("text_sparse_features"), scipy.sparse.csr_matrix)
+    assert isinstance(test_message.get("text_sparse_features"), scipy.sparse.coo_matrix)
 
-    actual = test_message.get("text_sparse_features")[0].toarray()
+    actual = test_message.get("text_sparse_features").toarray()
 
-    assert np.all(actual == expected)
+    assert np.all(actual[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -65,14 +65,14 @@ def test_count_vector_featurizer_attribute_featurization(
 
     if intent_features:
         assert (
-            train_message.get("intent_sparse_features")[0].toarray() == intent_features
+            train_message.get("intent_sparse_features").toarray()[0] == intent_features
         )
     else:
         assert train_message.get("intent_sparse_features") is None
 
     if response_features:
         assert (
-            train_message.get("response_sparse_features")[0].toarray()
+            train_message.get("response_sparse_features").toarray()[0]
             == response_features
         )
     else:
@@ -113,13 +113,13 @@ def test_count_vector_featurizer_shared_vocab(
     ftr.train(data)
 
     assert np.all(
-        train_message.get("text_sparse_features")[0].toarray() == text_features
+        train_message.get("text_sparse_features").toarray()[0] == text_features
     )
     assert np.all(
-        train_message.get("intent_sparse_features")[0].toarray() == intent_features
+        train_message.get("intent_sparse_features").toarray()[0] == intent_features
     )
     assert np.all(
-        train_message.get("response_sparse_features")[0].toarray() == response_features
+        train_message.get("response_sparse_features").toarray()[0] == response_features
     )
 
 
@@ -149,7 +149,7 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_sparse_features")[0].toarray() == expected)
+    assert np.all(test_message.get("text_sparse_features").toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -182,7 +182,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_sparse_features")[0].toarray() == expected)
+    assert np.all(test_message.get("text_sparse_features").toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -223,7 +223,7 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
 
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_sparse_features")[0].toarray() == expected)
+    assert np.all(test_message.get("text_sparse_features").toarray()[0] == expected)
 
 
 @pytest.mark.parametrize(
@@ -249,7 +249,7 @@ def test_count_vector_featurizer_char(sentence, expected):
     test_message = Message(sentence)
     ftr.process(test_message)
 
-    assert np.all(test_message.get("text_sparse_features")[0].toarray() == expected)
+    assert np.all(test_message.get("text_sparse_features").toarray()[0] == expected)
 
 
 def test_count_vector_featurizer_persist_load(tmpdir):

From 2a5428673cd94a5e9eeddc6f1120d1dfb0e52331 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 13 Nov 2019 16:56:13 +0100
Subject: [PATCH 161/239] fix different sequence lengths in sparse and dense
 features

---
 .../embedding_intent_classifier.py            |  48 ++--
 .../count_vectors_featurizer.py               | 205 +++++++++---------
 rasa/utils/train_utils.py                     |  61 +++---
 3 files changed, 172 insertions(+), 142 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e030b0f38356..e08e95d6ab98 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -11,7 +11,7 @@
 from rasa.nlu.classifiers import LABEL_RANKING_LENGTH
 from rasa.nlu.components import Component
 from rasa.utils import train_utils
-from rasa.utils.train_utils import SessionData
+from rasa.utils.train_utils import SessionDataType
 from rasa.nlu.constants import (
     MESSAGE_INTENT_ATTRIBUTE,
     MESSAGE_TEXT_ATTRIBUTE,
@@ -157,8 +157,9 @@ def __init__(
 
         self.batch_tuple_sizes = batch_tuple_sizes
 
-    # config migration warning
-    def _check_old_config_variables(self, config: Dict[Text, Any]) -> None:
+    @staticmethod
+    def _check_old_config_variables(config: Dict[Text, Any]) -> None:
+        """Config migration warning"""
 
         removed_tokenization_params = [
             "intent_tokenization_flag",
@@ -168,8 +169,9 @@ def _check_old_config_variables(self, config: Dict[Text, Any]) -> None:
             if removed_param in config:
                 warnings.warn(
                     "Intent tokenization has been moved to Tokenizer components. "
-                    "Your config still mentions '{}'. Tokenization may fail if you specify the parameter here."
-                    "Please specify the parameter 'intent_tokenization_flag' and 'intent_split_symbol' in the "
+                    "Your config still mentions '{}'. Tokenization may fail if "
+                    "you specify the parameter here. Please specify the parameter "
+                    "'intent_tokenization_flag' and 'intent_split_symbol' in the "
                     "tokenizer of your NLU pipeline".format(removed_param)
                 )
 
@@ -293,6 +295,13 @@ def _extract_and_add_features(
         if message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]) is not None:
             dense_features = message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute])
 
+        if sparse_features is not None and dense_features is not None:
+            if sparse_features.shape[0] != dense_features.shape[0]:
+                raise ValueError(
+                    f"Sequence dimensions for sparse and dense features "
+                    f"don't coincide in '{message.text}'"
+                )
+
         return sparse_features, dense_features
 
     def _extract_labels_precomputed_features(
@@ -338,7 +347,7 @@ def _create_label_data(
         training_data: "TrainingData",
         label_id_dict: Dict[Text, int],
         attribute: Text,
-    ) -> "SessionData":
+    ) -> "SessionDataType":
         """Create matrix with label_ids encoded in rows as bag of words.
 
         Find a training example for each label and get the encoded features
@@ -387,8 +396,8 @@ def _create_session_data(
         training_data: List["Message"],
         label_id_dict: Optional[Dict[Text, int]] = None,
         attribute: Optional[Text] = None,
-    ) -> "SessionData":
-        """Prepare data for training and create a SessionData object"""
+    ) -> "SessionDataType":
+        """Prepare data for training and create a SessionDataType object"""
         X_sparse = []
         X_dense = []
         Y_sparse = []
@@ -435,7 +444,7 @@ def _create_session_data(
 
     @staticmethod
     def _add_to_session_data(
-        session_data: SessionData, key: Text, features: List[np.ndarray]
+        session_data: SessionDataType, key: Text, features: List[np.ndarray]
     ):
         if not features:
             return
@@ -447,7 +456,10 @@ def _add_to_session_data(
                 session_data[key].append(data)
 
     @staticmethod
-    def _add_mask_to_session_data(session_data: SessionData, key: Text, from_key: Text):
+    def _add_mask_to_session_data(
+        session_data: SessionDataType, key: Text, from_key: Text
+    ):
+
         for data in session_data[from_key]:
             if data.size > 0:
                 mask = np.array([np.ones((x.shape[0], 1)) for x in data])
@@ -481,7 +493,7 @@ def _create_tf_embed_fnn(
         )
 
     def _build_tf_train_graph(
-        self, session_data: SessionData
+        self, session_data: SessionDataType
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
 
         # get in tensors from generator
@@ -568,7 +580,7 @@ def combine_sparse_dense_features(
         output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
         return output
 
-    def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
+    def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
 
         shapes, types = train_utils.get_shapes_types(session_data)
 
@@ -618,7 +630,7 @@ def _build_tf_pred_graph(self, session_data: "SessionData") -> "tf.Tensor":
         return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
 
     @staticmethod
-    def _get_num_of_features(session_data: "SessionData", key_prefix: Text) -> int:
+    def _get_num_of_features(session_data: "SessionDataType", key_prefix: Text) -> int:
         num_features = 0
         for k, values in session_data.items():
             if k.startswith(key_prefix):
@@ -626,7 +638,7 @@ def _get_num_of_features(session_data: "SessionData", key_prefix: Text) -> int:
                     num_features += v[0].shape[-1]
         return num_features
 
-    def check_input_dimension_consistency(self, session_data: "SessionData"):
+    def check_input_dimension_consistency(self, session_data: "SessionDataType"):
         if self.share_hidden_layers:
             num_text_features = self._get_num_of_features(session_data, "text_features")
             num_intent_features = self._get_num_of_features(
@@ -664,7 +676,8 @@ def preprocess_train_data(self, training_data: "TrainingData"):
 
         return session_data
 
-    def _check_enough_labels(self, session_data: "SessionData") -> bool:
+    @staticmethod
+    def _check_enough_labels(session_data: "SessionDataType") -> bool:
         return len(np.unique(session_data["intent_ids"])) >= 2
 
     def train(
@@ -674,6 +687,7 @@ def train(
         **kwargs: Any,
     ) -> None:
         """Train the embedding label classifier on a data set."""
+
         logger.debug("Started training embedding classifier.")
 
         # set numpy random seed
@@ -750,7 +764,9 @@ def train(
 
     # process helpers
     # noinspection PyPep8Naming
-    def _calculate_message_sim(self, batch: Tuple) -> Tuple[np.ndarray, List[float]]:
+    def _calculate_message_sim(
+        self, batch: Tuple[np.ndarray]
+    ) -> Tuple[np.ndarray, List[float]]:
         """Calculate message similarities"""
 
         message_sim = self.session.run(
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 7ac463008a53..68ddb65a283a 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -37,7 +37,9 @@ class CountVectorsFeaturizer(Featurizer):
         for attribute in MESSAGE_ATTRIBUTES
     ]
 
-    requires = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
+    requires = [
+        MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+    ]
 
     defaults = {
         # whether to use a shared vocab
@@ -210,87 +212,83 @@ def __init__(
         # declare class instance for CountVectorizer
         self.vectorizers = vectorizers
 
-    def _get_message_text_by_attribute(
-        self, message: "Message", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
-    ) -> Text:
-        """Get processed text of attribute of a message"""
-
-        if message.get(attribute) is None:
-            # return empty string since sklearn countvectorizer does not like None
-            # object while training and predicting
-            return ""
-
-        tokens = self._get_message_tokens_by_attribute(message, attribute)
-
-        text = self._process_text(tokens, attribute)
-
-        text = self._replace_with_oov_token(text, attribute)
+    @staticmethod
+    def _get_message_tokens_by_attribute(
+        message: "Message", attribute: Text
+    ) -> List[Text]:
+        """Get text tokens of an attribute of a message"""
+        if message.get(MESSAGE_TOKENS_NAMES[attribute]):
+            return [t.lemma for t in message.get(MESSAGE_TOKENS_NAMES[attribute])]
 
-        return text
+        return message.get(attribute).split()
 
-    def _process_text(
+    def _process_tokens(
         self, tokens: List[Text], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
-    ) -> Text:
+    ) -> List[Text]:
         """Apply processing and cleaning steps to text"""
 
-        text = " ".join(tokens)
-
         if attribute == MESSAGE_INTENT_ATTRIBUTE:
             # Don't do any processing for intent attribute. Treat them as whole labels
-            return text
+            return tokens
 
         # replace all digits with NUMBER token
-        text = re.sub(r"\b[0-9]+\b", "__NUMBER__", text)
+        tokens = [re.sub(r"\b[0-9]+\b", "__NUMBER__", text) for text in tokens]
 
         # convert to lowercase if necessary
         if self.lowercase:
-            text = text.lower()
-        return text
+            tokens = [text.lower() for text in tokens]
+        return tokens
 
-    def _replace_with_oov_token(self, text: Text, attribute: Text) -> Text:
+    def _replace_with_oov_token(
+        self, tokens: List[Text], attribute: Text
+    ) -> List[Text]:
         """Replace OOV words with OOV token"""
 
         if self.OOV_token and self.analyzer == "word":
-            text_tokens = text.split()
             if self._check_attribute_vocabulary(
                 attribute
             ) and self.OOV_token in self._get_attribute_vocabulary(attribute):
                 # CountVectorizer is trained, process for prediction
-                text_tokens = [
+                tokens = [
                     t
                     if t in self._get_attribute_vocabulary_tokens(attribute)
                     else self.OOV_token
-                    for t in text_tokens
+                    for t in tokens
                 ]
             elif self.OOV_words:
                 # CountVectorizer is not trained, process for train
-                text_tokens = [
-                    self.OOV_token if t in self.OOV_words else t for t in text_tokens
-                ]
-            text = " ".join(text_tokens)
-        return text
+                tokens = [self.OOV_token if t in self.OOV_words else t for t in tokens]
 
-    @staticmethod
-    def _get_message_tokens_by_attribute(
-        message: "Message", attribute: Text
+        return tokens
+
+    def _get_processed_message_tokens_by_attribute(
+        self, message: "Message", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
     ) -> List[Text]:
-        """Get text tokens of an attribute of a message"""
-        if message.get(MESSAGE_TOKENS_NAMES[attribute]):
-            return [t.lemma for t in message.get(MESSAGE_TOKENS_NAMES[attribute])]
+        """Get processed text of attribute of a message"""
 
-        return message.get(attribute).split()
+        if message.get(attribute) is None:
+            # return empty string since sklearn countvectorizer does not like None
+            # object while training and predicting
+            return [""]
+
+        tokens = self._get_message_tokens_by_attribute(message, attribute)
+
+        tokens = self._process_tokens(tokens, attribute)
+
+        tokens = self._replace_with_oov_token(tokens, attribute)
+
+        return tokens
 
     # noinspection PyPep8Naming
-    def _check_OOV_present(self, examples):
+    def _check_OOV_present(self, all_tokens: List[List[Text]]):
         """Check if an OOV word is present"""
         if self.OOV_token and not self.OOV_words:
-            for t in examples:
-                if (
-                    t is None
-                    or self.OOV_token in t
-                    or (self.lowercase and self.OOV_token in t.lower())
-                ):
-                    return
+            for tokens in all_tokens:
+                for text in tokens:
+                    if self.OOV_token in text or (
+                        self.lowercase and self.OOV_token in text.lower()
+                    ):
+                        return
 
             logger.warning(
                 "OOV_token='{}' was given, but it is not present "
@@ -299,35 +297,34 @@ def _check_OOV_present(self, examples):
                 "".format(self.OOV_token)
             )
 
-    def _set_attribute_features(
-        self, attribute: Text, attribute_features: List, training_data: "TrainingData"
-    ):
-        """Set computed features of the attribute to corresponding message objects"""
-        for i, example in enumerate(training_data.training_examples):
-            # create bag for each example
-            example.set(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
-                self._combine_with_existing_sparse_features(
-                    example,
-                    attribute_features[i],
-                    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
-                ),
-            )
-
-    def _get_all_attributes_processed_texts(
+    def _get_all_attributes_processed_tokens(
         self, training_data: "TrainingData"
-    ) -> Dict[Text, List[Text]]:
+    ) -> Dict[Text, List[List[Text]]]:
         """Get processed text for all attributes of examples in training data"""
 
-        processed_attribute_texts = {}
+        processed_attribute_tokens = {}
         for attribute in self._attributes:
-            attribute_texts = [
-                self._get_message_text_by_attribute(example, attribute)
+            all_tokens = [
+                self._get_processed_message_tokens_by_attribute(example, attribute)
                 for example in training_data.training_examples
             ]
-            self._check_OOV_present(attribute_texts)
-            processed_attribute_texts[attribute] = attribute_texts
-        return processed_attribute_texts
+            if attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+                # check for oov tokens only in text based attributes
+                self._check_OOV_present(all_tokens)
+            processed_attribute_tokens[attribute] = all_tokens
+        return processed_attribute_tokens
+
+    @staticmethod
+    def _convert_attribute_tokens_to_texts(
+        attribute_tokens: Dict[Text, List[List[Text]]]
+    ) -> Dict[Text, List[Text]]:
+        attribute_texts = {}
+        for attribute in attribute_tokens.keys():
+            attribute_texts[attribute] = [
+                " ".join(tokens) for tokens in attribute_tokens[attribute]
+            ]
+
+        return attribute_texts
 
     def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
         """Construct the vectorizers and train them with a shared vocab"""
@@ -389,35 +386,44 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
                     "training a CountVectorizer for it.".format(attribute)
                 )
 
-    def _get_featurized_attribute(
-        self, attribute: Text, attribute_texts: List[Text]
-    ) -> Optional[List[scipy.sparse.coo_matrix]]:
-        """Return features of a particular attribute for complete data"""
-
-        if self._check_attribute_vocabulary(attribute):
-            # count vectorizer was trained
-            return self._create_sequence(attribute, attribute_texts)
-        else:
-            return None
-
-    @staticmethod
-    def _get_text_sequence(text: Text) -> List[Text]:
-        return text.split()
-
     def _create_sequence(
-        self, attribute: Text, attribute_texts: List[Text]
+        self, attribute: Text, all_tokens: List[List[Text]]
     ) -> List[scipy.sparse.coo_matrix]:
-        texts = [self._get_text_sequence(text) for text in attribute_texts]
-
         X = []
 
-        for i, tokens in enumerate(texts):
+        for i, tokens in enumerate(all_tokens):
             x = self.vectorizers[attribute].transform(tokens)
             x.sort_indices()
             X.append(x.tocoo())
 
         return X
 
+    def _get_featurized_attribute(
+        self, attribute: Text, all_tokens: List[List[Text]]
+    ) -> Optional[List[scipy.sparse.coo_matrix]]:
+        """Return features of a particular attribute for complete data"""
+
+        if self._check_attribute_vocabulary(attribute):
+            # count vectorizer was trained
+            return self._create_sequence(attribute, all_tokens)
+        else:
+            return None
+
+    def _set_attribute_features(
+        self, attribute: Text, attribute_features: List, training_data: "TrainingData"
+    ):
+        """Set computed features of the attribute to corresponding message objects"""
+        for i, example in enumerate(training_data.training_examples):
+            # create bag for each example
+            example.set(
+                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+                self._combine_with_existing_sparse_features(
+                    example,
+                    attribute_features[i],
+                    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+                ),
+            )
+
     def train(
         self, training_data: TrainingData, cfg: RasaNLUModelConfig = None, **kwargs: Any
     ) -> None:
@@ -433,21 +439,24 @@ def train(
             self.OOV_words = [t.lemma_ for w in self.OOV_words for t in spacy_nlp(w)]
 
         # process sentences and collect data for all attributes
-        processed_attribute_texts = self._get_all_attributes_processed_texts(
+        processed_attribute_tokens = self._get_all_attributes_processed_tokens(
             training_data
         )
 
         # train for all attributes
+        attribute_texts = self._convert_attribute_tokens_to_texts(
+            processed_attribute_tokens
+        )
         if self.use_shared_vocab:
-            self._train_with_shared_vocab(processed_attribute_texts)
+            self._train_with_shared_vocab(attribute_texts)
         else:
-            self._train_with_independent_vocab(processed_attribute_texts)
+            self._train_with_independent_vocab(attribute_texts)
 
         # transform for all attributes
         for attribute in self._attributes:
 
             attribute_features = self._get_featurized_attribute(
-                attribute, processed_attribute_texts[attribute]
+                attribute, processed_attribute_tokens[attribute]
             )
 
             if attribute_features is not None:
@@ -467,10 +476,12 @@ def process(self, message: Message, **kwargs: Any) -> None:
             return
 
         attribute = MESSAGE_TEXT_ATTRIBUTE
-        message_text = self._get_message_text_by_attribute(message, attribute=attribute)
+        message_tokens = self._get_processed_message_tokens_by_attribute(
+            message, attribute
+        )
 
         # features shape (1, seq, dim)
-        features = self._create_sequence(attribute, [message_text])
+        features = self._create_sequence(attribute, [message_tokens])
 
         message.set(
             MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 64fcde691791..626d7e3b0f10 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -24,8 +24,8 @@
 logger = logging.getLogger(__name__)
 
 
-# dictionary for all tf session related data
-SessionData = Dict[Text, List[np.ndarray]]
+# type for all tf session related data
+SessionDataType = Dict[Text, List[np.ndarray]]
 
 
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
@@ -38,14 +38,14 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
 
 # noinspection PyPep8Naming
 def train_val_split(
-    session_data: "SessionData",
+    session_data: SessionDataType,
     evaluate_on_num_examples: int,
     random_seed: int,
     label_key: Text,
-) -> Tuple["SessionData", "SessionData"]:
+) -> Tuple[SessionDataType, SessionDataType]:
     """Create random hold out validation set using stratified split."""
     if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionData.")
+        raise ValueError(f"Key '{label_key}' not in SessionDataType.")
 
     label_counts = dict(
         zip(*np.unique(session_data[label_key][0], return_counts=True, axis=0))
@@ -86,7 +86,7 @@ def train_val_split(
 def check_train_test_sizes(
     evaluate_on_num_examples: int,
     label_counts: Dict[Any, int],
-    session_data: SessionData,
+    session_data: SessionDataType,
 ):
     num_examples = get_number_of_examples(session_data)
 
@@ -104,7 +104,7 @@ def check_train_test_sizes(
 
 
 def convert_train_test_split(
-    output_values: List[Any], session_data: SessionData, solo_values: List[Any]
+    output_values: List[Any], session_data: SessionDataType, solo_values: List[Any]
 ):
     keys = [k for k in session_data.keys()]
 
@@ -144,14 +144,14 @@ def combine_features(
     return np.concatenate([feature_1, feature_2])
 
 
-def shuffle_session_data(session_data: "SessionData") -> "SessionData":
+def shuffle_session_data(session_data: SessionDataType) -> SessionDataType:
     """Shuffle session data."""
     data_points = get_number_of_examples(session_data)
     ids = np.random.permutation(data_points)
     return session_data_for_ids(session_data, ids)
 
 
-def session_data_for_ids(session_data: SessionData, ids: np.ndarray):
+def session_data_for_ids(session_data: SessionDataType, ids: np.ndarray):
     """Filter session data by ids."""
     new_session_data = defaultdict(list)
     for k, values in session_data.items():
@@ -161,11 +161,11 @@ def session_data_for_ids(session_data: SessionData, ids: np.ndarray):
 
 
 def split_session_data_by_label(
-    session_data: "SessionData", label_key: Text, unique_label_ids: "np.ndarray"
-) -> List["SessionData"]:
+    session_data: SessionDataType, label_key: Text, unique_label_ids: "np.ndarray"
+) -> List[SessionDataType]:
     """Reorganize session data into a list of session data with the same labels."""
     if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionData.")
+        raise ValueError(f"Key '{label_key}' not in SessionDataType.")
 
     label_data = []
     for label_id in unique_label_ids:
@@ -176,8 +176,8 @@ def split_session_data_by_label(
 
 # noinspection PyPep8Naming
 def balance_session_data(
-    session_data: "SessionData", batch_size: int, shuffle: bool, label_key: Text
-) -> "SessionData":
+    session_data: SessionDataType, batch_size: int, shuffle: bool, label_key: Text
+) -> SessionDataType:
     """Mix session data to account for class imbalance.
 
     This batching strategy puts rare classes approximately in every other batch,
@@ -185,7 +185,7 @@ def balance_session_data(
     that more populated classes should appear more often.
     """
     if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionData.")
+        raise ValueError(f"Key '{label_key}' not in SessionDataType.")
 
     unique_label_ids, counts_label_ids = np.unique(
         session_data[label_key][0], return_counts=True, axis=0
@@ -243,7 +243,7 @@ def balance_session_data(
     return final_session_data
 
 
-def get_number_of_examples(session_data: SessionData):
+def get_number_of_examples(session_data: SessionDataType):
     """Obtain number of examples in session data.
     Raise a ValueError if number of examples differ for different data in session data.
     """
@@ -260,7 +260,7 @@ def get_number_of_examples(session_data: SessionData):
 
 
 def gen_batch(
-    session_data: "SessionData",
+    session_data: SessionDataType,
     batch_size: int,
     label_key: Text,
     batch_strategy: Text = "sequence",
@@ -286,12 +286,13 @@ def gen_batch(
 
 
 def prepare_batch(
-    session_data: SessionData,
+    session_data: SessionDataType,
     start: Optional[int] = None,
     end: Optional[int] = None,
     tuple_sizes: Dict[Text, int] = None,
-):
+) -> Tuple[np.ndarray]:
     """Slices session data into batch using given start and end value."""
+
     batch_data = []
 
     for key, values in session_data.items():
@@ -324,7 +325,8 @@ def prepare_batch(
 
 def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     """Convert a scipy matrix into inidces, data, and shape."""
-    seq_len = max([x.shape[0] for x in array_of_sparse])
+    max_seq_len = max([x.shape[0] for x in array_of_sparse])
+
     if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
         coo = [x.tocoo() for x in array_of_sparse]
     else:
@@ -334,7 +336,7 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     indices = [
         ids for i, x in enumerate(coo) for ids in zip([i] * len(x.row), x.row, x.col)
     ]
-    shape = (len(array_of_sparse), seq_len, array_of_sparse[0].shape[-1])
+    shape = (len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1])
 
     return [
         np.array(indices).astype(np.int64),
@@ -368,6 +370,7 @@ def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
             data_padded[i, : data[i].shape[0]] = data[i]
     else:
         max_seq_len = max([x.shape[0] for x in data])
+
         data_padded = np.zeros(
             [data_size, max_seq_len, feature_len], dtype=data[0].dtype
         )
@@ -378,7 +381,7 @@ def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
 
 
 def batch_to_session_data(
-    batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionData
+    batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionDataType
 ) -> Tuple[Dict[Text, List[tf.Tensor]], Dict[Text, int]]:
     """
     Batch contains any number of batch data. The order is equal to the
@@ -415,7 +418,7 @@ def batch_to_session_data(
 
 # noinspection PyPep8Naming
 def create_tf_dataset(
-    session_data: "SessionData",
+    session_data: SessionDataType,
     batch_size: Union["tf.Tensor", int],
     label_key: Text,
     batch_strategy: Text = "sequence",
@@ -435,7 +438,7 @@ def create_tf_dataset(
     )
 
 
-def get_shapes_types(session_data: SessionData) -> Tuple:
+def get_shapes_types(session_data: SessionDataType) -> Tuple:
     """Extract shapes and types from session data."""
     types = []
     shapes = []
@@ -444,10 +447,10 @@ def append_shape(v: np.ndarray):
         if isinstance(v[0], scipy.sparse.spmatrix):
             # scipy matrix is converted into indices, data, shape
             shapes.append((None, v[0].ndim + 1))
-            shapes.append((None))
+            shapes.append((None,))
             shapes.append((v[0].ndim + 1))
         elif v[0].ndim == 0:
-            shapes.append((None))
+            shapes.append((None,))
         elif v[0].ndim == 1:
             shapes.append((None, v[0].shape[-1]))
         else:
@@ -460,7 +463,7 @@ def append_type(v: np.ndarray):
             types.append(tf.float64)
             types.append(tf.int64)
         else:
-            types.append(v[0].dtype)
+            types.append(tf.float64)
 
     for values in session_data.values():
         for v in values:
@@ -471,8 +474,8 @@ def append_type(v: np.ndarray):
 
 
 def create_iterator_init_datasets(
-    session_data: "SessionData",
-    eval_session_data: "SessionData",
+    session_data: SessionDataType,
+    eval_session_data: SessionDataType,
     batch_size: Union["tf.Tensor", int],
     batch_strategy: Text,
     label_key: Text,

From 307e064e96c001a757acf6e960feff428cb3fb47 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 13 Nov 2019 17:32:12 +0100
Subject: [PATCH 162/239] cosmetic changes

---
 .../embedding_intent_classifier.py            | 172 +++++++++---------
 rasa/utils/train_utils.py                     | 138 +++++++-------
 2 files changed, 153 insertions(+), 157 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e08e95d6ab98..cdee45ed6f24 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -113,50 +113,6 @@ class EmbeddingIntentClassifier(Component):
     }
     # end default properties (DOC MARKER - don't remove)
 
-    def __init__(
-        self,
-        component_config: Optional[Dict[Text, Any]] = None,
-        inverted_label_dict: Optional[Dict[int, Text]] = None,
-        session: Optional["tf.Session"] = None,
-        graph: Optional["tf.Graph"] = None,
-        batch_placeholder: Optional["tf.Tensor"] = None,
-        similarity_all: Optional["tf.Tensor"] = None,
-        pred_confidence: Optional["tf.Tensor"] = None,
-        similarity: Optional["tf.Tensor"] = None,
-        label_embed: Optional["tf.Tensor"] = None,
-        all_labels_embed: Optional["tf.Tensor"] = None,
-        batch_tuple_sizes: Optional[Dict] = None,
-    ) -> None:
-        """Declare instant variables with default values"""
-
-        super().__init__(component_config)
-
-        self._load_params()
-
-        # transform numbers to labels
-        self.inverted_label_dict = inverted_label_dict
-        # encode all label_ids with numbers
-        self._label_data = None
-
-        # tf related instances
-        self.session = session
-        self.graph = graph
-        self.batch_in = batch_placeholder
-        self.sim_all = similarity_all
-        self.pred_confidence = pred_confidence
-        self.sim = similarity
-
-        # persisted embeddings
-        self.label_embed = label_embed
-        self.all_labels_embed = all_labels_embed
-
-        # internal tf instances
-        self._iterator = None
-        self._train_op = None
-        self._is_training = None
-
-        self.batch_tuple_sizes = batch_tuple_sizes
-
     @staticmethod
     def _check_old_config_variables(config: Dict[Text, Any]) -> None:
         """Config migration warning"""
@@ -242,6 +198,53 @@ def _load_params(self) -> None:
     def required_packages(cls) -> List[Text]:
         return ["tensorflow"]
 
+    def __init__(
+        self,
+        component_config: Optional[Dict[Text, Any]] = None,
+        inverted_label_dict: Optional[Dict[int, Text]] = None,
+        session: Optional["tf.Session"] = None,
+        graph: Optional["tf.Graph"] = None,
+        batch_placeholder: Optional["tf.Tensor"] = None,
+        similarity_all: Optional["tf.Tensor"] = None,
+        pred_confidence: Optional["tf.Tensor"] = None,
+        similarity: Optional["tf.Tensor"] = None,
+        message_embed: Optional["tf.Tensor"] = None,
+        label_embed: Optional["tf.Tensor"] = None,
+        all_labels_embed: Optional["tf.Tensor"] = None,
+        batch_tuple_sizes: Optional[Dict] = None,
+    ) -> None:
+        """Declare instant variables with default values"""
+
+        super().__init__(component_config)
+
+        self._load_params()
+
+        # transform numbers to labels
+        self.inverted_label_dict = inverted_label_dict
+        # encode all label_ids with numbers
+        self._label_data = None
+
+        # tf related instances
+        self.session = session
+        self.graph = graph
+        self.batch_in = batch_placeholder
+        self.sim_all = similarity_all
+        self.pred_confidence = pred_confidence
+        self.sim = similarity
+
+        # persisted embeddings
+        self.message_embed = message_embed
+        self.label_embed = label_embed
+        self.all_labels_embed = all_labels_embed
+
+        # keep the input tuple sizes in self.batch_in
+        self.batch_tuple_sizes = batch_tuple_sizes
+
+        # internal tf instances
+        self._iterator = None
+        self._train_op = None
+        self._is_training = None
+
     # training data helpers:
     @staticmethod
     def _create_label_id_dict(
@@ -492,13 +495,41 @@ def _create_tf_embed_fnn(
             layer_name_suffix=embed_name,
         )
 
+    def combine_sparse_dense_features(
+        self,
+        features: List[Union[tf.Tensor, tf.SparseTensor]],
+        mask: tf.Tensor,
+        name: Text,
+    ) -> tf.Tensor:
+
+        dense_features = []
+
+        dense_dim = self.dense_dim
+        # if dense features are present use the feature dimension of the dense features
+        for f in features:
+            if not isinstance(f, tf.SparseTensor):
+                dense_dim = f.shape[-1]
+                break
+
+        for f in features:
+            if isinstance(f, tf.SparseTensor):
+                dense_features.append(
+                    train_utils.tf_dense_layer_for_sparse(f, dense_dim, name, self.C2)
+                )
+            else:
+                dense_features.append(f)
+
+        output = tf.concat(dense_features, axis=-1)
+        # apply mean to convert sequence to sentence features
+        output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
+        return output
+
     def _build_tf_train_graph(
         self, session_data: SessionDataType
     ) -> Tuple["tf.Tensor", "tf.Tensor"]:
 
         # get in tensors from generator
         self.batch_in = self._iterator.get_next()
-
         # convert encoded all labels into the batch format
         label_batch = train_utils.prepare_batch(self._label_data)
 
@@ -516,7 +547,7 @@ def _build_tf_train_graph(
             label_data["intent_features"], label_data["intent_mask"][0], "intent"
         )
 
-        message_embed = self._create_tf_embed_fnn(
+        self.message_embed = self._create_tf_embed_fnn(
             a,
             self.hidden_layer_sizes["text"],
             fnn_name="text_intent" if self.share_hidden_layers else "text",
@@ -536,7 +567,7 @@ def _build_tf_train_graph(
         )
 
         return train_utils.calculate_loss_acc(
-            message_embed,
+            self.message_embed,
             self.label_embed,
             b,
             self.all_labels_embed,
@@ -551,35 +582,6 @@ def _build_tf_train_graph(
             self.scale_loss,
         )
 
-    def combine_sparse_dense_features(
-        self,
-        features: List[Union[tf.Tensor, tf.SparseTensor]],
-        mask: tf.Tensor,
-        name: Text,
-    ) -> tf.Tensor:
-
-        dense_features = []
-
-        dense_dim = self.dense_dim
-        # if dense features are present use the feature dimension of the dense features
-        for f in features:
-            if not isinstance(f, tf.SparseTensor):
-                dense_dim = f.shape[-1]
-                break
-
-        for f in features:
-            if isinstance(f, tf.SparseTensor):
-                dense_features.append(
-                    train_utils.tf_dense_layer_for_sparse(f, dense_dim, name, self.C2)
-                )
-            else:
-                dense_features.append(f)
-
-        output = tf.concat(dense_features, axis=-1)
-        # apply mean to convert sequence to sentence features
-        output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
-        return output
-
     def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
 
         shapes, types = train_utils.get_shapes_types(session_data)
@@ -603,7 +605,7 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
 
         self.all_labels_embed = tf.constant(self.session.run(self.all_labels_embed))
 
-        message_embed = self._create_tf_embed_fnn(
+        self.message_embed = self._create_tf_embed_fnn(
             a,
             self.hidden_layer_sizes["text"],
             fnn_name="text_intent" if self.share_hidden_layers else "text",
@@ -611,7 +613,7 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
         )
 
         self.sim_all = train_utils.tf_raw_sim(
-            message_embed[:, tf.newaxis, :],
+            self.message_embed[:, tf.newaxis, :],
             self.all_labels_embed[tf.newaxis, :, :],
             None,
         )
@@ -624,18 +626,17 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
         )
 
         self.sim = train_utils.tf_raw_sim(
-            message_embed[:, tf.newaxis, :], self.label_embed, None
+            self.message_embed[:, tf.newaxis, :], self.label_embed, None
         )
 
         return train_utils.confidence_from_sim(self.sim_all, self.similarity_type)
 
     @staticmethod
-    def _get_num_of_features(session_data: "SessionDataType", key_prefix: Text) -> int:
+    def _get_num_of_features(session_data: "SessionDataType", key: Text) -> int:
         num_features = 0
-        for k, values in session_data.items():
-            if k.startswith(key_prefix):
-                for v in values:
-                    num_features += v[0].shape[-1]
+        for data in session_data[key]:
+            if data.size > 0:
+                num_features += data[0].shape[-1]
         return num_features
 
     def check_input_dimension_consistency(self, session_data: "SessionDataType"):
@@ -860,6 +861,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Dict[Text, Any]:
             )
             train_utils.persist_tensor("similarity", self.sim, self.graph)
 
+            train_utils.persist_tensor("message_embed", self.message_embed, self.graph)
             train_utils.persist_tensor("label_embed", self.label_embed, self.graph)
             train_utils.persist_tensor(
                 "all_labels_embed", self.all_labels_embed, self.graph
@@ -913,6 +915,7 @@ def load(
                 pred_confidence = train_utils.load_tensor("pred_confidence")
                 sim = train_utils.load_tensor("similarity")
 
+                message_embed = train_utils.load_tensor("message_embed")
                 label_embed = train_utils.load_tensor("label_embed")
                 all_labels_embed = train_utils.load_tensor("all_labels_embed")
 
@@ -935,6 +938,7 @@ def load(
                 similarity_all=sim_all,
                 pred_confidence=pred_confidence,
                 similarity=sim,
+                message_embed=message_embed,
                 label_embed=label_embed,
                 all_labels_embed=all_labels_embed,
                 batch_tuple_sizes=batch_tuple_sizes,
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 626d7e3b0f10..1d930876bb3d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -21,6 +21,7 @@
 
 # avoid warning println on contrib import - remove for tf 2
 tf.contrib._warning = None
+
 logger = logging.getLogger(__name__)
 
 
@@ -44,6 +45,7 @@ def train_val_split(
     label_key: Text,
 ) -> Tuple[SessionDataType, SessionDataType]:
     """Create random hold out validation set using stratified split."""
+
     if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionDataType.")
 
@@ -132,6 +134,7 @@ def combine_features(
     feature_2: Union[np.ndarray, scipy.sparse.spmatrix],
 ) -> Union[np.ndarray, scipy.sparse.spmatrix]:
     """Concatenate features."""
+
     if isinstance(feature_1, scipy.sparse.spmatrix) and isinstance(
         feature_2, scipy.sparse.spmatrix
     ):
@@ -345,13 +348,6 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     ]
 
 
-def values_to_sparse_tensor(
-    indices: np.ndarray, data: np.ndarray, shape: Union[np.ndarray, List]
-) -> tf.SparseTensor:
-    """Create a Sparse Tensor from given indices, data, and shape."""
-    return tf.SparseTensor(indices, data, shape)
-
-
 def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
     """
     Pad data of different lengths.
@@ -383,40 +379,41 @@ def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
 def batch_to_session_data(
     batch: Union[Tuple[np.ndarray], Tuple[tf.Tensor]], session_data: SessionDataType
 ) -> Tuple[Dict[Text, List[tf.Tensor]], Dict[Text, int]]:
-    """
+    """Convert input batch tensors into batch data format.
+
     Batch contains any number of batch data. The order is equal to the
     key-value pairs in session data. As sparse data were converted into indices, data,
     shape before, this methods converts them into sparse tensors. Dense data is
     kept.
     """
+
     batch_data = defaultdict(list)
     # save the amount of placeholders attributed to session data keys
     tuple_sizes = defaultdict(int)
-    idx = 0
 
+    idx = 0
     for k, values in session_data.items():
         for v in values:
             tuple_sizes[k] = 0
             if isinstance(v[0], scipy.sparse.spmatrix):
                 # explicitly substitute last dimension in shape with known static value
                 batch_data[k].append(
-                    values_to_sparse_tensor(
+                    tf.SparseTensor(
                         batch[idx],
                         batch[idx + 1],
                         [batch[idx + 2][0], batch[idx + 2][1], v[0].shape[-1]],
                     )
                 )
-                idx += 3
                 tuple_sizes[k] += 3
+                idx += 3
             else:
                 batch_data[k].append(batch[idx])
-                idx += 1
                 tuple_sizes[k] += 1
+                idx += 1
 
     return batch_data, tuple_sizes
 
 
-# noinspection PyPep8Naming
 def create_tf_dataset(
     session_data: SessionDataType,
     batch_size: Union["tf.Tensor", int],
@@ -440,6 +437,7 @@ def create_tf_dataset(
 
 def get_shapes_types(session_data: SessionDataType) -> Tuple:
     """Extract shapes and types from session data."""
+
     types = []
     shapes = []
 
@@ -506,6 +504,50 @@ def create_iterator_init_datasets(
     return iterator, train_init_op, eval_init_op
 
 
+# noinspection PyPep8Naming
+def tf_dense_layer_for_sparse(
+    inputs: tf.SparseTensor,
+    units: int,
+    name: Text,
+    C2: float,
+    activation: Optional[Callable] = tf.nn.relu,
+    use_bias: bool = True,
+) -> tf.Tensor:
+    """Dense layer for sparse input tensor"""
+
+    if not isinstance(inputs, tf.SparseTensor):
+        raise ValueError("Input tensor should be sparse.")
+
+    with tf.variable_scope("dense_layer_for_sparse_" + name, reuse=tf.AUTO_REUSE):
+        kernel_regularizer = tf.contrib.layers.l2_regularizer(C2)
+        kernel = tf.get_variable(
+            "kernel",
+            shape=[inputs.shape[-1], units],
+            dtype=inputs.dtype,
+            regularizer=kernel_regularizer,
+        )
+        bias = tf.get_variable("bias", shape=[units], dtype=inputs.dtype)
+
+        # outputs will be 2D
+        outputs = tf.sparse.matmul(
+            tf.sparse.reshape(inputs, [-1, int(inputs.shape[-1])]), kernel
+        )
+
+        if len(inputs.shape) == 3:
+            # reshape back
+            outputs = tf.reshape(
+                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
+            )
+
+        if use_bias:
+            outputs = tf.nn.bias_add(outputs, bias)
+
+    if activation is None:
+        return outputs
+
+    return activation(outputs)
+
+
 # noinspection PyPep8Naming
 def create_tf_fnn(
     x_in: "tf.Tensor",
@@ -687,10 +729,14 @@ def _tf_sample_neg(
     return tf.batch_gather(tiled_all_bs, neg_ids)
 
 
-def _tf_calc_iou_mask(
+def _tf_get_bad_mask(
     pos_b: "tf.Tensor", all_bs: "tf.Tensor", neg_ids: "tf.Tensor"
 ) -> "tf.Tensor":
-    """Calculate IOU mask for given indices"""
+    """Calculate bad mask for given indices.
+
+    Checks that input features are different for positive negative samples.
+    """
+
     pos_b_in_flat = tf.expand_dims(pos_b, -2)
     neg_b_in_flat = _tf_sample_neg(tf.shape(pos_b)[0], all_bs, neg_ids)
 
@@ -699,12 +745,6 @@ def _tf_calc_iou_mask(
         pos_b_in_flat.dtype,
     )
 
-    # intersection_b_in_flat = tf.minimum(neg_b_in_flat, pos_b_in_flat)
-    # union_b_in_flat = tf.maximum(neg_b_in_flat, pos_b_in_flat)
-    #
-    # iou = tf.reduce_sum(intersection_b_in_flat, -1) / tf.reduce_sum(union_b_in_flat, -1)
-    # return 1.0 - tf.nn.relu(tf.sign(1.0 - iou))
-
 
 def _tf_get_negs(
     all_embed: "tf.Tensor", all_raw: "tf.Tensor", raw_pos: "tf.Tensor", num_neg: int
@@ -731,7 +771,7 @@ def _tf_get_negs(
     )
     neg_ids = shuffled_indices[:, :num_neg]
 
-    bad_negs = _tf_calc_iou_mask(raw_flat, all_raw, neg_ids)
+    bad_negs = _tf_get_bad_mask(raw_flat, all_raw, neg_ids)
     if len(raw_pos.shape) == 3:
         bad_negs = tf.reshape(bad_negs, (batch_size, seq_length, -1))
 
@@ -771,54 +811,6 @@ def sample_negatives(
     )
 
 
-def tf_dense_layer_for_sparse(
-    inputs: tf.SparseTensor,
-    units: int,
-    name: Text,
-    C2: float,
-    activation: Optional[Callable] = tf.nn.relu,
-    use_bias: bool = True,
-) -> tf.Tensor:
-    """Idea from
-    https://medium.com/dailymotion/how-to-design-deep-learning-models-with-sparse-inputs-in-tensorflow-keras-fd5e754abec1
-    """
-
-    if not isinstance(inputs, tf.SparseTensor):
-        raise
-
-    with tf.variable_scope("dense_layer_for_sparse_" + name, reuse=tf.AUTO_REUSE):
-        kernel_regularizer = tf.contrib.layers.l2_regularizer(C2)
-        kernel = tf.get_variable(
-            "kernel",
-            shape=[inputs.shape[-1], units],
-            dtype=inputs.dtype,
-            regularizer=kernel_regularizer,
-        )
-        bias = tf.get_variable("bias", shape=[units], dtype=inputs.dtype)
-
-        # outputs will be 2D
-        outputs = tf.sparse.matmul(
-            tf.sparse.reshape(inputs, [-1, tf.shape(inputs)[-1]]), kernel
-        )
-        # outputs = tf.matmul(
-        #     tf.reshape(tf.sparse.to_dense(inputs, validate_indices=False), [-1, tf.shape(inputs)[-1]]), kernel, a_is_sparse=True
-        # )
-
-        if len(inputs.shape) == 3:
-            # reshape back
-            outputs = tf.reshape(
-                outputs, (tf.shape(inputs)[0], tf.shape(inputs)[1], -1)
-            )
-
-        if use_bias:
-            outputs = tf.nn.bias_add(outputs, bias)
-
-    if activation is None:
-        return outputs
-
-    return activation(outputs)
-
-
 def tf_raw_sim(
     a: "tf.Tensor", b: "tf.Tensor", mask: Optional["tf.Tensor"]
 ) -> "tf.Tensor":
@@ -1233,7 +1225,7 @@ def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
         return tf.concat(attention, 0)
 
 
-def persist_tensor(name: Text, tensor: "tf.Tensor", graph: "tf.Graph") -> None:
+def persist_tensor(name: Text, tensor: Union["tf.Tensor", Tuple["tf.Tensor"], List["tf.Tensor"]], graph: "tf.Graph") -> None:
     """Add tensor to collection if it is not None"""
 
     if tensor is not None:
@@ -1250,8 +1242,8 @@ def load_tensor(name: Text) -> Optional["tf.Tensor"]:
 
     tensor_list = tf.get_collection(name)
 
-    if tensor_list is None:
-        return tensor_list
+    if not tensor_list:
+        return None
 
     if len(tensor_list) == 1:
         return tensor_list[0]

From 7c46caafd4efd490422f09da07d9d5b41d364882 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 13 Nov 2019 17:33:14 +0100
Subject: [PATCH 163/239] black

---
 rasa/utils/train_utils.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 1d930876bb3d..50c6b0bdf246 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -1225,7 +1225,11 @@ def extract_attention(attention_weights) -> Optional["tf.Tensor"]:
         return tf.concat(attention, 0)
 
 
-def persist_tensor(name: Text, tensor: Union["tf.Tensor", Tuple["tf.Tensor"], List["tf.Tensor"]], graph: "tf.Graph") -> None:
+def persist_tensor(
+    name: Text,
+    tensor: Union["tf.Tensor", Tuple["tf.Tensor"], List["tf.Tensor"]],
+    graph: "tf.Graph",
+) -> None:
     """Add tensor to collection if it is not None"""
 
     if tensor is not None:
@@ -1237,7 +1241,7 @@ def persist_tensor(name: Text, tensor: Union["tf.Tensor", Tuple["tf.Tensor"], Li
             graph.add_to_collection(name, tensor)
 
 
-def load_tensor(name: Text) -> Optional["tf.Tensor"]:
+def load_tensor(name: Text) -> Optional[Union["tf.Tensor", List["tf.Tensor"]]]:
     """Load tensor or set it to None"""
 
     tensor_list = tf.get_collection(name)

From 468ef3ca703a1fc14d486479c241fd584ad5b399 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Wed, 13 Nov 2019 18:05:22 +0100
Subject: [PATCH 164/239] fix default Y features

---
 .../embedding_intent_classifier.py            | 19 +++++++++++--------
 .../count_vectors_featurizer.py               |  7 ++++---
 2 files changed, 15 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index cdee45ed6f24..4a9a3f98d62b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -54,7 +54,7 @@ class EmbeddingIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = []
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
@@ -398,9 +398,10 @@ def _create_session_data(
         self,
         training_data: List["Message"],
         label_id_dict: Optional[Dict[Text, int]] = None,
-        attribute: Optional[Text] = None,
+        label_attribute: Optional[Text] = None,
     ) -> "SessionDataType":
         """Prepare data for training and create a SessionDataType object"""
+
         X_sparse = []
         X_dense = []
         Y_sparse = []
@@ -422,8 +423,8 @@ def _create_session_data(
             if _dense is not None:
                 Y_dense.append(_dense)
 
-            if e.get(attribute):
-                label_ids.append(label_id_dict[e.get(attribute)])
+            if label_attribute and e.get(label_attribute):
+                label_ids.append(label_id_dict[e.get(label_attribute)])
 
         X_sparse = np.array(X_sparse)
         X_dense = np.array(X_dense)
@@ -436,8 +437,8 @@ def _create_session_data(
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
 
-        if "intent_features" not in session_data:
-            # no intent features are present, get default features from _label_data
+        if label_attribute and not session_data["intent_features"]:
+            # no label features are present, get default features from _label_data
             session_data["intent_features"] = self.use_default_label_features(label_ids)
 
         self._add_mask_to_session_data(session_data, "text_mask", "text_features")
@@ -463,10 +464,12 @@ def _add_mask_to_session_data(
         session_data: SessionDataType, key: Text, from_key: Text
     ):
 
+        session_data[key] = []
+
         for data in session_data[from_key]:
             if data.size > 0:
                 mask = np.array([np.ones((x.shape[0], 1)) for x in data])
-                session_data[key] = [mask]
+                session_data[key].append(mask)
                 break
 
     # tf helpers:
@@ -670,7 +673,7 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         session_data = self._create_session_data(
             training_data.intent_examples,
             label_id_dict,
-            attribute=MESSAGE_INTENT_ATTRIBUTE,
+            label_attribute=MESSAGE_INTENT_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 68ddb65a283a..80498fd07b2a 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -282,7 +282,7 @@ def _get_processed_message_tokens_by_attribute(
     # noinspection PyPep8Naming
     def _check_OOV_present(self, all_tokens: List[List[Text]]):
         """Check if an OOV word is present"""
-        if self.OOV_token and not self.OOV_words:
+        if self.OOV_token and not self.OOV_words and all_tokens:
             for tokens in all_tokens:
                 for text in tokens:
                     if self.OOV_token in text or (
@@ -382,8 +382,9 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
                     )
             else:
                 logger.debug(
-                    "No text provided for {} attribute in any messages of training data. Skipping "
-                    "training a CountVectorizer for it.".format(attribute)
+                    "No text provided for {} attribute in any messages of "
+                    "training data. Skipping training a CountVectorizer "
+                    "for it.".format(attribute)
                 )
 
     def _create_sequence(

From b2391cfbc8d46cb2d6b150ab063a3cc4f8869db3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 19:52:40 +0100
Subject: [PATCH 165/239] use f strings

---
 .../embedding_intent_classifier.py            | 15 ++++----
 .../count_vectors_featurizer.py               | 37 ++++++++++---------
 rasa/utils/train_utils.py                     | 21 ++++-------
 3 files changed, 34 insertions(+), 39 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 4a9a3f98d62b..90efb2ad2066 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -124,11 +124,11 @@ def _check_old_config_variables(config: Dict[Text, Any]) -> None:
         for removed_param in removed_tokenization_params:
             if removed_param in config:
                 warnings.warn(
-                    "Intent tokenization has been moved to Tokenizer components. "
-                    "Your config still mentions '{}'. Tokenization may fail if "
-                    "you specify the parameter here. Please specify the parameter "
-                    "'intent_tokenization_flag' and 'intent_split_symbol' in the "
-                    "tokenizer of your NLU pipeline".format(removed_param)
+                    f"Intent tokenization has been moved to Tokenizer components. "
+                    f"Your config still mentions '{removed_param}'. Tokenization may "
+                    f"fail if you specify the parameter here. Please specify the "
+                    f"parameter 'intent_tokenization_flag' and 'intent_split_symbol' "
+                    f"in the tokenizer of your NLU pipeline."
                 )
 
     # init helpers
@@ -949,8 +949,7 @@ def load(
 
         else:
             logger.warning(
-                "Failed to load nlu model. Maybe path {} "
-                "doesn't exist"
-                "".format(os.path.abspath(model_dir))
+                f"Failed to load nlu model. Maybe path {os.path.abspath(model_dir)} "
+                f"doesn't exist"
             )
             return cls(component_config=meta)
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 80498fd07b2a..4201587b5570 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -180,7 +180,7 @@ def _check_analyzer(self):
                 )
 
     @staticmethod
-    def _attributes(analyzer):
+    def _attributes_for(analyzer):
         """Create a list of attributes that should be featurized."""
 
         # intents should be featurized only by word level count vectorizer
@@ -207,7 +207,7 @@ def __init__(
         self._check_analyzer()
 
         # set which attributes to featurize
-        self._attributes = self._attributes(self.analyzer)
+        self._attributes = self._attributes_for(self.analyzer)
 
         # declare class instance for CountVectorizer
         self.vectorizers = vectorizers
@@ -237,6 +237,7 @@ def _process_tokens(
         # convert to lowercase if necessary
         if self.lowercase:
             tokens = [text.lower() for text in tokens]
+
         return tokens
 
     def _replace_with_oov_token(
@@ -245,9 +246,10 @@ def _replace_with_oov_token(
         """Replace OOV words with OOV token"""
 
         if self.OOV_token and self.analyzer == "word":
-            if self._check_attribute_vocabulary(
+            vocabulary_exists = self._check_attribute_vocabulary(attribute)
+            if vocabulary_exists and self.OOV_token in self._get_attribute_vocabulary(
                 attribute
-            ) and self.OOV_token in self._get_attribute_vocabulary(attribute):
+            ):
                 # CountVectorizer is trained, process for prediction
                 tokens = [
                     t
@@ -272,9 +274,7 @@ def _get_processed_message_tokens_by_attribute(
             return [""]
 
         tokens = self._get_message_tokens_by_attribute(message, attribute)
-
         tokens = self._process_tokens(tokens, attribute)
-
         tokens = self._replace_with_oov_token(tokens, attribute)
 
         return tokens
@@ -291,10 +291,9 @@ def _check_OOV_present(self, all_tokens: List[List[Text]]):
                         return
 
             logger.warning(
-                "OOV_token='{}' was given, but it is not present "
-                "in the training data. All unseen words "
-                "will be ignored during prediction."
-                "".format(self.OOV_token)
+                f"OOV_token='{self.OOV_token}' was given, but it is not present "
+                f"in the training data. All unseen words will be ignored during "
+                f"prediction."
             )
 
     def _get_all_attributes_processed_tokens(
@@ -312,6 +311,7 @@ def _get_all_attributes_processed_tokens(
                 # check for oov tokens only in text based attributes
                 self._check_OOV_present(all_tokens)
             processed_attribute_tokens[attribute] = all_tokens
+
         return processed_attribute_tokens
 
     @staticmethod
@@ -319,6 +319,7 @@ def _convert_attribute_tokens_to_texts(
         attribute_tokens: Dict[Text, List[List[Text]]]
     ) -> Dict[Text, List[Text]]:
         attribute_texts = {}
+
         for attribute in attribute_tokens.keys():
             attribute_texts[attribute] = [
                 " ".join(tokens) for tokens in attribute_tokens[attribute]
@@ -353,7 +354,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
             )
 
     @staticmethod
-    def _attribute_texts_is_non_empty(attribute_texts):
+    def _attribute_texts_is_non_empty(attribute_texts: List[Text]) -> bool:
         return any(attribute_texts)
 
     def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]]):
@@ -377,14 +378,14 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
                     self.vectorizers[attribute].fit(attribute_texts[attribute])
                 except ValueError:
                     logger.warning(
-                        "Unable to train CountVectorizer for message attribute {}. "
-                        "Leaving an untrained CountVectorizer for it".format(attribute)
+                        f"Unable to train CountVectorizer for message "
+                        f"attribute {attribute}. Leaving an untrained "
+                        f"CountVectorizer for it."
                     )
             else:
                 logger.debug(
-                    "No text provided for {} attribute in any messages of "
-                    "training data. Skipping training a CountVectorizer "
-                    "for it.".format(attribute)
+                    f"No text provided for {attribute} attribute in any messages of "
+                    f"training data. Skipping training a CountVectorizer for it."
                 )
 
     def _create_sequence(
@@ -565,7 +566,7 @@ def _create_shared_vocab_vectorizers(
 
         attribute_vectorizers = {}
 
-        for attribute in cls._attributes(analyzer):
+        for attribute in cls._attributes_for(analyzer):
             attribute_vectorizers[attribute] = shared_vectorizer
 
         return attribute_vectorizers
@@ -588,7 +589,7 @@ def _create_independent_vocab_vectorizers(
 
         attribute_vectorizers = {}
 
-        for attribute in cls._attributes(analyzer):
+        for attribute in cls._attributes_for(analyzer):
 
             attribute_vocabulary = vocabulary[attribute] if vocabulary else None
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 50c6b0bdf246..667d916f666d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -588,9 +588,8 @@ def tf_normalize_if_cosine(x: "tf.Tensor", similarity_type: Text) -> "tf.Tensor"
         return x
     else:
         raise ValueError(
-            "Wrong similarity type '{}', "
-            "should be 'cosine' or 'inner'"
-            "".format(similarity_type)
+            f"Wrong similarity type '{similarity_type}', "
+            f"should be 'cosine' or 'inner'"
         )
 
 
@@ -1004,9 +1003,7 @@ def choose_loss(
         )
     else:
         raise ValueError(
-            "Wrong loss type '{}', "
-            "should be 'margin' or 'softmax'"
-            "".format(loss_type)
+            f"Wrong loss type '{loss_type}', " f"should be 'margin' or 'softmax'"
         )
 
 
@@ -1145,8 +1142,8 @@ def train_tf_dataset(
 
     if evaluate_on_num_examples:
         logger.info(
-            "Validation accuracy is calculated every {} epochs"
-            "".format(evaluate_every_num_epochs)
+            f"Validation accuracy is calculated every {evaluate_every_num_epochs} "
+            f"epochs."
         )
     pbar = tqdm(range(epochs), desc="Epochs", disable=is_logging_disabled())
 
@@ -1199,14 +1196,12 @@ def train_tf_dataset(
         pbar.set_postfix(postfix_dict)
 
     final_message = (
-        "Finished training embedding policy, "
-        "train loss={:.3f}, train accuracy={:.3f}"
-        "".format(train_loss, train_acc)
+        f"Finished training embedding policy, "
+        f"train loss={train_loss:.3f}, train accuracy={train_acc:.3f}"
     )
     if eval_init_op is not None:
         final_message += (
-            ", validation loss={:.3f}, validation accuracy={:.3f}"
-            "".format(val_loss, val_acc)
+            f", validation loss={val_loss:.3f}, validation accuracy={val_acc:.3f}"
         )
     logger.info(final_message)
 

From ed2b72fcbcf179da09d586dd07db6c3894e25431 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 13 Nov 2019 20:22:05 +0100
Subject: [PATCH 166/239] formatting

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 4 +++-
 rasa/nlu/selectors/embedding_response_selector.py   | 2 +-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 90efb2ad2066..a04f87054b3d 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -437,7 +437,9 @@ def _create_session_data(
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
         self._add_to_session_data(session_data, "intent_ids", [label_ids])
 
-        if label_attribute and not session_data["intent_features"]:
+        if label_attribute and (
+            "intent_features" not in session_data or not session_data["intent_features"]
+        ):
             # no label features are present, get default features from _label_data
             session_data["intent_features"] = self.use_default_label_features(label_ids)
 
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 0653f7949630..9e1bf79d54d9 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -150,7 +150,7 @@ def preprocess_train_data(self, training_data):
         session_data = self._create_session_data(
             training_data.intent_examples,
             label_id_dict,
-            attribute=MESSAGE_RESPONSE_ATTRIBUTE,
+            label_attribute=MESSAGE_RESPONSE_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)

From 38d83c3f94a7d138941e1eb746d51ac2409cc07e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 09:18:42 +0100
Subject: [PATCH 167/239] fix types

---
 rasa/core/policies/embedding_policy.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index a353185d1e7b..e7a991da5279 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -252,7 +252,7 @@ def _label_features_for_Y(self, label_ids: "np.ndarray") -> "np.ndarray":
     # noinspection PyPep8Naming
     def _create_session_data(
         self, data_X: "np.ndarray", data_Y: Optional["np.ndarray"] = None
-    ) -> "train_utils.SessionData":
+    ) -> "train_utils.SessionDataType":
         """Combine all tf session related data into dict."""
         data_X = data_X.astype(np.float32)
 
@@ -370,7 +370,9 @@ def _build_tf_train_graph(self) -> Tuple["tf.Tensor", "tf.Tensor"]:
         )
 
     # prepare for prediction
-    def _create_tf_placeholders(self, session_data: "train_utils.SessionData") -> None:
+    def _create_tf_placeholders(
+        self, session_data: "train_utils.SessionDataType"
+    ) -> None:
         """Create placeholders for prediction."""
 
         dialogue_len = None  # use dynamic time
@@ -386,7 +388,7 @@ def _create_tf_placeholders(self, session_data: "train_utils.SessionData") -> No
         )
 
     def _build_tf_pred_graph(
-        self, session_data: "train_utils.SessionData"
+        self, session_data: "train_utils.SessionDataType"
     ) -> "tf.Tensor":
         """Rebuild tf graph for prediction."""
 

From 5fdd2510c57d3f653f7d52ead5ddb64c3b958bdf Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 09:31:28 +0100
Subject: [PATCH 168/239] store tuple sizes correctly

---
 rasa/utils/train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 667d916f666d..e3861402d9ca 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -393,8 +393,8 @@ def batch_to_session_data(
 
     idx = 0
     for k, values in session_data.items():
+        tuple_sizes[k] = 0
         for v in values:
-            tuple_sizes[k] = 0
             if isinstance(v[0], scipy.sparse.spmatrix):
                 # explicitly substitute last dimension in shape with known static value
                 batch_data[k].append(

From 75b0e69061023bfd18b1c148f0c4fdb8e570c12b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 14 Nov 2019 09:54:15 +0100
Subject: [PATCH 169/239] use float32 everywhere

---
 rasa/core/policies/embedding_policy.py | 3 ---
 rasa/utils/train_utils.py              | 8 ++++----
 tests/core/test_policies.py            | 2 --
 3 files changed, 4 insertions(+), 9 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index e7a991da5279..c4540158a779 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -254,13 +254,10 @@ def _create_session_data(
         self, data_X: "np.ndarray", data_Y: Optional["np.ndarray"] = None
     ) -> "train_utils.SessionDataType":
         """Combine all tf session related data into dict."""
-        data_X = data_X.astype(np.float32)
-
         if data_Y is not None:
             # training time
             label_ids = self._label_ids_for_Y(data_Y)
             Y = self._label_features_for_Y(label_ids)
-            Y = Y.astype(np.float32)
 
             # idea taken from sklearn's stratify split
             if label_ids.ndim == 2:
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e3861402d9ca..e14a416cc30d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -343,7 +343,7 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
 
     return [
         np.array(indices).astype(np.int64),
-        np.array(data).astype(np.float64),
+        np.array(data).astype(np.float32),
         np.array(shape).astype(np.int64),
     ]
 
@@ -373,7 +373,7 @@ def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
         for i in range(data_size):
             data_padded[i, : data[i].shape[0], :] = data[i]
 
-    return data_padded.astype(np.float64)
+    return data_padded.astype(np.float32)
 
 
 def batch_to_session_data(
@@ -458,10 +458,10 @@ def append_type(v: np.ndarray):
         if isinstance(v[0], scipy.sparse.spmatrix):
             # scipy matrix is converted into indices, data, shape
             types.append(tf.int64)
-            types.append(tf.float64)
+            types.append(tf.float32)
             types.append(tf.int64)
         else:
-            types.append(tf.float64)
+            types.append(tf.float32)
 
     for values in session_data.values():
         for v in values:
diff --git a/tests/core/test_policies.py b/tests/core/test_policies.py
index 6f376c475be3..75b5bf9daea2 100644
--- a/tests/core/test_policies.py
+++ b/tests/core/test_policies.py
@@ -1,10 +1,8 @@
-import asyncio
 from unittest.mock import patch
 
 import numpy as np
 import pytest
 
-import rasa.utils.io
 from rasa.utils import train_utils
 from rasa.core import training
 from rasa.core.actions.action import (

From 38e8b81ab0caacc987165b3a63f38c328bf37315 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 14 Nov 2019 10:11:56 +0100
Subject: [PATCH 170/239] fix docstrings

---
 .../nlu/classifiers/embedding_intent_classifier.py | 10 +++++++---
 rasa/utils/train_utils.py                          | 14 ++++++++++++--
 2 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index a04f87054b3d..e05fb81f77ad 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -273,6 +273,7 @@ def _check_labels_features_exist(
         labels_example: List["Message"], attribute: Text
     ) -> bool:
         """Check if all labels have features set"""
+
         for label_example in labels_example:
             if (
                 label_example.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute])
@@ -310,8 +311,8 @@ def _extract_and_add_features(
     def _extract_labels_precomputed_features(
         self, label_examples: List["Message"]
     ) -> List[np.ndarray]:
+        """Collect precomputed encodings"""
 
-        # Collect precomputed encodings
         sparse_features = []
         dense_features = []
 
@@ -334,6 +335,7 @@ def _compute_default_label_features(
         labels_example: List["Message"],
     ) -> List[np.ndarray]:
         """Compute one-hot representation for the labels"""
+
         return [
             np.array(
                 [
@@ -659,8 +661,10 @@ def check_input_dimension_consistency(self, session_data: "SessionDataType"):
                 )
 
     def preprocess_train_data(self, training_data: "TrainingData"):
-        """Performs sanity checks on training data, extracts encodings for labels and
-        prepares data for training"""
+        """Prepares data for training.
+
+        Performs sanity checks on training data, extracts encodings for labels.
+        """
 
         label_id_dict = self._create_label_id_dict(
             training_data, attribute=MESSAGE_INTENT_ATTRIBUTE
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e14a416cc30d..6d4e0c785e4d 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -31,6 +31,7 @@
 
 def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto]:
     """Prepare `tf.compat.v1.ConfigProto` for training"""
+
     if config.get("tf_config") is not None:
         return tf.compat.v1.ConfigProto(**config.pop("tf_config"))
     else:
@@ -149,6 +150,7 @@ def combine_features(
 
 def shuffle_session_data(session_data: SessionDataType) -> SessionDataType:
     """Shuffle session data."""
+
     data_points = get_number_of_examples(session_data)
     ids = np.random.permutation(data_points)
     return session_data_for_ids(session_data, ids)
@@ -156,6 +158,7 @@ def shuffle_session_data(session_data: SessionDataType) -> SessionDataType:
 
 def session_data_for_ids(session_data: SessionDataType, ids: np.ndarray):
     """Filter session data by ids."""
+
     new_session_data = defaultdict(list)
     for k, values in session_data.items():
         for v in values:
@@ -167,6 +170,7 @@ def split_session_data_by_label(
     session_data: SessionDataType, label_key: Text, unique_label_ids: "np.ndarray"
 ) -> List[SessionDataType]:
     """Reorganize session data into a list of session data with the same labels."""
+
     if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionDataType.")
 
@@ -187,6 +191,7 @@ def balance_session_data(
     by repeating them. Mimics stratified batching, but also takes into account
     that more populated classes should appear more often.
     """
+
     if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionDataType.")
 
@@ -248,8 +253,10 @@ def balance_session_data(
 
 def get_number_of_examples(session_data: SessionDataType):
     """Obtain number of examples in session data.
+
     Raise a ValueError if number of examples differ for different data in session data.
     """
+
     example_lengths = [v.shape[0] for values in session_data.values() for v in values]
 
     # check if number of examples is the same for all values
@@ -270,6 +277,7 @@ def gen_batch(
     shuffle: bool = False,
 ) -> Generator[Tuple, None, None]:
     """Generate batches."""
+
     if shuffle:
         session_data = shuffle_session_data(session_data)
 
@@ -328,6 +336,7 @@ def prepare_batch(
 
 def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     """Convert a scipy matrix into inidces, data, and shape."""
+
     max_seq_len = max([x.shape[0] for x in array_of_sparse])
 
     if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
@@ -349,10 +358,11 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
 
 
 def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
-    """
-    Pad data of different lengths.
+    """Pad data of different lengths.
+
     Data is padded with zeros. Zeros are added to the beginning of data.
     """
+
     if data[0].ndim == 0:
         return data
 

From 6e472a99fe07e7a43f6962ee7b38584951aacc97 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 14 Nov 2019 10:51:17 +0100
Subject: [PATCH 171/239] fix label_ids in core

---
 rasa/core/policies/embedding_policy.py |  6 -----
 rasa/utils/train_utils.py              | 37 +++++++++++++++++---------
 2 files changed, 24 insertions(+), 19 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index c4540158a779..98285d9aec03 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -258,12 +258,6 @@ def _create_session_data(
             # training time
             label_ids = self._label_ids_for_Y(data_Y)
             Y = self._label_features_for_Y(label_ids)
-
-            # idea taken from sklearn's stratify split
-            if label_ids.ndim == 2:
-                # for multi-label y, map each distinct row to a string repr
-                # using join because str(row) uses an ellipsis if len(row) > 1000
-                label_ids = np.array([" ".join(row.astype("str")) for row in label_ids])
         else:
             # prediction time
             label_ids = None
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 6d4e0c785e4d..96deaf8d136c 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -48,15 +48,20 @@ def train_val_split(
     """Create random hold out validation set using stratified split."""
 
     if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionDataType.")
+        raise ValueError(f"Key '{label_key}' not in SessionData.")
 
-    label_counts = dict(
-        zip(*np.unique(session_data[label_key][0], return_counts=True, axis=0))
-    )
+    label_ids = session_data[label_key][0]
+    # idea taken from sklearn's stratify split
+    if label_ids.ndim == 2:
+        # for multi-label y, map each distinct row to a string repr
+        # using join because str(row) uses an ellipsis if len(row) > 1000
+        label_ids = np.array([" ".join(row.astype("str")) for row in label_ids])
+
+    label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
 
     check_train_test_sizes(evaluate_on_num_examples, label_counts, session_data)
 
-    counts = np.array([label_counts[label] for label in session_data[label_key][0]])
+    counts = np.array([label_counts[label] for label in label_ids])
 
     multi_values = []
     [
@@ -76,7 +81,7 @@ def train_val_split(
         *multi_values,
         test_size=evaluate_on_num_examples,
         random_state=random_seed,
-        stratify=session_data[label_key][0][counts > 1],
+        stratify=label_ids[counts > 1],
     )
 
     session_data_train, session_data_val = convert_train_test_split(
@@ -167,16 +172,15 @@ def session_data_for_ids(session_data: SessionDataType, ids: np.ndarray):
 
 
 def split_session_data_by_label(
-    session_data: SessionDataType, label_key: Text, unique_label_ids: "np.ndarray"
+    session_data: SessionDataType,
+    label_ids: "np.ndarray",
+    unique_label_ids: "np.ndarray",
 ) -> List[SessionDataType]:
     """Reorganize session data into a list of session data with the same labels."""
 
-    if label_key not in session_data or len(session_data[label_key]) > 1:
-        raise ValueError(f"Key '{label_key}' not in SessionDataType.")
-
     label_data = []
     for label_id in unique_label_ids:
-        ids = session_data[label_key][0] == label_id
+        ids = label_ids == label_id
         label_data.append(session_data_for_ids(session_data, ids))
     return label_data
 
@@ -195,13 +199,20 @@ def balance_session_data(
     if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionDataType.")
 
+    label_ids = session_data[label_key][0]
+    # idea taken from sklearn's stratify split
+    if label_ids.ndim == 2:
+        # for multi-label y, map each distinct row to a string repr
+        # using join because str(row) uses an ellipsis if len(row) > 1000
+        label_ids = np.array([" ".join(row.astype("str")) for row in label_ids])
+
     unique_label_ids, counts_label_ids = np.unique(
-        session_data[label_key][0], return_counts=True, axis=0
+        label_ids, return_counts=True, axis=0
     )
     num_label_ids = len(unique_label_ids)
 
     # need to call every time, so that the data is shuffled inside each class
-    label_data = split_session_data_by_label(session_data, label_key, unique_label_ids)
+    label_data = split_session_data_by_label(session_data, label_ids, unique_label_ids)
 
     data_idx = [0] * num_label_ids
     num_data_cycles = [0] * num_label_ids

From 5fdf957f905e8356edf963c55d8ce93e04689cf8 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 14 Nov 2019 10:54:13 +0100
Subject: [PATCH 172/239] use helper method

---
 rasa/utils/train_utils.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 96deaf8d136c..9853f8af8e06 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -38,6 +38,16 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
+def create_label_ids(label_ids):
+    # idea taken from sklearn's stratify split
+    if label_ids.ndim == 2:
+        # for multi-label y, map each distinct row to a string repr
+        # using join because str(row) uses an ellipsis if len(row) > 1000
+        label_ids = np.array([" ".join(row.astype("str")) for row in label_ids])
+
+    return label_ids
+
+
 # noinspection PyPep8Naming
 def train_val_split(
     session_data: SessionDataType,
@@ -50,12 +60,7 @@ def train_val_split(
     if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionData.")
 
-    label_ids = session_data[label_key][0]
-    # idea taken from sklearn's stratify split
-    if label_ids.ndim == 2:
-        # for multi-label y, map each distinct row to a string repr
-        # using join because str(row) uses an ellipsis if len(row) > 1000
-        label_ids = np.array([" ".join(row.astype("str")) for row in label_ids])
+    label_ids = create_label_ids(session_data[label_key][0])
 
     label_counts = dict(zip(*np.unique(label_ids, return_counts=True, axis=0)))
 
@@ -199,12 +204,7 @@ def balance_session_data(
     if label_key not in session_data or len(session_data[label_key]) > 1:
         raise ValueError(f"Key '{label_key}' not in SessionDataType.")
 
-    label_ids = session_data[label_key][0]
-    # idea taken from sklearn's stratify split
-    if label_ids.ndim == 2:
-        # for multi-label y, map each distinct row to a string repr
-        # using join because str(row) uses an ellipsis if len(row) > 1000
-        label_ids = np.array([" ".join(row.astype("str")) for row in label_ids])
+    label_ids = create_label_ids(session_data[label_key][0])
 
     unique_label_ids, counts_label_ids = np.unique(
         label_ids, return_counts=True, axis=0

From 6aaf3ce10a587956cb60941532854b0ed610d0e5 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 14 Nov 2019 12:43:41 +0100
Subject: [PATCH 173/239] fix dynamic seq in label_id

---
 rasa/core/policies/embedding_policy.py           |  3 +++
 .../classifiers/embedding_intent_classifier.py   |  8 +++++++-
 rasa/utils/train_utils.py                        | 16 ++++++++++------
 3 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/rasa/core/policies/embedding_policy.py b/rasa/core/policies/embedding_policy.py
index 98285d9aec03..e817f1010171 100644
--- a/rasa/core/policies/embedding_policy.py
+++ b/rasa/core/policies/embedding_policy.py
@@ -258,6 +258,9 @@ def _create_session_data(
             # training time
             label_ids = self._label_ids_for_Y(data_Y)
             Y = self._label_features_for_Y(label_ids)
+            # explicitly add last dimension to label_ids
+            # to track correctly dynamic sequences
+            label_ids = np.expand_dims(label_ids, -1)
         else:
             # prediction time
             label_ids = None
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index e05fb81f77ad..df015dd3701f 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -437,7 +437,11 @@ def _create_session_data(
         session_data = {}
         self._add_to_session_data(session_data, "text_features", [X_sparse, X_dense])
         self._add_to_session_data(session_data, "intent_features", [Y_sparse, Y_dense])
-        self._add_to_session_data(session_data, "intent_ids", [label_ids])
+        # explicitly add last dimension to label_ids
+        # to track correctly dynamic sequences
+        self._add_to_session_data(
+            session_data, "intent_ids", [np.expand_dims(label_ids, -1)]
+        )
 
         if label_attribute and (
             "intent_features" not in session_data or not session_data["intent_features"]
@@ -472,6 +476,8 @@ def _add_mask_to_session_data(
 
         for data in session_data[from_key]:
             if data.size > 0:
+                # explicitly add last dimension to mask
+                # to track correctly dynamic sequences
                 mask = np.array([np.ones((x.shape[0], 1)) for x in data])
                 session_data[key].append(mask)
                 break
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 9853f8af8e06..58708c5536c0 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -38,14 +38,18 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
         return None
 
 
-def create_label_ids(label_ids):
-    # idea taken from sklearn's stratify split
-    if label_ids.ndim == 2:
+def create_label_ids(label_ids: "np.ndarray") -> "np.ndarray":
+    """Convert various size label_ids into single dim array."""
+
+    if label_ids.ndim == 1:
+        return label_ids
+    elif label_ids.ndim == 2 and label_ids.shape[-1] == 1:
+        return label_ids[:, 0]
+    else:
+        # idea taken from sklearn's stratify split
         # for multi-label y, map each distinct row to a string repr
         # using join because str(row) uses an ellipsis if len(row) > 1000
-        label_ids = np.array([" ".join(row.astype("str")) for row in label_ids])
-
-    return label_ids
+        return np.array([" ".join(row.astype("str")) for row in label_ids[:, :, 0]])
 
 
 # noinspection PyPep8Naming

From 4f9ecf7bbcb90a84473718b42d54e467e88ace91 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 14 Nov 2019 12:51:40 +0100
Subject: [PATCH 174/239] raise if unsupported label_id dims

---
 rasa/utils/train_utils.py | 17 +++++++++++------
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 58708c5536c0..b65359b0a493 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -39,18 +39,23 @@ def load_tf_config(config: Dict[Text, Any]) -> Optional[tf.compat.v1.ConfigProto
 
 
 def create_label_ids(label_ids: "np.ndarray") -> "np.ndarray":
-    """Convert various size label_ids into single dim array."""
+    """Convert various size label_ids into single dim array.
+
+    for multi-label y, map each distinct row to a string repr
+    using join because str(row) uses an ellipsis if len(row) > 1000.
+    Idea taken from sklearn's stratify split.
+    """
 
     if label_ids.ndim == 1:
         return label_ids
     elif label_ids.ndim == 2 and label_ids.shape[-1] == 1:
         return label_ids[:, 0]
-    else:
-        # idea taken from sklearn's stratify split
-        # for multi-label y, map each distinct row to a string repr
-        # using join because str(row) uses an ellipsis if len(row) > 1000
+    elif label_ids.ndim == 2:
+        return np.array([" ".join(row.astype("str")) for row in label_ids])
+    elif label_ids.ndim == 3 and label_ids.shape[-1] == 1:
         return np.array([" ".join(row.astype("str")) for row in label_ids[:, :, 0]])
-
+    else:
+        raise ValueError("Unsupported label_ids dimensions")
 
 # noinspection PyPep8Naming
 def train_val_split(

From 015c4d91800f8cd11744054d688b5a02993d5456 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 14 Nov 2019 12:52:07 +0100
Subject: [PATCH 175/239] black

---
 rasa/utils/train_utils.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index b65359b0a493..c8662fa13cf5 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -57,6 +57,7 @@ def create_label_ids(label_ids: "np.ndarray") -> "np.ndarray":
     else:
         raise ValueError("Unsupported label_ids dimensions")
 
+
 # noinspection PyPep8Naming
 def train_val_split(
     session_data: SessionDataType,

From 6004d6508343cf8ec884f3dd50e2e0a62f36c78d Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 14 Nov 2019 13:52:28 +0100
Subject: [PATCH 176/239] fix import

---
 tests/utils/test_train_utils.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 7c0ba8e400be..3f03a0d4782b 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -3,7 +3,7 @@
 import numpy as np
 
 from rasa.utils.train_utils import (
-    SessionData,
+    SessionDataType,
     shuffle_session_data,
     split_session_data_by_label,
     train_val_split,
@@ -15,7 +15,7 @@
 
 
 @pytest.fixture
-async def session_data() -> SessionData:
+async def session_data() -> SessionDataType:
     return {
         "text_features": [
             np.array(
@@ -63,13 +63,13 @@ async def session_data() -> SessionData:
     }
 
 
-def test_shuffle_session_data(session_data: SessionData):
+def test_shuffle_session_data(session_data: SessionDataType):
     shuffeled_session_data = shuffle_session_data(session_data)
 
     assert np.array(shuffeled_session_data.values()) != np.array(session_data.values())
 
 
-def test_split_session_data_by_label(session_data: SessionData):
+def test_split_session_data_by_label(session_data: SessionDataType):
     split_session_data = split_session_data_by_label(
         session_data, "intent_ids", np.array([0, 1])
     )
@@ -79,14 +79,14 @@ def test_split_session_data_by_label(session_data: SessionData):
         assert len(set(s["intent_ids"][0])) == 1
 
 
-def test_split_session_data_by_incorrect_label(session_data: SessionData):
+def test_split_session_data_by_incorrect_label(session_data: SessionDataType):
     with pytest.raises(ValueError):
         split_session_data_by_label(
             session_data, "not-existing", np.array([1, 2, 3, 4, 5])
         )
 
 
-def test_train_val_split(session_data: SessionData):
+def test_train_val_split(session_data: SessionDataType):
     train_session_data, val_session_data = train_val_split(
         session_data, 2, 42, "intent_ids"
     )
@@ -101,12 +101,12 @@ def test_train_val_split(session_data: SessionData):
 
 
 @pytest.mark.parametrize("size", [0, 1, 5])
-def test_train_val_split_incorrect_size(session_data: SessionData, size):
+def test_train_val_split_incorrect_size(session_data: SessionDataType, size):
     with pytest.raises(ValueError):
         train_val_split(session_data, size, 42, "intent_ids")
 
 
-def test_session_data_for_ids(session_data: SessionData):
+def test_session_data_for_ids(session_data: SessionDataType):
     filtered_session_data = session_data_for_ids(session_data, np.array([0, 1]))
 
     for values in filtered_session_data.values():
@@ -123,19 +123,19 @@ def test_session_data_for_ids(session_data: SessionData):
     )
 
 
-def test_get_number_of_examples(session_data: SessionData):
+def test_get_number_of_examples(session_data: SessionDataType):
     num = get_number_of_examples(session_data)
 
     assert num == 5
 
 
-def test_get_number_of_examples_raises_value_error(session_data: SessionData):
+def test_get_number_of_examples_raises_value_error(session_data: SessionDataType):
     session_data["dense"] = np.random.randint(5, size=(2, 10))
     with pytest.raises(ValueError):
         get_number_of_examples(session_data)
 
 
-def test_gen_batch(session_data: SessionData):
+def test_gen_batch(session_data: SessionDataType):
     iterator = gen_batch(
         session_data, 2, "intent_ids", shuffle=True, batch_strategy="balanced"
     )
@@ -156,7 +156,7 @@ def test_gen_batch(session_data: SessionData):
         next(iterator)
 
 
-def test_balance_session_data(session_data: SessionData):
+def test_balance_session_data(session_data: SessionDataType):
     balanced_session_data = balance_session_data(session_data, 2, False, "intent_ids")
 
     for k, values in session_data.items():

From 5c050da69835cca2828e5571fabb7fada08c4cfe Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Thu, 14 Nov 2019 15:13:05 +0100
Subject: [PATCH 177/239] fix split session data tests

---
 rasa/utils/train_utils.py       |  6 ++++--
 tests/utils/test_train_utils.py | 13 +++----------
 2 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index c8662fa13cf5..f9d0e879f44a 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -186,7 +186,7 @@ def session_data_for_ids(session_data: SessionDataType, ids: np.ndarray):
     return new_session_data
 
 
-def split_session_data_by_label(
+def split_session_data_by_label_ids(
     session_data: SessionDataType,
     label_ids: "np.ndarray",
     unique_label_ids: "np.ndarray",
@@ -222,7 +222,9 @@ def balance_session_data(
     num_label_ids = len(unique_label_ids)
 
     # need to call every time, so that the data is shuffled inside each class
-    label_data = split_session_data_by_label(session_data, label_ids, unique_label_ids)
+    label_data = split_session_data_by_label_ids(
+        session_data, label_ids, unique_label_ids
+    )
 
     data_idx = [0] * num_label_ids
     num_data_cycles = [0] * num_label_ids
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index 3f03a0d4782b..dcad2a8790c2 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -5,7 +5,7 @@
 from rasa.utils.train_utils import (
     SessionDataType,
     shuffle_session_data,
-    split_session_data_by_label,
+    split_session_data_by_label_ids,
     train_val_split,
     session_data_for_ids,
     get_number_of_examples,
@@ -70,8 +70,8 @@ def test_shuffle_session_data(session_data: SessionDataType):
 
 
 def test_split_session_data_by_label(session_data: SessionDataType):
-    split_session_data = split_session_data_by_label(
-        session_data, "intent_ids", np.array([0, 1])
+    split_session_data = split_session_data_by_label_ids(
+        session_data, session_data["intent_ids"][0], np.array([0, 1])
     )
 
     assert len(split_session_data) == 2
@@ -79,13 +79,6 @@ def test_split_session_data_by_label(session_data: SessionDataType):
         assert len(set(s["intent_ids"][0])) == 1
 
 
-def test_split_session_data_by_incorrect_label(session_data: SessionDataType):
-    with pytest.raises(ValueError):
-        split_session_data_by_label(
-            session_data, "not-existing", np.array([1, 2, 3, 4, 5])
-        )
-
-
 def test_train_val_split(session_data: SessionDataType):
     train_session_data, val_session_data = train_val_split(
         session_data, 2, 42, "intent_ids"

From b40d6f41d58c9f0d1179c18a6245663ab9f20f33 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 15 Nov 2019 09:53:49 +0100
Subject: [PATCH 178/239] slightly cleaner sparse to indicies code

---
 rasa/utils/train_utils.py | 51 ++++++++++++++++++---------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index f9d0e879f44a..2c18c33f4194 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -324,7 +324,7 @@ def prepare_batch(
     start: Optional[int] = None,
     end: Optional[int] = None,
     tuple_sizes: Dict[Text, int] = None,
-) -> Tuple[np.ndarray]:
+) -> Tuple[Optional[np.ndarray]]:
     """Slices session data into batch using given start and end value."""
 
     batch_data = []
@@ -349,9 +349,9 @@ def prepare_batch(
                 _data = v[:]
 
             if isinstance(_data[0], scipy.sparse.spmatrix):
-                batch_data = batch_data + scipy_matrix_to_values(_data)
+                batch_data += scipy_matrix_to_values(_data)
             else:
-                batch_data.append(pad_data(_data))
+                batch_data.append(pad_dense_data(_data))
 
     # len of batch_data is equal to the number of keys in session data
     return tuple(batch_data)
@@ -360,17 +360,17 @@ def prepare_batch(
 def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     """Convert a scipy matrix into inidces, data, and shape."""
 
+    if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
+        array_of_sparse = [x.tocoo() for x in array_of_sparse]
+
     max_seq_len = max([x.shape[0] for x in array_of_sparse])
 
-    if not isinstance(array_of_sparse[0], scipy.sparse.coo_matrix):
-        coo = [x.tocoo() for x in array_of_sparse]
-    else:
-        coo = array_of_sparse
-    data = [v for x in array_of_sparse for v in x.data]
+    indices = []
+    data = []
+    for i, x in enumerate(array_of_sparse):
+        indices.extend(list(zip([i] * len(x.row), x.row, x.col)))
+        data.extend(x.data)
 
-    indices = [
-        ids for i, x in enumerate(coo) for ids in zip([i] * len(x.row), x.row, x.col)
-    ]
     shape = (len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1])
 
     return [
@@ -380,33 +380,28 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
     ]
 
 
-def pad_data(data: np.ndarray, feature_len: Optional[int] = None) -> np.ndarray:
+def pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
     """Pad data of different lengths.
 
-    Data is padded with zeros. Zeros are added to the beginning of data.
+    Data is padded with zeros. Zeros are added to the end of data.
     """
 
-    if data[0].ndim == 0:
-        return data
-
-    data_size = len(data)
-    if feature_len is None:
-        feature_len = max([x.shape[-1] for x in data])
-
-    if data[0].ndim == 1:
-        data_padded = np.zeros([data_size, feature_len], dtype=data[0].dtype)
-        for i in range(data_size):
-            data_padded[i, : data[i].shape[0]] = data[i]
+    if array_of_dense[0].ndim < 2:
+        # data doesn't contain a sequence
+        return array_of_dense
     else:
-        max_seq_len = max([x.shape[0] for x in data])
+        # data contains dynamic sequence dimension
+        data_size = len(array_of_dense)
+        max_seq_len = max([x.shape[0] for x in array_of_dense])
 
         data_padded = np.zeros(
-            [data_size, max_seq_len, feature_len], dtype=data[0].dtype
+            [data_size, max_seq_len, array_of_dense[0].shape[-1]],
+            dtype=array_of_dense[0].dtype,
         )
         for i in range(data_size):
-            data_padded[i, : data[i].shape[0], :] = data[i]
+            data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
 
-    return data_padded.astype(np.float32)
+        return data_padded.astype(np.float32)
 
 
 def batch_to_session_data(

From af54fbcd328bfedab23d87de360aebefdcd86289 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 15 Nov 2019 09:57:25 +0100
Subject: [PATCH 179/239] use extend

---
 rasa/utils/train_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 2c18c33f4194..98ac04e1f1ee 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -349,7 +349,7 @@ def prepare_batch(
                 _data = v[:]
 
             if isinstance(_data[0], scipy.sparse.spmatrix):
-                batch_data += scipy_matrix_to_values(_data)
+                batch_data.extend(scipy_matrix_to_values(_data))
             else:
                 batch_data.append(pad_dense_data(_data))
 

From 5d435a3920568598d14990d32244a72402d24d0b Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 15 Nov 2019 10:02:51 +0100
Subject: [PATCH 180/239]  remove else

---
 rasa/utils/train_utils.py | 23 +++++++++++------------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index 98ac04e1f1ee..bc9806a2d637 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -383,25 +383,24 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
 def pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
     """Pad data of different lengths.
 
-    Data is padded with zeros. Zeros are added to the end of data.
+    Sequential data is padded with zeros. Zeros are added to the end of data.
     """
 
     if array_of_dense[0].ndim < 2:
         # data doesn't contain a sequence
         return array_of_dense
-    else:
-        # data contains dynamic sequence dimension
-        data_size = len(array_of_dense)
-        max_seq_len = max([x.shape[0] for x in array_of_dense])
 
-        data_padded = np.zeros(
-            [data_size, max_seq_len, array_of_dense[0].shape[-1]],
-            dtype=array_of_dense[0].dtype,
-        )
-        for i in range(data_size):
-            data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
+    data_size = len(array_of_dense)
+    max_seq_len = max([x.shape[0] for x in array_of_dense])
+
+    data_padded = np.zeros(
+        [data_size, max_seq_len, array_of_dense[0].shape[-1]],
+        dtype=array_of_dense[0].dtype,
+    )
+    for i in range(data_size):
+        data_padded[i, : array_of_dense[i].shape[0], :] = array_of_dense[i]
 
-        return data_padded.astype(np.float32)
+    return data_padded.astype(np.float32)
 
 
 def batch_to_session_data(

From 8d41e5e93f4fa472873bf54fade9a92b66d50483 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Fri, 15 Nov 2019 10:46:46 +0100
Subject: [PATCH 181/239] use numpy stack

---
 rasa/utils/train_utils.py | 20 +++++++++++---------
 1 file changed, 11 insertions(+), 9 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index bc9806a2d637..d8063ae54d1b 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -365,18 +365,20 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
 
     max_seq_len = max([x.shape[0] for x in array_of_sparse])
 
-    indices = []
-    data = []
-    for i, x in enumerate(array_of_sparse):
-        indices.extend(list(zip([i] * len(x.row), x.row, x.col)))
-        data.extend(x.data)
+    indices = np.hstack(
+        [
+            np.vstack([i * np.ones_like(x.row), x.row, x.col])
+            for i, x in enumerate(array_of_sparse)
+        ]
+    ).T
+    data = np.hstack([x.data for x in array_of_sparse])
 
-    shape = (len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1])
+    shape = np.array((len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1]))
 
     return [
-        np.array(indices).astype(np.int64),
-        np.array(data).astype(np.float32),
-        np.array(shape).astype(np.int64),
+        indices.astype(np.int64),
+        data.astype(np.float32),
+        shape.astype(np.int64),
     ]
 
 

From 308b487f858026930915de337f21d387a2bbfc73 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 15 Nov 2019 14:52:39 +0100
Subject: [PATCH 182/239] fix split train val

---
 rasa/utils/train_utils.py       | 26 +++++++++++++-------------
 tests/utils/test_train_utils.py |  6 ++++++
 2 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index d8063ae54d1b..f621f9fb324a 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -129,8 +129,6 @@ def check_train_test_sizes(
 def convert_train_test_split(
     output_values: List[Any], session_data: SessionDataType, solo_values: List[Any]
 ):
-    keys = [k for k in session_data.keys()]
-
     session_data_train = defaultdict(list)
     session_data_val = defaultdict(list)
 
@@ -138,14 +136,20 @@ def convert_train_test_split(
     # order is kept, e.g. same order as session data keys
 
     # train datasets have an even index
-    for i in range(len(session_data)):
-        session_data_train[keys[i]].append(
-            combine_features(output_values[i * 2], solo_values[i])
-        )
+    index = 0
+    for key, values in session_data.items():
+        for _ in range(len(values)):
+            session_data_train[key].append(
+                combine_features(output_values[index * 2], solo_values[index])
+            )
+            index += 1
 
     # val datasets have an odd index
-    for i in range(len(session_data)):
-        session_data_val[keys[i]].append(output_values[(i * 2) + 1])
+    index = 0
+    for key, values in session_data.items():
+        for _ in range(len(values)):
+            session_data_val[key].append(output_values[(index * 2) + 1])
+            index += 1
 
     return session_data_train, session_data_val
 
@@ -375,11 +379,7 @@ def scipy_matrix_to_values(array_of_sparse: np.ndarray) -> List[np.ndarray]:
 
     shape = np.array((len(array_of_sparse), max_seq_len, array_of_sparse[0].shape[-1]))
 
-    return [
-        indices.astype(np.int64),
-        data.astype(np.float32),
-        shape.astype(np.int64),
-    ]
+    return [indices.astype(np.int64), data.astype(np.float32), shape.astype(np.int64)]
 
 
 def pad_dense_data(array_of_dense: np.ndarray) -> np.ndarray:
diff --git a/tests/utils/test_train_utils.py b/tests/utils/test_train_utils.py
index dcad2a8790c2..bdbb89cde492 100644
--- a/tests/utils/test_train_utils.py
+++ b/tests/utils/test_train_utils.py
@@ -84,6 +84,12 @@ def test_train_val_split(session_data: SessionDataType):
         session_data, 2, 42, "intent_ids"
     )
 
+    for k, values in session_data.items():
+        assert len(values) == len(train_session_data[k])
+        assert len(values) == len(val_session_data[k])
+        for i, v in enumerate(values):
+            assert v[0].dtype == train_session_data[k][i][0].dtype
+
     for values in train_session_data.values():
         for v in values:
             assert v.shape[0] == 3

From 62d9e60c3358b7c978002f261982167561d954b1 Mon Sep 17 00:00:00 2001
From: Vladimir Vlasov <vladimir@rasa.com>
Date: Wed, 20 Nov 2019 14:16:13 +0100
Subject: [PATCH 183/239] mask combined input before averaging

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index df015dd3701f..6a554014bf1b 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -532,7 +532,7 @@ def combine_sparse_dense_features(
             else:
                 dense_features.append(f)
 
-        output = tf.concat(dense_features, axis=-1)
+        output = tf.concat(dense_features, axis=-1) * mask
         # apply mean to convert sequence to sentence features
         output = tf.reduce_sum(output, axis=1) / tf.reduce_sum(mask, axis=1)
         return output

From 931d5eb2620ed311a788bef9ebf8f5cf8de73c97 Mon Sep 17 00:00:00 2001
From: Vova Vv <mr.voov@gmail.com>
Date: Mon, 25 Nov 2019 11:27:23 +0100
Subject: [PATCH 184/239] fix oov token warning

---
 .../sparse_featurizer/count_vectors_featurizer.py    | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 7ec9f2c95dd2..3b5cdc751954 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -291,11 +291,13 @@ def _check_OOV_present(self, all_tokens: List[List[Text]]):
                     ):
                         return
 
-            warnings.warn(
-                f"OOV_token='{self.OOV_token}' was given, but it is not present "
-                "in the training data. All unseen words "
-                "will be ignored during prediction."
-            )
+            if any(text for tokens in all_tokens for text in tokens):
+                # if there is some text in tokens, warn if there is no oov token
+                logger.warning(
+                    f"OOV_token='{self.OOV_token}' was given, but it is not present "
+                    "in the training data. All unseen words "
+                    "will be ignored during prediction."
+                )
 
     def _get_all_attributes_processed_tokens(
         self, training_data: "TrainingData"

From 232176c852b8c9f9366d4ed597d08806a94513e6 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 27 Nov 2019 17:21:42 +0100
Subject: [PATCH 185/239] move convert featurizer to dense featurizers

---
 .../convert_featurizer.py                     | 20 ++++++++++---------
 rasa/nlu/registry.py                          |  2 +-
 2 files changed, 12 insertions(+), 10 deletions(-)
 rename rasa/nlu/featurizers/{ => dense_featurizer}/convert_featurizer.py (83%)

diff --git a/rasa/nlu/featurizers/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
similarity index 83%
rename from rasa/nlu/featurizers/convert_featurizer.py
rename to rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 614feeef3b1a..a0ea4fd03613 100644
--- a/rasa/nlu/featurizers/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -1,11 +1,11 @@
 import logging
-from rasa.nlu.featurizers import Featurizer
+from rasa.nlu.featurizers.featurzier import Featurizer
 from typing import Any, Dict, List, Optional, Text, Tuple
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_FEATURE_NAMES,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 import numpy as np
@@ -17,7 +17,7 @@
 class ConveRTFeaturizer(Featurizer):
 
     provides = [
-        MESSAGE_VECTOR_FEATURE_NAMES[attribute]
+        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
@@ -96,11 +96,11 @@ def train(
                 for index, ex in enumerate(batch_examples):
 
                     ex.set(
-                        MESSAGE_VECTOR_FEATURE_NAMES[attribute],
-                        self._combine_with_existing_features(
+                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                        self._combine_with_existing_dense_features(
                             ex,
                             batch_features[index],
-                            MESSAGE_VECTOR_FEATURE_NAMES[attribute],
+                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         ),
                     )
 
@@ -110,8 +110,10 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         feats = self._compute_features([message])[0]
         message.set(
-            MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self._combine_with_existing_features(
-                message, feats, MESSAGE_VECTOR_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            self._combine_with_existing_dense_features(
+                message,
+                feats,
+                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             ),
         )
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index b8d9838681a0..82fbb2531f21 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -25,7 +25,7 @@
 from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-from rasa.nlu.featurizers.convert_featurizer import ConveRTFeaturizer
+from nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer

From 7c87d601e67ce13a965ed6f8c44c0d27b9ec969c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 27 Nov 2019 17:30:21 +0100
Subject: [PATCH 186/239] add future warning to ngram featurizer

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 1 +
 rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py  | 6 ++++--
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index a0ea4fd03613..8036280c9d07 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -23,6 +23,7 @@ class ConveRTFeaturizer(Featurizer):
 
     def _load_model(self) -> None:
 
+        # needed in order to load model
         import tensorflow_text
         import tensorflow_hub as tfhub
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index e33d3ba1bb0c..1cdb220042ee 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -1,4 +1,5 @@
 import logging
+import warnings
 
 from typing import Any, Dict, Optional, Text
 
@@ -11,7 +12,7 @@ class NGramFeaturizer(Featurizer):
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
         super(NGramFeaturizer, self).__init__(component_config)
 
-        logger.warning(
+        warnings.warn(
             "DEPRECATION warning: Using `NGramFeaturizer` is deprecated. "
             "Please use `CountVectorsFeaturizer` instead. The following settings"
             "should match the previous `NGramFeaturizer`:"
@@ -21,5 +22,6 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
             "  min_ngram: 3"
             "  max_ngram: 17"
             "  max_features: 10"
-            "  min_df: 5"
+            "  min_df: 5",
+            FutureWarning,
         )

From a81d0a8dec8c5ce4e35269d5cf891c66fe879067 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 27 Nov 2019 17:34:05 +0100
Subject: [PATCH 187/239] set default value of use_cls_token to false

---
 rasa/nlu/tokenizers/jieba_tokenizer.py      | 2 +-
 rasa/nlu/tokenizers/mitie_tokenizer.py      | 2 +-
 rasa/nlu/tokenizers/spacy_tokenizer.py      | 2 +-
 rasa/nlu/tokenizers/whitespace_tokenizer.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 9a53729cd410..126caceaa5c8 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -37,7 +37,7 @@ class JiebaTokenizer(Tokenizer):
         # Symbol on which intent should be split
         "intent_split_symbol": "_",
         # add __CLS__ token to the end of the list of tokens
-        "use_cls_token": True,
+        "use_cls_token": False,
     }  # default don't load custom dictionary
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index e17d49c3bab7..5bbfda74e502 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -18,7 +18,7 @@ class MitieTokenizer(Tokenizer):
 
     defaults = {
         # add __CLS__ token to the end of the list of tokens
-        "use_cls_token": True
+        "use_cls_token": False
     }
 
     @classmethod
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index f4ca504ad653..19a61bde6070 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -29,7 +29,7 @@ class SpacyTokenizer(Tokenizer):
 
     defaults = {
         # add __CLS__ token to the end of the list of tokens
-        "use_cls_token": True
+        "use_cls_token": False
     }
 
     def train(
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index fa029b0b43f2..e819588c28ff 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -24,7 +24,7 @@ class WhitespaceTokenizer(Tokenizer):
         # Text will be tokenized with case sensitive as default
         "case_sensitive": True,
         # add __CLS__ token to the end of the list of tokens
-        "use_cls_token": True,
+        "use_cls_token": False,
     }
 
     def __init__(self, component_config: Dict[Text, Any] = None) -> None:

From 4471dee3611878a29bc3d11f0d4f0df07060187b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 28 Nov 2019 15:34:03 +0100
Subject: [PATCH 188/239] fix import (add root)

---
 rasa/nlu/registry.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index a3006db75954..f43dd28ddb40 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -25,7 +25,7 @@
 from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
 from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
-from nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer
+from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import ConveRTFeaturizer
 from rasa.nlu.model import Metadata
 from rasa.nlu.tokenizers.jieba_tokenizer import JiebaTokenizer
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer

From e48d7a5398ddcce5c1d2fb4905117225a1781983 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 28 Nov 2019 17:21:01 +0100
Subject: [PATCH 189/239] add return_sequence flag

---
 .../dense_featurizer/convert_featurizer.py    |  4 +--
 .../dense_featurizer/mitie_featurizer.py      | 30 ++++++++++++++++---
 .../dense_featurizer/spacy_featurizer.py      | 21 +++++++++++--
 .../count_vectors_featurizer.py               | 17 +++++++++--
 .../sparse_featurizer/regex_featurizer.py     | 23 ++++++++++++--
 .../test_count_vectors_featurizer.py          | 30 +++++++++++++++----
 .../nlu/featurizers/test_mitie_featurizer.py  |  2 +-
 .../nlu/featurizers/test_regex_featurizer.py  |  4 +--
 .../nlu/featurizers/test_spacy_featurizer.py  |  8 ++---
 9 files changed, 112 insertions(+), 27 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 8036280c9d07..339a20e0e432 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -100,7 +100,7 @@ def train(
                         MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
                             ex,
-                            batch_features[index],
+                            [batch_features[index]],
                             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         ),
                     )
@@ -114,7 +114,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(
                 message,
-                feats,
+                [feats],
                 MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             ),
         )
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 5efb9cfe4a84..77d42c9be743 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import typing
-from typing import Any, List, Text
+from typing import Any, List, Text, Dict
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurzier import Featurizer
@@ -29,6 +29,19 @@ class MitieFeaturizer(Featurizer):
         "mitie_feature_extractor"
     ]
 
+    defaults = {
+        # if True return a sequence of features (return vector has size
+        # token-size x feature-dimension)
+        # if False token-size will be equal to 1
+        "return_sequence": False
+    }
+
+    def __init__(self, component_config: Dict[Text, Any] = None):
+
+        super().__init__(component_config)
+
+        self.return_sequence = component_config["return_sequence"]
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["mitie", "numpy"]
@@ -99,7 +112,16 @@ def features_for_tokens(
         feature_extractor: "mitie.total_word_feature_extractor",
     ) -> np.ndarray:
 
-        vec = []
+        if self.return_sequence:
+            vec = []
+            for token in tokens:
+                vec.append(feature_extractor.get_feature_vector(token.text))
+            return np.array(vec)
+
+        vec = np.zeros(self.ndim(feature_extractor))
         for token in tokens:
-            vec.append(feature_extractor.get_feature_vector(token.text))
-        return np.array(vec)
+            vec += feature_extractor.get_feature_vector(token.text)
+        if tokens:
+            return vec / len(tokens)
+        else:
+            return vec
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 75b8f486a6c9..b368b82626f6 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import typing
-from typing import Any, Optional
+from typing import Any, Optional, Dict, Text
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurzier import Featurizer
@@ -31,9 +31,24 @@ class SpacyFeaturizer(Featurizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ] + [MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES]
 
+    defaults = {
+        # if True return a sequence of features (return vector has size
+        # token-size x feature-dimension)
+        # if False token-size will be equal to 1
+        "return_sequence": False
+    }
+
+    def __init__(self, component_config: Dict[Text, Any] = None):
+
+        super().__init__(component_config)
+
+        self.return_sequence = component_config["return_sequence"]
+
     def _features_for_doc(self, doc: "Doc") -> np.ndarray:
-        """Feature vector for a single document / sentence."""
-        return np.array([t.vector for t in doc])
+        """Feature vector for a single document / sentence / tokens."""
+        if self.return_sequence:
+            return np.array([t.vector for t in doc])
+        return np.array([doc.vector])
 
     def train(
         self,
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 3b5cdc751954..f1dd05c82a07 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -77,6 +77,10 @@ class CountVectorsFeaturizer(Featurizer):
         # will be converted to lowercase if lowercase is True
         "OOV_token": None,  # string or None
         "OOV_words": [],  # string or list of strings
+        # if True return a sequence of features (return vector has size
+        # token-size x feature-dimension)
+        # if False token-size will be equal to 1
+        "return_sequence": False,
     }
 
     @classmethod
@@ -117,6 +121,9 @@ def _load_count_vect_params(self):
         # if convert all characters to lowercase
         self.lowercase = self.component_config["lowercase"]
 
+        # whether to return a sequence or not
+        self.return_sequence = self.component_config["return_sequence"]
+
     # noinspection PyPep8Naming
     def _load_OOV_params(self):
         self.OOV_token = self.component_config["OOV_token"]
@@ -397,9 +404,13 @@ def _create_sequence(
         X = []
 
         for i, tokens in enumerate(all_tokens):
-            x = self.vectorizers[attribute].transform(tokens)
-            x.sort_indices()
-            X.append(x.tocoo())
+            if self.return_sequence:
+                x = self.vectorizers[attribute].transform(tokens)
+                x.sort_indices()
+                X.append(x.tocoo())
+            else:
+                x = self.vectorizers[attribute].transform([" ".join(tokens)])
+                X.append(x.tocoo())
 
         return X
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 71a6d0f168ff..1790e227baa0 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -31,6 +31,13 @@ class RegexFeaturizer(Featurizer):
 
     requires = [MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
 
+    defaults = {
+        # if True return a sequence of features (return vector has size
+        # token-size x feature-dimension)
+        # if False token-size will be equal to 1
+        "return_sequence": False
+    }
+
     def __init__(
         self,
         component_config: Dict[Text, Any] = None,
@@ -44,6 +51,8 @@ def __init__(
         lookup_tables = lookup_tables or []
         self._add_lookup_table_regexes(lookup_tables)
 
+        self.return_sequence = component_config["return_sequence"]
+
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
     ) -> None:
@@ -88,7 +97,12 @@ def _features_for_patterns(
         relating the name of the regex to whether it was matched."""
         tokens = message.get(MESSAGE_TOKENS_NAMES[attribute], [])
 
-        vec = np.zeros([len(tokens), len(self.known_patterns)])
+        if self.return_sequence:
+            seq_length = len(tokens)
+        else:
+            seq_length = 1
+
+        vec = np.zeros([seq_length, len(self.known_patterns)])
 
         for pattern_index, pattern in enumerate(self.known_patterns):
             matches = re.finditer(pattern["pattern"], message.text)
@@ -98,10 +112,15 @@ def _features_for_patterns(
                 patterns = t.get("pattern", default={})
                 patterns[pattern["name"]] = False
 
+                if self.return_sequence:
+                    seq_index = token_index
+                else:
+                    seq_index = 1
+
                 for match in matches:
                     if t.offset < match.end() and t.end > match.start():
                         patterns[pattern["name"]] = True
-                        vec[token_index][pattern_index] = 1.0
+                        vec[seq_index][pattern_index] = 1.0
 
                 t.set("pattern", patterns)
 
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index fa90bf2d43bf..07236b49967a 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -21,7 +21,9 @@ def test_count_vector_featurizer(sentence, expected):
         CountVectorsFeaturizer,
     )
 
-    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
+    ftr = CountVectorsFeaturizer(
+        {"token_pattern": r"(?u)\b\w+\b", "return_sequence": True}
+    )
     train_message = Message(sentence)
     # this is needed for a valid training example
     train_message.set("intent", "bla")
@@ -53,7 +55,9 @@ def test_count_vector_featurizer_attribute_featurization(
         CountVectorsFeaturizer,
     )
 
-    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
+    ftr = CountVectorsFeaturizer(
+        {"token_pattern": r"(?u)\b\w+\b", "return_sequence": True}
+    )
     train_message = Message(sentence)
 
     # this is needed for a valid training example
@@ -101,7 +105,11 @@ def test_count_vector_featurizer_shared_vocab(
     )
 
     ftr = CountVectorsFeaturizer(
-        {"token_pattern": r"(?u)\b\w+\b", "use_shared_vocab": True}
+        {
+            "token_pattern": r"(?u)\b\w+\b",
+            "use_shared_vocab": True,
+            "return_sequence": True,
+        }
     )
     train_message = Message(sentence)
 
@@ -138,7 +146,11 @@ def test_count_vector_featurizer_oov_token(sentence, expected):
     )
 
     ftr = CountVectorsFeaturizer(
-        {"token_pattern": r"(?u)\b\w+\b", "OOV_token": "__oov__"}
+        {
+            "token_pattern": r"(?u)\b\w+\b",
+            "OOV_token": "__oov__",
+            "return_sequence": True,
+        }
     )
     train_message = Message(sentence)
     # this is needed for a valid training example
@@ -171,6 +183,7 @@ def test_count_vector_featurizer_oov_words(sentence, expected):
             "token_pattern": r"(?u)\b\w+\b",
             "OOV_token": "__oov__",
             "OOV_words": ["oov_word0", "OOV_word1"],
+            "return_sequence": True,
         }
     )
     train_message = Message(sentence)
@@ -202,7 +215,9 @@ def test_count_vector_featurizer_using_tokens(tokens, expected):
         CountVectorsFeaturizer,
     )
 
-    ftr = CountVectorsFeaturizer({"token_pattern": r"(?u)\b\w+\b"})
+    ftr = CountVectorsFeaturizer(
+        {"token_pattern": r"(?u)\b\w+\b", "return_sequence": True}
+    )
 
     # using empty string instead of real text string to make sure
     # count vector only can come from `tokens` feature.
@@ -239,7 +254,9 @@ def test_count_vector_featurizer_char(sentence, expected):
         CountVectorsFeaturizer,
     )
 
-    ftr = CountVectorsFeaturizer({"min_ngram": 1, "max_ngram": 2, "analyzer": "char"})
+    ftr = CountVectorsFeaturizer(
+        {"min_ngram": 1, "max_ngram": 2, "analyzer": "char", "return_sequence": True}
+    )
     train_message = Message(sentence)
     # this is needed for a valid training example
     train_message.set("intent", "bla")
@@ -269,6 +286,7 @@ def test_count_vector_featurizer_persist_load(tmpdir):
         "max_ngram": 3,
         "max_features": 10,
         "lowercase": False,
+        "return_sequence": True,
     }
     train_ftr = CountVectorsFeaturizer(config)
 
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index be5df1209b9a..24efee2dee22 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -7,7 +7,7 @@
 def test_mitie_featurizer(mitie_feature_extractor, default_config):
     from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 
-    mitie_component_config = {"name": "MitieFeaturizer"}
+    mitie_component_config = {"name": "MitieFeaturizer", "return_sequence": True}
     ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
 
     sentence = "Hey how are you today"
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index 40a9f70b4f37..e0371291298a 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -55,7 +55,7 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
         {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
         {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
     ]
-    ftr = RegexFeaturizer(known_patterns=patterns)
+    ftr = RegexFeaturizer({"return_sequence": True}, known_patterns=patterns)
 
     # adds tokens to the message
     tokenizer = SpacyTokenizer({"use_cls_token": False})
@@ -104,7 +104,7 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
         },
         {"name": "plates", "elements": "data/test/lookup_tables/plates.txt"},
     ]
-    ftr = RegexFeaturizer(lookup_tables=lookups)
+    ftr = RegexFeaturizer({"return_sequence": True}, lookup_tables=lookups)
 
     # adds tokens to the message
     component_config = {"name": "SpacyTokenizer", "use_cls_token": False}
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index 077286fd1150..ea716568ba06 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -11,7 +11,7 @@
 def test_spacy_featurizer(sentence, spacy_nlp):
     from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
-    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())
+    ftr = SpacyFeaturizer.create({"return_sequence": True}, RasaNLUModelConfig())
 
     doc = spacy_nlp(sentence)
     vecs = ftr._features_for_doc(doc)
@@ -49,7 +49,7 @@ def test_spacy_intent_featurizer(spacy_nlp_component):
 
     td = training_data.load_data("data/examples/rasa/demo-rasa.json")
     spacy_nlp_component.train(td, config=None)
-    spacy_featurizer = SpacyFeaturizer()
+    spacy_featurizer = SpacyFeaturizer({"return_sequence": True})
     spacy_featurizer.train(td, config=None)
 
     intent_features_exist = np.array(
@@ -73,7 +73,7 @@ def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
     doc = spacy_nlp(sentence)
     token_vectors = [t.vector for t in doc]
 
-    spacy_config = {}
+    spacy_config = {"return_sequence": True}
     ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
 
     greet = {"intent": "greet", "text_features": [0.5]}
@@ -97,7 +97,7 @@ def test_spacy_featurizer_casing(spacy_nlp):
     # retrieves vectors. For compressed spacy models (e.g. models
     # ending in _sm) this test will most likely fail.
 
-    ftr = SpacyFeaturizer.create({}, RasaNLUModelConfig())
+    ftr = SpacyFeaturizer.create({"return_sequence": True}, RasaNLUModelConfig())
 
     td = training_data.load_data("data/examples/rasa/demo-rasa.json")
     for e in td.intent_examples:

From bcfb0ad5fe956f2a242381df3d8a3061409bf97e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 28 Nov 2019 17:27:18 +0100
Subject: [PATCH 190/239] convert featurizer returns seq of 1

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 8036280c9d07..339a20e0e432 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -100,7 +100,7 @@ def train(
                         MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
                             ex,
-                            batch_features[index],
+                            [batch_features[index]],
                             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         ),
                     )
@@ -114,7 +114,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(
                 message,
-                feats,
+                [feats],
                 MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             ),
         )

From f2b9e4fb52011e4925c9ccf478ac83c493ac74b8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 11:59:27 +0100
Subject: [PATCH 191/239] fix return_sequence not found in config

---
 rasa/nlu/extractors/crf_entity_extractor.py   |  8 +++--
 .../dense_featurizer/mitie_featurizer.py      |  2 +-
 .../dense_featurizer/spacy_featurizer.py      |  2 +-
 .../sparse_featurizer/regex_featurizer.py     |  4 +--
 rasa/nlu/registry.py                          |  4 +--
 .../test_count_vectors_featurizer.py          | 33 +++++++++++++++++++
 .../nlu/featurizers/test_spacy_featurizer.py  | 28 +++++++++++++++-
 7 files changed, 72 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 545521dd84b3..abe4344807fd 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -596,8 +596,12 @@ def __get_dense_features(message: Message) -> Optional[List[Any]]:
 
         tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
         if len(tokens) != len(features):
-            warn_string = f"Number of word embeddings ({len(features)}) does not match number of tokens ({len(tokens)})"
-            raise Exception(warn_string)
+            warnings.warn(
+                f"Number of features ({len(features)}) for attribute "
+                f"'{MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]}' "
+                f"does not match number of tokens ({len(tokens)})"
+            )
+            return None
 
         # convert to python-crfsuite feature format
         features_out = []
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 77d42c9be743..5ad04b655c17 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -40,7 +40,7 @@ def __init__(self, component_config: Dict[Text, Any] = None):
 
         super().__init__(component_config)
 
-        self.return_sequence = component_config["return_sequence"]
+        self.return_sequence = self.component_config["return_sequence"]
 
     @classmethod
     def required_packages(cls) -> List[Text]:
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index b368b82626f6..6348604e9b71 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -42,7 +42,7 @@ def __init__(self, component_config: Dict[Text, Any] = None):
 
         super().__init__(component_config)
 
-        self.return_sequence = component_config["return_sequence"]
+        self.return_sequence = self.component_config["return_sequence"]
 
     def _features_for_doc(self, doc: "Doc") -> np.ndarray:
         """Feature vector for a single document / sentence / tokens."""
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 1790e227baa0..9f5751746b79 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -51,7 +51,7 @@ def __init__(
         lookup_tables = lookup_tables or []
         self._add_lookup_table_regexes(lookup_tables)
 
-        self.return_sequence = component_config["return_sequence"]
+        self.return_sequence = self.component_config["return_sequence"]
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -115,7 +115,7 @@ def _features_for_patterns(
                 if self.return_sequence:
                     seq_index = token_index
                 else:
-                    seq_index = 1
+                    seq_index = 0
 
                 for match in matches:
                     if t.offset < match.end() and t.end > match.start():
diff --git a/rasa/nlu/registry.py b/rasa/nlu/registry.py
index f43dd28ddb40..34a17a930bcc 100644
--- a/rasa/nlu/registry.py
+++ b/rasa/nlu/registry.py
@@ -108,7 +108,7 @@
 registered_pipeline_templates = {
     "pretrained_embeddings_spacy": [
         {"name": "SpacyNLP"},
-        {"name": "SpacyTokenizer", "use_cls_token": False},
+        {"name": "SpacyTokenizer"},
         {"name": "SpacyFeaturizer"},
         {"name": "RegexFeaturizer"},
         {"name": "CRFEntityExtractor"},
@@ -117,7 +117,7 @@
     ],
     "keyword": [{"name": "KeywordIntentClassifier"}],
     "supervised_embeddings": [
-        {"name": "WhitespaceTokenizer", "use_cls_token": False},
+        {"name": "WhitespaceTokenizer"},
         {"name": "RegexFeaturizer"},
         {"name": "CRFEntityExtractor"},
         {"name": "EntitySynonymMapper"},
diff --git a/tests/nlu/featurizers/test_count_vectors_featurizer.py b/tests/nlu/featurizers/test_count_vectors_featurizer.py
index 07236b49967a..66d3b473b810 100644
--- a/tests/nlu/featurizers/test_count_vectors_featurizer.py
+++ b/tests/nlu/featurizers/test_count_vectors_featurizer.py
@@ -40,6 +40,39 @@ def test_count_vector_featurizer(sentence, expected):
     assert np.all(actual[0] == expected)
 
 
+@pytest.mark.parametrize(
+    "sentence, expected",
+    [
+        ("hello hello hello hello hello ", [[5]]),
+        ("hello goodbye hello", [[1, 2]]),
+        ("a b c d e f", [[1, 1, 1, 1, 1, 1]]),
+        ("a 1 2", [[2, 1]]),
+    ],
+)
+def test_count_vector_featurizer_no_sequence(sentence, expected):
+    from rasa.nlu.featurizers.sparse_featurizer.count_vectors_featurizer import (
+        CountVectorsFeaturizer,
+    )
+
+    ftr = CountVectorsFeaturizer(
+        {"token_pattern": r"(?u)\b\w+\b", "return_sequence": False}
+    )
+    train_message = Message(sentence)
+    # this is needed for a valid training example
+    train_message.set("intent", "bla")
+    data = TrainingData([train_message])
+    ftr.train(data)
+
+    test_message = Message(sentence)
+    ftr.process(test_message)
+
+    assert isinstance(test_message.get("text_sparse_features"), scipy.sparse.coo_matrix)
+
+    actual = test_message.get("text_sparse_features").toarray()
+
+    assert np.all(actual == expected)
+
+
 @pytest.mark.parametrize(
     "sentence, intent, response, intent_features, response_features",
     [
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index ea716568ba06..c3ed0d147e62 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -67,7 +67,7 @@ def test_spacy_intent_featurizer(spacy_nlp_component):
     "sentence, expected",
     [("hey how are you today", [-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])],
 )
-def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
+def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
     from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     doc = spacy_nlp(sentence)
@@ -89,6 +89,32 @@ def test_spacy_ner_featurizer(sentence, expected, spacy_nlp):
     assert np.allclose(vecs, expected, atol=1e-4)
 
 
+@pytest.mark.parametrize(
+    "sentence, expected",
+    [("hey how are you today", [-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])],
+)
+def test_spacy_featurizer_no_sequence(sentence, expected, spacy_nlp):
+    from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
+
+    doc = spacy_nlp(sentence)
+    token_vectors = [t.vector for t in doc]
+
+    spacy_config = {"return_sequence": False}
+    ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
+
+    greet = {"intent": "greet", "text_features": [0.5]}
+
+    message = Message(sentence, greet)
+    message.set("spacy_doc", doc)
+
+    ftr._set_spacy_features(message)
+
+    vecs = message.get("text_dense_features")
+
+    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
+    assert np.allclose(vecs, expected, atol=1e-4)
+
+
 def test_spacy_featurizer_casing(spacy_nlp):
     from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 

From a9360e3507ab0cdefe4ed0da37a71c4e514217c3 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 16:25:33 +0100
Subject: [PATCH 192/239] convert featurizer return seq of 1

---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 339a20e0e432..c6f7a11872c4 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -100,7 +100,7 @@ def train(
                         MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
                             ex,
-                            [batch_features[index]],
+                            np.expand_dims(batch_features[index], axis=0),
                             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
                         ),
                     )
@@ -114,7 +114,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(
                 message,
-                [feats],
+                np.expand_dims(feats, axis=0),
                 MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
             ),
         )

From 2850813467ff4f62f4816acd6094b7c783e07642 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 16:41:31 +0100
Subject: [PATCH 193/239] add more tests

---
 .../dense_featurizer/mitie_featurizer.py      |  7 +++--
 .../nlu/featurizers/test_mitie_featurizer.py  | 13 ++++++++
 .../nlu/featurizers/test_regex_featurizer.py  | 30 ++++++++++++++++++-
 .../nlu/featurizers/test_spacy_featurizer.py  | 14 +++++----
 4 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 5ad04b655c17..435900b1f661 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -121,7 +121,8 @@ def features_for_tokens(
         vec = np.zeros(self.ndim(feature_extractor))
         for token in tokens:
             vec += feature_extractor.get_feature_vector(token.text)
+
         if tokens:
-            return vec / len(tokens)
-        else:
-            return vec
+            vec = vec / len(tokens)
+
+        return np.expand_dims(vec, axis=0)
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index 24efee2dee22..094c57ed7e01 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -4,6 +4,19 @@
 from rasa.nlu.config import RasaNLUModelConfig
 
 
+def test_mitie_featurizer_no_sequence(mitie_feature_extractor, default_config):
+    from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
+
+    mitie_component_config = {"name": "MitieFeaturizer", "return_sequence": False}
+    ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
+    sentence = "Hey how are you today"
+    tokens = MitieTokenizer().tokenize(sentence)
+    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)[0]
+    print(vecs)
+    expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
+    assert np.allclose(vecs[:5], expected, atol=1e-5)
+
+
 def test_mitie_featurizer(mitie_feature_extractor, default_config):
     from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index e0371291298a..958a7032b549 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -114,7 +114,6 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     tokenizer.process(message)
 
     result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
-    print(result.toarray())
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
@@ -124,3 +123,32 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
         token_matches = token.get("pattern").values()
         num_matches = sum(token_matches)
         assert num_matches == labeled_tokens.count(i)
+
+
+@pytest.mark.parametrize(
+    "sentence, expected ",
+    [
+        ("hey how are you today", [0.0, 1.0, 0.0]),
+        ("hey 456 how are you", [1.0, 1.0, 0.0]),
+        ("blah balh random eh", [0.0, 0.0, 0.0]),
+        ("a 1 digit number", [1.0, 0.0, 1.0]),
+    ],
+)
+def test_regex_featurizer_no_sequence(sentence, expected, spacy_nlp):
+    from rasa.nlu.featurizers.sparse_featurizer.regex_featurizer import RegexFeaturizer
+
+    patterns = [
+        {"pattern": "[0-9]+", "name": "number", "usage": "intent"},
+        {"pattern": "\\bhey*", "name": "hello", "usage": "intent"},
+        {"pattern": "[0-1]+", "name": "binary", "usage": "intent"},
+    ]
+    ftr = RegexFeaturizer({"return_sequence": False}, known_patterns=patterns)
+
+    # adds tokens to the message
+    tokenizer = SpacyTokenizer()
+    message = Message(sentence)
+    message.set("spacy_doc", spacy_nlp(sentence))
+    tokenizer.process(message)
+
+    result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
+    assert np.allclose(result.toarray()[0], expected, atol=1e-10)
diff --git a/tests/nlu/featurizers/test_spacy_featurizer.py b/tests/nlu/featurizers/test_spacy_featurizer.py
index c3ed0d147e62..536075ad1900 100644
--- a/tests/nlu/featurizers/test_spacy_featurizer.py
+++ b/tests/nlu/featurizers/test_spacy_featurizer.py
@@ -91,13 +91,17 @@ def test_spacy_featurizer_sequence(sentence, expected, spacy_nlp):
 
 @pytest.mark.parametrize(
     "sentence, expected",
-    [("hey how are you today", [-0.28451, 0.31007, -0.57039, -0.073056, -0.17322])],
+    [
+        (
+            "hey how are you today",
+            [-0.19649599, 0.32493639, -0.37408298, -0.10622784, 0.062756],
+        )
+    ],
 )
 def test_spacy_featurizer_no_sequence(sentence, expected, spacy_nlp):
     from rasa.nlu.featurizers.dense_featurizer.spacy_featurizer import SpacyFeaturizer
 
     doc = spacy_nlp(sentence)
-    token_vectors = [t.vector for t in doc]
 
     spacy_config = {"return_sequence": False}
     ftr = SpacyFeaturizer.create(spacy_config, RasaNLUModelConfig())
@@ -109,10 +113,10 @@ def test_spacy_featurizer_no_sequence(sentence, expected, spacy_nlp):
 
     ftr._set_spacy_features(message)
 
-    vecs = message.get("text_dense_features")
+    vecs = message.get("text_dense_features")[0]
 
-    assert np.allclose(token_vectors[0][:5], vecs, atol=1e-4)
-    assert np.allclose(vecs, expected, atol=1e-4)
+    assert np.allclose(doc.vector, vecs, atol=1e-4)
+    assert np.allclose(expected, vecs[:5], atol=1e-4)
 
 
 def test_spacy_featurizer_casing(spacy_nlp):

From 32586f3d7f2442b101640fba3d3f3e1f5275338e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 16:52:14 +0100
Subject: [PATCH 194/239] add test for convert featurizer

---
 .../featurizers/test_convert_featurizer.py    |  32 ++
 .../nlu/featurizers/test_mitie_featurizer.py  | 310 +-----------------
 2 files changed, 50 insertions(+), 292 deletions(-)
 create mode 100644 tests/nlu/featurizers/test_convert_featurizer.py

diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py
new file mode 100644
index 000000000000..978b98fd5f91
--- /dev/null
+++ b/tests/nlu/featurizers/test_convert_featurizer.py
@@ -0,0 +1,32 @@
+import numpy as np
+
+from nlu.constants import (
+    MESSAGE_TEXT_ATTRIBUTE,
+    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    MESSAGE_TOKENS_NAMES,
+)
+from rasa.nlu.training_data import Message
+from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
+from rasa.nlu.config import RasaNLUModelConfig
+
+
+def test_convert_featurizer(mitie_feature_extractor, default_config):
+    from rasa.nlu.featurizers.dense_featurizer.convert_featurizer import (
+        ConveRTFeaturizer,
+    )
+
+    component_config = {"name": "ConveRTFeaturizer", "return_sequence": False}
+    featurizer = ConveRTFeaturizer.create(component_config, RasaNLUModelConfig())
+
+    sentence = "Hey how are you today ?"
+    message = Message(sentence)
+    tokens = WhitespaceTokenizer().tokenize(sentence)
+    message.set(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], tokens)
+
+    featurizer.process(message)
+
+    vecs = message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])[0]
+
+    expected = np.array([1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])
+
+    assert np.allclose(vecs[:5], expected, atol=1e-5)
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index 094c57ed7e01..b155c5507ee8 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -4,308 +4,34 @@
 from rasa.nlu.config import RasaNLUModelConfig
 
 
-def test_mitie_featurizer_no_sequence(mitie_feature_extractor, default_config):
+def test_mitie_featurizer(mitie_feature_extractor, default_config):
     from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 
-    mitie_component_config = {"name": "MitieFeaturizer", "return_sequence": False}
-    ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
+    component_config = {"name": "MitieFeaturizer", "return_sequence": True}
+    featurizer = MitieFeaturizer.create(component_config, RasaNLUModelConfig())
+
     sentence = "Hey how are you today"
+
     tokens = MitieTokenizer().tokenize(sentence)
-    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)[0]
-    print(vecs)
-    expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
+
+    vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)[0]
+
+    expected = np.array(
+        [0.00000000e00, -5.12735510e00, 4.39929873e-01, -5.60760403e00, -8.26445103e00]
+    )
     assert np.allclose(vecs[:5], expected, atol=1e-5)
 
 
-def test_mitie_featurizer(mitie_feature_extractor, default_config):
+def test_mitie_featurizer_no_sequence(mitie_feature_extractor, default_config):
     from rasa.nlu.featurizers.dense_featurizer.mitie_featurizer import MitieFeaturizer
 
-    mitie_component_config = {"name": "MitieFeaturizer", "return_sequence": True}
-    ftr = MitieFeaturizer.create(mitie_component_config, RasaNLUModelConfig())
+    component_config = {"name": "MitieFeaturizer", "return_sequence": False}
+    featurizer = MitieFeaturizer.create(component_config, RasaNLUModelConfig())
 
     sentence = "Hey how are you today"
+    tokens = MitieTokenizer().tokenize(sentence)
 
-    mitie_component_config = {"name": "MitieTokenizer", "use_cls_token": False}
-    tokens = MitieTokenizer(mitie_component_config).tokenize(sentence)
-
-    vecs = ftr.features_for_tokens(tokens, mitie_feature_extractor)
-
-    print(vecs[0])
+    vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)[0]
 
-    expected = np.array(
-        [
-            0.00000000e00,
-            -5.12735510e00,
-            4.39929873e-01,
-            -5.60760403e00,
-            -8.26445103e00,
-            -2.82586724e-01,
-            -1.00573087e00,
-            6.16612673e00,
-            4.51831198e00,
-            -4.36631250e00,
-            -4.30110741e00,
-            1.45059049e00,
-            -1.43656611e00,
-            -2.57342124e00,
-            4.97706032e00,
-            8.10247707e00,
-            -3.96101475e-01,
-            7.00332284e-01,
-            5.96706450e-01,
-            -2.48464966e00,
-            4.15551476e-02,
-            -1.87830377e00,
-            2.42095375e00,
-            -5.44092357e-01,
-            2.67223120e00,
-            -2.00578022e00,
-            1.22963965e00,
-            2.41412973e00,
-            -1.68471694e00,
-            1.87547529e00,
-            -6.68652773e-01,
-            3.51532221e-01,
-            -2.20869064e-01,
-            -1.27275801e00,
-            -4.55570340e-01,
-            1.30415881e00,
-            -7.04453290e-01,
-            2.64976263e00,
-            1.26680803e00,
-            2.04671100e-01,
-            2.10326767e00,
-            -9.23435837e-02,
-            -1.68740237e00,
-            -1.97914347e-01,
-            -1.41813803e00,
-            -1.66212067e-01,
-            -3.05680543e-01,
-            -9.69038725e-01,
-            9.14714634e-01,
-            7.34257996e-01,
-            -2.09392056e-01,
-            1.55312046e-01,
-            8.92380118e-01,
-            -9.96749043e-01,
-            4.89007890e-01,
-            1.47573900e00,
-            -1.83904159e00,
-            -3.72725785e-01,
-            2.75361121e-01,
-            -5.25904536e-01,
-            1.30387291e-01,
-            7.00960279e-01,
-            6.50017858e-01,
-            -4.50371534e-01,
-            -6.38834953e-01,
-            6.73773468e-01,
-            8.81396413e-01,
-            -6.75141394e-01,
-            4.86862332e-01,
-            5.01379609e-01,
-            4.88356948e-01,
-            -4.53982174e-01,
-            -7.30833590e-01,
-            8.64133000e-01,
-            -1.71588242e-01,
-            6.58954322e-01,
-            2.18044650e-02,
-            7.73006380e-01,
-            4.51285660e-01,
-            -8.16508114e-01,
-            1.95529416e-01,
-            4.02571887e-01,
-            3.07720363e-01,
-            -1.18403830e-01,
-            -1.73695073e-01,
-            1.27457175e-02,
-            4.45014611e-02,
-            -2.24128217e-01,
-            -1.06355086e-01,
-            6.02598965e-01,
-            -3.68196577e-01,
-            -5.17653847e00,
-            5.86267173e-01,
-            -2.78645813e-01,
-            -2.67106779e-02,
-            2.77676725e00,
-            2.26144981e00,
-            -1.74828792e00,
-            4.56249189e00,
-            1.75182652e00,
-            -4.38783467e-01,
-            -1.09945142e00,
-            -2.11774683e00,
-            -2.76408410e00,
-            -1.55349958e00,
-            -3.79859638e00,
-            3.96455169e00,
-            -2.52921015e-01,
-            -1.92639256e00,
-            -4.89389300e-01,
-            -4.02990580e00,
-            -8.72295380e00,
-            -1.46801007e00,
-            -2.31932306e00,
-            1.67305171e00,
-            -3.19912529e00,
-            1.86834311e00,
-            2.06363127e-01,
-            4.57791597e-01,
-            -2.40873742e00,
-            1.95506680e00,
-            -3.92530274e00,
-            -2.98407483e00,
-            -1.78072822e00,
-            1.29415095e00,
-            1.00851856e-01,
-            -1.08310506e-01,
-            1.16931573e-01,
-            1.38969958e00,
-            -7.87991047e-01,
-            -1.70851195e00,
-            3.38014960e00,
-            -2.66119748e-01,
-            2.83784223e00,
-            3.44787151e-01,
-            1.87817657e00,
-            7.69976914e-01,
-            5.02131760e-01,
-            1.00641572e00,
-            2.05512595e00,
-            8.01849067e-01,
-            -8.64741862e-01,
-            -2.41731501e00,
-            -6.90070271e-01,
-            8.99859846e-01,
-            2.59272814e-01,
-            2.12083149e00,
-            1.71254003e00,
-            2.31126094e00,
-            1.05681944e00,
-            -8.90498281e-01,
-            5.30907393e-01,
-            2.41127789e-01,
-            -3.24536473e-01,
-            -5.03312349e-01,
-            -3.45470524e00,
-            7.23897219e-01,
-            3.49540949e00,
-            -1.54396147e-01,
-            1.96257377e00,
-            -8.16661939e-02,
-            -1.42608774e00,
-            -6.39269233e-01,
-            7.82996774e-01,
-            2.48106170e00,
-            9.45179760e-01,
-            -8.31814110e-01,
-            -7.13138878e-01,
-            -1.56903923e00,
-            1.44644022e00,
-            -1.24463284e00,
-            -5.90117991e-01,
-            -1.30865097e00,
-            1.70658243e00,
-            3.14512819e-01,
-            5.01549184e-01,
-            -3.24578106e-01,
-            2.81532764e-01,
-            6.94498479e-01,
-            1.65341794e00,
-            -1.78533092e-01,
-            -1.36791408e00,
-            -3.05325389e-01,
-            1.57340133e00,
-            -8.41358781e-01,
-            5.52713513e-01,
-            -7.22983599e-01,
-            4.10806626e-01,
-            -5.17388061e-02,
-            1.05758071e00,
-            2.37797365e-01,
-            -8.51634622e-01,
-            -1.79594696e-01,
-            -4.38443124e-01,
-            9.10361111e-02,
-            1.02692962e-01,
-            6.27609611e-01,
-            5.56623459e-01,
-            5.40035427e-01,
-            1.44254386e00,
-            8.21452856e-01,
-            7.96434343e-01,
-            -6.25197291e-01,
-            3.09273601e-03,
-            -6.24552667e-02,
-            -1.03001225e00,
-            3.47646058e-01,
-            -3.60108972e-01,
-            7.73691535e-01,
-            -8.22658122e-01,
-            -6.71157479e-01,
-            9.70521867e-01,
-            2.73865640e-01,
-            1.69602585e00,
-            5.74894428e-01,
-            5.25952458e-01,
-            -2.73797333e-01,
-            2.24092394e-01,
-            -3.01282465e-01,
-            -8.98015559e-01,
-            -3.54814857e-01,
-            -5.35844207e-01,
-            -2.62837589e-01,
-            7.90212154e-01,
-            1.64234906e-01,
-            1.01651788e00,
-            1.22546232e00,
-            -3.33948851e-01,
-            -4.89927202e-01,
-            -1.12350926e-01,
-            -1.36075035e-01,
-            -9.49754834e-01,
-            -5.68806455e-02,
-            2.42536068e-01,
-            -2.81865031e-01,
-            5.53327501e-02,
-            7.90774226e-01,
-            2.38684490e-02,
-            -9.46886778e-01,
-            5.67425728e-01,
-            1.11705333e-01,
-            -8.51398855e-02,
-            -3.33825350e-01,
-            2.33040452e-01,
-            -4.90594149e-01,
-            6.75024092e-03,
-            3.73918623e-01,
-            -3.34260643e-01,
-            -7.60734856e-01,
-            -5.63092679e-02,
-            4.10971254e-01,
-            -2.63164580e-01,
-            8.54819715e-02,
-            -4.20097411e-02,
-            -9.82390791e-02,
-            3.80521566e-01,
-            2.33330190e-01,
-            4.47722435e-01,
-            -3.42616737e-02,
-            -5.51659703e-01,
-            5.68716228e-01,
-            -2.82406271e-01,
-            6.78738177e-01,
-            -8.50788295e-01,
-            1.21547759e-01,
-            -3.42155367e-01,
-            -5.10491610e-01,
-            -6.96370900e-01,
-            2.27460936e-02,
-            -3.87611985e-01,
-            -1.09960282e00,
-        ]
-    )
-
-    assert np.allclose(vecs[0], expected, atol=1e-5)
+    expected = np.array([0.0, -4.4551446, 0.26073121, -1.46632245, -1.84205751])
+    assert np.allclose(vecs[:5], expected, atol=1e-5)

From 5121edc704dfcecdd3626e2de4d38f027c176fda Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 16:56:17 +0100
Subject: [PATCH 195/239] fix default pipeline test

---
 tests/nlu/base/test_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlu/base/test_config.py b/tests/nlu/base/test_config.py
index f6453e49404e..be729075adb3 100644
--- a/tests/nlu/base/test_config.py
+++ b/tests/nlu/base/test_config.py
@@ -65,7 +65,7 @@ def test_set_attr_on_component():
     cfg = config.load("sample_configs/config_pretrained_embeddings_spacy.yml")
     cfg.set_component_attr(6, C=324)
 
-    assert cfg.for_component(1) == {"name": "SpacyTokenizer", "use_cls_token": False}
+    assert cfg.for_component(1) == {"name": "SpacyTokenizer"}
     assert cfg.for_component(6) == {"name": "SklearnIntentClassifier", "C": 324}
 
 

From 3c20e33ec3560acd06d364ece337a673e28e0f8a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 17:02:00 +0100
Subject: [PATCH 196/239] refactor mitie featurizer

---
 .../dense_featurizer/mitie_featurizer.py          | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 435900b1f661..3e7b093f76f7 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -106,6 +106,16 @@ def _mitie_feature_extractor(self, **kwargs):
             )
         return mitie_feature_extractor
 
+    def _features_as_sequence(
+        self,
+        tokens: List[Token],
+        feature_extractor: "mitie.total_word_feature_extractor",
+    ) -> np.ndarray:
+        vec = []
+        for token in tokens:
+            vec.append(feature_extractor.get_feature_vector(token.text))
+        return np.array(vec)
+
     def features_for_tokens(
         self,
         tokens: List[Token],
@@ -113,10 +123,7 @@ def features_for_tokens(
     ) -> np.ndarray:
 
         if self.return_sequence:
-            vec = []
-            for token in tokens:
-                vec.append(feature_extractor.get_feature_vector(token.text))
-            return np.array(vec)
+            return self._features_as_sequence(tokens, feature_extractor)
 
         vec = np.zeros(self.ndim(feature_extractor))
         for token in tokens:

From aac64a83e476a7162e3427b8968db060a0ccee37 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Fri, 29 Nov 2019 18:18:32 +0100
Subject: [PATCH 197/239] fix import

---
 tests/nlu/featurizers/test_convert_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py
index 978b98fd5f91..e46881449f72 100644
--- a/tests/nlu/featurizers/test_convert_featurizer.py
+++ b/tests/nlu/featurizers/test_convert_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from nlu.constants import (
+from rasa.nlu.constants import (
     MESSAGE_TEXT_ATTRIBUTE,
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
     MESSAGE_TOKENS_NAMES,

From a4b454b9cc58d1eb60a64650493889b947db261c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 2 Dec 2019 13:17:37 +0100
Subject: [PATCH 198/239] Add warning to convert featurizer.

---
 rasa/nlu/classifiers/embedding_intent_classifier.py         | 2 +-
 rasa/nlu/extractors/crf_entity_extractor.py                 | 5 ++---
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 6 ++++++
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index c25f76f94198..594d05e95f3e 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -305,7 +305,7 @@ def _extract_and_add_features(
             if sparse_features.shape[0] != dense_features.shape[0]:
                 raise ValueError(
                     f"Sequence dimensions for sparse and dense features "
-                    f"don't coincide in '{message.text}'"
+                    f"don't coincide in '{message.text}' for attribute '{attribute}'."
                 )
 
         return sparse_features, dense_features
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index abe4344807fd..710134edada2 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -596,12 +596,11 @@ def __get_dense_features(message: Message) -> Optional[List[Any]]:
 
         tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
         if len(tokens) != len(features):
-            warnings.warn(
+            raise ValueError(
                 f"Number of features ({len(features)}) for attribute "
                 f"'{MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]}' "
-                f"does not match number of tokens ({len(tokens)})"
+                f"does not match number of tokens ({len(tokens)})."
             )
-            return None
 
         # convert to python-crfsuite feature format
         features_out = []
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index c6f7a11872c4..364cd65e9545 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -45,6 +45,12 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
         self._load_model()
 
+        logger.warning(
+            f"ConveRTFeaturizer always returns a feature vector of size "
+            f"(1 x feature-dimensions. If you use any other featurizer with "
+            f"'return_sequence' equal to True, training will fail."
+        )
+
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["tensorflow_text", "tensorflow_hub"]

From 303ef4cc0ebfdda2c28be61c85fd6a8c5826131d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 2 Dec 2019 13:22:51 +0100
Subject: [PATCH 199/239] update warning in crf entity extractor

---
 rasa/nlu/extractors/crf_entity_extractor.py                | 7 +++++--
 .../nlu/featurizers/dense_featurizer/convert_featurizer.py | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 710134edada2..427e55c37339 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -596,11 +596,14 @@ def __get_dense_features(message: Message) -> Optional[List[Any]]:
 
         tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
         if len(tokens) != len(features):
-            raise ValueError(
+            warnings.warn(
                 f"Number of features ({len(features)}) for attribute "
                 f"'{MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]}' "
-                f"does not match number of tokens ({len(tokens)})."
+                f"does not match number of tokens ({len(tokens)}). Set "
+                f"'return_sequence' to true in the corresponding featurizer in order "
+                f"to make use of the features in 'CRFEntityExtractor'."
             )
+            return None
 
         # convert to python-crfsuite feature format
         features_out = []
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 364cd65e9545..fc8da63e80e0 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -47,7 +47,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
         logger.warning(
             f"ConveRTFeaturizer always returns a feature vector of size "
-            f"(1 x feature-dimensions. If you use any other featurizer with "
+            f"(1 x feature-dimensions). If you use any other featurizer with "
             f"'return_sequence' equal to True, training will fail."
         )
 

From b4e1e04d538f4701d6c7e8f9fd9b1c64b84f86c2 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 2 Dec 2019 14:39:06 +0100
Subject: [PATCH 200/239] Add empty documentation page.

---
 ...aturization.rst => core-featurization.rst} |  6 ++---
 docs/api/nlu-featurization.rst                | 22 +++++++++++++++++++
 docs/index.rst                                |  3 ++-
 3 files changed, 27 insertions(+), 4 deletions(-)
 rename docs/api/{featurization.rst => core-featurization.rst} (98%)
 create mode 100644 docs/api/nlu-featurization.rst

diff --git a/docs/api/featurization.rst b/docs/api/core-featurization.rst
similarity index 98%
rename from docs/api/featurization.rst
rename to docs/api/core-featurization.rst
index 90021cb68b07..2267b6882f80 100644
--- a/docs/api/featurization.rst
+++ b/docs/api/core-featurization.rst
@@ -1,10 +1,10 @@
 :desc: Find out how to apply machine learning algorithms to conversational AI
        using vector representations of conversations with Rasa.
 
-.. _featurization:
+.. _featurization_conversations:
 
-Featurization
-==============
+Featurization of Conversations
+==============================
 
 .. edit-link::
 
diff --git a/docs/api/nlu-featurization.rst b/docs/api/nlu-featurization.rst
new file mode 100644
index 000000000000..7d6ba11755bc
--- /dev/null
+++ b/docs/api/nlu-featurization.rst
@@ -0,0 +1,22 @@
+:desc: Find out how to apply machine learning algorithms to conversational AI
+       using vector representations of conversations with Rasa.
+
+.. _featurization_utterance:
+
+Featurization of Utterances
+===========================
+
+.. edit-link::
+
+TODO
+
+
+Sparse Featurizers
+^^^^^^^^^^^^^^^^^^
+
+TODO
+
+Dense Featurizers
+^^^^^^^^^^^^^^^^^
+
+TODO
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index c205cd584691..3e91c3ef1ebc 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -90,7 +90,8 @@ Understand messages, hold conversations, and connect to messaging channels and A
    api/event-brokers
    api/lock-stores
    api/training-data-importers
-   api/featurization
+   api/nlu-featurization
+   api/core-featurization
    migration-guide
    changelog
 

From 2e32d7e34197e15bc31c92fedbc695355675bcfa Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 2 Dec 2019 15:25:01 +0100
Subject: [PATCH 201/239] update documentation

---
 docs/api/nlu-featurization.rst |  22 ------
 docs/index.rst                 |   1 -
 docs/nlu/components.rst        | 131 +++++++++++++++++++--------------
 docs/nlu/entity-extraction.rst |  14 ++++
 4 files changed, 88 insertions(+), 80 deletions(-)
 delete mode 100644 docs/api/nlu-featurization.rst

diff --git a/docs/api/nlu-featurization.rst b/docs/api/nlu-featurization.rst
deleted file mode 100644
index 7d6ba11755bc..000000000000
--- a/docs/api/nlu-featurization.rst
+++ /dev/null
@@ -1,22 +0,0 @@
-:desc: Find out how to apply machine learning algorithms to conversational AI
-       using vector representations of conversations with Rasa.
-
-.. _featurization_utterance:
-
-Featurization of Utterances
-===========================
-
-.. edit-link::
-
-TODO
-
-
-Sparse Featurizers
-^^^^^^^^^^^^^^^^^^
-
-TODO
-
-Dense Featurizers
-^^^^^^^^^^^^^^^^^
-
-TODO
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index 3e91c3ef1ebc..d2f1b039a02c 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -90,7 +90,6 @@ Understand messages, hold conversations, and connect to messaging channels and A
    api/event-brokers
    api/lock-stores
    api/training-data-importers
-   api/nlu-featurization
    api/core-featurization
    migration-guide
    changelog
diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 7f898772a9f8..1f4417592796 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -85,12 +85,26 @@ SpacyNLP
 Featurizers
 -----------
 
+Featurizers are divided into two different categories: sparse featurizers and dense featurizers.
+Sparse featurizers are featurizers that return feature vectors with a lot of missing values, e.g. zeros.
+As those feature vectors would normally take up a lot of memory, we store them as sparse features.
+Sparse features only store the values that are not zero and their positions in the vector.
+Thus, we save a lot of memroy and are able to train on larger dataset.
+
+By default all featurizers will return a matrix of length (1 x feature-dimension).
+All featurizer (except the ``ConveRTFeaturizer``) have the option to return a sequence instead.
+In case the flag ``"return_sequence"`` is set to ``True`` the returned matrix of a featurizer will have the size
+(token-length x feature-dimenstion).
+So, the returned vector will have an entry for every token.
+
+
 MitieFeaturizer
 ~~~~~~~~~~~~~~~
 
 :Short: MITIE intent featurizer
 :Outputs: nothing, used as an input to intent classifiers that need intent features (e.g. ``SklearnIntentClassifier``)
 :Requires: :ref:`MitieNLP`
+:Type: Dense featurizer
 :Description:
     Creates feature for intent classification using the MITIE featurizer.
 
@@ -114,6 +128,7 @@ SpacyFeaturizer
 :Short: spacy intent featurizer
 :Outputs: nothing, used as an input to intent classifiers that need intent features (e.g. ``SklearnIntentClassifier``)
 :Requires: :ref:`SpacyNLP`
+:Type: Dense featurizer
 :Description:
     Creates feature for intent classification using the spacy featurizer.
 :Configuration:
@@ -123,28 +138,37 @@ SpacyFeaturizer
         pipeline:
         - name: "SpacyFeaturizer"
 
-NGramFeaturizer
-~~~~~~~~~~~~~~~
 
-:Short: Appends char-ngram features to feature vector
-:Outputs: nothing, appends its features to an existing feature vector generated by another intent featurizer
-:Requires: :ref:`SpacyNLP`
+ConveRTFeaturizer
+~~~~~~~~~~~~~~~~~
+
+:Short:
+    Creates a vector representation of user message and response (if specified) using
+    `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model.
+:Outputs:
+    nothing, used as an input to intent classifiers and response selectors that need intent features and response
+    features respectively (e.g. ``EmbeddingIntentClassifier`` and ``ResponseSelector``)
+:Requires: nothing
+:Type: Dense featurizer
 :Description:
-    This featurizer appends character ngram features to a feature vector. During training the component looks for the
-    most common character sequences (e.g. ``app`` or ``ing``). The added features represent a boolean flag if the
-    character sequence is present in the word sequence or not.
+    Creates features for intent classification and response selection.
+    Uses the `default signature <https://github.com/PolyAI-LDN/polyai-models#tfhub-signatures>`_ to compute vector
+    representations of input text.
+
+    .. warning::
+        Since ``ConveRT`` model is trained only on an english corpus of conversations, this featurizer should only
+        be used if your training data is in english language.
 
-    .. note:: There needs to be another intent featurizer previous to this one in the pipeline!
+    .. note::
+        To use ``ConveRTFeaturizer`` you should install ``tensorflow_text==1.15.1`` and ``tensorflow_hub==0.6.0``.
+        Otherwise, you can also do a pip install of Rasa with ``pip install rasa[convert]``
 
 :Configuration:
 
     .. code-block:: yaml
 
         pipeline:
-        - name: "NGramFeaturizer"
-          # Maximum number of ngrams to use when augmenting
-          # feature vectors with character ngrams
-          max_number_of_ngrams: 10
+        - name: "ConveRTFeaturizer"
 
 
 RegexFeaturizer
@@ -153,12 +177,14 @@ RegexFeaturizer
 :Short: regex feature creation to support intent and entity classification
 :Outputs: ``text_features`` and ``tokens.pattern``
 :Requires: nothing
+:Type: Sparse featurizer
 :Description:
-    During training, the regex intent featurizer creates a list of `regular expressions` defined in the training data format.
-    For each regex, a feature will be set marking whether this expression was found in the input, which will later be fed into intent classifier / entity
-    extractor to simplify classification (assuming the classifier has learned during the training phase, that this set
-    feature indicates a certain intent). Regex features for entity extraction are currently only supported by the
-    ``CRFEntityExtractor`` component!
+    During training, the regex intent featurizer creates a list of `regular expressions` defined in the training
+    data format.
+    For each regex, a feature will be set marking whether this expression was found in the input, which will later
+    be fed into intent classifier / entity extractor to simplify classification (assuming the classifier has learned
+    during the training phase, that this set feature indicates a certain intent).
+    Regex features for entity extraction are currently only supported by the ``CRFEntityExtractor`` component!
 
     .. note:: There needs to be a tokenizer previous to this featurizer in the pipeline!
 
@@ -172,6 +198,7 @@ CountVectorsFeaturizer
    need bag-of-words representation of intent features
    (e.g. ``EmbeddingIntentClassifier``)
 :Requires: nothing
+:Type: Dense featurizer
 :Description:
     Creates bag-of-words representation of user message and label features using
     `sklearn's CountVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.
@@ -205,25 +232,31 @@ CountVectorsFeaturizer
 
         Since the training is performed on limited vocabulary data, it cannot be guaranteed that during prediction
         an algorithm will not encounter an unknown word (a word that were not seen during training).
-        In order to teach an algorithm how to treat unknown words, some words in training data can be substituted by generic word ``OOV_token``.
+        In order to teach an algorithm how to treat unknown words, some words in training data can be substituted
+        by generic word ``OOV_token``.
         In this case during prediction all unknown words will be treated as this generic word ``OOV_token``.
 
-        For example, one might create separate intent ``outofscope`` in the training data containing messages of different number of ``OOV_token`` s and
-        maybe some additional general words. Then an algorithm will likely classify a message with unknown words as this intent ``outofscope``.
+        For example, one might create separate intent ``outofscope`` in the training data containing messages of
+        different number of ``OOV_token`` s and maybe some additional general words.
+        Then an algorithm will likely classify a message with unknown words as this intent ``outofscope``.
 
         .. note::
             This featurizer creates a bag-of-words representation by **counting** words,
             so the number of ``OOV_token`` in the sentence might be important.
 
-            - ``OOV_token`` set a keyword for unseen words; if training data contains ``OOV_token`` as words in some messages,
-              during prediction the words that were not seen during training will be substituted with provided ``OOV_token``;
-              if ``OOV_token=None`` (default behaviour) words that were not seen during training will be ignored during prediction time;
-            - ``OOV_words`` set a list of words to be treated as ``OOV_token`` during training; if a list of words that should be treated
-              as Out-Of-Vacabulary is known, it can be set to ``OOV_words`` instead of manually changing it in trainig data or using custom preprocessor.
+            - ``OOV_token`` set a keyword for unseen words; if training data contains ``OOV_token`` as words in some
+              messages, during prediction the words that were not seen during training will be substituted with
+              provided ``OOV_token``; if ``OOV_token=None`` (default behaviour) words that were not seen during
+              training will be ignored during prediction time;
+            - ``OOV_words`` set a list of words to be treated as ``OOV_token`` during training; if a list of words
+              that should be treated as Out-Of-Vacabulary is known, it can be set to ``OOV_words`` instead of manually
+              changing it in trainig data or using custom preprocessor.
 
         .. note::
-            Providing ``OOV_words`` is optional, training data can contain ``OOV_token`` input manually or by custom additional preprocessor.
-            Unseen words will be substituted with ``OOV_token`` **only** if this token is present in the training data or ``OOV_words`` list is provided.
+            Providing ``OOV_words`` is optional, training data can contain ``OOV_token`` input manually or by custom
+            additional preprocessor.
+            Unseen words will be substituted with ``OOV_token`` **only** if this token is present in the training
+            data or ``OOV_words`` list is provided.
 
     Sharing Vocabulary between user message and labels:
 
@@ -270,29 +303,6 @@ CountVectorsFeaturizer
           OOV_words: []  # list of strings
 
 
-ConveRTFeaturizer
-~~~~~~~~~~~~~~~~~
-
-:Short: Creates a vector representation of user message and response (if specified) using `ConveRT <https://github.com/PolyAI-LDN/polyai-models>`_ model.
-:Outputs: nothing, used as an input to intent classifiers and response selectors that need intent features and response features respectively(e.g. ``EmbeddingIntentClassifier`` and ``ResponseSelector``)
-:Requires: nothing
-:Description:
-    Creates features for intent classification and response selection.
-    Uses the `default signature <https://github.com/PolyAI-LDN/polyai-models#tfhub-signatures>`_ to compute vector representations of input text.
-
-    .. warning::
-        Since ``ConveRT`` model is trained only on an english corpus of conversations, this featurizer should only be used if your training data is in english language.
-
-    .. note::
-        To use ``ConveRTFeaturizer`` you should install ``tensorflow_text==1.15.1`` and ``tensorflow_hub==0.6.0``. Otherwise, you can also do a pip install of Rasa with ``pip install rasa[convert]``
-
-:Configuration:
-
-    .. code-block:: yaml
-
-        pipeline:
-        - name: "ConveRTFeaturizer"
-
 Intent Classifiers
 ------------------
 
@@ -300,7 +310,9 @@ Intent Classifiers
 MitieIntentClassifier
 ~~~~~~~~~~~~~~~~~~~~~
 
-:Short: MITIE intent classifier (using a `text categorizer <https://github.com/mit-nlp/MITIE/blob/master/examples/python/text_categorizer_pure_model.py>`_)
+:Short:
+    MITIE intent classifier (using a
+    `text categorizer <https://github.com/mit-nlp/MITIE/blob/master/examples/python/text_categorizer_pure_model.py>`_)
 :Outputs: ``intent``
 :Requires: A tokenizer and a featurizer
 :Output-Example:
@@ -313,7 +325,8 @@ MitieIntentClassifier
 
 :Description:
     This classifier uses MITIE to perform intent classification. The underlying classifier
-    is using a multi-class linear SVM with a sparse linear kernel (see `MITIE trainer code <https://github.com/mit-nlp/MITIE/blob/master/mitielib/src/text_categorizer_trainer.cpp#L222>`_).
+    is using a multi-class linear SVM with a sparse linear kernel (see
+    `MITIE trainer code <https://github.com/mit-nlp/MITIE/blob/master/mitielib/src/text_categorizer_trainer.cpp#L222>`_).
 
 :Configuration:
 
@@ -349,7 +362,8 @@ SklearnIntentClassifier
 :Description:
     The sklearn intent classifier trains a linear SVM which gets optimized using a grid search. In addition
     to other classifiers it also provides rankings of the labels that did not "win". The spacy intent classifier
-    needs to be preceded by a featurizer in the pipeline. This featurizer creates the features used for the classification.
+    needs to be preceded by a featurizer in the pipeline. This featurizer creates the features used for the
+    classification.
 
 :Configuration:
     During the training of the SVM a hyperparameter search is run to
@@ -472,7 +486,8 @@ EmbeddingIntentClassifier
 
     .. note:: For ``cosine`` similarity ``mu_pos`` and ``mu_neg`` should be between ``-1`` and ``1``.
 
-    .. note:: There is an option to use linearly increasing batch size. The idea comes from `<https://arxiv.org/abs/1711.00489>`_.
+    .. note:: There is an option to use linearly increasing batch size. The idea comes from
+              `<https://arxiv.org/abs/1711.00489>`_.
               In order to do it pass a list to ``batch_size``, e.g. ``"batch_size": [64, 256]`` (default behaviour).
               If constant ``batch_size`` is required, pass an ``int``, e.g. ``"batch_size": 64``.
 
@@ -508,7 +523,8 @@ KeywordIntentClassifier
 
 :Description:
     This classifier works by searching a message for keywords.
-    The matching is case sensitive by default and searches only for exact matches of the keyword-string in the user message.
+    The matching is case sensitive by default and searches only for exact matches of the keyword-string in the user
+    message.
     The keywords for an intent are the examples of that intent in the NLU training data.
     This means the entire example is the keyword, not the individual words in the example.
 
@@ -804,7 +820,8 @@ CRFEntityExtractor
     neighbouring entity tags: the most likely set of tags is then calculated and returned.
     If POS features are used (pos or pos2), spaCy has to be installed. If you want to use
     additional features, such as pre-trained word embeddings, from any provided dense
-    featurizer, use ``"text_dense_features"``.
+    featurizer, use ``"text_dense_features"``. Make sure to set ``"return_sequence"`` to
+    ``True`` in the corresponding featurizer.
 :Configuration:
    .. code-block:: yaml
 
diff --git a/docs/nlu/entity-extraction.rst b/docs/nlu/entity-extraction.rst
index 58671a316b4c..2110bf016990 100644
--- a/docs/nlu/entity-extraction.rst
+++ b/docs/nlu/entity-extraction.rst
@@ -149,3 +149,17 @@ associate that with a location entity.
 
 If you just want to match regular expressions exactly, you can do this in your code,
 as a postprocessing step after receiving the response from Rasa NLU.
+
+
+Passing Custom Features to ``CRFEntityExtractor``
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+If you want to pass custom features, such as pre-trained word embeddings, to ``CRFEntityExtractor``, you can
+add any dense featurizer to the pipeline before the ``CRFEntityExtractor``.
+Make sure to set ``"return_sequence"`` to ``True`` for the corresponding dense featurizer.
+``CRFEntityExtractor`` automatically finds the additional dense features and checks if the dense features are an
+iterable of ``len(tokens)``, where each entry is a vector.
+A warning will be shown in case the check fails.
+However, ``CRFEntityExtractor`` will continue to train just without the additional custom features.
+In case dense features are present ``CRFEntityExtractor`` will pass the dense feature vectors to ``sklearn_crfsuite``
+and used for training.

From 24e92b628994e1292a08948733ca2e9de1e2191d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 3 Dec 2019 15:11:26 +0100
Subject: [PATCH 202/239] raise value error if seq dimension does not match

---
 rasa/nlu/extractors/crf_entity_extractor.py   |  2 +-
 .../dense_featurizer/convert_featurizer.py    |  2 +-
 .../dense_featurizer/spacy_featurizer.py      |  2 +-
 rasa/nlu/featurizers/featurzier.py            | 18 ++++++++++
 .../count_vectors_featurizer.py               | 12 +++----
 tests/nlu/featurizers/test_featurizer.py      | 33 +++++++++++++++++++
 6 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 427e55c37339..ee7db21b657c 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -592,7 +592,7 @@ def __get_dense_features(message: Message) -> Optional[List[Any]]:
         )
 
         if features is None:
-            return features
+            return None
 
         tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
         if len(tokens) != len(features):
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index fc8da63e80e0..dda5aeb0a162 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -45,7 +45,7 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
         self._load_model()
 
-        logger.warning(
+        logger.debug(
             f"ConveRTFeaturizer always returns a feature vector of size "
             f"(1 x feature-dimensions). If you use any other featurizer with "
             f"'return_sequence' equal to True, training will fail."
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 6348604e9b71..e300d3aa6fa8 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -81,7 +81,7 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
 
             if cls_token_used:
                 # cls token is used, need to append a vector
-                cls_token_vec = np.zeros([1, fs.shape[-1]])
+                cls_token_vec = np.mean(fs, axis=0, keepdims=True)
                 fs = np.concatenate([fs, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index edd5a3e2f46a..cd72499c84bf 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -30,6 +30,16 @@ def _combine_with_existing_dense_features(
         feature_name: Text = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
     ) -> Any:
         if message.get(feature_name) is not None:
+
+            if len(message.get(feature_name)) != len(additional_features):
+                raise ValueError(
+                    f"Cannot concatenate dense features as sequence dimension does not "
+                    f"match: {len(message.get(feature_name))} != "
+                    f"{len(additional_features)}."
+                    f"Make sure to set 'return_sequence' to the same value for all your"
+                    f"featurizers."
+                )
+
             return np.concatenate(
                 (message.get(feature_name), additional_features), axis=-1
             )
@@ -47,6 +57,14 @@ def _combine_with_existing_sparse_features(
         if message.get(feature_name) is not None:
             from scipy.sparse import hstack
 
+            if message.get(feature_name).shape[0] != additional_features.shape[0]:
+                raise ValueError(
+                    f"Cannot concatenate sparse features as sequence dimension does not "
+                    f"match: {message.get(feature_name).shape[0]} != "
+                    f"{additional_features.shape[0]}."
+                    f"Make sure to set 'return_sequence' to the same value for all your"
+                    f"featurizers."
+                )
             return hstack([message.get(feature_name), additional_features])
         else:
             return additional_features
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index f1dd05c82a07..12ce7bf83e5f 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -404,13 +404,13 @@ def _create_sequence(
         X = []
 
         for i, tokens in enumerate(all_tokens):
+            input = tokens
             if self.return_sequence:
-                x = self.vectorizers[attribute].transform(tokens)
-                x.sort_indices()
-                X.append(x.tocoo())
-            else:
-                x = self.vectorizers[attribute].transform([" ".join(tokens)])
-                X.append(x.tocoo())
+                input = [" ".join(tokens)]
+
+            x = self.vectorizers[attribute].transform(input)
+            x.sort_indices()
+            X.append(x.tocoo())
 
         return X
 
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 803c559d7433..0d8939cbe288 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -30,6 +30,22 @@ def test_combine_with_existing_dense_features():
     assert np.all(expected_features == actual_features)
 
 
+def test_combine_with_existing_dense_features_shape_mismatch():
+    featurizer = Featurizer()
+    attribute = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+
+    existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
+    new_features = [[0, 1]]
+
+    message = Message("This is a text.")
+    message.set(attribute, existing_features)
+
+    with pytest.raises(ValueError):
+        featurizer._combine_with_existing_dense_features(
+            message, new_features, attribute
+        )
+
+
 def test_combine_with_existing_sparse_features():
 
     featurizer = Featurizer()
@@ -50,6 +66,23 @@ def test_combine_with_existing_sparse_features():
     assert np.all(expected_features == actual_features)
 
 
+def test_combine_with_existing_sparse_features_shape_mismatch():
+
+    featurizer = Featurizer()
+    attribute = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+
+    existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
+    new_features = scipy.sparse.csr_matrix([[0, 1]])
+
+    message = Message("This is a text.")
+    message.set(attribute, existing_features)
+
+    with pytest.raises(ValueError):
+        featurizer._combine_with_existing_sparse_features(
+            message, new_features, attribute
+        )
+
+
 @pytest.mark.parametrize(
     "features, expected",
     [

From 388fb6e7f21179301d45657459a9973b9b97523b Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 4 Dec 2019 10:33:04 +0100
Subject: [PATCH 203/239] take mean vec for cls token in mitie

---
 .../dense_featurizer/mitie_featurizer.py      | 37 +++++++++++++++----
 .../dense_featurizer/spacy_featurizer.py      |  2 +-
 .../nlu/featurizers/test_mitie_featurizer.py  |  5 ++-
 3 files changed, 33 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 3e7b093f76f7..e09af8930105 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -15,6 +15,7 @@
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    CLS_TOKEN,
 )
 
 
@@ -111,20 +112,17 @@ def _features_as_sequence(
         tokens: List[Token],
         feature_extractor: "mitie.total_word_feature_extractor",
     ) -> np.ndarray:
-        vec = []
+        features = []
         for token in tokens:
-            vec.append(feature_extractor.get_feature_vector(token.text))
-        return np.array(vec)
+            features.append(feature_extractor.get_feature_vector(token.text))
 
-    def features_for_tokens(
+        return np.array(features)
+
+    def _features_as_non_sequence(
         self,
         tokens: List[Token],
         feature_extractor: "mitie.total_word_feature_extractor",
     ) -> np.ndarray:
-
-        if self.return_sequence:
-            return self._features_as_sequence(tokens, feature_extractor)
-
         vec = np.zeros(self.ndim(feature_extractor))
         for token in tokens:
             vec += feature_extractor.get_feature_vector(token.text)
@@ -133,3 +131,26 @@ def features_for_tokens(
             vec = vec / len(tokens)
 
         return np.expand_dims(vec, axis=0)
+
+    def features_for_tokens(
+        self,
+        tokens: List[Token],
+        feature_extractor: "mitie.total_word_feature_extractor",
+    ) -> np.ndarray:
+        cls_token_used = tokens[-1].text == CLS_TOKEN if tokens else False
+
+        input_tokens = tokens
+        if cls_token_used:
+            input_tokens = tokens[:-1]
+
+        if self.return_sequence:
+            features = self._features_as_sequence(input_tokens, feature_extractor)
+        else:
+            features = self._features_as_non_sequence(input_tokens, feature_extractor)
+
+        if cls_token_used and self.return_sequence:
+            # cls token is used, need to append a vector
+            cls_token_vec = np.mean(features, axis=0, keepdims=True)
+            features = np.concatenate([features, cls_token_vec])
+
+        return features
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index e300d3aa6fa8..772c75a13d6a 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -79,7 +79,7 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
         if message_attribute_doc is not None:
             fs = self._features_for_doc(message_attribute_doc)
 
-            if cls_token_used:
+            if cls_token_used and self.return_sequence:
                 # cls token is used, need to append a vector
                 cls_token_vec = np.mean(fs, axis=0, keepdims=True)
                 fs = np.concatenate([fs, cls_token_vec])
diff --git a/tests/nlu/featurizers/test_mitie_featurizer.py b/tests/nlu/featurizers/test_mitie_featurizer.py
index b155c5507ee8..fa9a581de7d1 100644
--- a/tests/nlu/featurizers/test_mitie_featurizer.py
+++ b/tests/nlu/featurizers/test_mitie_featurizer.py
@@ -2,6 +2,7 @@
 
 from rasa.nlu.tokenizers.mitie_tokenizer import MitieTokenizer
 from rasa.nlu.config import RasaNLUModelConfig
+from rasa.nlu.constants import CLS_TOKEN
 
 
 def test_mitie_featurizer(mitie_feature_extractor, default_config):
@@ -10,7 +11,7 @@ def test_mitie_featurizer(mitie_feature_extractor, default_config):
     component_config = {"name": "MitieFeaturizer", "return_sequence": True}
     featurizer = MitieFeaturizer.create(component_config, RasaNLUModelConfig())
 
-    sentence = "Hey how are you today"
+    sentence = f"Hey how are you today {CLS_TOKEN}"
 
     tokens = MitieTokenizer().tokenize(sentence)
 
@@ -28,7 +29,7 @@ def test_mitie_featurizer_no_sequence(mitie_feature_extractor, default_config):
     component_config = {"name": "MitieFeaturizer", "return_sequence": False}
     featurizer = MitieFeaturizer.create(component_config, RasaNLUModelConfig())
 
-    sentence = "Hey how are you today"
+    sentence = f"Hey how are you today {CLS_TOKEN}"
     tokens = MitieTokenizer().tokenize(sentence)
 
     vecs = featurizer.features_for_tokens(tokens, mitie_feature_extractor)[0]

From d5579a367d973888687eecf61eb2647ddcec5f1d Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 4 Dec 2019 14:44:47 +0100
Subject: [PATCH 204/239] fix bug in count vector featurizer

---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 12ce7bf83e5f..73077e854610 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -405,7 +405,7 @@ def _create_sequence(
 
         for i, tokens in enumerate(all_tokens):
             input = tokens
-            if self.return_sequence:
+            if not self.return_sequence:
                 input = [" ".join(tokens)]
 
             x = self.vectorizers[attribute].transform(input)

From 0a37a610d38302dd0c583fee4d345410a4c171c4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 4 Dec 2019 14:54:26 +0100
Subject: [PATCH 205/239] review comments

---
 .../dense_featurizer/mitie_featurizer.py      | 41 +++++--------------
 .../dense_featurizer/spacy_featurizer.py      |  8 ++--
 2 files changed, 14 insertions(+), 35 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index e09af8930105..84dd7e14b069 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -107,31 +107,6 @@ def _mitie_feature_extractor(self, **kwargs):
             )
         return mitie_feature_extractor
 
-    def _features_as_sequence(
-        self,
-        tokens: List[Token],
-        feature_extractor: "mitie.total_word_feature_extractor",
-    ) -> np.ndarray:
-        features = []
-        for token in tokens:
-            features.append(feature_extractor.get_feature_vector(token.text))
-
-        return np.array(features)
-
-    def _features_as_non_sequence(
-        self,
-        tokens: List[Token],
-        feature_extractor: "mitie.total_word_feature_extractor",
-    ) -> np.ndarray:
-        vec = np.zeros(self.ndim(feature_extractor))
-        for token in tokens:
-            vec += feature_extractor.get_feature_vector(token.text)
-
-        if tokens:
-            vec = vec / len(tokens)
-
-        return np.expand_dims(vec, axis=0)
-
     def features_for_tokens(
         self,
         tokens: List[Token],
@@ -139,18 +114,22 @@ def features_for_tokens(
     ) -> np.ndarray:
         cls_token_used = tokens[-1].text == CLS_TOKEN if tokens else False
 
-        input_tokens = tokens
+        tokens_without_cls = tokens
         if cls_token_used:
-            input_tokens = tokens[:-1]
+            tokens_without_cls = tokens[:-1]
 
-        if self.return_sequence:
-            features = self._features_as_sequence(input_tokens, feature_extractor)
-        else:
-            features = self._features_as_non_sequence(input_tokens, feature_extractor)
+        # calculate features
+        features = []
+        for token in tokens_without_cls:
+            features.append(feature_extractor.get_feature_vector(token.text))
+        features = np.array(features)
 
         if cls_token_used and self.return_sequence:
             # cls token is used, need to append a vector
             cls_token_vec = np.mean(features, axis=0, keepdims=True)
             features = np.concatenate([features, cls_token_vec])
 
+        if not self.return_sequence:
+            features = np.mean(features, axis=0, keepdims=True)
+
         return features
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 772c75a13d6a..7a5228996848 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -77,14 +77,14 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
         cls_token_used = tokens[-1].text == CLS_TOKEN if tokens else False
 
         if message_attribute_doc is not None:
-            fs = self._features_for_doc(message_attribute_doc)
+            features = self._features_for_doc(message_attribute_doc)
 
             if cls_token_used and self.return_sequence:
                 # cls token is used, need to append a vector
-                cls_token_vec = np.mean(fs, axis=0, keepdims=True)
-                fs = np.concatenate([fs, cls_token_vec])
+                cls_token_vec = np.mean(features, axis=0, keepdims=True)
+                features = np.concatenate([features, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(
-                message, fs, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+                message, features, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
             )
             message.set(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute], features)

From afec4a914e3bb16e871591c50540d66f35b77d0a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 9 Dec 2019 15:40:21 +0100
Subject: [PATCH 206/239] add comment to count vectors about input to
 vectorizer

---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 73077e854610..8665ef79b91e 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -404,6 +404,10 @@ def _create_sequence(
         X = []
 
         for i, tokens in enumerate(all_tokens):
+            # vectorizer.transform returns a sparse matrix of size
+            # [n_samples, n_features]
+            # set input to list of tokens if sequence should be returned
+            # otherwise join all tokens to a single string and pass that as a list
             input = tokens
             if not self.return_sequence:
                 input = [" ".join(tokens)]

From f6507ca27fba624178ccbe2a26419a16b1f17ba2 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 9 Dec 2019 16:03:39 +0100
Subject: [PATCH 207/239] throw error is return seq is true for convert
 featurizer

---
 .../dense_featurizer/convert_featurizer.py      | 17 +++++++++++++++++
 rasa/nlu/featurizers/featurzier.py              | 13 ++++++++++++-
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index dda5aeb0a162..c70b35722c8f 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -21,6 +21,13 @@ class ConveRTFeaturizer(Featurizer):
         for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
+    defaults = {
+        # if True return a sequence of features (return vector has size
+        # token-size x feature-dimension)
+        # if False token-size will be equal to 1
+        "return_sequence": False
+    }
+
     def _load_model(self) -> None:
 
         # needed in order to load model
@@ -45,6 +52,16 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
 
         self._load_model()
 
+        self.return_sequence = self.component_config["return_sequence"]
+
+        if self.return_sequence:
+            raise NotImplementedError(
+                f"ConveRTFeaturizer always returns a feature vector of size "
+                f"(1 x feature-dimensions). ConveRTFeaturizer cannot return a "
+                f"proper sequence right now. ConveRTFeaturizer can only be used"
+                f"with 'return_sequence' set to False."
+            )
+
         logger.debug(
             f"ConveRTFeaturizer always returns a feature vector of size "
             f"(1 x feature-dimensions). If you use any other featurizer with "
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index cd72499c84bf..11588f707657 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -1,6 +1,6 @@
 import numpy as np
 import scipy.sparse
-from typing import Any, Text, List, Union, Optional
+from typing import Any, Text, List, Union, Optional, Dict
 from rasa.nlu.training_data import Message
 from rasa.nlu.components import Component
 from rasa.nlu.constants import (
@@ -23,6 +23,17 @@ def sequence_to_sentence_features(
 
 
 class Featurizer(Component):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
+        super(Featurizer, self).__init__(component_config)
+
+        try:
+            self.return_sequence = self.component_config["return_sequence"]
+        except KeyError:
+            raise KeyError(
+                "No default value for 'return_sequence' was set. Please, "
+                "add it to the default dict of the featurizer."
+            )
+
     @staticmethod
     def _combine_with_existing_dense_features(
         message: Message,

From de9a5edbefe85451249ea00c15a64fc704f12943 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 9 Dec 2019 16:45:46 +0100
Subject: [PATCH 208/239] update warnings

---
 rasa/nlu/featurizers/featurzier.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index 11588f707657..cbc77e715993 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -46,8 +46,8 @@ def _combine_with_existing_dense_features(
                 raise ValueError(
                     f"Cannot concatenate dense features as sequence dimension does not "
                     f"match: {len(message.get(feature_name))} != "
-                    f"{len(additional_features)}."
-                    f"Make sure to set 'return_sequence' to the same value for all your"
+                    f"{len(additional_features)}. "
+                    f"Make sure to set 'return_sequence' to the same value for all your "
                     f"featurizers."
                 )
 
@@ -72,8 +72,8 @@ def _combine_with_existing_sparse_features(
                 raise ValueError(
                     f"Cannot concatenate sparse features as sequence dimension does not "
                     f"match: {message.get(feature_name).shape[0]} != "
-                    f"{additional_features.shape[0]}."
-                    f"Make sure to set 'return_sequence' to the same value for all your"
+                    f"{additional_features.shape[0]}. "
+                    f"Make sure to set 'return_sequence' to the same value for all your "
                     f"featurizers."
                 )
             return hstack([message.get(feature_name), additional_features])

From c01673c7b7e7fc3a64a78e7921443ce1da6fb623 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 9 Dec 2019 16:51:36 +0100
Subject: [PATCH 209/239] update warning

---
 .../featurizers/dense_featurizer/convert_featurizer.py    | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index c70b35722c8f..2b08006189c0 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -57,9 +57,11 @@ def __init__(self, component_config: Dict[Text, Any] = None) -> None:
         if self.return_sequence:
             raise NotImplementedError(
                 f"ConveRTFeaturizer always returns a feature vector of size "
-                f"(1 x feature-dimensions). ConveRTFeaturizer cannot return a "
-                f"proper sequence right now. ConveRTFeaturizer can only be used"
-                f"with 'return_sequence' set to False."
+                f"(1 x feature-dimensions). It cannot return a proper sequence "
+                f"right now. ConveRTFeaturizer can only be used "
+                f"with 'return_sequence' set to False. Also, any other featurizer "
+                f"used next to ConveRTFeaturizer should have the flag "
+                f"'return_sequence' set to False."
             )
 
         logger.debug(

From 3ae7626b4734dd2c65e7e4ce3a03dd40e3118cca Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 9 Dec 2019 17:33:02 +0100
Subject: [PATCH 210/239] fix tests

---
 tests/nlu/featurizers/test_featurizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index 0d8939cbe288..e1ce8bc5297b 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -13,7 +13,7 @@
 
 def test_combine_with_existing_dense_features():
 
-    featurizer = Featurizer()
+    featurizer = Featurizer({"return_sequence": False})
     attribute = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
@@ -31,7 +31,7 @@ def test_combine_with_existing_dense_features():
 
 
 def test_combine_with_existing_dense_features_shape_mismatch():
-    featurizer = Featurizer()
+    featurizer = Featurizer({"return_sequence": False})
     attribute = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
@@ -48,7 +48,7 @@ def test_combine_with_existing_dense_features_shape_mismatch():
 
 def test_combine_with_existing_sparse_features():
 
-    featurizer = Featurizer()
+    featurizer = Featurizer({"return_sequence": False})
     attribute = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
@@ -68,7 +68,7 @@ def test_combine_with_existing_sparse_features():
 
 def test_combine_with_existing_sparse_features_shape_mismatch():
 
-    featurizer = Featurizer()
+    featurizer = Featurizer({"return_sequence": False})
     attribute = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])

From fcf0474c412c6ac18a6c63fec59373f277eba3be Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Dec 2019 11:14:10 +0100
Subject: [PATCH 211/239] remove default values from example configs

---
 examples/formbot/config.yml       | 1 -
 examples/restaurantbot/config.yml | 1 -
 rasa/core/test.py                 | 1 -
 3 files changed, 3 deletions(-)

diff --git a/examples/formbot/config.yml b/examples/formbot/config.yml
index 3cf3f4a14fe5..3aa0e7577759 100644
--- a/examples/formbot/config.yml
+++ b/examples/formbot/config.yml
@@ -2,7 +2,6 @@ language: en
 
 pipeline:
   - name: WhitespaceTokenizer
-    use_cls_token: False
   - name: CRFEntityExtractor
   - name: EntitySynonymMapper
   - name: CountVectorsFeaturizer
diff --git a/examples/restaurantbot/config.yml b/examples/restaurantbot/config.yml
index 52eb0709829e..b06666b8f0dd 100644
--- a/examples/restaurantbot/config.yml
+++ b/examples/restaurantbot/config.yml
@@ -3,7 +3,6 @@ language: en
 pipeline:
   - name: "SpacyNLP"
   - name: "SpacyTokenizer"
-    use_cls_token: False
   - name: "SpacyFeaturizer"
   - name: "SklearnIntentClassifier"
   - name: "CRFEntityExtractor"
diff --git a/rasa/core/test.py b/rasa/core/test.py
index 7ae42d9d2d82..39e53cddb042 100644
--- a/rasa/core/test.py
+++ b/rasa/core/test.py
@@ -25,7 +25,6 @@
 except ImportError:
     matplotlib.use("agg")
 
-
 logger = logging.getLogger(__name__)
 
 StoryEvalution = namedtuple(

From 79e0cebad860ad39d994e0104aeec17584c70a08 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Dec 2019 13:56:39 +0100
Subject: [PATCH 212/239] fix import

---
 rasa/nlu/extractors/mitie_entity_extractor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index 10fa4e88d454..2a60c7139888 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -12,7 +12,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 
 logger = logging.getLogger(__name__)

From e47176a249e5c8f013a1a0b665e5946e92a7b3d4 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Dec 2019 14:26:26 +0100
Subject: [PATCH 213/239] update documentatioon

---
 docs/nlu/components.rst                       | 17 ++++++++++++-----
 docs/nlu/entity-extraction.rst                |  6 +++---
 rasa/nlu/extractors/mitie_entity_extractor.py |  2 +-
 3 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 1f4417592796..e6ebd1ae552e 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -88,14 +88,15 @@ Featurizers
 Featurizers are divided into two different categories: sparse featurizers and dense featurizers.
 Sparse featurizers are featurizers that return feature vectors with a lot of missing values, e.g. zeros.
 As those feature vectors would normally take up a lot of memory, we store them as sparse features.
-Sparse features only store the values that are not zero and their positions in the vector.
+Sparse features only store the values that are non zero and their positions in the vector.
 Thus, we save a lot of memroy and are able to train on larger dataset.
 
 By default all featurizers will return a matrix of length (1 x feature-dimension).
 All featurizer (except the ``ConveRTFeaturizer``) have the option to return a sequence instead.
 In case the flag ``"return_sequence"`` is set to ``True`` the returned matrix of a featurizer will have the size
-(token-length x feature-dimenstion).
-So, the returned vector will have an entry for every token.
+(token-length x feature-dimension).
+So, the returned matrix will have an entry for every token.
+Otherwise, the matrix will just have on entry for the complete utterance.
 
 
 MitieFeaturizer
@@ -163,6 +164,12 @@ ConveRTFeaturizer
         To use ``ConveRTFeaturizer`` you should install ``tensorflow_text==1.15.1`` and ``tensorflow_hub==0.6.0``.
         Otherwise, you can also do a pip install of Rasa with ``pip install rasa[convert]``
 
+    .. warning::
+        If you set the option ``"return_sequence"`` to ``True``, Rasa will raise an error informing you that this
+        option is currently not supported. Do not use this featurizer in combination with any other featurizer that
+        has the option ``"return_sequence"`` set to ``True`` as training will fail. However, you can use this
+        featurizer with any other featurizer as long as ``"return_sequence"`` is set to ``False`` for all of them.  
+
 :Configuration:
 
     .. code-block:: yaml
@@ -192,13 +199,13 @@ RegexFeaturizer
 CountVectorsFeaturizer
 ~~~~~~~~~~~~~~~~~~~~~~
 
-:Short: Creates bag-of-words representation of user message and label(intent and response) features
+:Short: Creates bag-of-words representation of user message and label (intent and response) features
 :Outputs:
    nothing, used as an input to intent classifiers that
    need bag-of-words representation of intent features
    (e.g. ``EmbeddingIntentClassifier``)
 :Requires: nothing
-:Type: Dense featurizer
+:Type: Sparse featurizer
 :Description:
     Creates bag-of-words representation of user message and label features using
     `sklearn's CountVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.
diff --git a/docs/nlu/entity-extraction.rst b/docs/nlu/entity-extraction.rst
index 2110bf016990..c76305d5e0e1 100644
--- a/docs/nlu/entity-extraction.rst
+++ b/docs/nlu/entity-extraction.rst
@@ -155,11 +155,11 @@ Passing Custom Features to ``CRFEntityExtractor``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
 If you want to pass custom features, such as pre-trained word embeddings, to ``CRFEntityExtractor``, you can
-add any dense featurizer to the pipeline before the ``CRFEntityExtractor``.
+add any dense featurizer (except ``ConveRTFeaturizer``) to the pipeline before the ``CRFEntityExtractor``.
 Make sure to set ``"return_sequence"`` to ``True`` for the corresponding dense featurizer.
 ``CRFEntityExtractor`` automatically finds the additional dense features and checks if the dense features are an
 iterable of ``len(tokens)``, where each entry is a vector.
 A warning will be shown in case the check fails.
 However, ``CRFEntityExtractor`` will continue to train just without the additional custom features.
-In case dense features are present ``CRFEntityExtractor`` will pass the dense feature vectors to ``sklearn_crfsuite``
-and used for training.
+In case dense features are present, ``CRFEntityExtractor`` will pass the dense features to ``sklearn_crfsuite``
+and use them for training.
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index 10fa4e88d454..2a60c7139888 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -12,7 +12,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
-from rasa.nlu.tokenizers import Token
+from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 
 logger = logging.getLogger(__name__)

From e3ed14fa6497e7f678267fe545ca600bd072f660 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Dec 2019 14:33:40 +0100
Subject: [PATCH 214/239] fix links

---
 docs/core/policies.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/core/policies.rst b/docs/core/policies.rst
index 1934deb67a66..6285df3d2ea8 100644
--- a/docs/core/policies.rst
+++ b/docs/core/policies.rst
@@ -70,7 +70,7 @@ in the policy configuration yaml file.
 
     Only the ``MaxHistoryTrackerFeaturizer`` uses a max history,
     whereas the ``FullDialogueTrackerFeaturizer`` always looks at
-    the full conversation history. See :ref:`featurization` for details.
+    the full conversation history. See :ref:`featurization_conversations` for details.
 
 As an example, let's say you have an ``out_of_scope`` intent which
 describes off-topic user messages. If your bot sees this intent multiple
@@ -218,7 +218,7 @@ following steps:
 
 It is recommended to use
 ``state_featurizer=LabelTokenizerSingleStateFeaturizer(...)``
-(see :ref:`featurization` for details).
+(see :ref:`featurization_conversations` for details).
 
 **Configuration:**
 
@@ -308,7 +308,7 @@ It is recommended to use
         Default ``max_history`` for this policy is ``None`` which means it'll use
         the ``FullDialogueTrackerFeaturizer``. We recommend to set ``max_history`` to
         some finite value in order to use ``MaxHistoryTrackerFeaturizer``
-        for **faster training**. See :ref:`featurization` for details.
+        for **faster training**. See :ref:`featurization_conversations` for details.
         We recommend to increase ``batch_size`` for ``MaxHistoryTrackerFeaturizer``
         (e.g. ``"batch_size": [32, 64]``)
 

From 225f1e48aeb278e5c55b1e2eabc89bf2cdd293e9 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Tue, 10 Dec 2019 15:42:28 +0100
Subject: [PATCH 215/239] reduce complexity

---
 changelog/4935.feature.rst                    | 15 ++++++++
 .../dense_featurizer/mitie_featurizer.py      | 34 ++++++++++---------
 rasa/utils/train_utils.py                     | 13 ++++---
 3 files changed, 41 insertions(+), 21 deletions(-)
 create mode 100644 changelog/4935.feature.rst

diff --git a/changelog/4935.feature.rst b/changelog/4935.feature.rst
new file mode 100644
index 000000000000..662c2b74e44d
--- /dev/null
+++ b/changelog/4935.feature.rst
@@ -0,0 +1,15 @@
+Add option ``use_cls_token`` to all tokenizers. If it is set to ``True``, the token ``__CLS__`` will be added to
+the end of the list of tokens.
+
+Add option ``return_sequence`` to all featurizers. By default all featurizers return a matrix of size
+(1 x feature-dimension). If the option ``return_sequence`` is set to ``True``, the corresponding featurizer will return
+a matrix of size (token-length x feature-dimension).
+
+Split featurizers into sparse and dense featurizers.
+
+Deprecate ``NGramFeaturizer``. Please use ``CountVectorsFeaturizer`` instead.
+
+To use custom features in the ``CRFEntityExtractor`` use ``text_dense_features`` instead of ``ner_features``. If
+``text_dense_features`` are present in the feature set, the ``CRFEntityExtractor`` will automatically make use of
+them. Just make sure to add a dense featurizer in front of the ``CRFEntityExtractor`` in your pipeline and set the
+flag ``return_sequence`` to ``True`` for that featurizer.
\ No newline at end of file
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 09c18c27e2ff..0f185734ade6 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -61,23 +61,25 @@ def train(
 
         mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
         for example in training_data.intent_examples:
-
             for attribute in MESSAGE_ATTRIBUTES:
-
-                attribute_tokens = self.get_tokens_by_attribute(example, attribute)
-                if attribute_tokens is not None:
-
-                    features = self.features_for_tokens(
-                        attribute_tokens, mitie_feature_extractor
-                    )
-                    example.set(
-                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
-                        self._combine_with_existing_dense_features(
-                            example,
-                            features,
-                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
-                        ),
-                    )
+                self.process_training_example(
+                    example, attribute, mitie_feature_extractor
+                )
+
+    def process_training_example(
+        self, example: Message, attribute: Text, mitie_feature_extractor: Any
+    ):
+        attribute_tokens = self.get_tokens_by_attribute(example, attribute)
+        if attribute_tokens is not None:
+            features = self.features_for_tokens(
+                attribute_tokens, mitie_feature_extractor
+            )
+            example.set(
+                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                self._combine_with_existing_dense_features(
+                    example, features, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+                ),
+            )
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index f621f9fb324a..e939b7433595 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -48,14 +48,17 @@ def create_label_ids(label_ids: "np.ndarray") -> "np.ndarray":
 
     if label_ids.ndim == 1:
         return label_ids
-    elif label_ids.ndim == 2 and label_ids.shape[-1] == 1:
+
+    if label_ids.ndim == 2 and label_ids.shape[-1] == 1:
         return label_ids[:, 0]
-    elif label_ids.ndim == 2:
+
+    if label_ids.ndim == 2:
         return np.array([" ".join(row.astype("str")) for row in label_ids])
-    elif label_ids.ndim == 3 and label_ids.shape[-1] == 1:
+
+    if label_ids.ndim == 3 and label_ids.shape[-1] == 1:
         return np.array([" ".join(row.astype("str")) for row in label_ids[:, :, 0]])
-    else:
-        raise ValueError("Unsupported label_ids dimensions")
+
+    raise ValueError("Unsupported label_ids dimensions")
 
 
 # noinspection PyPep8Naming

From d434f04a280ffbf9ed9b144aa8cdb57fece4480c Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 11 Dec 2019 08:20:04 +0100
Subject: [PATCH 216/239] update featurization link

---
 docs/migration-guide.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/migration-guide.rst b/docs/migration-guide.rst
index fe933e449a30..a8776048f36c 100644
--- a/docs/migration-guide.rst
+++ b/docs/migration-guide.rst
@@ -37,7 +37,7 @@ General
 - Default ``max_history`` for ``EmbeddingPolicy`` is ``None`` which means it'll use
   the ``FullDialogueTrackerFeaturizer``. We recommend to set ``max_history`` to
   some finite value in order to use ``MaxHistoryTrackerFeaturizer``
-  for **faster training**. See :ref:`featurization` for details.
+  for **faster training**. See :ref:`featurization_conversations` for details.
   We recommend to increase ``batch_size`` for ``MaxHistoryTrackerFeaturizer``
   (e.g. ``"batch_size": [32, 64]``)
 - **Compare** mode of ``rasa train core`` allows the whole core config comparison.

From 8fdb9cfb0c94100b75c9b76bd4962adf7a9432c1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 11 Dec 2019 13:17:50 +0100
Subject: [PATCH 217/239] review comment

---
 docs/nlu/components.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index e6ebd1ae552e..60e392adf1c0 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -82,8 +82,8 @@ SpacyNLP
           # between these two words, therefore setting this to `true`.
           case_sensitive: false
 
-Featurizers
------------
+Text Featurizers
+----------------
 
 Featurizers are divided into two different categories: sparse featurizers and dense featurizers.
 Sparse featurizers are featurizers that return feature vectors with a lot of missing values, e.g. zeros.

From 4c631e6f9dc8f4e84b18619eea84073df3e2ea81 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 11 Dec 2019 13:51:28 +0100
Subject: [PATCH 218/239] remove MESSAGE_ from nlu constants

---
 rasa/core/actions/action.py                   |  4 +-
 .../embedding_intent_classifier.py            | 40 ++++++--------
 .../classifiers/mitie_intent_classifier.py    |  6 +--
 .../classifiers/sklearn_intent_classifier.py  | 14 ++---
 rasa/nlu/components.py                        |  4 +-
 rasa/nlu/constants.py                         | 47 +++++++----------
 rasa/nlu/extractors/crf_entity_extractor.py   | 52 ++++++++-----------
 .../nlu/extractors/duckling_http_extractor.py |  8 +--
 rasa/nlu/extractors/entity_synonyms.py        | 10 ++--
 rasa/nlu/extractors/mitie_entity_extractor.py | 20 +++----
 rasa/nlu/extractors/spacy_entity_extractor.py |  8 +--
 .../dense_featurizer/convert_featurizer.py    | 17 +++---
 .../dense_featurizer/mitie_featurizer.py      | 22 +++-----
 .../dense_featurizer/spacy_featurizer.py      | 20 ++++---
 rasa/nlu/featurizers/featurzier.py            | 12 ++---
 .../count_vectors_featurizer.py               | 33 +++++-------
 .../sparse_featurizer/regex_featurizer.py     | 20 ++++---
 .../selectors/embedding_response_selector.py  | 20 +++----
 rasa/nlu/test.py                              |  4 +-
 rasa/nlu/tokenizers/jieba_tokenizer.py        | 14 +++--
 rasa/nlu/tokenizers/mitie_tokenizer.py        | 14 ++---
 rasa/nlu/tokenizers/spacy_tokenizer.py        | 21 +++-----
 rasa/nlu/tokenizers/tokenizer.py              | 10 ++--
 rasa/nlu/tokenizers/whitespace_tokenizer.py   | 14 ++---
 rasa/nlu/training_data/formats/markdown.py    |  6 +--
 rasa/nlu/training_data/formats/rasa.py        |  6 +--
 rasa/nlu/training_data/message.py             | 30 +++++------
 rasa/nlu/training_data/training_data.py       | 15 ++----
 rasa/nlu/utils/spacy_utils.py                 | 18 ++-----
 .../test_embedding_intent_classifier.py       | 34 ++++--------
 .../featurizers/test_convert_featurizer.py    | 10 ++--
 tests/nlu/featurizers/test_featurizer.py      | 14 ++---
 .../nlu/featurizers/test_regex_featurizer.py  | 18 +++----
 33 files changed, 231 insertions(+), 354 deletions(-)

diff --git a/rasa/core/actions/action.py b/rasa/core/actions/action.py
index cfac96eff227..3e00ff24b73b 100644
--- a/rasa/core/actions/action.py
+++ b/rasa/core/actions/action.py
@@ -19,7 +19,7 @@
 from rasa.nlu.constants import (
     DEFAULT_OPEN_UTTERANCE_TYPE,
     OPEN_UTTERANCE_PREDICTION_KEY,
-    MESSAGE_SELECTOR_PROPERTY_NAME,
+    RESPONSE_SELECTOR_PROPERTY_NAME,
 )
 
 from rasa.core.events import (
@@ -197,7 +197,7 @@ async def run(
         """Query the appropriate response and create a bot utterance with that."""
 
         response_selector_properties = tracker.latest_message.parse_data[
-            MESSAGE_SELECTOR_PROPERTY_NAME
+            RESPONSE_SELECTOR_PROPERTY_NAME
         ]
 
         if self.intent_name_from_action() in response_selector_properties:
diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 594d05e95f3e..5ae2c2e194ca 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -13,10 +13,10 @@
 from rasa.utils import train_utils
 from rasa.utils.train_utils import SessionDataType
 from rasa.nlu.constants import (
-    MESSAGE_INTENT_ATTRIBUTE,
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    INTENT_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
+    SPARSE_FEATURE_NAMES,
+    DENSE_FEATURE_NAMES,
 )
 
 import tensorflow as tf
@@ -278,10 +278,8 @@ def _check_labels_features_exist(
 
         for label_example in labels_example:
             if (
-                label_example.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute])
-                is None
-                and label_example.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute])
-                is None
+                label_example.get(SPARSE_FEATURE_NAMES[attribute]) is None
+                and label_example.get(DENSE_FEATURE_NAMES[attribute]) is None
             ):
                 return False
         return True
@@ -293,13 +291,11 @@ def _extract_and_add_features(
         sparse_features = None
         dense_features = None
 
-        if message.get(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]) is not None:
-            sparse_features = message.get(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]
-            )
+        if message.get(SPARSE_FEATURE_NAMES[attribute]) is not None:
+            sparse_features = message.get(SPARSE_FEATURE_NAMES[attribute])
 
-        if message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]) is not None:
-            dense_features = message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute])
+        if message.get(DENSE_FEATURE_NAMES[attribute]) is not None:
+            dense_features = message.get(DENSE_FEATURE_NAMES[attribute])
 
         if sparse_features is not None and dense_features is not None:
             if sparse_features.shape[0] != dense_features.shape[0]:
@@ -319,9 +315,7 @@ def _extract_labels_precomputed_features(
         dense_features = []
 
         for e in label_examples:
-            _sparse, _dense = self._extract_and_add_features(
-                e, MESSAGE_INTENT_ATTRIBUTE
-            )
+            _sparse, _dense = self._extract_and_add_features(e, INTENT_ATTRIBUTE)
             if _sparse is not None:
                 sparse_features.append(_sparse)
             if _dense is not None:
@@ -413,15 +407,13 @@ def _create_session_data(
         label_ids = []
 
         for e in training_data:
-            _sparse, _dense = self._extract_and_add_features(e, MESSAGE_TEXT_ATTRIBUTE)
+            _sparse, _dense = self._extract_and_add_features(e, TEXT_ATTRIBUTE)
             if _sparse is not None:
                 X_sparse.append(_sparse)
             if _dense is not None:
                 X_dense.append(_dense)
 
-            _sparse, _dense = self._extract_and_add_features(
-                e, MESSAGE_INTENT_ATTRIBUTE
-            )
+            _sparse, _dense = self._extract_and_add_features(e, INTENT_ATTRIBUTE)
             if _sparse is not None:
                 Y_sparse.append(_sparse)
             if _dense is not None:
@@ -675,19 +667,19 @@ def preprocess_train_data(self, training_data: "TrainingData"):
         """
 
         label_id_dict = self._create_label_id_dict(
-            training_data, attribute=MESSAGE_INTENT_ATTRIBUTE
+            training_data, attribute=INTENT_ATTRIBUTE
         )
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
 
         self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=MESSAGE_INTENT_ATTRIBUTE
+            training_data, label_id_dict, attribute=INTENT_ATTRIBUTE
         )
 
         session_data = self._create_session_data(
             training_data.intent_examples,
             label_id_dict,
-            label_attribute=MESSAGE_INTENT_ATTRIBUTE,
+            label_attribute=INTENT_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)
diff --git a/rasa/nlu/classifiers/mitie_intent_classifier.py b/rasa/nlu/classifiers/mitie_intent_classifier.py
index 656ef8a61687..4438e50e2c35 100644
--- a/rasa/nlu/classifiers/mitie_intent_classifier.py
+++ b/rasa/nlu/classifiers/mitie_intent_classifier.py
@@ -5,7 +5,7 @@
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
-from rasa.nlu.constants import MESSAGE_TOKENS_NAMES, MESSAGE_TEXT_ATTRIBUTE
+from rasa.nlu.constants import MESSAGE_TOKENS_NAMES, TEXT_ATTRIBUTE
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
@@ -17,7 +17,7 @@ class MitieIntentClassifier(Component):
     provides = ["intent"]
 
     requires = [
-        MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+        MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE],
         "mitie_feature_extractor",
         "mitie_file",
     ]
@@ -85,7 +85,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
     def _tokens_of_message(message) -> List[Text]:
         return [
             token.text
-            for token in message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
+            for token in message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], [])
         ]
 
     @classmethod
diff --git a/rasa/nlu/classifiers/sklearn_intent_classifier.py b/rasa/nlu/classifiers/sklearn_intent_classifier.py
index 23f13f1ea91c..bfdadd90c21b 100644
--- a/rasa/nlu/classifiers/sklearn_intent_classifier.py
+++ b/rasa/nlu/classifiers/sklearn_intent_classifier.py
@@ -12,11 +12,7 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
-from rasa.nlu.constants import (
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_TEXT_ATTRIBUTE,
-)
+from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT_ATTRIBUTE
 
 logger = logging.getLogger(__name__)
 
@@ -29,7 +25,7 @@ class SklearnIntentClassifier(Component):
 
     provides = ["intent", "intent_ranking"]
 
-    requires = [MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
 
     defaults = {
         # C parameter of the svm - cross validation will select the best value
@@ -102,9 +98,7 @@ def train(
             X = np.stack(
                 [
                     sequence_to_sentence_features(
-                        example.get(
-                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-                        )
+                        example.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
                     )
                     for example in training_data.intent_examples
                 ]
@@ -156,7 +150,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             intent_ranking = []
         else:
             X = sequence_to_sentence_features(
-                message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+                message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
             ).reshape(1, -1)
             intent_ids, probabilities = self.predict(X)
             intents = self.transform_labels_num2str(np.ravel(intent_ids))
diff --git a/rasa/nlu/components.py b/rasa/nlu/components.py
index 4b261a52e46e..d5c3786887b2 100644
--- a/rasa/nlu/components.py
+++ b/rasa/nlu/components.py
@@ -5,7 +5,7 @@
 
 from rasa.nlu.config import RasaNLUModelConfig, override_defaults
 from rasa.nlu.training_data import TrainingData, Message
-from rasa.nlu.constants import MESSAGE_RESPONSE_ATTRIBUTE
+from rasa.nlu.constants import RESPONSE_ATTRIBUTE
 
 if typing.TYPE_CHECKING:
     from rasa.nlu.model import Metadata
@@ -90,7 +90,7 @@ def validate_required_components_from_data(
     response_selector_exists = False
     for component in pipeline:
         # check if a response selector is part of NLU pipeline
-        if MESSAGE_RESPONSE_ATTRIBUTE in component.provides:
+        if RESPONSE_ATTRIBUTE in component.provides:
             response_selector_exists = True
 
     if len(data.response_examples) and not response_selector_exists:
diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index 08cc2925a827..c821df509a9e 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -1,47 +1,40 @@
-MESSAGE_TEXT_ATTRIBUTE = "text"
+TEXT_ATTRIBUTE = "text"
 
-MESSAGE_RESPONSE_KEY_ATTRIBUTE = "response_key"
+RESPONSE_KEY_ATTRIBUTE = "response_key"
 
-MESSAGE_INTENT_ATTRIBUTE = "intent"
+INTENT_ATTRIBUTE = "intent"
 
-MESSAGE_RESPONSE_ATTRIBUTE = "response"
+RESPONSE_ATTRIBUTE = "response"
 
-MESSAGE_ENTITIES_ATTRIBUTE = "entities"
+ENTITIES_ATTRIBUTE = "entities"
 
 CLS_TOKEN = "__CLS__"
 
-MESSAGE_ATTRIBUTES = [
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
-    MESSAGE_RESPONSE_ATTRIBUTE,
-]
+MESSAGE_ATTRIBUTES = [TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
 
 MESSAGE_TOKENS_NAMES = {
-    MESSAGE_TEXT_ATTRIBUTE: "tokens",
-    MESSAGE_INTENT_ATTRIBUTE: "intent_tokens",
-    MESSAGE_RESPONSE_ATTRIBUTE: "response_tokens",
+    TEXT_ATTRIBUTE: "tokens",
+    INTENT_ATTRIBUTE: "intent_tokens",
+    RESPONSE_ATTRIBUTE: "response_tokens",
 }
 
-MESSAGE_VECTOR_SPARSE_FEATURE_NAMES = {
-    MESSAGE_TEXT_ATTRIBUTE: "text_sparse_features",
-    MESSAGE_INTENT_ATTRIBUTE: "intent_sparse_features",
-    MESSAGE_RESPONSE_ATTRIBUTE: "response_sparse_features",
+SPARSE_FEATURE_NAMES = {
+    TEXT_ATTRIBUTE: "text_sparse_features",
+    INTENT_ATTRIBUTE: "intent_sparse_features",
+    RESPONSE_ATTRIBUTE: "response_sparse_features",
 }
 
-MESSAGE_VECTOR_DENSE_FEATURE_NAMES = {
-    MESSAGE_TEXT_ATTRIBUTE: "text_dense_features",
-    MESSAGE_INTENT_ATTRIBUTE: "intent_dense_features",
-    MESSAGE_RESPONSE_ATTRIBUTE: "response_dense_features",
+DENSE_FEATURE_NAMES = {
+    TEXT_ATTRIBUTE: "text_dense_features",
+    INTENT_ATTRIBUTE: "intent_dense_features",
+    RESPONSE_ATTRIBUTE: "response_dense_features",
 }
 
-MESSAGE_SPACY_FEATURES_NAMES = {
-    MESSAGE_TEXT_ATTRIBUTE: "spacy_doc",
-    MESSAGE_RESPONSE_ATTRIBUTE: "response_spacy_doc",
-}
+SPACY_DOCS = {TEXT_ATTRIBUTE: "spacy_doc", RESPONSE_ATTRIBUTE: "response_spacy_doc"}
 
-SPACY_FEATURIZABLE_ATTRIBUTES = [MESSAGE_TEXT_ATTRIBUTE, MESSAGE_RESPONSE_ATTRIBUTE]
+SPACY_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
 
-MESSAGE_SELECTOR_PROPERTY_NAME = "response_selector"
+RESPONSE_SELECTOR_PROPERTY_NAME = "response_selector"
 DEFAULT_OPEN_UTTERANCE_TYPE = "default"
 OPEN_UTTERANCE_PREDICTION_KEY = "response"
 OPEN_UTTERANCE_RANKING_KEY = "ranking"
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 7b700da390a5..101d3bde4c69 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -12,10 +12,10 @@
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     MESSAGE_TOKENS_NAMES,
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_ENTITIES_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
+    DENSE_FEATURE_NAMES,
+    SPACY_DOCS,
+    ENTITIES_ATTRIBUTE,
 )
 from rasa.constants import DOCS_BASE_URL
 
@@ -41,9 +41,9 @@ class CRFToken(NamedTuple):
 
 class CRFEntityExtractor(EntityExtractor):
 
-    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES_ATTRIBUTE]
 
-    requires = [MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]]
 
     defaults = {
         # BILOU_flag determines whether to use BILOU tagging or not.
@@ -173,11 +173,7 @@ def _create_dataset(self, examples: List[Message]) -> List[List[CRFToken]]:
         return dataset
 
     def _check_spacy_doc(self, message) -> None:
-        if (
-            self.pos_features
-            and message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE])
-            is None
-        ):
+        if self.pos_features and message.get(SPACY_DOCS[TEXT_ATTRIBUTE]) is None:
             raise InvalidConfigError(
                 "Could not find `spacy_doc` attribute for "
                 "message {}\n"
@@ -193,8 +189,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         extracted = self.add_extractor_name(self.extract_entities(message))
         message.set(
-            MESSAGE_ENTITIES_ATTRIBUTE,
-            message.get(MESSAGE_ENTITIES_ATTRIBUTE, []) + extracted,
+            ENTITIES_ATTRIBUTE,
+            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
             add_to_output=True,
         )
 
@@ -203,9 +199,7 @@ def _convert_example(example: Message) -> List[Tuple[int, int, Text]]:
         def convert_entity(entity):
             return entity["start"], entity["end"], entity["entity"]
 
-        return [
-            convert_entity(ent) for ent in example.get(MESSAGE_ENTITIES_ATTRIBUTE, [])
-        ]
+        return [convert_entity(ent) for ent in example.get(ENTITIES_ATTRIBUTE, [])]
 
     def extract_entities(self, message: Message) -> List[Dict[Text, Any]]:
         """Take a sentence and return entities in json format"""
@@ -340,9 +334,9 @@ def _from_crf_to_json(
     ) -> List[Dict[Text, Any]]:
 
         if self.pos_features:
-            tokens = message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
         else:
-            tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            tokens = message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])
 
         if len(tokens) != len(entities):
             raise Exception(
@@ -503,13 +497,11 @@ def _from_json_to_crf(
         if self.pos_features:
             from spacy.gold import GoldParse  # pytype: disable=import-error
 
-            doc_or_tokens = message.get(
-                MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-            )
+            doc_or_tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
             gold = GoldParse(doc_or_tokens, entities=entity_offsets)
             ents = [l[5] for l in gold.orig_annot]
         else:
-            doc_or_tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            doc_or_tokens = message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])
             ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
 
         # collect badly annotated examples
@@ -573,8 +565,8 @@ def _bilou_tags_from_offsets(tokens, entities, missing: Text = "O") -> List[Text
 
     @staticmethod
     def __pattern_of_token(message, i):
-        if message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]) is not None:
-            return message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])[i].get(
+        if message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]) is not None:
+            return message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])[i].get(
                 "pattern", {}
             )
         else:
@@ -589,18 +581,16 @@ def __tag_of_token(token):
 
     @staticmethod
     def __get_dense_features(message: Message) -> Optional[List[Any]]:
-        features = message.get(
-            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
-        )
+        features = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])
 
         if features is None:
             return None
 
-        tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], [])
+        tokens = message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], [])
         if len(tokens) != len(features):
             warnings.warn(
                 f"Number of features ({len(features)}) for attribute "
-                f"'{MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]}' "
+                f"'{DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]}' "
                 f"does not match number of tokens ({len(tokens)}). Set "
                 f"'return_sequence' to true in the corresponding featurizer in order "
                 f"to make use of the features in 'CRFEntityExtractor'."
@@ -625,9 +615,9 @@ def _from_text_to_crf(
 
         crf_format = []
         if self.pos_features:
-            tokens = message.get(MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
         else:
-            tokens = message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+            tokens = message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])
 
         text_dense_features = self.__get_dense_features(message)
 
diff --git a/rasa/nlu/extractors/duckling_http_extractor.py b/rasa/nlu/extractors/duckling_http_extractor.py
index 74ac1cf997c9..e9c329acc159 100644
--- a/rasa/nlu/extractors/duckling_http_extractor.py
+++ b/rasa/nlu/extractors/duckling_http_extractor.py
@@ -6,7 +6,7 @@
 import requests
 from typing import Any, List, Optional, Text, Dict
 
-from rasa.nlu.constants import MESSAGE_ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES_ATTRIBUTE
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -52,7 +52,7 @@ def convert_duckling_format_to_rasa(
 class DucklingHTTPExtractor(EntityExtractor):
     """Searches for structured entites, e.g. dates, using a duckling server."""
 
-    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES_ATTRIBUTE]
 
     defaults = {
         # by default all dimensions recognized by duckling are returned
@@ -187,8 +187,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         extracted = self.add_extractor_name(extracted)
         message.set(
-            MESSAGE_ENTITIES_ATTRIBUTE,
-            message.get(MESSAGE_ENTITIES_ATTRIBUTE, []) + extracted,
+            ENTITIES_ATTRIBUTE,
+            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
             add_to_output=True,
         )
 
diff --git a/rasa/nlu/extractors/entity_synonyms.py b/rasa/nlu/extractors/entity_synonyms.py
index b0e598737663..500027711621 100644
--- a/rasa/nlu/extractors/entity_synonyms.py
+++ b/rasa/nlu/extractors/entity_synonyms.py
@@ -2,7 +2,7 @@
 import warnings
 from typing import Any, Dict, Optional, Text
 
-from rasa.nlu.constants import MESSAGE_ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES_ATTRIBUTE
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -13,7 +13,7 @@
 
 class EntitySynonymMapper(EntityExtractor):
 
-    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES_ATTRIBUTE]
 
     def __init__(
         self,
@@ -33,15 +33,15 @@ def train(
             self.add_entities_if_synonyms(key, value)
 
         for example in training_data.entity_examples:
-            for entity in example.get(MESSAGE_ENTITIES_ATTRIBUTE, []):
+            for entity in example.get(ENTITIES_ATTRIBUTE, []):
                 entity_val = example.text[entity["start"] : entity["end"]]
                 self.add_entities_if_synonyms(entity_val, str(entity.get("value")))
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        updated_entities = message.get(MESSAGE_ENTITIES_ATTRIBUTE, [])[:]
+        updated_entities = message.get(ENTITIES_ATTRIBUTE, [])[:]
         self.replace_synonyms(updated_entities)
-        message.set(MESSAGE_ENTITIES_ATTRIBUTE, updated_entities, add_to_output=True)
+        message.set(ENTITIES_ATTRIBUTE, updated_entities, add_to_output=True)
 
     def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]:
 
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index 2a60c7139888..4869d7748c8b 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -4,11 +4,7 @@
 import typing
 from typing import Any, Dict, List, Optional, Text
 
-from rasa.nlu.constants import (
-    MESSAGE_ENTITIES_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
-    MESSAGE_TEXT_ATTRIBUTE,
-)
+from rasa.nlu.constants import ENTITIES_ATTRIBUTE, MESSAGE_TOKENS_NAMES, TEXT_ATTRIBUTE
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -23,10 +19,10 @@
 
 class MitieEntityExtractor(EntityExtractor):
 
-    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES_ATTRIBUTE]
 
     requires = [
-        MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+        MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE],
         "mitie_feature_extractor",
         "mitie_file",
     ]
@@ -101,9 +97,9 @@ def _prepare_mitie_sample(self, training_example) -> Any:
         import mitie
 
         text = training_example.text
-        tokens = training_example.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE])
+        tokens = training_example.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])
         sample = mitie.ner_training_instance([t.text for t in tokens])
-        for ent in training_example.get(MESSAGE_ENTITIES_ATTRIBUTE, []):
+        for ent in training_example.get(ENTITIES_ATTRIBUTE, []):
             try:
                 # if the token is not aligned an exception will be raised
                 start, end = MitieEntityExtractor.find_entity(ent, text, tokens)
@@ -134,13 +130,13 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         ents = self.extract_entities(
             message.text,
-            message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]),
+            message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]),
             mitie_feature_extractor,
         )
         extracted = self.add_extractor_name(ents)
         message.set(
-            MESSAGE_ENTITIES_ATTRIBUTE,
-            message.get(MESSAGE_ENTITIES_ATTRIBUTE, []) + extracted,
+            ENTITIES_ATTRIBUTE,
+            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
             add_to_output=True,
         )
 
diff --git a/rasa/nlu/extractors/spacy_entity_extractor.py b/rasa/nlu/extractors/spacy_entity_extractor.py
index 34f030f0bf3f..8c0bf9e79322 100644
--- a/rasa/nlu/extractors/spacy_entity_extractor.py
+++ b/rasa/nlu/extractors/spacy_entity_extractor.py
@@ -1,7 +1,7 @@
 import typing
 from typing import Any, Dict, List, Text, Optional
 
-from rasa.nlu.constants import MESSAGE_ENTITIES_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES_ATTRIBUTE
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.training_data import Message
 
@@ -11,7 +11,7 @@
 
 class SpacyEntityExtractor(EntityExtractor):
 
-    provides = [MESSAGE_ENTITIES_ATTRIBUTE]
+    provides = [ENTITIES_ATTRIBUTE]
 
     requires = ["spacy_nlp"]
 
@@ -36,8 +36,8 @@ def process(self, message: Message, **kwargs: Any) -> None:
             all_extracted, dimensions
         )
         message.set(
-            MESSAGE_ENTITIES_ATTRIBUTE,
-            message.get(MESSAGE_ENTITIES_ATTRIBUTE, []) + extracted,
+            ENTITIES_ATTRIBUTE,
+            message.get(ENTITIES_ATTRIBUTE, []) + extracted,
             add_to_output=True,
         )
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 9a2ea3d928de..bfadb4455024 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -4,8 +4,8 @@
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    TEXT_ATTRIBUTE,
+    DENSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 import numpy as np
@@ -17,8 +17,7 @@
 class ConveRTFeaturizer(Featurizer):
 
     provides = [
-        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+        DENSE_FEATURE_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
     defaults = {
@@ -75,7 +74,7 @@ def required_packages(cls) -> List[Text]:
         return ["tensorflow_text", "tensorflow_hub"]
 
     def _compute_features(
-        self, batch_examples: List[Message], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+        self, batch_examples: List[Message], attribute: Text = TEXT_ATTRIBUTE
     ) -> np.ndarray:
 
         # Get text for attribute of each example
@@ -122,11 +121,11 @@ def train(
                 for index, ex in enumerate(batch_examples):
 
                     ex.set(
-                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                        DENSE_FEATURE_NAMES[attribute],
                         self._combine_with_existing_dense_features(
                             ex,
                             np.expand_dims(batch_features[index], axis=0),
-                            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                            DENSE_FEATURE_NAMES[attribute],
                         ),
                     )
 
@@ -136,10 +135,10 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         feats = self._compute_features([message])[0]
         message.set(
-            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(
                 message,
                 np.expand_dims(feats, axis=0),
-                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+                DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
             ),
         )
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 0f185734ade6..b0fc219a4531 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -11,20 +11,17 @@
     import mitie
 
 from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    DENSE_FEATURE_NAMES,
     CLS_TOKEN,
 )
 
 
 class MitieFeaturizer(Featurizer):
 
-    provides = [
-        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
-        for attribute in MESSAGE_ATTRIBUTES
-    ]
+    provides = [DENSE_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
     requires = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES] + [
         "mitie_feature_extractor"
@@ -75,9 +72,9 @@ def process_training_example(
                 attribute_tokens, mitie_feature_extractor
             )
             example.set(
-                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute],
+                DENSE_FEATURE_NAMES[attribute],
                 self._combine_with_existing_dense_features(
-                    example, features, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+                    example, features, DENSE_FEATURE_NAMES[attribute]
                 ),
             )
 
@@ -85,15 +82,12 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
         features = self.features_for_tokens(
-            message.get(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]),
-            mitie_feature_extractor,
+            message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]), mitie_feature_extractor
         )
         message.set(
-            MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+            DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
             self._combine_with_existing_dense_features(
-                message,
-                features,
-                MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+                message, features, DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
             ),
         )
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 7a5228996848..195e047cc569 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -10,9 +10,9 @@
     from spacy.tokens import Doc
 
 from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
+    TEXT_ATTRIBUTE,
+    SPACY_DOCS,
+    DENSE_FEATURE_NAMES,
     SPACY_FEATURIZABLE_ATTRIBUTES,
     MESSAGE_TOKENS_NAMES,
     CLS_TOKEN,
@@ -22,13 +22,11 @@
 class SpacyFeaturizer(Featurizer):
 
     provides = [
-        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+        DENSE_FEATURE_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
     requires = [
-        MESSAGE_SPACY_FEATURES_NAMES[attribute]
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+        SPACY_DOCS[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ] + [MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES]
 
     defaults = {
@@ -63,13 +61,13 @@ def train(
 
     def get_doc(self, message, attribute):
 
-        return message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
+        return message.get(SPACY_DOCS[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
         self._set_spacy_features(message)
 
-    def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
+    def _set_spacy_features(self, message, attribute=TEXT_ATTRIBUTE):
         """Adds the spacy word vectors to the messages features."""
 
         message_attribute_doc = self.get_doc(message, attribute)
@@ -85,6 +83,6 @@ def _set_spacy_features(self, message, attribute=MESSAGE_TEXT_ATTRIBUTE):
                 features = np.concatenate([features, cls_token_vec])
 
             features = self._combine_with_existing_dense_features(
-                message, features, MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute]
+                message, features, DENSE_FEATURE_NAMES[attribute]
             )
-            message.set(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[attribute], features)
+            message.set(DENSE_FEATURE_NAMES[attribute], features)
diff --git a/rasa/nlu/featurizers/featurzier.py b/rasa/nlu/featurizers/featurzier.py
index cbc77e715993..334d6fb61fc1 100644
--- a/rasa/nlu/featurizers/featurzier.py
+++ b/rasa/nlu/featurizers/featurzier.py
@@ -3,11 +3,7 @@
 from typing import Any, Text, List, Union, Optional, Dict
 from rasa.nlu.training_data import Message
 from rasa.nlu.components import Component
-from rasa.nlu.constants import (
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    MESSAGE_TEXT_ATTRIBUTE,
-)
+from rasa.nlu.constants import SPARSE_FEATURE_NAMES, DENSE_FEATURE_NAMES, TEXT_ATTRIBUTE
 
 
 def sequence_to_sentence_features(
@@ -38,7 +34,7 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
     def _combine_with_existing_dense_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE],
+        feature_name: Text = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
     ) -> Any:
         if message.get(feature_name) is not None:
 
@@ -61,9 +57,7 @@ def _combine_with_existing_dense_features(
     def _combine_with_existing_sparse_features(
         message: Message,
         additional_features: Any,
-        feature_name: Text = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
-            MESSAGE_TEXT_ATTRIBUTE
-        ],
+        feature_name: Text = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
     ) -> Any:
         if message.get(feature_name) is not None:
             from scipy.sparse import hstack
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index a64e3f0ddfcc..07f0d5b1f38e 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -11,11 +11,11 @@
 from rasa.nlu.model import Metadata
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_INTENT_ATTRIBUTE,
+    SPARSE_FEATURE_NAMES,
+    INTENT_ATTRIBUTE,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 
@@ -33,10 +33,7 @@ class CountVectorsFeaturizer(Featurizer):
     from https://arxiv.org/abs/1810.07150.
     """
 
-    provides = [
-        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute]
-        for attribute in MESSAGE_ATTRIBUTES
-    ]
+    provides = [SPARSE_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
     requires = [
         MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
@@ -231,11 +228,11 @@ def _get_message_tokens_by_attribute(
         return message.get(attribute).split()
 
     def _process_tokens(
-        self, tokens: List[Text], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+        self, tokens: List[Text], attribute: Text = TEXT_ATTRIBUTE
     ) -> List[Text]:
         """Apply processing and cleaning steps to text"""
 
-        if attribute == MESSAGE_INTENT_ATTRIBUTE:
+        if attribute == INTENT_ATTRIBUTE:
             # Don't do any processing for intent attribute. Treat them as whole labels
             return tokens
 
@@ -272,7 +269,7 @@ def _replace_with_oov_token(
         return tokens
 
     def _get_processed_message_tokens_by_attribute(
-        self, message: "Message", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+        self, message: "Message", attribute: Text = TEXT_ATTRIBUTE
     ) -> List[Text]:
         """Get processed text of attribute of a message"""
 
@@ -357,7 +354,7 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
             combined_cleaned_texts += attribute_texts[attribute]
 
         try:
-            self.vectorizers[MESSAGE_TEXT_ATTRIBUTE].fit(combined_cleaned_texts)
+            self.vectorizers[TEXT_ATTRIBUTE].fit(combined_cleaned_texts)
         except ValueError:
             logger.warning(
                 "Unable to train a shared CountVectorizer. Leaving an untrained CountVectorizer"
@@ -436,11 +433,9 @@ def _set_attribute_features(
         for i, example in enumerate(training_data.training_examples):
             # create bag for each example
             example.set(
-                MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+                SPARSE_FEATURE_NAMES[attribute],
                 self._combine_with_existing_sparse_features(
-                    example,
-                    attribute_features[i],
-                    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+                    example, attribute_features[i], SPARSE_FEATURE_NAMES[attribute]
                 ),
             )
 
@@ -495,7 +490,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
             )
             return
 
-        attribute = MESSAGE_TEXT_ATTRIBUTE
+        attribute = TEXT_ATTRIBUTE
         message_tokens = self._get_processed_message_tokens_by_attribute(
             message, attribute
         )
@@ -504,11 +499,11 @@ def process(self, message: Message, **kwargs: Any) -> None:
         features = self._create_sequence(attribute, [message_tokens])
 
         message.set(
-            MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+            SPARSE_FEATURE_NAMES[attribute],
             self._combine_with_existing_sparse_features(
                 message,
                 features[0],  # 0 -> batch dimension
-                feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+                feature_name=SPARSE_FEATURE_NAMES[attribute],
             ),
         )
 
@@ -545,7 +540,7 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
 
                 if self.use_shared_vocab:
                     # Only persist vocabulary from one attribute. Can be loaded and distributed to all attributes.
-                    vocab = attribute_vocabularies[MESSAGE_TEXT_ATTRIBUTE]
+                    vocab = attribute_vocabularies[TEXT_ATTRIBUTE]
                 else:
                     vocab = attribute_vocabularies
 
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 2b37f282e14d..682b504b2c2f 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -13,9 +13,9 @@
 import rasa.utils.io
 from rasa.nlu.constants import (
     MESSAGE_TOKENS_NAMES,
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
+    TEXT_ATTRIBUTE,
+    RESPONSE_ATTRIBUTE,
+    SPARSE_FEATURE_NAMES,
 )
 
 logger = logging.getLogger(__name__)
@@ -26,9 +26,9 @@
 
 class RegexFeaturizer(Featurizer):
 
-    provides = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
 
-    requires = [MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]]
 
     defaults = {
         # if True return a sequence of features (return vector has size
@@ -60,21 +60,19 @@ def train(
         self._add_lookup_table_regexes(training_data.lookup_tables)
 
         for example in training_data.training_examples:
-            for attribute in [MESSAGE_TEXT_ATTRIBUTE, MESSAGE_RESPONSE_ATTRIBUTE]:
+            for attribute in [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]:
                 self._text_features_with_regex(example, attribute)
 
     def process(self, message: Message, **kwargs: Any) -> None:
-        self._text_features_with_regex(message, MESSAGE_TEXT_ATTRIBUTE)
+        self._text_features_with_regex(message, TEXT_ATTRIBUTE)
 
     def _text_features_with_regex(self, message: Message, attribute: Text) -> None:
         if self.known_patterns:
             extras = self._features_for_patterns(message, attribute)
             features = self._combine_with_existing_sparse_features(
-                message,
-                extras,
-                feature_name=MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute],
+                message, extras, feature_name=SPARSE_FEATURE_NAMES[attribute]
             )
-            message.set(MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[attribute], features)
+            message.set(SPARSE_FEATURE_NAMES[attribute], features)
 
     def _add_lookup_table_regexes(
         self, lookup_tables: List[Dict[Text, Union[Text, List]]]
diff --git a/rasa/nlu/selectors/embedding_response_selector.py b/rasa/nlu/selectors/embedding_response_selector.py
index 9e1bf79d54d9..1c8fbac28b1c 100644
--- a/rasa/nlu/selectors/embedding_response_selector.py
+++ b/rasa/nlu/selectors/embedding_response_selector.py
@@ -4,10 +4,10 @@
 
 from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_SELECTOR_PROPERTY_NAME,
+    RESPONSE_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
+    SPARSE_FEATURE_NAMES,
+    RESPONSE_SELECTOR_PROPERTY_NAME,
     DEFAULT_OPEN_UTTERANCE_TYPE,
 )
 
@@ -43,7 +43,7 @@ class ResponseSelector(EmbeddingIntentClassifier):
 
     provides = ["response", "response_ranking"]
 
-    requires = [MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]]
+    requires = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
 
     # default properties (DOC MARKER - don't remove)
     defaults = {
@@ -124,10 +124,10 @@ def _set_message_property(
         message: "Message", prediction_dict: Dict[Text, Any], selector_key: Text
     ):
 
-        message_selector_properties = message.get(MESSAGE_SELECTOR_PROPERTY_NAME, {})
+        message_selector_properties = message.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
         message_selector_properties[selector_key] = prediction_dict
         message.set(
-            MESSAGE_SELECTOR_PROPERTY_NAME,
+            RESPONSE_SELECTOR_PROPERTY_NAME,
             message_selector_properties,
             add_to_output=True,
         )
@@ -139,18 +139,18 @@ def preprocess_train_data(self, training_data):
             training_data = training_data.filter_by_intent(self.retrieval_intent)
 
         label_id_dict = self._create_label_id_dict(
-            training_data, attribute=MESSAGE_RESPONSE_ATTRIBUTE
+            training_data, attribute=RESPONSE_ATTRIBUTE
         )
 
         self.inverted_label_dict = {v: k for k, v in label_id_dict.items()}
         self._label_data = self._create_label_data(
-            training_data, label_id_dict, attribute=MESSAGE_RESPONSE_ATTRIBUTE
+            training_data, label_id_dict, attribute=RESPONSE_ATTRIBUTE
         )
 
         session_data = self._create_session_data(
             training_data.intent_examples,
             label_id_dict,
-            label_attribute=MESSAGE_RESPONSE_ATTRIBUTE,
+            label_attribute=RESPONSE_ATTRIBUTE,
         )
 
         self.check_input_dimension_consistency(session_data)
diff --git a/rasa/nlu/test.py b/rasa/nlu/test.py
index c3d7026a6c10..f6c51df48454 100644
--- a/rasa/nlu/test.py
+++ b/rasa/nlu/test.py
@@ -22,7 +22,7 @@
 from rasa.constants import TEST_DATA_FILE, TRAIN_DATA_FILE
 from rasa.nlu.constants import (
     DEFAULT_OPEN_UTTERANCE_TYPE,
-    MESSAGE_SELECTOR_PROPERTY_NAME,
+    RESPONSE_SELECTOR_PROPERTY_NAME,
     OPEN_UTTERANCE_PREDICTION_KEY,
 )
 from rasa.model import get_model
@@ -903,7 +903,7 @@ def get_eval_data(
 
             # including all examples here. Empty response examples are filtered at the time of metric calculation
             intent_target = example.get("intent", "")
-            selector_properties = result.get(MESSAGE_SELECTOR_PROPERTY_NAME, {})
+            selector_properties = result.get(RESPONSE_SELECTOR_PROPERTY_NAME, {})
 
             if intent_target in available_response_selector_types:
                 response_prediction_key = intent_target
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index afb5aeadc5f3..708fc9f2b7d1 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -11,8 +11,8 @@
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_INTENT_ATTRIBUTE,
-    MESSAGE_TEXT_ATTRIBUTE,
+    INTENT_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
 )
@@ -96,20 +96,18 @@ def train(
     def process(self, message: Message, **kwargs: Any) -> None:
 
         message.set(
-            MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self.tokenize(message.text, MESSAGE_TEXT_ATTRIBUTE),
+            MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE],
+            self.tokenize(message.text, TEXT_ATTRIBUTE),
         )
 
     def preprocess_text(self, text: Text, attribute: Text) -> Text:
 
-        if attribute == MESSAGE_INTENT_ATTRIBUTE and self.intent_tokenization_flag:
+        if attribute == INTENT_ATTRIBUTE and self.intent_tokenization_flag:
             return " ".join(text.split(self.intent_split_symbol))
         else:
             return text
 
-    def tokenize(
-        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
-    ) -> List[Token]:
+    def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
         import jieba
 
         text = self.preprocess_text(text, attribute)
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 5bbfda74e502..188000572e49 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -4,11 +4,7 @@
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
-from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
-    MESSAGE_ATTRIBUTES,
-)
+from rasa.nlu.constants import TEXT_ATTRIBUTE, MESSAGE_TOKENS_NAMES, MESSAGE_ATTRIBUTES
 from rasa.utils.io import DEFAULT_ENCODING
 
 
@@ -41,9 +37,7 @@ def train(
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        message.set(
-            MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], self.tokenize(message.text)
-        )
+        message.set(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text))
 
     def _token_from_offset(
         self, text: bytes, offset: int, encoded_sentence: bytes
@@ -53,9 +47,7 @@ def _token_from_offset(
             self._byte_to_char_offset(encoded_sentence, offset),
         )
 
-    def tokenize(
-        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
-    ) -> List[Token]:
+    def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
         import mitie
 
         encoded_sentence = text.encode(DEFAULT_ENCODING)
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 19a61bde6070..372dace62f9b 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -6,9 +6,9 @@
 from rasa.nlu.training_data import Message, TrainingData
 
 from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
-    MESSAGE_SPACY_FEATURES_NAMES,
+    SPACY_DOCS,
     SPACY_FEATURIZABLE_ATTRIBUTES,
 )
 
@@ -22,10 +22,7 @@ class SpacyTokenizer(Tokenizer):
         MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
-    requires = [
-        MESSAGE_SPACY_FEATURES_NAMES[attribute]
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ]
+    requires = [SPACY_DOCS[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES]
 
     defaults = {
         # add __CLS__ token to the end of the list of tokens
@@ -49,19 +46,15 @@ def train(
                     )
 
     def get_doc(self, message: Message, attribute: Text) -> "Doc":
-        return message.get(MESSAGE_SPACY_FEATURES_NAMES[attribute])
+        return message.get(SPACY_DOCS[attribute])
 
     def process(self, message: Message, **kwargs: Any) -> None:
         message.set(
-            MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self.tokenize(
-                self.get_doc(message, MESSAGE_TEXT_ATTRIBUTE), MESSAGE_TEXT_ATTRIBUTE
-            ),
+            MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE],
+            self.tokenize(self.get_doc(message, TEXT_ATTRIBUTE), TEXT_ATTRIBUTE),
         )
 
-    def tokenize(
-        self, doc: "Doc", attribute: Text = MESSAGE_TEXT_ATTRIBUTE
-    ) -> List[Token]:
+    def tokenize(self, doc: "Doc", attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
         tokens = [Token(t.text, t.idx, lemma=t.lemma_) for t in doc]
         self.add_cls_token(tokens, attribute)
         return tokens
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index e0981572d4f4..fc63dfebc207 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -3,11 +3,7 @@
 from typing import Text, List, Optional, Dict, Any
 
 from rasa.nlu.components import Component
-from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_TEXT_ATTRIBUTE,
-    CLS_TOKEN,
-)
+from rasa.nlu.constants import RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE, CLS_TOKEN
 
 logger = logging.getLogger(__name__)
 
@@ -66,10 +62,10 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
             )
 
     def add_cls_token(
-        self, tokens: List[Token], attribute: Text = MESSAGE_TEXT_ATTRIBUTE
+        self, tokens: List[Token], attribute: Text = TEXT_ATTRIBUTE
     ) -> List[Token]:
         if (
-            attribute in [MESSAGE_RESPONSE_ATTRIBUTE, MESSAGE_TEXT_ATTRIBUTE]
+            attribute in [RESPONSE_ATTRIBUTE, TEXT_ATTRIBUTE]
             and self.use_cls_token
             and tokens
         ):
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index e819588c28ff..e968c549fa2d 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -5,8 +5,8 @@
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    MESSAGE_INTENT_ATTRIBUTE,
-    MESSAGE_TEXT_ATTRIBUTE,
+    INTENT_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
     MESSAGE_TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
 )
@@ -52,18 +52,14 @@ def train(
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        message.set(
-            MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], self.tokenize(message.text)
-        )
+        message.set(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text))
 
-    def tokenize(
-        self, text: Text, attribute: Text = MESSAGE_TEXT_ATTRIBUTE
-    ) -> List[Token]:
+    def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
 
         if not self.case_sensitive:
             text = text.lower()
 
-        if attribute != MESSAGE_INTENT_ATTRIBUTE:
+        if attribute != INTENT_ATTRIBUTE:
             # remove 'not a word character' if
             words = re.sub(
                 # there is a space or an end of a string after it
diff --git a/rasa/nlu/training_data/formats/markdown.py b/rasa/nlu/training_data/formats/markdown.py
index b3d7fec31ba1..e62f5743e7a2 100644
--- a/rasa/nlu/training_data/formats/markdown.py
+++ b/rasa/nlu/training_data/formats/markdown.py
@@ -12,8 +12,8 @@
 )
 from rasa.nlu.utils import build_entity
 from rasa.nlu.constants import (
-    MESSAGE_INTENT_ATTRIBUTE,
-    MESSAGE_RESPONSE_KEY_ATTRIBUTE,
+    INTENT_ATTRIBUTE,
+    RESPONSE_KEY_ATTRIBUTE,
     RESPONSE_IDENTIFIER_DELIMITER,
 )
 
@@ -218,7 +218,7 @@ def _generate_training_examples_md(self, training_data: "TrainingData") -> Text:
 
         # Sort by intent while keeping basic intent order
         for example in [e.as_dict_nlu() for e in training_data.training_examples]:
-            intent = example[MESSAGE_INTENT_ATTRIBUTE]
+            intent = example[INTENT_ATTRIBUTE]
             training_examples.setdefault(intent, [])
             training_examples[intent].append(example)
 
diff --git a/rasa/nlu/training_data/formats/rasa.py b/rasa/nlu/training_data/formats/rasa.py
index fc73cbdecb93..3a6dd286bbb0 100644
--- a/rasa/nlu/training_data/formats/rasa.py
+++ b/rasa/nlu/training_data/formats/rasa.py
@@ -13,9 +13,9 @@
 from rasa.nlu.training_data.util import transform_entity_synonyms
 from rasa.nlu.utils import json_to_string
 from rasa.nlu.constants import (
-    MESSAGE_INTENT_ATTRIBUTE,
-    MESSAGE_RESPONSE_KEY_ATTRIBUTE,
-    MESSAGE_RESPONSE_ATTRIBUTE,
+    INTENT_ATTRIBUTE,
+    RESPONSE_KEY_ATTRIBUTE,
+    RESPONSE_ATTRIBUTE,
     RESPONSE_IDENTIFIER_DELIMITER,
 )
 
diff --git a/rasa/nlu/training_data/message.py b/rasa/nlu/training_data/message.py
index ab84af948938..7a661a748923 100644
--- a/rasa/nlu/training_data/message.py
+++ b/rasa/nlu/training_data/message.py
@@ -1,11 +1,11 @@
 from typing import Any, Optional, Tuple, Text
 
 from rasa.nlu.constants import (
-    MESSAGE_ENTITIES_ATTRIBUTE,
-    MESSAGE_INTENT_ATTRIBUTE,
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_RESPONSE_KEY_ATTRIBUTE,
-    MESSAGE_TEXT_ATTRIBUTE,
+    ENTITIES_ATTRIBUTE,
+    INTENT_ATTRIBUTE,
+    RESPONSE_ATTRIBUTE,
+    RESPONSE_KEY_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
     RESPONSE_IDENTIFIER_DELIMITER,
 )
 from rasa.nlu.utils import ordered
@@ -30,7 +30,7 @@ def set(self, prop, info, add_to_output=False) -> None:
             self.output_properties.add(prop)
 
     def get(self, prop, default=None) -> Any:
-        if prop == MESSAGE_TEXT_ATTRIBUTE:
+        if prop == TEXT_ATTRIBUTE:
             return self.text
         return self.data.get(prop, default)
 
@@ -38,10 +38,10 @@ def as_dict_nlu(self) -> dict:
         """Get dict representation of message as it would appear in training data"""
 
         d = self.as_dict()
-        if d.get(MESSAGE_INTENT_ATTRIBUTE, None):
-            d[MESSAGE_INTENT_ATTRIBUTE] = self.get_combined_intent_response_key()
-        d.pop(MESSAGE_RESPONSE_KEY_ATTRIBUTE, None)
-        d.pop(MESSAGE_RESPONSE_ATTRIBUTE, None)
+        if d.get(INTENT_ATTRIBUTE, None):
+            d[INTENT_ATTRIBUTE] = self.get_combined_intent_response_key()
+        d.pop(RESPONSE_KEY_ATTRIBUTE, None)
+        d.pop(RESPONSE_ATTRIBUTE, None)
         return d
 
     def as_dict(self, only_output_properties=False) -> dict:
@@ -73,18 +73,18 @@ def build(cls, text, intent=None, entities=None) -> "Message":
         data = {}
         if intent:
             split_intent, response_key = cls.separate_intent_response_key(intent)
-            data[MESSAGE_INTENT_ATTRIBUTE] = split_intent
+            data[INTENT_ATTRIBUTE] = split_intent
             if response_key:
-                data[MESSAGE_RESPONSE_KEY_ATTRIBUTE] = response_key
+                data[RESPONSE_KEY_ATTRIBUTE] = response_key
         if entities:
-            data[MESSAGE_ENTITIES_ATTRIBUTE] = entities
+            data[ENTITIES_ATTRIBUTE] = entities
         return cls(text, data)
 
     def get_combined_intent_response_key(self) -> Text:
         """Get intent as it appears in training data"""
 
-        intent = self.get(MESSAGE_INTENT_ATTRIBUTE)
-        response_key = self.get(MESSAGE_RESPONSE_KEY_ATTRIBUTE)
+        intent = self.get(INTENT_ATTRIBUTE)
+        response_key = self.get(RESPONSE_KEY_ATTRIBUTE)
         response_key_suffix = (
             f"{RESPONSE_IDENTIFIER_DELIMITER}{response_key}" if response_key else ""
         )
diff --git a/rasa/nlu/training_data/training_data.py b/rasa/nlu/training_data/training_data.py
index 6ac11824f8d9..1cf1d818d346 100644
--- a/rasa/nlu/training_data/training_data.py
+++ b/rasa/nlu/training_data/training_data.py
@@ -9,10 +9,7 @@
 
 import rasa.nlu.utils
 import rasa.utils.common as rasa_utils
-from rasa.nlu.constants import (
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_RESPONSE_KEY_ATTRIBUTE,
-)
+from rasa.nlu.constants import RESPONSE_ATTRIBUTE, RESPONSE_KEY_ATTRIBUTE
 from rasa.nlu.training_data.message import Message
 from rasa.nlu.training_data.util import check_duplicate_synonym
 from rasa.nlu.utils import list_to_str
@@ -180,7 +177,7 @@ def sort_regex_features(self) -> None:
     def fill_response_phrases(self) -> None:
         """Set response phrase for all examples by looking up NLG stories"""
         for example in self.training_examples:
-            response_key = example.get(MESSAGE_RESPONSE_KEY_ATTRIBUTE)
+            response_key = example.get(RESPONSE_KEY_ATTRIBUTE)
             # if response_key is None, that means the corresponding intent is not a retrieval intent
             # and hence no response text needs to be fetched.
             # If response_key is set, fetch the corresponding response text
@@ -190,7 +187,7 @@ def fill_response_phrases(self) -> None:
                 assistant_utterances = self.nlg_stories.get(story_lookup_intent, [])
                 if assistant_utterances:
                     # selecting only first assistant utterance for now
-                    example.set(MESSAGE_RESPONSE_ATTRIBUTE, assistant_utterances[0])
+                    example.set(RESPONSE_ATTRIBUTE, assistant_utterances[0])
                 else:
                     raise ValueError(
                         "No response phrases found for {}. Check training data "
@@ -387,11 +384,9 @@ def build_nlg_stories_from_examples(examples) -> Dict[Text, list]:
 
         nlg_stories = {}
         for ex in examples:
-            if ex.get(MESSAGE_RESPONSE_KEY_ATTRIBUTE) and ex.get(
-                MESSAGE_RESPONSE_ATTRIBUTE
-            ):
+            if ex.get(RESPONSE_KEY_ATTRIBUTE) and ex.get(RESPONSE_ATTRIBUTE):
                 nlg_stories[ex.get_combined_intent_response_key()] = [
-                    ex.get(MESSAGE_RESPONSE_ATTRIBUTE)
+                    ex.get(RESPONSE_ATTRIBUTE)
                 ]
         return nlg_stories
 
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index 2077fb1e3c96..35e36fe2d721 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -14,17 +14,12 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
     from rasa.nlu.model import Metadata
 
-from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_SPACY_FEATURES_NAMES,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
-)
+from rasa.nlu.constants import TEXT_ATTRIBUTE, SPACY_DOCS, SPACY_FEATURIZABLE_ATTRIBUTES
 
 
 class SpacyNLP(Component):
     provides = ["spacy_nlp"] + [
-        MESSAGE_SPACY_FEATURES_NAMES[attribute]
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+        SPACY_DOCS[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
     ]
 
     defaults = {
@@ -230,16 +225,11 @@ def train(
                 if len(example_attribute_doc):
                     # If length is 0, that means the initial text feature was None and was replaced by ''
                     # in preprocess method
-                    example.set(
-                        MESSAGE_SPACY_FEATURES_NAMES[attribute], example_attribute_doc
-                    )
+                    example.set(SPACY_DOCS[attribute], example_attribute_doc)
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        message.set(
-            MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE],
-            self.doc_for_text(message.text),
-        )
+        message.set(SPACY_DOCS[TEXT_ATTRIBUTE], self.doc_for_text(message.text))
 
     @classmethod
     def load(
diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_embedding_intent_classifier.py
index 44691cbf0fa7..435bcfa02ae5 100644
--- a/tests/nlu/classifiers/test_embedding_intent_classifier.py
+++ b/tests/nlu/classifiers/test_embedding_intent_classifier.py
@@ -3,10 +3,10 @@
 import scipy.sparse
 
 from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    MESSAGE_INTENT_ATTRIBUTE,
+    TEXT_ATTRIBUTE,
+    SPARSE_FEATURE_NAMES,
+    DENSE_FEATURE_NAMES,
+    INTENT_ATTRIBUTE,
 )
 from rasa.nlu.classifiers.embedding_intent_classifier import EmbeddingIntentClassifier
 from rasa.nlu.training_data import Message
@@ -71,23 +71,15 @@ def test_get_num_of_features():
                 Message(
                     "test a",
                     data={
-                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
-                            MESSAGE_TEXT_ATTRIBUTE
-                        ]: np.zeros(1),
-                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[
-                            MESSAGE_TEXT_ATTRIBUTE
-                        ]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
                     },
                 ),
                 Message(
                     "test b",
                     data={
-                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
-                            MESSAGE_TEXT_ATTRIBUTE
-                        ]: np.zeros(1),
-                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[
-                            MESSAGE_TEXT_ATTRIBUTE
-                        ]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]: np.zeros(1),
                     },
                 ),
             ],
@@ -98,12 +90,8 @@ def test_get_num_of_features():
                 Message(
                     "test a",
                     data={
-                        MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[
-                            MESSAGE_INTENT_ATTRIBUTE
-                        ]: np.zeros(1),
-                        MESSAGE_VECTOR_DENSE_FEATURE_NAMES[
-                            MESSAGE_INTENT_ATTRIBUTE
-                        ]: np.zeros(1),
+                        SPARSE_FEATURE_NAMES[INTENT_ATTRIBUTE]: np.zeros(1),
+                        DENSE_FEATURE_NAMES[INTENT_ATTRIBUTE]: np.zeros(1),
                     },
                 )
             ],
@@ -112,7 +100,7 @@ def test_get_num_of_features():
     ],
 )
 def test_check_labels_features_exist(messages, expected):
-    attribute = MESSAGE_TEXT_ATTRIBUTE
+    attribute = TEXT_ATTRIBUTE
 
     assert (
         EmbeddingIntentClassifier._check_labels_features_exist(messages, attribute)
diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py
index e46881449f72..e61272deddbb 100644
--- a/tests/nlu/featurizers/test_convert_featurizer.py
+++ b/tests/nlu/featurizers/test_convert_featurizer.py
@@ -1,10 +1,6 @@
 import numpy as np
 
-from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    MESSAGE_TOKENS_NAMES,
-)
+from rasa.nlu.constants import TEXT_ATTRIBUTE, DENSE_FEATURE_NAMES, MESSAGE_TOKENS_NAMES
 from rasa.nlu.training_data import Message
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.config import RasaNLUModelConfig
@@ -21,11 +17,11 @@ def test_convert_featurizer(mitie_feature_extractor, default_config):
     sentence = "Hey how are you today ?"
     message = Message(sentence)
     tokens = WhitespaceTokenizer().tokenize(sentence)
-    message.set(MESSAGE_TOKENS_NAMES[MESSAGE_TEXT_ATTRIBUTE], tokens)
+    message.set(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
 
     featurizer.process(message)
 
-    vecs = message.get(MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE])[0]
+    vecs = message.get(DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE])[0]
 
     expected = np.array([1.0251294, -0.04053932, -0.7018805, -0.82054937, -0.75054353])
 
diff --git a/tests/nlu/featurizers/test_featurizer.py b/tests/nlu/featurizers/test_featurizer.py
index e1ce8bc5297b..2d7d6436cccc 100644
--- a/tests/nlu/featurizers/test_featurizer.py
+++ b/tests/nlu/featurizers/test_featurizer.py
@@ -3,18 +3,14 @@
 import scipy.sparse
 
 from rasa.nlu.featurizers.featurzier import Featurizer, sequence_to_sentence_features
-from rasa.nlu.constants import (
-    MESSAGE_VECTOR_DENSE_FEATURE_NAMES,
-    MESSAGE_VECTOR_SPARSE_FEATURE_NAMES,
-    MESSAGE_TEXT_ATTRIBUTE,
-)
+from rasa.nlu.constants import DENSE_FEATURE_NAMES, SPARSE_FEATURE_NAMES, TEXT_ATTRIBUTE
 from rasa.nlu.training_data import Message
 
 
 def test_combine_with_existing_dense_features():
 
     featurizer = Featurizer({"return_sequence": False})
-    attribute = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+    attribute = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
     new_features = [[1, 0], [0, 1]]
@@ -32,7 +28,7 @@ def test_combine_with_existing_dense_features():
 
 def test_combine_with_existing_dense_features_shape_mismatch():
     featurizer = Featurizer({"return_sequence": False})
-    attribute = MESSAGE_VECTOR_DENSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+    attribute = DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
 
     existing_features = [[1, 0, 2, 3], [2, 0, 0, 1]]
     new_features = [[0, 1]]
@@ -49,7 +45,7 @@ def test_combine_with_existing_dense_features_shape_mismatch():
 def test_combine_with_existing_sparse_features():
 
     featurizer = Featurizer({"return_sequence": False})
-    attribute = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+    attribute = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
     new_features = scipy.sparse.csr_matrix([[1, 0], [0, 1]])
@@ -69,7 +65,7 @@ def test_combine_with_existing_sparse_features():
 def test_combine_with_existing_sparse_features_shape_mismatch():
 
     featurizer = Featurizer({"return_sequence": False})
-    attribute = MESSAGE_VECTOR_SPARSE_FEATURE_NAMES[MESSAGE_TEXT_ATTRIBUTE]
+    attribute = SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]
 
     existing_features = scipy.sparse.csr_matrix([[1, 0, 2, 3], [2, 0, 0, 1]])
     new_features = scipy.sparse.csr_matrix([[0, 1]])
diff --git a/tests/nlu/featurizers/test_regex_featurizer.py b/tests/nlu/featurizers/test_regex_featurizer.py
index 958a7032b549..cf252a09a49a 100644
--- a/tests/nlu/featurizers/test_regex_featurizer.py
+++ b/tests/nlu/featurizers/test_regex_featurizer.py
@@ -1,11 +1,7 @@
 import numpy as np
 import pytest
 
-from rasa.nlu.constants import (
-    MESSAGE_TEXT_ATTRIBUTE,
-    MESSAGE_RESPONSE_ATTRIBUTE,
-    MESSAGE_SPACY_FEATURES_NAMES,
-)
+from rasa.nlu.constants import TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE, SPACY_DOCS
 from rasa.nlu.tokenizers.spacy_tokenizer import SpacyTokenizer
 from rasa.nlu.training_data import Message
 
@@ -59,13 +55,11 @@ def test_regex_featurizer(sentence, expected, labeled_tokens, spacy_nlp):
 
     # adds tokens to the message
     tokenizer = SpacyTokenizer({"use_cls_token": False})
-    message = Message(sentence, data={MESSAGE_RESPONSE_ATTRIBUTE: sentence})
-    message.set(
-        MESSAGE_SPACY_FEATURES_NAMES[MESSAGE_TEXT_ATTRIBUTE], spacy_nlp(sentence)
-    )
+    message = Message(sentence, data={RESPONSE_ATTRIBUTE: sentence})
+    message.set(SPACY_DOCS[TEXT_ATTRIBUTE], spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
@@ -113,7 +107,7 @@ def test_lookup_tables(sentence, expected, labeled_tokens, spacy_nlp):
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
     assert np.allclose(result.toarray(), expected, atol=1e-10)
 
     # the tokenizer should have added tokens
@@ -150,5 +144,5 @@ def test_regex_featurizer_no_sequence(sentence, expected, spacy_nlp):
     message.set("spacy_doc", spacy_nlp(sentence))
     tokenizer.process(message)
 
-    result = ftr._features_for_patterns(message, MESSAGE_TEXT_ATTRIBUTE)
+    result = ftr._features_for_patterns(message, TEXT_ATTRIBUTE)
     assert np.allclose(result.toarray()[0], expected, atol=1e-10)

From 50d54e3ec7eb2f003c3972e3fdbf4d5a8eabf961 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Wed, 11 Dec 2019 13:53:44 +0100
Subject: [PATCH 219/239] rename spacy_featurizable_attributes to
 dense_featurizable_attributes

---
 .../nlu/classifiers/mitie_intent_classifier.py | 13 +++----------
 rasa/nlu/constants.py                          |  4 ++--
 rasa/nlu/extractors/crf_entity_extractor.py    | 18 ++++++++----------
 rasa/nlu/extractors/mitie_entity_extractor.py  | 12 ++++--------
 .../dense_featurizer/convert_featurizer.py     |  6 +++---
 .../dense_featurizer/mitie_featurizer.py       |  8 ++++----
 .../dense_featurizer/spacy_featurizer.py       | 14 +++++++-------
 .../count_vectors_featurizer.py                | 16 +++++++---------
 .../sparse_featurizer/regex_featurizer.py      |  6 +++---
 rasa/nlu/tokenizers/jieba_tokenizer.py         |  9 ++++-----
 rasa/nlu/tokenizers/mitie_tokenizer.py         |  8 ++++----
 rasa/nlu/tokenizers/spacy_tokenizer.py         | 17 +++++++----------
 rasa/nlu/tokenizers/whitespace_tokenizer.py    |  8 ++++----
 rasa/nlu/utils/spacy_utils.py                  |  8 ++++----
 .../nlu/featurizers/test_convert_featurizer.py |  4 ++--
 15 files changed, 66 insertions(+), 85 deletions(-)

diff --git a/rasa/nlu/classifiers/mitie_intent_classifier.py b/rasa/nlu/classifiers/mitie_intent_classifier.py
index 4438e50e2c35..7a95e8e26f8c 100644
--- a/rasa/nlu/classifiers/mitie_intent_classifier.py
+++ b/rasa/nlu/classifiers/mitie_intent_classifier.py
@@ -5,7 +5,7 @@
 from rasa.nlu.components import Component
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.model import Metadata
-from rasa.nlu.constants import MESSAGE_TOKENS_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import TOKENS_NAMES, TEXT_ATTRIBUTE
 from rasa.nlu.training_data import Message, TrainingData
 
 if typing.TYPE_CHECKING:
@@ -16,11 +16,7 @@ class MitieIntentClassifier(Component):
 
     provides = ["intent"]
 
-    requires = [
-        MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE],
-        "mitie_feature_extractor",
-        "mitie_file",
-    ]
+    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE], "mitie_feature_extractor", "mitie_file"]
 
     def __init__(
         self, component_config: Optional[Dict[Text, Any]] = None, clf=None
@@ -83,10 +79,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
     @staticmethod
     def _tokens_of_message(message) -> List[Text]:
-        return [
-            token.text
-            for token in message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], [])
-        ]
+        return [token.text for token in message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])]
 
     @classmethod
     def load(
diff --git a/rasa/nlu/constants.py b/rasa/nlu/constants.py
index c821df509a9e..6fb2f023fe18 100644
--- a/rasa/nlu/constants.py
+++ b/rasa/nlu/constants.py
@@ -12,7 +12,7 @@
 
 MESSAGE_ATTRIBUTES = [TEXT_ATTRIBUTE, INTENT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
 
-MESSAGE_TOKENS_NAMES = {
+TOKENS_NAMES = {
     TEXT_ATTRIBUTE: "tokens",
     INTENT_ATTRIBUTE: "intent_tokens",
     RESPONSE_ATTRIBUTE: "response_tokens",
@@ -32,7 +32,7 @@
 
 SPACY_DOCS = {TEXT_ATTRIBUTE: "spacy_doc", RESPONSE_ATTRIBUTE: "response_spacy_doc"}
 
-SPACY_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
+DENSE_FEATURIZABLE_ATTRIBUTES = [TEXT_ATTRIBUTE, RESPONSE_ATTRIBUTE]
 
 RESPONSE_SELECTOR_PROPERTY_NAME = "response_selector"
 DEFAULT_OPEN_UTTERANCE_TYPE = "default"
diff --git a/rasa/nlu/extractors/crf_entity_extractor.py b/rasa/nlu/extractors/crf_entity_extractor.py
index 101d3bde4c69..99854c01d3da 100644
--- a/rasa/nlu/extractors/crf_entity_extractor.py
+++ b/rasa/nlu/extractors/crf_entity_extractor.py
@@ -11,7 +11,7 @@
 from rasa.nlu.tokenizers.tokenizer import Token
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
-    MESSAGE_TOKENS_NAMES,
+    TOKENS_NAMES,
     TEXT_ATTRIBUTE,
     DENSE_FEATURE_NAMES,
     SPACY_DOCS,
@@ -43,7 +43,7 @@ class CRFEntityExtractor(EntityExtractor):
 
     provides = [ENTITIES_ATTRIBUTE]
 
-    requires = [MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]]
+    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
 
     defaults = {
         # BILOU_flag determines whether to use BILOU tagging or not.
@@ -336,7 +336,7 @@ def _from_crf_to_json(
         if self.pos_features:
             tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
         else:
-            tokens = message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])
+            tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
 
         if len(tokens) != len(entities):
             raise Exception(
@@ -501,7 +501,7 @@ def _from_json_to_crf(
             gold = GoldParse(doc_or_tokens, entities=entity_offsets)
             ents = [l[5] for l in gold.orig_annot]
         else:
-            doc_or_tokens = message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])
+            doc_or_tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
             ents = self._bilou_tags_from_offsets(doc_or_tokens, entity_offsets)
 
         # collect badly annotated examples
@@ -565,10 +565,8 @@ def _bilou_tags_from_offsets(tokens, entities, missing: Text = "O") -> List[Text
 
     @staticmethod
     def __pattern_of_token(message, i):
-        if message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]) is not None:
-            return message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])[i].get(
-                "pattern", {}
-            )
+        if message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]) is not None:
+            return message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])[i].get("pattern", {})
         else:
             return {}
 
@@ -586,7 +584,7 @@ def __get_dense_features(message: Message) -> Optional[List[Any]]:
         if features is None:
             return None
 
-        tokens = message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], [])
+        tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE], [])
         if len(tokens) != len(features):
             warnings.warn(
                 f"Number of features ({len(features)}) for attribute "
@@ -617,7 +615,7 @@ def _from_text_to_crf(
         if self.pos_features:
             tokens = message.get(SPACY_DOCS[TEXT_ATTRIBUTE])
         else:
-            tokens = message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])
+            tokens = message.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
 
         text_dense_features = self.__get_dense_features(message)
 
diff --git a/rasa/nlu/extractors/mitie_entity_extractor.py b/rasa/nlu/extractors/mitie_entity_extractor.py
index 4869d7748c8b..671b312e499f 100644
--- a/rasa/nlu/extractors/mitie_entity_extractor.py
+++ b/rasa/nlu/extractors/mitie_entity_extractor.py
@@ -4,7 +4,7 @@
 import typing
 from typing import Any, Dict, List, Optional, Text
 
-from rasa.nlu.constants import ENTITIES_ATTRIBUTE, MESSAGE_TOKENS_NAMES, TEXT_ATTRIBUTE
+from rasa.nlu.constants import ENTITIES_ATTRIBUTE, TOKENS_NAMES, TEXT_ATTRIBUTE
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.extractors import EntityExtractor
 from rasa.nlu.model import Metadata
@@ -21,11 +21,7 @@ class MitieEntityExtractor(EntityExtractor):
 
     provides = [ENTITIES_ATTRIBUTE]
 
-    requires = [
-        MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE],
-        "mitie_feature_extractor",
-        "mitie_file",
-    ]
+    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE], "mitie_feature_extractor", "mitie_file"]
 
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None, ner=None):
         """Construct a new intent classifier using the sklearn framework."""
@@ -97,7 +93,7 @@ def _prepare_mitie_sample(self, training_example) -> Any:
         import mitie
 
         text = training_example.text
-        tokens = training_example.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE])
+        tokens = training_example.get(TOKENS_NAMES[TEXT_ATTRIBUTE])
         sample = mitie.ner_training_instance([t.text for t in tokens])
         for ent in training_example.get(ENTITIES_ATTRIBUTE, []):
             try:
@@ -130,7 +126,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         ents = self.extract_entities(
             message.text,
-            message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]),
+            message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]),
             mitie_feature_extractor,
         )
         extracted = self.add_extractor_name(ents)
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index bfadb4455024..e760e7bcd13c 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -6,7 +6,7 @@
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
     DENSE_FEATURE_NAMES,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
 )
 import numpy as np
 import tensorflow as tf
@@ -17,7 +17,7 @@
 class ConveRTFeaturizer(Featurizer):
 
     provides = [
-        DENSE_FEATURE_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
     defaults = {
@@ -99,7 +99,7 @@ def train(
 
         batch_size = 64
 
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
 
             non_empty_examples = list(
                 filter(lambda x: x.get(attribute), training_data.training_examples)
diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index b0fc219a4531..9347b9296a34 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -12,7 +12,7 @@
 
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
+    TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     DENSE_FEATURE_NAMES,
     CLS_TOKEN,
@@ -23,7 +23,7 @@ class MitieFeaturizer(Featurizer):
 
     provides = [DENSE_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
-    requires = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES] + [
+    requires = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES] + [
         "mitie_feature_extractor"
     ]
 
@@ -50,7 +50,7 @@ def ndim(self, feature_extractor: "mitie.total_word_feature_extractor"):
 
     def get_tokens_by_attribute(self, example, attribute) -> Any:
 
-        return example.get(MESSAGE_TOKENS_NAMES[attribute])
+        return example.get(TOKENS_NAMES[attribute])
 
     def train(
         self, training_data: TrainingData, config: RasaNLUModelConfig, **kwargs: Any
@@ -82,7 +82,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         mitie_feature_extractor = self._mitie_feature_extractor(**kwargs)
         features = self.features_for_tokens(
-            message.get(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]), mitie_feature_extractor
+            message.get(TOKENS_NAMES[TEXT_ATTRIBUTE]), mitie_feature_extractor
         )
         message.set(
             DENSE_FEATURE_NAMES[TEXT_ATTRIBUTE],
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 195e047cc569..3578f585ca71 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -13,8 +13,8 @@
     TEXT_ATTRIBUTE,
     SPACY_DOCS,
     DENSE_FEATURE_NAMES,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
-    MESSAGE_TOKENS_NAMES,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
+    TOKENS_NAMES,
     CLS_TOKEN,
 )
 
@@ -22,12 +22,12 @@
 class SpacyFeaturizer(Featurizer):
 
     provides = [
-        DENSE_FEATURE_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+        DENSE_FEATURE_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
     requires = [
-        SPACY_DOCS[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ] + [MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES]
+        SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
+    ] + [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
     defaults = {
         # if True return a sequence of features (return vector has size
@@ -56,7 +56,7 @@ def train(
     ) -> None:
 
         for example in training_data.intent_examples:
-            for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+            for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
                 self._set_spacy_features(example, attribute)
 
     def get_doc(self, message, attribute):
@@ -71,7 +71,7 @@ def _set_spacy_features(self, message, attribute=TEXT_ATTRIBUTE):
         """Adds the spacy word vectors to the messages features."""
 
         message_attribute_doc = self.get_doc(message, attribute)
-        tokens = message.get(MESSAGE_TOKENS_NAMES[attribute])
+        tokens = message.get(TOKENS_NAMES[attribute])
         cls_token_used = tokens[-1].text == CLS_TOKEN if tokens else False
 
         if message_attribute_doc is not None:
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 07f0d5b1f38e..53097c2f47da 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -12,11 +12,11 @@
 from rasa.nlu.training_data import Message, TrainingData
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
+    TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
     SPARSE_FEATURE_NAMES,
     INTENT_ATTRIBUTE,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
 )
 
 logger = logging.getLogger(__name__)
@@ -35,9 +35,7 @@ class CountVectorsFeaturizer(Featurizer):
 
     provides = [SPARSE_FEATURE_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
-    requires = [
-        MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ]
+    requires = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
     defaults = {
         # whether to use a shared vocab
@@ -190,7 +188,7 @@ def _attributes_for(analyzer):
 
         # intents should be featurized only by word level count vectorizer
         return (
-            MESSAGE_ATTRIBUTES if analyzer == "word" else SPACY_FEATURIZABLE_ATTRIBUTES
+            MESSAGE_ATTRIBUTES if analyzer == "word" else DENSE_FEATURIZABLE_ATTRIBUTES
         )
 
     def __init__(
@@ -222,8 +220,8 @@ def _get_message_tokens_by_attribute(
         message: "Message", attribute: Text
     ) -> List[Text]:
         """Get text tokens of an attribute of a message"""
-        if message.get(MESSAGE_TOKENS_NAMES[attribute]):
-            return [t.lemma for t in message.get(MESSAGE_TOKENS_NAMES[attribute])]
+        if message.get(TOKENS_NAMES[attribute]):
+            return [t.lemma for t in message.get(TOKENS_NAMES[attribute])]
 
         return message.get(attribute).split()
 
@@ -314,7 +312,7 @@ def _get_all_attributes_processed_tokens(
                 self._get_processed_message_tokens_by_attribute(example, attribute)
                 for example in training_data.training_examples
             ]
-            if attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+            if attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
                 # check for oov tokens only in text based attributes
                 self._check_OOV_present(all_tokens)
             processed_attribute_tokens[attribute] = all_tokens
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 682b504b2c2f..6964e7e4fe2a 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -12,7 +12,7 @@
 from rasa.nlu.training_data import Message, TrainingData
 import rasa.utils.io
 from rasa.nlu.constants import (
-    MESSAGE_TOKENS_NAMES,
+    TOKENS_NAMES,
     TEXT_ATTRIBUTE,
     RESPONSE_ATTRIBUTE,
     SPARSE_FEATURE_NAMES,
@@ -28,7 +28,7 @@ class RegexFeaturizer(Featurizer):
 
     provides = [SPARSE_FEATURE_NAMES[TEXT_ATTRIBUTE]]
 
-    requires = [MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE]]
+    requires = [TOKENS_NAMES[TEXT_ATTRIBUTE]]
 
     defaults = {
         # if True return a sequence of features (return vector has size
@@ -92,7 +92,7 @@ def _features_for_patterns(
         regexes did match. Furthermore, if the
         message is tokenized, the function will mark all tokens with a dict
         relating the name of the regex to whether it was matched."""
-        tokens = message.get(MESSAGE_TOKENS_NAMES[attribute], [])
+        tokens = message.get(TOKENS_NAMES[attribute], [])
 
         if self.return_sequence:
             seq_length = len(tokens)
diff --git a/rasa/nlu/tokenizers/jieba_tokenizer.py b/rasa/nlu/tokenizers/jieba_tokenizer.py
index 708fc9f2b7d1..73e6e0541ab3 100644
--- a/rasa/nlu/tokenizers/jieba_tokenizer.py
+++ b/rasa/nlu/tokenizers/jieba_tokenizer.py
@@ -13,7 +13,7 @@
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
+    TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
 )
 
@@ -26,7 +26,7 @@
 
 class JiebaTokenizer(Tokenizer):
 
-    provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
+    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
     language_list = ["zh"]
 
@@ -89,15 +89,14 @@ def train(
 
                 if example.get(attribute) is not None:
                     example.set(
-                        MESSAGE_TOKENS_NAMES[attribute],
+                        TOKENS_NAMES[attribute],
                         self.tokenize(example.get(attribute), attribute),
                     )
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
         message.set(
-            MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE],
-            self.tokenize(message.text, TEXT_ATTRIBUTE),
+            TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text, TEXT_ATTRIBUTE)
         )
 
     def preprocess_text(self, text: Text, attribute: Text) -> Text:
diff --git a/rasa/nlu/tokenizers/mitie_tokenizer.py b/rasa/nlu/tokenizers/mitie_tokenizer.py
index 188000572e49..d775bddc0449 100644
--- a/rasa/nlu/tokenizers/mitie_tokenizer.py
+++ b/rasa/nlu/tokenizers/mitie_tokenizer.py
@@ -4,13 +4,13 @@
 from rasa.nlu.tokenizers.tokenizer import Token, Tokenizer
 from rasa.nlu.training_data import Message, TrainingData
 
-from rasa.nlu.constants import TEXT_ATTRIBUTE, MESSAGE_TOKENS_NAMES, MESSAGE_ATTRIBUTES
+from rasa.nlu.constants import TEXT_ATTRIBUTE, TOKENS_NAMES, MESSAGE_ATTRIBUTES
 from rasa.utils.io import DEFAULT_ENCODING
 
 
 class MitieTokenizer(Tokenizer):
 
-    provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
+    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
     defaults = {
         # add __CLS__ token to the end of the list of tokens
@@ -31,13 +31,13 @@ def train(
 
                 if example.get(attribute) is not None:
                     example.set(
-                        MESSAGE_TOKENS_NAMES[attribute],
+                        TOKENS_NAMES[attribute],
                         self.tokenize(example.get(attribute), attribute),
                     )
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        message.set(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text))
+        message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text))
 
     def _token_from_offset(
         self, text: bytes, offset: int, encoded_sentence: bytes
diff --git a/rasa/nlu/tokenizers/spacy_tokenizer.py b/rasa/nlu/tokenizers/spacy_tokenizer.py
index 372dace62f9b..ed14d65f8e38 100644
--- a/rasa/nlu/tokenizers/spacy_tokenizer.py
+++ b/rasa/nlu/tokenizers/spacy_tokenizer.py
@@ -7,9 +7,9 @@
 
 from rasa.nlu.constants import (
     TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
+    TOKENS_NAMES,
     SPACY_DOCS,
-    SPACY_FEATURIZABLE_ATTRIBUTES,
+    DENSE_FEATURIZABLE_ATTRIBUTES,
 )
 
 if typing.TYPE_CHECKING:
@@ -18,11 +18,9 @@
 
 class SpacyTokenizer(Tokenizer):
 
-    provides = [
-        MESSAGE_TOKENS_NAMES[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
-    ]
+    provides = [TOKENS_NAMES[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
-    requires = [SPACY_DOCS[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES]
+    requires = [SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES]
 
     defaults = {
         # add __CLS__ token to the end of the list of tokens
@@ -35,14 +33,13 @@ def train(
 
         for example in training_data.training_examples:
 
-            for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+            for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
 
                 attribute_doc = self.get_doc(example, attribute)
 
                 if attribute_doc is not None:
                     example.set(
-                        MESSAGE_TOKENS_NAMES[attribute],
-                        self.tokenize(attribute_doc, attribute),
+                        TOKENS_NAMES[attribute], self.tokenize(attribute_doc, attribute)
                     )
 
     def get_doc(self, message: Message, attribute: Text) -> "Doc":
@@ -50,7 +47,7 @@ def get_doc(self, message: Message, attribute: Text) -> "Doc":
 
     def process(self, message: Message, **kwargs: Any) -> None:
         message.set(
-            MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE],
+            TOKENS_NAMES[TEXT_ATTRIBUTE],
             self.tokenize(self.get_doc(message, TEXT_ATTRIBUTE), TEXT_ATTRIBUTE),
         )
 
diff --git a/rasa/nlu/tokenizers/whitespace_tokenizer.py b/rasa/nlu/tokenizers/whitespace_tokenizer.py
index e968c549fa2d..4b2f49b15e73 100644
--- a/rasa/nlu/tokenizers/whitespace_tokenizer.py
+++ b/rasa/nlu/tokenizers/whitespace_tokenizer.py
@@ -7,14 +7,14 @@
 from rasa.nlu.constants import (
     INTENT_ATTRIBUTE,
     TEXT_ATTRIBUTE,
-    MESSAGE_TOKENS_NAMES,
+    TOKENS_NAMES,
     MESSAGE_ATTRIBUTES,
 )
 
 
 class WhitespaceTokenizer(Tokenizer):
 
-    provides = [MESSAGE_TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
+    provides = [TOKENS_NAMES[attribute] for attribute in MESSAGE_ATTRIBUTES]
 
     defaults = {
         # Flag to check whether to split intents
@@ -46,13 +46,13 @@ def train(
             for attribute in MESSAGE_ATTRIBUTES:
                 if example.get(attribute) is not None:
                     example.set(
-                        MESSAGE_TOKENS_NAMES[attribute],
+                        TOKENS_NAMES[attribute],
                         self.tokenize(example.get(attribute), attribute),
                     )
 
     def process(self, message: Message, **kwargs: Any) -> None:
 
-        message.set(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text))
+        message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], self.tokenize(message.text))
 
     def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]:
 
diff --git a/rasa/nlu/utils/spacy_utils.py b/rasa/nlu/utils/spacy_utils.py
index 35e36fe2d721..f06ff5f7459f 100644
--- a/rasa/nlu/utils/spacy_utils.py
+++ b/rasa/nlu/utils/spacy_utils.py
@@ -14,12 +14,12 @@
     from spacy.tokens.doc import Doc  # pytype: disable=import-error
     from rasa.nlu.model import Metadata
 
-from rasa.nlu.constants import TEXT_ATTRIBUTE, SPACY_DOCS, SPACY_FEATURIZABLE_ATTRIBUTES
+from rasa.nlu.constants import TEXT_ATTRIBUTE, SPACY_DOCS, DENSE_FEATURIZABLE_ATTRIBUTES
 
 
 class SpacyNLP(Component):
     provides = ["spacy_nlp"] + [
-        SPACY_DOCS[attribute] for attribute in SPACY_FEATURIZABLE_ATTRIBUTES
+        SPACY_DOCS[attribute] for attribute in DENSE_FEATURIZABLE_ATTRIBUTES
     ]
 
     defaults = {
@@ -186,7 +186,7 @@ def docs_for_training_data(
         self, training_data: TrainingData
     ) -> Dict[Text, List[Any]]:
         attribute_docs = {}
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
             texts = [self.get_text(e, attribute) for e in training_data.intent_examples]
             # Index and freeze indices of the training samples for preserving the order
             # after processing the data.
@@ -218,7 +218,7 @@ def train(
 
         attribute_docs = self.docs_for_training_data(training_data)
 
-        for attribute in SPACY_FEATURIZABLE_ATTRIBUTES:
+        for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
 
             for idx, example in enumerate(training_data.training_examples):
                 example_attribute_doc = attribute_docs[attribute][idx]
diff --git a/tests/nlu/featurizers/test_convert_featurizer.py b/tests/nlu/featurizers/test_convert_featurizer.py
index e61272deddbb..66e25c4f9d4d 100644
--- a/tests/nlu/featurizers/test_convert_featurizer.py
+++ b/tests/nlu/featurizers/test_convert_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 
-from rasa.nlu.constants import TEXT_ATTRIBUTE, DENSE_FEATURE_NAMES, MESSAGE_TOKENS_NAMES
+from rasa.nlu.constants import TEXT_ATTRIBUTE, DENSE_FEATURE_NAMES, TOKENS_NAMES
 from rasa.nlu.training_data import Message
 from rasa.nlu.tokenizers.whitespace_tokenizer import WhitespaceTokenizer
 from rasa.nlu.config import RasaNLUModelConfig
@@ -17,7 +17,7 @@ def test_convert_featurizer(mitie_feature_extractor, default_config):
     sentence = "Hey how are you today ?"
     message = Message(sentence)
     tokens = WhitespaceTokenizer().tokenize(sentence)
-    message.set(MESSAGE_TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
+    message.set(TOKENS_NAMES[TEXT_ATTRIBUTE], tokens)
 
     featurizer.process(message)
 

From f9b4f82c4d28778d9e79ff5b608688a6b2179d1e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 09:42:47 +0100
Subject: [PATCH 220/239] update changelog entry

---
 changelog/4935.feature.rst                       | 16 +++++++++-------
 changelog/4935.removal.rst                       |  8 ++++++++
 changelog/4957.removal.rst                       |  5 +++++
 .../sparse_featurizer/ngram_featurizer.py        |  9 ++++-----
 4 files changed, 26 insertions(+), 12 deletions(-)
 create mode 100644 changelog/4935.removal.rst
 create mode 100644 changelog/4957.removal.rst

diff --git a/changelog/4935.feature.rst b/changelog/4935.feature.rst
index 662c2b74e44d..e95640de7f00 100644
--- a/changelog/4935.feature.rst
+++ b/changelog/4935.feature.rst
@@ -1,15 +1,17 @@
+Preparation for an upcoming change in the ``EmbeddingIntentClassifier``:
+
 Add option ``use_cls_token`` to all tokenizers. If it is set to ``True``, the token ``__CLS__`` will be added to
-the end of the list of tokens.
+the end of the list of tokens. Default is set to ``False``. No need to change the default value for now.
 
 Add option ``return_sequence`` to all featurizers. By default all featurizers return a matrix of size
 (1 x feature-dimension). If the option ``return_sequence`` is set to ``True``, the corresponding featurizer will return
-a matrix of size (token-length x feature-dimension).
-
-Split featurizers into sparse and dense featurizers.
-
-Deprecate ``NGramFeaturizer``. Please use ``CountVectorsFeaturizer`` instead.
+a matrix of size (token-length x feature-dimension). See https://rasa.com/docs/rasa/nlu/components/#featurizers.
+Default value is set to ``False``. However, you might want to set it to ``True`` if you want to use custom features
+in the ``CRFEntityExtractor``.
+See https://rasa.com/docs/rasa/nlu/entity-extraction/#passing-custom-features-to-crfentityextractor.
 
 To use custom features in the ``CRFEntityExtractor`` use ``text_dense_features`` instead of ``ner_features``. If
 ``text_dense_features`` are present in the feature set, the ``CRFEntityExtractor`` will automatically make use of
 them. Just make sure to add a dense featurizer in front of the ``CRFEntityExtractor`` in your pipeline and set the
-flag ``return_sequence`` to ``True`` for that featurizer.
\ No newline at end of file
+flag ``return_sequence`` to ``True`` for that featurizer.
+See https://rasa.com/docs/rasa/nlu/entity-extraction/#passing-custom-features-to-crfentityextractor.
\ No newline at end of file
diff --git a/changelog/4935.removal.rst b/changelog/4935.removal.rst
new file mode 100644
index 000000000000..dd1b98565419
--- /dev/null
+++ b/changelog/4935.removal.rst
@@ -0,0 +1,8 @@
+Remove ``NGramFeaturizer``. Please use ``CountVectorsFeaturizer`` instead.
+The following settings match the previous ``NGramFeaturizer``:
+- name: 'CountVectorsFeaturizer'
+  analyzer: 'char_wb'
+  min_ngram: 3
+  max_ngram: 17
+  max_features: 10
+  min_df: 5
\ No newline at end of file
diff --git a/changelog/4957.removal.rst b/changelog/4957.removal.rst
new file mode 100644
index 000000000000..3ae9e979f438
--- /dev/null
+++ b/changelog/4957.removal.rst
@@ -0,0 +1,5 @@
+To use custom features in the ``CRFEntityExtractor`` use ``text_dense_features`` instead of ``ner_features``. If
+``text_dense_features`` are present in the feature set, the ``CRFEntityExtractor`` will automatically make use of
+them. Just make sure to add a dense featurizer in front of the ``CRFEntityExtractor`` in your pipeline and set the
+flag ``return_sequence`` to ``True`` for that featurizer.
+See https://rasa.com/docs/rasa/nlu/entity-extraction/#passing-custom-features-to-crfentityextractor.
\ No newline at end of file
diff --git a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index 1cdb220042ee..8a1a438e42b5 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -12,16 +12,15 @@ class NGramFeaturizer(Featurizer):
     def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
         super(NGramFeaturizer, self).__init__(component_config)
 
-        warnings.warn(
-            "DEPRECATION warning: Using `NGramFeaturizer` is deprecated. "
+        raise NotImplementedError(
+            "REMOVAL warning: You cannot use `NGramFeaturizer` anymore. "
             "Please use `CountVectorsFeaturizer` instead. The following settings"
-            "should match the previous `NGramFeaturizer`:"
+            "match the previous `NGramFeaturizer`:"
             ""
             "- name: 'CountVectorsFeaturizer'"
             "  analyzer: 'char_wb'"
             "  min_ngram: 3"
             "  max_ngram: 17"
             "  max_features: 10"
-            "  min_df: 5",
-            FutureWarning,
+            "  min_df: 5"
         )

From 832755e7d789d96b9701c5e7e9a13b2592f9b282 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 09:55:47 +0100
Subject: [PATCH 221/239] update docs around convert featurizer

---
 docs/nlu/components.rst                                  | 4 ++--
 .../featurizers/dense_featurizer/convert_featurizer.py   | 9 +++++++++
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 60e392adf1c0..9a8207826b20 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -161,8 +161,8 @@ ConveRTFeaturizer
         be used if your training data is in english language.
 
     .. note::
-        To use ``ConveRTFeaturizer`` you should install ``tensorflow_text==1.15.1`` and ``tensorflow_hub==0.6.0``.
-        Otherwise, you can also do a pip install of Rasa with ``pip install rasa[convert]``
+        To use ``ConveRTFeaturizer`` you need to install additional tensorflow libraries (``tensorflow_text`` and
+        ``tensorflow_hub``). You should do a pip install of Rasa with ``pip install rasa[convert]`` to install those.
 
     .. warning::
         If you set the option ``"return_sequence"`` to ``True``, Rasa will raise an error informing you that this
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index e760e7bcd13c..4b4b9999727e 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -1,4 +1,5 @@
 import logging
+import warnings
 from rasa.nlu.featurizers.featurzier import Featurizer
 from typing import Any, Dict, List, Optional, Text, Tuple
 from rasa.nlu.config import RasaNLUModelConfig
@@ -97,6 +98,14 @@ def train(
         **kwargs: Any,
     ) -> None:
 
+        if config is not None and config.language != "en":
+            warnings.warn(
+                f"Since ``ConveRT`` model is trained only on an english "
+                f"corpus of conversations, this featurizer should only be "
+                f"used if your training data is in english language. "
+                f"However, you are training in '{config.language}'."
+            )
+
         batch_size = 64
 
         for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:

From 4e4cef6926444297e8c32c54fdfaecfb06312eb1 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 10:02:22 +0100
Subject: [PATCH 222/239] add description to public methods in embedding intent
 classifier

---
 .../embedding_intent_classifier.py            | 27 +++++++++++--------
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 5ae2c2e194ca..0fe8a3ce2a8c 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -215,7 +215,7 @@ def __init__(
         all_labels_embed: Optional["tf.Tensor"] = None,
         batch_tuple_sizes: Optional[Dict] = None,
     ) -> None:
-        """Declare instant variables with default values"""
+        """Declare instance variables with default values"""
 
         super().__init__(component_config)
 
@@ -381,7 +381,7 @@ def _create_label_data(
 
         return label_data
 
-    def use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
+    def _use_default_label_features(self, label_ids: np.ndarray) -> List[np.ndarray]:
         return [
             np.array(
                 [
@@ -441,7 +441,9 @@ def _create_session_data(
             "intent_features" not in session_data or not session_data["intent_features"]
         ):
             # no label features are present, get default features from _label_data
-            session_data["intent_features"] = self.use_default_label_features(label_ids)
+            session_data["intent_features"] = self._use_default_label_features(
+                label_ids
+            )
 
         self._add_mask_to_session_data(session_data, "text_mask", "text_features")
         self._add_mask_to_session_data(session_data, "intent_mask", "intent_features")
@@ -502,13 +504,12 @@ def _create_tf_embed_fnn(
             layer_name_suffix=embed_name,
         )
 
-    def combine_sparse_dense_features(
+    def _combine_sparse_dense_features(
         self,
         features: List[Union[tf.Tensor, tf.SparseTensor]],
         mask: tf.Tensor,
         name: Text,
     ) -> tf.Tensor:
-
         dense_features = []
 
         dense_dim = self.dense_dim
@@ -544,13 +545,13 @@ def _build_tf_train_graph(
         batch_data, _ = train_utils.batch_to_session_data(self.batch_in, session_data)
         label_data, _ = train_utils.batch_to_session_data(label_batch, self._label_data)
 
-        a = self.combine_sparse_dense_features(
+        a = self._combine_sparse_dense_features(
             batch_data["text_features"], batch_data["text_mask"][0], "text"
         )
-        b = self.combine_sparse_dense_features(
+        b = self._combine_sparse_dense_features(
             batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
         )
-        all_bs = self.combine_sparse_dense_features(
+        all_bs = self._combine_sparse_dense_features(
             label_data["intent_features"], label_data["intent_mask"][0], "intent"
         )
 
@@ -603,10 +604,10 @@ def _build_tf_pred_graph(self, session_data: "SessionDataType") -> "tf.Tensor":
             self.batch_in, session_data
         )
 
-        a = self.combine_sparse_dense_features(
+        a = self._combine_sparse_dense_features(
             batch_data["text_features"], batch_data["text_mask"][0], "text"
         )
-        b = self.combine_sparse_dense_features(
+        b = self._combine_sparse_dense_features(
             batch_data["intent_features"], batch_data["intent_mask"][0], "intent"
         )
 
@@ -647,6 +648,8 @@ def _get_num_of_features(session_data: "SessionDataType", key: Text) -> int:
         return num_features
 
     def check_input_dimension_consistency(self, session_data: "SessionDataType"):
+        """Check if text features and intent features have the same dimension."""
+
         if self.share_hidden_layers:
             num_text_features = self._get_num_of_features(session_data, "text_features")
             num_intent_features = self._get_num_of_features(
@@ -696,7 +699,7 @@ def train(
         cfg: Optional["RasaNLUModelConfig"] = None,
         **kwargs: Any,
     ) -> None:
-        """Train the embedding label classifier on a data set."""
+        """Train the embedding intent classifier on a data set."""
 
         logger.debug("Started training embedding classifier.")
 
@@ -797,6 +800,7 @@ def _calculate_message_sim(
     def predict_label(
         self, message: "Message"
     ) -> Tuple[Dict[Text, Any], List[Dict[Text, Any]]]:
+        """Predicts the intent of the provided message."""
 
         label = {"name": None, "confidence": 0.0}
         label_ranking = []
@@ -903,6 +907,7 @@ def load(
         cached_component: Optional["EmbeddingIntentClassifier"] = None,
         **kwargs: Any,
     ) -> "EmbeddingIntentClassifier":
+        """Loads the trained model from the provided directory."""
 
         if model_dir and meta.get("file"):
             file_name = meta.get("file")

From bb231b1054d6c4e0ad0dafc4c6c74706c0456d13 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 10:07:27 +0100
Subject: [PATCH 223/239] update train utils

---
 rasa/utils/train_utils.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index e939b7433595..b7a8226cdff7 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -81,19 +81,9 @@ def train_val_split(
 
     counts = np.array([label_counts[label] for label in label_ids])
 
-    multi_values = []
-    [
-        multi_values.append(v[counts > 1])
-        for values in session_data.values()
-        for v in values
-    ]
+    multi_values = [v[counts > 1] for values in session_data.values() for v in values]
 
-    solo_values = []
-    [
-        solo_values.append(v[counts == 1])
-        for values in session_data.values()
-        for v in values
-    ]
+    solo_values = [v[counts == 1] for values in session_data.values() for v in values]
 
     output_values = train_test_split(
         *multi_values,
@@ -114,6 +104,8 @@ def check_train_test_sizes(
     label_counts: Dict[Any, int],
     session_data: SessionDataType,
 ):
+    """Check whether the evaluation data set is too large or too small."""
+
     num_examples = get_number_of_examples(session_data)
 
     if evaluate_on_num_examples >= num_examples - len(label_counts):
@@ -131,7 +123,10 @@ def check_train_test_sizes(
 
 def convert_train_test_split(
     output_values: List[Any], session_data: SessionDataType, solo_values: List[Any]
-):
+) -> Tuple[SessionDataType, SessionDataType]:
+    """Convert the output of sklearn.model_selection.train_test_split into train and
+    eval session data."""
+
     session_data_train = defaultdict(list)
     session_data_val = defaultdict(list)
 

From aa3bf9d2ba89571d64b6b0a91a5902749793c9d8 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 10:12:07 +0100
Subject: [PATCH 224/239] update changelog entry

---
 changelog/4935.feature.rst | 6 +-----
 1 file changed, 1 insertion(+), 5 deletions(-)

diff --git a/changelog/4935.feature.rst b/changelog/4935.feature.rst
index e95640de7f00..33c5ee7cd50e 100644
--- a/changelog/4935.feature.rst
+++ b/changelog/4935.feature.rst
@@ -10,8 +10,4 @@ Default value is set to ``False``. However, you might want to set it to ``True``
 in the ``CRFEntityExtractor``.
 See https://rasa.com/docs/rasa/nlu/entity-extraction/#passing-custom-features-to-crfentityextractor.
 
-To use custom features in the ``CRFEntityExtractor`` use ``text_dense_features`` instead of ``ner_features``. If
-``text_dense_features`` are present in the feature set, the ``CRFEntityExtractor`` will automatically make use of
-them. Just make sure to add a dense featurizer in front of the ``CRFEntityExtractor`` in your pipeline and set the
-flag ``return_sequence`` to ``True`` for that featurizer.
-See https://rasa.com/docs/rasa/nlu/entity-extraction/#passing-custom-features-to-crfentityextractor.
\ No newline at end of file
+These changes are model breaking!
\ No newline at end of file

From 1125e115783cdad0782b597b09215ec2370b05ef Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 10:18:28 +0100
Subject: [PATCH 225/239] Update nlu component documentation.

---
 docs/nlu/components.rst | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 9a8207826b20..71cb8a5a331a 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -85,7 +85,7 @@ SpacyNLP
 Text Featurizers
 ----------------
 
-Featurizers are divided into two different categories: sparse featurizers and dense featurizers.
+Text featurizers are divided into two different categories: sparse featurizers and dense featurizers.
 Sparse featurizers are featurizers that return feature vectors with a lot of missing values, e.g. zeros.
 As those feature vectors would normally take up a lot of memory, we store them as sparse features.
 Sparse features only store the values that are non zero and their positions in the vector.
@@ -186,6 +186,7 @@ RegexFeaturizer
 :Requires: nothing
 :Type: Sparse featurizer
 :Description:
+    Creates features for entity extraction and intent classification.
     During training, the regex intent featurizer creates a list of `regular expressions` defined in the training
     data format.
     For each regex, a feature will be set marking whether this expression was found in the input, which will later
@@ -207,6 +208,7 @@ CountVectorsFeaturizer
 :Requires: nothing
 :Type: Sparse featurizer
 :Description:
+    Creates features for intent classification and response selection.
     Creates bag-of-words representation of user message and label features using
     `sklearn's CountVectorizer <http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html>`_.
     All tokens which consist only of digits (e.g. 123 and 99 but not a123d) will be assigned to the same feature.

From 9628eb25d03769d2a7d6d85a27e4d3e7070bdb6f Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 10:49:28 +0100
Subject: [PATCH 226/239] fix spelling mistakes

---
 docs/nlu/components.rst | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 71cb8a5a331a..3557f4c12ff9 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -89,14 +89,14 @@ Text featurizers are divided into two different categories: sparse featurizers a
 Sparse featurizers are featurizers that return feature vectors with a lot of missing values, e.g. zeros.
 As those feature vectors would normally take up a lot of memory, we store them as sparse features.
 Sparse features only store the values that are non zero and their positions in the vector.
-Thus, we save a lot of memroy and are able to train on larger dataset.
+Thus, we save a lot of memroy and are able to train on larger datasets.
 
 By default all featurizers will return a matrix of length (1 x feature-dimension).
 All featurizer (except the ``ConveRTFeaturizer``) have the option to return a sequence instead.
 In case the flag ``"return_sequence"`` is set to ``True`` the returned matrix of a featurizer will have the size
 (token-length x feature-dimension).
 So, the returned matrix will have an entry for every token.
-Otherwise, the matrix will just have on entry for the complete utterance.
+Otherwise, the matrix will just have one entry for the complete utterance.
 
 
 MitieFeaturizer

From e4529c01aebaff53026786f8c13f39efd201108e Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 13:42:53 +0100
Subject: [PATCH 227/239] refactoring count vectors featurizer

---
 .../count_vectors_featurizer.py               | 160 +++++++-----------
 1 file changed, 65 insertions(+), 95 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 53097c2f47da..5a853a431cdd 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -336,15 +336,18 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
         """Construct the vectorizers and train them with a shared vocab"""
 
         self.vectorizers = self._create_shared_vocab_vectorizers(
-            self.token_pattern,
-            self.strip_accents,
-            self.lowercase,
-            self.stop_words,
-            (self.min_ngram, self.max_ngram),
-            self.max_df,
-            self.min_df,
-            self.max_features,
-            self.analyzer,
+            {
+                "token_pattern": self.token_pattern,
+                "strip_accents": self.strip_accents,
+                "lowercase": self.lowercase,
+                "stop_words": self.stop_words,
+                "min_ngram": self.min_ngram,
+                "max_ngram": self.max_ngram,
+                "max_df": self.max_df,
+                "min_df": self.min_df,
+                "max_features": self.max_features,
+                "analyzer": self.analyzer,
+            }
         )
 
         combined_cleaned_texts = []
@@ -355,7 +358,8 @@ def _train_with_shared_vocab(self, attribute_texts: Dict[Text, List[Text]]):
             self.vectorizers[TEXT_ATTRIBUTE].fit(combined_cleaned_texts)
         except ValueError:
             logger.warning(
-                "Unable to train a shared CountVectorizer. Leaving an untrained CountVectorizer"
+                "Unable to train a shared CountVectorizer. "
+                "Leaving an untrained CountVectorizer"
             )
 
     @staticmethod
@@ -366,15 +370,18 @@ def _train_with_independent_vocab(self, attribute_texts: Dict[Text, List[Text]])
         """Construct the vectorizers and train them with an independent vocab"""
 
         self.vectorizers = self._create_independent_vocab_vectorizers(
-            self.token_pattern,
-            self.strip_accents,
-            self.lowercase,
-            self.stop_words,
-            (self.min_ngram, self.max_ngram),
-            self.max_df,
-            self.min_df,
-            self.max_features,
-            self.analyzer,
+            {
+                "token_pattern": self.token_pattern,
+                "strip_accents": self.strip_accents,
+                "lowercase": self.lowercase,
+                "stop_words": self.stop_words,
+                "min_ngram": self.min_ngram,
+                "max_ngram": self.max_ngram,
+                "max_df": self.max_df,
+                "min_df": self.min_df,
+                "max_features": self.max_features,
+                "analyzer": self.analyzer,
+            }
         )
 
         for attribute in self._attributes:
@@ -537,7 +544,8 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
                 featurizer_file = os.path.join(model_dir, file_name)
 
                 if self.use_shared_vocab:
-                    # Only persist vocabulary from one attribute. Can be loaded and distributed to all attributes.
+                    # Only persist vocabulary from one attribute. Can be loaded and
+                    # distributed to all attributes.
                     vocab = attribute_vocabularies[TEXT_ATTRIBUTE]
                 else:
                     vocab = attribute_vocabularies
@@ -548,72 +556,52 @@ def persist(self, file_name: Text, model_dir: Text) -> Optional[Dict[Text, Any]]
 
     @classmethod
     def _create_shared_vocab_vectorizers(
-        cls,
-        token_pattern,
-        strip_accents,
-        lowercase,
-        stop_words,
-        ngram_range,
-        max_df,
-        min_df,
-        max_features,
-        analyzer,
-        vocabulary=None,
+        cls, parameters: Dict[Text, Any], vocabulary: Optional[Any] = None
     ) -> Dict[Text, "CountVectorizer"]:
         """Create vectorizers for all attributes with shared vocabulary"""
 
         shared_vectorizer = CountVectorizer(
-            token_pattern=token_pattern,
-            strip_accents=strip_accents,
-            lowercase=lowercase,
-            stop_words=stop_words,
-            ngram_range=ngram_range,
-            max_df=max_df,
-            min_df=min_df,
-            max_features=max_features,
-            analyzer=analyzer,
+            token_pattern=parameters["token_pattern"],
+            strip_accents=parameters["strip_accents"],
+            lowercase=parameters["lowercase"],
+            stop_words=parameters["stop_words"],
+            ngram_range=(parameters["min_ngram"], parameters["max_ngram"]),
+            max_df=parameters["max_df"],
+            min_df=parameters["min_df"],
+            max_features=parameters["max_features"],
+            analyzer=parameters["analyzer"],
             vocabulary=vocabulary,
         )
 
         attribute_vectorizers = {}
 
-        for attribute in cls._attributes_for(analyzer):
+        for attribute in cls._attributes_for(parameters["analyzer"]):
             attribute_vectorizers[attribute] = shared_vectorizer
 
         return attribute_vectorizers
 
     @classmethod
     def _create_independent_vocab_vectorizers(
-        cls,
-        token_pattern,
-        strip_accents,
-        lowercase,
-        stop_words,
-        ngram_range,
-        max_df,
-        min_df,
-        max_features,
-        analyzer,
-        vocabulary=None,
+        cls, parameters: Dict[Text, Any], vocabulary: Optional[Any] = None
     ) -> Dict[Text, "CountVectorizer"]:
         """Create vectorizers for all attributes with independent vocabulary"""
 
         attribute_vectorizers = {}
 
-        for attribute in cls._attributes_for(analyzer):
+        for attribute in cls._attributes_for(parameters["analyzer"]):
 
             attribute_vocabulary = vocabulary[attribute] if vocabulary else None
 
             attribute_vectorizer = CountVectorizer(
-                token_pattern=token_pattern,
-                strip_accents=strip_accents,
-                lowercase=lowercase,
-                stop_words=stop_words,
-                ngram_range=ngram_range,
-                max_df=max_df,
-                min_df=min_df,
-                max_features=max_features,
-                analyzer=analyzer,
+                token_pattern=parameters["token_pattern"],
+                strip_accents=parameters["strip_accents"],
+                lowercase=parameters["lowercase"],
+                stop_words=parameters["stop_words"],
+                ngram_range=(parameters["min_ngram"], parameters["max_ngram"]),
+                max_df=parameters["max_df"],
+                min_df=parameters["min_df"],
+                max_features=parameters["max_features"],
+                analyzer=parameters["analyzer"],
                 vocabulary=attribute_vocabulary,
             )
             attribute_vectorizers[attribute] = attribute_vectorizer
@@ -633,38 +621,20 @@ def load(
         file_name = meta.get("file")
         featurizer_file = os.path.join(model_dir, file_name)
 
-        if os.path.exists(featurizer_file):
-            vocabulary = utils.json_unpickle(featurizer_file)
-
-            share_vocabulary = meta["use_shared_vocab"]
-
-            if share_vocabulary:
-                vectorizers = cls._create_shared_vocab_vectorizers(
-                    token_pattern=meta["token_pattern"],
-                    strip_accents=meta["strip_accents"],
-                    lowercase=meta["lowercase"],
-                    stop_words=meta["stop_words"],
-                    ngram_range=(meta["min_ngram"], meta["max_ngram"]),
-                    max_df=meta["max_df"],
-                    min_df=meta["min_df"],
-                    max_features=meta["max_features"],
-                    analyzer=meta["analyzer"],
-                    vocabulary=vocabulary,
-                )
-            else:
-                vectorizers = cls._create_independent_vocab_vectorizers(
-                    token_pattern=meta["token_pattern"],
-                    strip_accents=meta["strip_accents"],
-                    lowercase=meta["lowercase"],
-                    stop_words=meta["stop_words"],
-                    ngram_range=(meta["min_ngram"], meta["max_ngram"]),
-                    max_df=meta["max_df"],
-                    min_df=meta["min_df"],
-                    max_features=meta["max_features"],
-                    analyzer=meta["analyzer"],
-                    vocabulary=vocabulary,
-                )
+        if not os.path.exists(featurizer_file):
+            return cls(meta)
 
-            return cls(meta, vectorizers)
+        vocabulary = utils.json_unpickle(featurizer_file)
+
+        share_vocabulary = meta["use_shared_vocab"]
+
+        if share_vocabulary:
+            vectorizers = cls._create_shared_vocab_vectorizers(
+                meta, vocabulary=vocabulary
+            )
         else:
-            return cls(meta)
+            vectorizers = cls._create_independent_vocab_vectorizers(
+                meta, vocabulary=vocabulary
+            )
+
+        return cls(meta, vectorizers)

From a366b77802c32028ca3d17865a8e226d4d7bdd48 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 14:40:16 +0100
Subject: [PATCH 228/239] compute default intent features as dense features

---
 rasa/nlu/classifiers/embedding_intent_classifier.py       | 6 ++----
 tests/nlu/classifiers/test_embedding_intent_classifier.py | 6 ++----
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index 0fe8a3ce2a8c..f18382288bbc 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -335,10 +335,8 @@ def _compute_default_label_features(
         return [
             np.array(
                 [
-                    scipy.sparse.coo_matrix(
-                        ([1], ([0], [idx])), shape=(1, len(labels_example))
-                    )
-                    for idx in range(len(labels_example))
+                    np.expand_dims(a, 0)
+                    for a in np.eye(len(labels_example), dtype=np.float32)
                 ]
             )
         ]
diff --git a/tests/nlu/classifiers/test_embedding_intent_classifier.py b/tests/nlu/classifiers/test_embedding_intent_classifier.py
index 435bcfa02ae5..1c2c1edc237f 100644
--- a/tests/nlu/classifiers/test_embedding_intent_classifier.py
+++ b/tests/nlu/classifiers/test_embedding_intent_classifier.py
@@ -24,11 +24,9 @@ def test_compute_default_label_features():
 
     output = output[0]
 
-    assert output.size == len(label_features)
     for i, o in enumerate(output):
-        assert isinstance(o, scipy.sparse.coo_matrix)
-        assert o.data[0] == 1
-        assert o.col[0] == i
+        assert isinstance(o, np.ndarray)
+        assert o[0][i] == 1
         assert o.shape == (1, len(label_features))
 
 

From 47095d1bf75ac2f244e76b6c3733cda8adb63fcb Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Thu, 12 Dec 2019 14:48:01 +0100
Subject: [PATCH 229/239] use different dense dim default value for intents

---
 rasa/nlu/classifiers/embedding_intent_classifier.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/nlu/classifiers/embedding_intent_classifier.py b/rasa/nlu/classifiers/embedding_intent_classifier.py
index f18382288bbc..ccaa6cf4ba26 100644
--- a/rasa/nlu/classifiers/embedding_intent_classifier.py
+++ b/rasa/nlu/classifiers/embedding_intent_classifier.py
@@ -79,7 +79,7 @@ class EmbeddingIntentClassifier(Component):
         "random_seed": None,
         # embedding parameters
         # default dense dimension used if no dense features are present
-        "dense_dim": 512,
+        "dense_dim": {"text": 512, "intent": 20},
         # dimension size of embedding vectors
         "embed_dim": 20,
         # the type of the similarity
@@ -510,7 +510,7 @@ def _combine_sparse_dense_features(
     ) -> tf.Tensor:
         dense_features = []
 
-        dense_dim = self.dense_dim
+        dense_dim = self.dense_dim[name]
         # if dense features are present use the feature dimension of the dense features
         for f in features:
             if not isinstance(f, tf.SparseTensor):

From 2df3b367d1f784ab554504dfdfebdbc46d83adc0 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 11:14:49 +0100
Subject: [PATCH 230/239] update model version

---
 rasa/constants.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/constants.py b/rasa/constants.py
index a270ab695e18..7b88ca8ae8ba 100644
--- a/rasa/constants.py
+++ b/rasa/constants.py
@@ -33,7 +33,7 @@
 CONFIG_MANDATORY_KEYS_NLU = ["language", "pipeline"]
 CONFIG_MANDATORY_KEYS = CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU
 
-MINIMUM_COMPATIBLE_VERSION = "1.3.0a2"
+MINIMUM_COMPATIBLE_VERSION = "1.6.0a1"
 
 GLOBAL_USER_CONFIG_PATH = os.path.expanduser("~/.config/rasa/global.yml")
 

From ec2cb581c47d4db6590005175dd2a58d42a75204 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 12:43:24 +0100
Subject: [PATCH 231/239] update changelog

---
 changelog/4935.feature.rst |  4 +++-
 changelog/4935.removal.rst | 18 +++++++++++-------
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/changelog/4935.feature.rst b/changelog/4935.feature.rst
index 33c5ee7cd50e..13a62e090661 100644
--- a/changelog/4935.feature.rst
+++ b/changelog/4935.feature.rst
@@ -10,4 +10,6 @@ Default value is set to ``False``. However, you might want to set it to ``True``
 in the ``CRFEntityExtractor``.
 See https://rasa.com/docs/rasa/nlu/entity-extraction/#passing-custom-features-to-crfentityextractor.
 
-These changes are model breaking!
\ No newline at end of file
+.. warning::
+
+    These changes break model compatibility. You will need to retrain your old models!
\ No newline at end of file
diff --git a/changelog/4935.removal.rst b/changelog/4935.removal.rst
index dd1b98565419..b84cffc4666d 100644
--- a/changelog/4935.removal.rst
+++ b/changelog/4935.removal.rst
@@ -1,8 +1,12 @@
-Remove ``NGramFeaturizer``. Please use ``CountVectorsFeaturizer`` instead.
+Removed ``ner_features`` as a feature name from ``CRFEntityExtractor``, use ``text_dense_features`` instead. If
+
 The following settings match the previous ``NGramFeaturizer``:
-- name: 'CountVectorsFeaturizer'
-  analyzer: 'char_wb'
-  min_ngram: 3
-  max_ngram: 17
-  max_features: 10
-  min_df: 5
\ No newline at end of file
+
+.. code-block:: yaml
+
+    - name: 'CountVectorsFeaturizer'
+        analyzer: 'char_wb'
+        min_ngram: 3
+        max_ngram: 17
+        max_features: 10
+        min_df: 5
\ No newline at end of file

From 2f148f35a3058874c6dc42859bc05435389630ee Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 12:43:37 +0100
Subject: [PATCH 232/239] increase version to 1.6.0a2

---
 rasa/constants.py | 2 +-
 rasa/version.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/rasa/constants.py b/rasa/constants.py
index 7b88ca8ae8ba..5b4f28dbd103 100644
--- a/rasa/constants.py
+++ b/rasa/constants.py
@@ -33,7 +33,7 @@
 CONFIG_MANDATORY_KEYS_NLU = ["language", "pipeline"]
 CONFIG_MANDATORY_KEYS = CONFIG_MANDATORY_KEYS_CORE + CONFIG_MANDATORY_KEYS_NLU
 
-MINIMUM_COMPATIBLE_VERSION = "1.6.0a1"
+MINIMUM_COMPATIBLE_VERSION = "1.6.0a2"
 
 GLOBAL_USER_CONFIG_PATH = os.path.expanduser("~/.config/rasa/global.yml")
 
diff --git a/rasa/version.py b/rasa/version.py
index 12171a743b46..a4a26281c38e 100644
--- a/rasa/version.py
+++ b/rasa/version.py
@@ -1,3 +1,3 @@
 # this file will automatically be changed,
 # do not add anything but the version number here!
-__version__ = "1.6.0a1"
+__version__ = "1.6.0a2"

From 8ba153aa8c5a2182c19f59942d2116e6ca55801a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 12:50:59 +0100
Subject: [PATCH 233/239] update documentation

---
 docs/nlu/components.rst                                     | 4 +++-
 docs/nlu/entity-extraction.rst                              | 2 ++
 rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py | 6 ------
 3 files changed, 5 insertions(+), 7 deletions(-)

diff --git a/docs/nlu/components.rst b/docs/nlu/components.rst
index 3557f4c12ff9..7ebebcd615e9 100644
--- a/docs/nlu/components.rst
+++ b/docs/nlu/components.rst
@@ -93,10 +93,12 @@ Thus, we save a lot of memroy and are able to train on larger datasets.
 
 By default all featurizers will return a matrix of length (1 x feature-dimension).
 All featurizer (except the ``ConveRTFeaturizer``) have the option to return a sequence instead.
-In case the flag ``"return_sequence"`` is set to ``True`` the returned matrix of a featurizer will have the size
+In case the flag ``"return_sequence"`` is set to ``True``, the returned matrix of the featurizer will have the size
 (token-length x feature-dimension).
 So, the returned matrix will have an entry for every token.
 Otherwise, the matrix will just have one entry for the complete utterance.
+If you want to use custom features for your ``CRFEntityExtractor``, you should set ``"return_sequence"`` to ``True``.
+For more details, take a look :ref:`entity-extraction-custom-features`.
 
 
 MitieFeaturizer
diff --git a/docs/nlu/entity-extraction.rst b/docs/nlu/entity-extraction.rst
index c76305d5e0e1..966a333178ef 100644
--- a/docs/nlu/entity-extraction.rst
+++ b/docs/nlu/entity-extraction.rst
@@ -151,6 +151,8 @@ If you just want to match regular expressions exactly, you can do this in your c
 as a postprocessing step after receiving the response from Rasa NLU.
 
 
+.. _entity-extraction-custom-features:
+
 Passing Custom Features to ``CRFEntityExtractor``
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
index 4b4b9999727e..f1bbaf19b748 100644
--- a/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/convert_featurizer.py
@@ -64,12 +64,6 @@ def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
                 f"'return_sequence' set to False."
             )
 
-        logger.debug(
-            f"ConveRTFeaturizer always returns a feature vector of size "
-            f"(1 x feature-dimensions). If you use any other featurizer with "
-            f"'return_sequence' equal to True, training will fail."
-        )
-
     @classmethod
     def required_packages(cls) -> List[Text]:
         return ["tensorflow_text", "tensorflow_hub"]

From a79916c18302e6c4b1616a24f2bfbcc6ab63bd22 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 12:57:11 +0100
Subject: [PATCH 234/239] review comments

---
 .../count_vectors_featurizer.py               | 32 ++++++++++---------
 .../sparse_featurizer/regex_featurizer.py     |  6 ++--
 rasa/nlu/tokenizers/tokenizer.py              | 10 +++---
 rasa/utils/train_utils.py                     |  4 +--
 4 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 5a853a431cdd..80a8fe72156a 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -285,21 +285,23 @@ def _get_processed_message_tokens_by_attribute(
     # noinspection PyPep8Naming
     def _check_OOV_present(self, all_tokens: List[List[Text]]):
         """Check if an OOV word is present"""
-        if self.OOV_token and not self.OOV_words and all_tokens:
-            for tokens in all_tokens:
-                for text in tokens:
-                    if self.OOV_token in text or (
-                        self.lowercase and self.OOV_token in text.lower()
-                    ):
-                        return
-
-            if any(text for tokens in all_tokens for text in tokens):
-                # if there is some text in tokens, warn if there is no oov token
-                logger.warning(
-                    f"OOV_token='{self.OOV_token}' was given, but it is not present "
-                    "in the training data. All unseen words "
-                    "will be ignored during prediction."
-                )
+        if not self.OOV_token or self.OOV_words or not all_tokens:
+            return
+
+        for tokens in all_tokens:
+            for text in tokens:
+                if self.OOV_token in text or (
+                    self.lowercase and self.OOV_token in text.lower()
+                ):
+                    return
+
+        if any(text for tokens in all_tokens for text in tokens):
+            # if there is some text in tokens, warn if there is no oov token
+            logger.warning(
+                f"OOV_token='{self.OOV_token}' was given, but it is not present "
+                "in the training data. All unseen words "
+                "will be ignored during prediction."
+            )
 
     def _get_all_attributes_processed_tokens(
         self, training_data: "TrainingData"
diff --git a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
index 1b7a94e9b691..c3c6cd668fce 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/regex_featurizer.py
@@ -42,9 +42,9 @@ class RegexFeaturizer(Featurizer):
 
     def __init__(
         self,
-        component_config: Dict[Text, Any] = None,
-        known_patterns: List[Dict[Text, Text]] = None,
-        lookup_tables: List[Dict[Text, Union[Text, List]]] = None,
+        component_config: Optional[Dict[Text, Any]] = None,
+        known_patterns: Optional[List[Dict[Text, Text]]] = None,
+        lookup_tables: Optional[List[Dict[Text, Union[Text, List]]]] = None,
     ) -> None:
 
         super().__init__(component_config)
diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index fc63dfebc207..620f791d8e9a 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -15,20 +15,20 @@ def __init__(
         offset: int,
         data: Optional[Dict[Text, Any]] = None,
         lemma: Optional[Text] = None,
-    ):
+    ) -> None:
         self.offset = offset
         self.text = text
         self.end = offset + len(text)
         self.data = data if data else {}
         self.lemma = lemma or text
 
-    def set(self, prop: Text, info: Any):
+    def set(self, prop: Text, info: Any) -> None:
         self.data[prop] = info
 
-    def get(self, prop: Text, default: Optional[Any] = None):
+    def get(self, prop: Text, default: Optional[Any] = None) -> Any:
         return self.data.get(prop, default)
 
-    def __eq__(self, other):
+    def __eq__(self, other) -> bool:
         if not isinstance(other, Token):
             return NotImplemented
         return (self.offset, self.end, self.text, self.lemma) == (
@@ -38,7 +38,7 @@ def __eq__(self, other):
             other.lemma,
         )
 
-    def __lt__(self, other):
+    def __lt__(self, other) -> bool:
         if not isinstance(other, Token):
             return NotImplemented
         return (self.offset, self.end, self.text, self.lemma) < (
diff --git a/rasa/utils/train_utils.py b/rasa/utils/train_utils.py
index b7a8226cdff7..b61c6ad6a2e3 100644
--- a/rasa/utils/train_utils.py
+++ b/rasa/utils/train_utils.py
@@ -276,7 +276,7 @@ def balance_session_data(
     return final_session_data
 
 
-def get_number_of_examples(session_data: SessionDataType):
+def get_number_of_examples(session_data: SessionDataType) -> int:
     """Obtain number of examples in session data.
 
     Raise a ValueError if number of examples differ for different data in session data.
@@ -325,7 +325,7 @@ def prepare_batch(
     session_data: SessionDataType,
     start: Optional[int] = None,
     end: Optional[int] = None,
-    tuple_sizes: Dict[Text, int] = None,
+    tuple_sizes: Optional[Dict[Text, int]] = None,
 ) -> Tuple[Optional[np.ndarray]]:
     """Slices session data into batch using given start and end value."""
 

From 5ef7b806a8d95af524bbb011bdf45889dd525dd6 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 12:58:25 +0100
Subject: [PATCH 235/239] Update
 rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py

Co-Authored-By: Tom Bocklisch <tom@rasa.com>
---
 rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
index 8a1a438e42b5..3d4414fe488d 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/ngram_featurizer.py
@@ -9,7 +9,7 @@
 
 
 class NGramFeaturizer(Featurizer):
-    def __init__(self, component_config: Optional[Dict[Text, Any]] = None):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
         super(NGramFeaturizer, self).__init__(component_config)
 
         raise NotImplementedError(

From bb44fd6596499bf5bb1843b160c4d26c8f8e2c1a Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 13:04:00 +0100
Subject: [PATCH 236/239] add missing types

---
 .../featurizers/dense_featurizer/mitie_featurizer.py  |  9 ++++-----
 .../featurizers/dense_featurizer/spacy_featurizer.py  |  6 +++---
 .../sparse_featurizer/count_vectors_featurizer.py     | 11 +++++++----
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
index 9347b9296a34..79993427f167 100644
--- a/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/mitie_featurizer.py
@@ -1,6 +1,6 @@
 import numpy as np
 import typing
-from typing import Any, List, Text, Dict
+from typing import Any, List, Text, Dict, Optional
 
 from rasa.nlu.config import RasaNLUModelConfig
 from rasa.nlu.featurizers.featurzier import Featurizer
@@ -34,7 +34,7 @@ class MitieFeaturizer(Featurizer):
         "return_sequence": False
     }
 
-    def __init__(self, component_config: Dict[Text, Any] = None):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
         super().__init__(component_config)
 
@@ -44,12 +44,11 @@ def __init__(self, component_config: Dict[Text, Any] = None):
     def required_packages(cls) -> List[Text]:
         return ["mitie", "numpy"]
 
-    def ndim(self, feature_extractor: "mitie.total_word_feature_extractor"):
+    def ndim(self, feature_extractor: "mitie.total_word_feature_extractor") -> int:
 
         return feature_extractor.num_dimensions
 
-    def get_tokens_by_attribute(self, example, attribute) -> Any:
-
+    def get_tokens_by_attribute(self, example: Message, attribute: Text) -> Any:
         return example.get(TOKENS_NAMES[attribute])
 
     def train(
diff --git a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
index 3578f585ca71..8ede3e4e73ff 100644
--- a/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
+++ b/rasa/nlu/featurizers/dense_featurizer/spacy_featurizer.py
@@ -36,7 +36,7 @@ class SpacyFeaturizer(Featurizer):
         "return_sequence": False
     }
 
-    def __init__(self, component_config: Dict[Text, Any] = None):
+    def __init__(self, component_config: Optional[Dict[Text, Any]] = None) -> None:
 
         super().__init__(component_config)
 
@@ -59,7 +59,7 @@ def train(
             for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
                 self._set_spacy_features(example, attribute)
 
-    def get_doc(self, message, attribute):
+    def get_doc(self, message: Message, attribute: Text) -> Any:
 
         return message.get(SPACY_DOCS[attribute])
 
@@ -67,7 +67,7 @@ def process(self, message: Message, **kwargs: Any) -> None:
 
         self._set_spacy_features(message)
 
-    def _set_spacy_features(self, message, attribute=TEXT_ATTRIBUTE):
+    def _set_spacy_features(self, message: Message, attribute: Text = TEXT_ATTRIBUTE):
         """Adds the spacy word vectors to the messages features."""
 
         message_attribute_doc = self.get_doc(message, attribute)
diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 80a8fe72156a..cfdb0275e149 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -183,7 +183,7 @@ def _check_analyzer(self) -> None:
                 )
 
     @staticmethod
-    def _attributes_for(analyzer):
+    def _attributes_for(analyzer: Text) -> List[Text]:
         """Create a list of attributes that should be featurized."""
 
         # intents should be featurized only by word level count vectorizer
@@ -447,7 +447,10 @@ def _set_attribute_features(
             )
 
     def train(
-        self, training_data: TrainingData, cfg: RasaNLUModelConfig = None, **kwargs: Any
+        self,
+        training_data: TrainingData,
+        cfg: Optional[RasaNLUModelConfig] = None,
+        **kwargs: Any,
     ) -> None:
         """Train the featurizer.
 
@@ -614,8 +617,8 @@ def _create_independent_vocab_vectorizers(
     def load(
         cls,
         meta: Dict[Text, Any],
-        model_dir: Text = None,
-        model_metadata: Metadata = None,
+        model_dir: Optional[Text] = None,
+        model_metadata: Optional[Metadata] = None,
         cached_component: Optional["CountVectorsFeaturizer"] = None,
         **kwargs: Any,
     ) -> "CountVectorsFeaturizer":

From 3032fc497ad1210db67f1b6c8a1028cebd1e98c9 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 13:05:54 +0100
Subject: [PATCH 237/239] Update
 rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py

Co-Authored-By: Tom Bocklisch <tom@rasa.com>
---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index cfdb0275e149..696a0727d7e1 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -283,7 +283,7 @@ def _get_processed_message_tokens_by_attribute(
         return tokens
 
     # noinspection PyPep8Naming
-    def _check_OOV_present(self, all_tokens: List[List[Text]]):
+    def _check_OOV_present(self, all_tokens: List[List[Text]]) -> None:
         """Check if an OOV word is present"""
         if not self.OOV_token or self.OOV_words or not all_tokens:
             return

From e1eade1bdb0ab60af497a555658d6b3d0d8a7119 Mon Sep 17 00:00:00 2001
From: Tanja <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 13:06:07 +0100
Subject: [PATCH 238/239] Update
 rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py

Co-Authored-By: Tom Bocklisch <tom@rasa.com>
---
 .../featurizers/sparse_featurizer/count_vectors_featurizer.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
index 696a0727d7e1..cdaab57e4f20 100644
--- a/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
+++ b/rasa/nlu/featurizers/sparse_featurizer/count_vectors_featurizer.py
@@ -435,7 +435,7 @@ def _get_featurized_attribute(
 
     def _set_attribute_features(
         self, attribute: Text, attribute_features: List, training_data: "TrainingData"
-    ):
+    ) -> None:
         """Set computed features of the attribute to corresponding message objects"""
         for i, example in enumerate(training_data.training_examples):
             # create bag for each example

From ad308278b6ae7af08d1188f8d3b52b7af2f37d01 Mon Sep 17 00:00:00 2001
From: Tanja Bergmann <tabergma@gmail.com>
Date: Mon, 16 Dec 2019 14:44:42 +0100
Subject: [PATCH 239/239] fix types

---
 rasa/nlu/tokenizers/tokenizer.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/rasa/nlu/tokenizers/tokenizer.py b/rasa/nlu/tokenizers/tokenizer.py
index 620f791d8e9a..e5505a531ccd 100644
--- a/rasa/nlu/tokenizers/tokenizer.py
+++ b/rasa/nlu/tokenizers/tokenizer.py
@@ -28,9 +28,9 @@ def set(self, prop: Text, info: Any) -> None:
     def get(self, prop: Text, default: Optional[Any] = None) -> Any:
         return self.data.get(prop, default)
 
-    def __eq__(self, other) -> bool:
+    def __eq__(self, other):
         if not isinstance(other, Token):
-            return NotImplemented
+            return NotImplementedError
         return (self.offset, self.end, self.text, self.lemma) == (
             other.offset,
             other.end,
@@ -38,9 +38,9 @@ def __eq__(self, other) -> bool:
             other.lemma,
         )
 
-    def __lt__(self, other) -> bool:
+    def __lt__(self, other):
         if not isinstance(other, Token):
-            return NotImplemented
+            return NotImplementedError
         return (self.offset, self.end, self.text, self.lemma) < (
             other.offset,
             other.end,