-
Notifications
You must be signed in to change notification settings - Fork 4.7k
/
nlu_sparse.py
164 lines (144 loc) · 5.83 KB
/
nlu_sparse.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
import logging
from typing import Any, Text, Dict, List, Type
from sklearn.feature_extraction.text import TfidfVectorizer
from rasa.engine.recipes.default_recipe import DefaultV1Recipe
from rasa.engine.graph import ExecutionContext, GraphComponent
from rasa.engine.storage.resource import Resource
from rasa.engine.storage.storage import ModelStorage
from rasa.nlu.featurizers.sparse_featurizer.sparse_featurizer import SparseFeaturizer
from rasa.nlu.tokenizers.tokenizer import Tokenizer
from rasa.shared.nlu.training_data.training_data import TrainingData
from rasa.shared.nlu.training_data.features import Features
from rasa.shared.nlu.training_data.message import Message
from rasa.nlu.constants import (
DENSE_FEATURIZABLE_ATTRIBUTES,
FEATURIZER_CLASS_ALIAS,
)
from joblib import dump, load
from rasa.shared.nlu.constants import (
TEXT,
TEXT_TOKENS,
FEATURE_TYPE_SENTENCE,
FEATURE_TYPE_SEQUENCE,
)
logger = logging.getLogger(__name__)
@DefaultV1Recipe.register(
DefaultV1Recipe.ComponentType.MESSAGE_FEATURIZER, is_trainable=True
)
class TfIdfFeaturizer(SparseFeaturizer, GraphComponent):
@classmethod
def required_components(cls) -> List[Type]:
"""Components that should be included in the pipeline before this component."""
return [Tokenizer]
@staticmethod
def required_packages() -> List[Text]:
"""Any extra python dependencies required for this component to run."""
return ["sklearn"]
@staticmethod
def get_default_config() -> Dict[Text, Any]:
"""Returns the component's default config."""
return {
**SparseFeaturizer.get_default_config(),
"analyzer": "word",
"min_ngram": 1,
"max_ngram": 1,
}
def __init__(
self,
config: Dict[Text, Any],
name: Text,
model_storage: ModelStorage,
resource: Resource,
) -> None:
"""Constructs a new tf/idf vectorizer using the sklearn framework."""
super().__init__(name, config)
# Initialize the tfidf sklearn component
self.tfm = TfidfVectorizer(
analyzer=config["analyzer"],
ngram_range=(config["min_ngram"], config["max_ngram"]),
)
# We need to use these later when saving the trained component.
self._model_storage = model_storage
self._resource = resource
def train(self, training_data: TrainingData) -> Resource:
"""Trains the component from training data."""
texts = [e.get(TEXT) for e in training_data.training_examples if e.get(TEXT)]
self.tfm.fit(texts)
self.persist()
return self._resource
@classmethod
def create(
cls,
config: Dict[Text, Any],
model_storage: ModelStorage,
resource: Resource,
execution_context: ExecutionContext,
) -> GraphComponent:
"""Creates a new untrained component (see parent class for full docstring)."""
return cls(config, execution_context.node_name, model_storage, resource)
def _set_features(self, message: Message, attribute: Text = TEXT) -> None:
"""Sets the features on a single message. Utility method."""
tokens = message.get(TEXT_TOKENS)
# If the message doesn't have tokens, we can't create features.
if not tokens:
return None
# Make distinction between sentence and sequence features
text_vector = self.tfm.transform([message.get(TEXT)])
word_vectors = self.tfm.transform([t.text for t in tokens])
final_sequence_features = Features(
word_vectors,
FEATURE_TYPE_SEQUENCE,
attribute,
self._config[FEATURIZER_CLASS_ALIAS],
)
message.add_features(final_sequence_features)
final_sentence_features = Features(
text_vector,
FEATURE_TYPE_SENTENCE,
attribute,
self._config[FEATURIZER_CLASS_ALIAS],
)
message.add_features(final_sentence_features)
def process(self, messages: List[Message]) -> List[Message]:
"""Processes incoming message and compute and set features."""
for message in messages:
for attribute in DENSE_FEATURIZABLE_ATTRIBUTES:
self._set_features(message, attribute)
return messages
def process_training_data(self, training_data: TrainingData) -> TrainingData:
"""Processes the training examples in the given training data in-place."""
self.process(training_data.training_examples)
return training_data
def persist(self) -> None:
"""
Persist this model into the passed directory.
Returns the metadata necessary to load the model again. In this case; `None`.
"""
with self._model_storage.write_to(self._resource) as model_dir:
dump(self.tfm, model_dir / "tfidfvectorizer.joblib")
@classmethod
def load(
cls,
config: Dict[Text, Any],
model_storage: ModelStorage,
resource: Resource,
execution_context: ExecutionContext,
) -> GraphComponent:
"""Loads trained component from disk."""
try:
with model_storage.read_from(resource) as model_dir:
tfidfvectorizer = load(model_dir / "tfidfvectorizer.joblib")
component = cls(
config, execution_context.node_name, model_storage, resource
)
component.tfm = tfidfvectorizer
except (ValueError, FileNotFoundError):
logger.debug(
f"Couldn't load metadata for component '{cls.__name__}' as the persisted "
f"model data couldn't be loaded."
)
return component
@classmethod
def validate_config(cls, config: Dict[Text, Any]) -> None:
"""Validates that the component is configured properly."""
pass