- prediction and updating prrof of concept with torch

- ranking now as independent scoring of candidates, prevents deadlocks from pairwise comparison of more than two candidates (also reduces complexity a lot). drawback: more difficult to decide labels for updating (as we only have relative ratings between candidates)
yuluqinn · Apr 27, 2019 · e126eba · e126eba
1 parent 5fb4620
commit e126eba
Show file tree

Hide file tree

Showing 4 changed files with 169 additions and 86 deletions.
diff --git a/lexi/core/endpoints.py b/lexi/core/endpoints.py
@@ -218,7 +218,7 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0):
                 logger.debug("Word in modified context: {} {} {} {}".format(
                     modified_sentence, w, original_start_offset,
                     original_start_offset+len(w)))
-                featurized_words[w] = ranker.featurizer.transform_wic(
+                featurized_words[w] = ranker.featurizer.featurize(
                     modified_sentence, original_start_offset,
                     original_start_offset+len(w))
 
@@ -227,11 +227,15 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0):
         difficult_words = [w for w in choices if not w == simple_word]
 
         # add feature vectors to update batch
+        update_batch.append((featurized_words[simple_word], 0))
         for difficult in difficult_words:
-            update_batch.append((featurized_words[simple_word],
-                                 featurized_words[difficult]))
+            update_batch.append((featurized_words[difficult], 1))
+            # update_batch.append((featurized_words[simple_word],
+            #                      featurized_words[difficult]))
 
     if update_batch:
+        update_batch = list(zip(*update_batch))
+        # print(help(ranker))
         ranker.update(update_batch)
         ranker.save(user_id)
     else:

diff --git a/lexi/core/featurize/featurizers.py b/lexi/core/featurize/featurizers.py
@@ -31,17 +31,6 @@ def map_inv(self, ids):
         return out
 
 
-class LexiFeaturizer(metaclass=ABCMeta):
-
-    @abstractmethod
-    def save(self, path):
-        raise NotImplementedError
-
-    @abstractmethod
-    def load(self, path):
-        raise NotImplementedError
-
-
 class Featurizer:
 
     def __init__(self, features=None):

diff --git a/lexi/core/simplification/lexical.py b/lexi/core/simplification/lexical.py
@@ -2,17 +2,19 @@
 import pickle
 import os
 import jsonpickle
+import torch
 
 from lexi.config import LEXICAL_MODEL_PATH_TEMPLATE, RANKER_MODEL_PATH_TEMPLATE
 from lexi.core.simplification import SimplificationPipeline
 from lexi.core.simplification.util import make_synonyms_dict, \
     parse_embeddings
-from lexi.core.featurize.featurizers import LexicalFeaturizer, LexiFeaturizer
+from lexi.core.featurize.featurizers import LexicalFeaturizer
 from lexi.core.util import util
 from abc import ABCMeta, abstractmethod
 import keras
 from keras.layers import Input, Dense
 from sklearn.feature_extraction import DictVectorizer
+from sklearn.preprocessing import MinMaxScaler
 
 logger = logging.getLogger('lexi')
 
@@ -118,13 +120,14 @@ def simplify_text(self, text, startOffset=0, endOffset=None, cwi=None,
 
                 # STEP 4: RANKING
                 if ranker:
-                    ranking = ranker.rank(candidates)
+                    ranking = ranker.rank(candidates, sent, wb, we)
                 elif self.ranker:
-                    ranking = self.ranker.rank(candidates)
+                    ranking = self.ranker.rank(candidates, sent, wb, we)
                 else:
                     ranking = candidates
                 offset2simplification[global_word_offset_start] = \
                     (sent[wb:we], ranking, sent, wb, we)
+
         return offset2simplification
 
 
@@ -206,31 +209,31 @@ def load(self, path):
         self.model = keras.models.load_model(path)
 
 
-class LexiCWIFeaturizer(DictVectorizer):
-
-    def __init__(self):
-        super().__init__()
-
-    def dimensions(self):
-        return len(self.get_feature_names())
-        # return 3
-
-    def transform_wic(self, sentence, startOffset, endOffset):
-        featuredict = dict()
-        featuredict["word_length"] = endOffset - startOffset
-        featuredict["sentence_length"] = len(sentence)
-        self.transform(featuredict)
-
-    def save(self, path):
-        json = jsonpickle.encode(self)
-        with open(path, "w") as jsonfile:
-            jsonfile.write(json)
-
-    @staticmethod
-    def staticload(path):
-        with open(path) as jsonfile:
-            json = jsonfile.read()
-        return jsonpickle.decode(json)
+# class LexiFeaturizer(DictVectorizer):
+#
+#     def __init__(self):
+#         super().__init__()
+#
+#     def dimensions(self):
+#         return len(self.get_feature_names())
+#         # return 3
+#
+#     def featurize(self, sentence, startOffset, endOffset):
+#         featuredict = dict()
+#         featuredict["word_length"] = endOffset - startOffset
+#         featuredict["sentence_length"] = len(sentence)
+#         self.transform(featuredict)
+#
+#     def save(self, path):
+#         json = jsonpickle.encode(self)
+#         with open(path, "w") as jsonfile:
+#             jsonfile.write(json)
+#
+#     @staticmethod
+#     def staticload(path):
+#         with open(path) as jsonfile:
+#             json = jsonfile.read()
+#         return jsonpickle.decode(json)
 
 
 class LexiCWI(LexiPersonalizedPipelineStep):
@@ -239,46 +242,64 @@ def __init__(self, userId, featurizer=None):
         # self.model = self.build_model()
         super().__init__(userId)
         self.featurizer = featurizer if featurizer is not None else \
-            LexiCWIFeaturizer()
+            LexiFeaturizer()
+        self.model = self.build_model()
+        self.optimizer = torch.optim.Adam(self.model.parameters())
 
-    def build_model(self, ):
-        n_input = self.featurizer.dimensions()
-        i = Input(shape=(n_input,))
-        o = Dense([2])
-        model = keras.models.Model(Input(n_input), )
-        return model
+    def build_model(self):
+        return LexiScorerNet(self.featurizer.dimensions(), [10, 10])
 
     def fresh_train(self, cwi_data):
         x, y = cwi_data
-        self.model.fit(x, y)
+        self.model.fit(x, y, self.optimizer)
 
     def update(self, cwi_data):
         x, y = cwi_data
-        self.model.fit(x, y)  # TODO updating like this is problematic if we
+        self.model.fit(x, y, self.optimizer)  # TODO updating like this is problematic if we
         # want learning rate decay or other things that rely on previous
         # iterations, those are not saved in the model or optimizer...
 
     def identify_targets(self, sent, token_offsets):
-        return token_offsets  # TODO implement, use is_complex
+        return [(wb, we) for wb, we in token_offsets if
+                self.is_complex(sent, wb, we)]
 
     def is_complex(self, sent, startOffset, endOffset):
-        return endOffset-startOffset > 7  # TODO implement properly
+        x = self.featurizer.featurize(sent, startOffset, endOffset)
+        logger.debug(x)
+        cwi_score = self.model(x)
+        return cwi_score > 0
 
 
-class LexiRankingFeaturizer(DictVectorizer):
+class LexiFeaturizer(DictVectorizer):
 
     def __init__(self):
-        super().__init__()
+        super().__init__(sparse=False)
+        self.scaler = MinMaxScaler()
 
     def dimensions(self):
-        return len(self.get_feature_names())
-        # return 3
-
-    def transform_wic(self, sentence, startOffset, endOffset):
+        if hasattr(self, "feature_names_"):
+            return len(self.get_feature_names())
+        else:
+            logger.warning("Asking for vectorizer dimensionality, "
+                           "but vectorizer has not been fit yet. Returning 0.")
+            return 0
+
+    def to_dict(self, sentence, startOffset, endOffset):
         featuredict = dict()
         featuredict["word_length"] = endOffset - startOffset
         featuredict["sentence_length"] = len(sentence)
-        self.transform(featuredict)
+        return featuredict
+
+    def fit(self, words_in_context):
+        wic_dicts = [self.to_dict(*wic) for wic in words_in_context]
+        vecs = super().fit_transform(wic_dicts)
+        self.scaler.fit(vecs)
+
+    def featurize(self, sentence, startOffset, endOffset, scale=True):
+        vecs = self.transform(self.to_dict(sentence, startOffset, endOffset))
+        if scale:
+            vecs = self.scaler.transform(vecs)
+        return vecs
 
     def save(self, path):
         json = jsonpickle.encode(self)
@@ -294,36 +315,103 @@ def staticload(path):
 
 class LexiRanker(LexiPersonalizedPipelineStep):
 
-    def __init__(self, userId):
-        self.userId = userId
-        self.featurizer = LexiRankingFeaturizer()
-        logger.debug("Featurizer: {}".format(self.featurizer))
-        logger.debug("Has transform? {}".format(hasattr(self.featurizer, "transform")))
-        self.model = self.build_model()
+    def __init__(self, userId, featurizer=None):
         super().__init__(userId)
+        self.featurizer = featurizer or LexiFeaturizer()
+        self.model = self.build_model()
+        self.optimizer = torch.optim.Adam(self.model.parameters())
+
+    def build_model(self):
+        return LexiScorerNet(self.featurizer.dimensions(), [10, 10])
+
+    def fresh_train(self, data):
+        x, y = data
+        self.model.fit(x, y, self.optimizer)
+
+    def update(self, cwi_data):
+        x, y = cwi_data
+        x = torch.Tensor(x)
+        y = torch.Tensor(y)
+        self.model.fit(x, y, self.optimizer)  # TODO updating like this is
+        # problematic if we want learning rate decay or other things that rely
+        # on previous iterations, those are not saved in the model or optimizer...
 
     def set_featurizer(self, featurizer):
         self.featurizer = featurizer
 
-    def build_model(self):
-        pass
-
-    def rank(self, candidates, sentence=None, index=None):
-        return sorted(candidates, key=lambda x: len(x))
+    def rank(self, candidates, sentence=None, wb=0, we=0):
+        scored_candidates = []
+        for candidate in candidates:
+            modified_sentence = sentence[:wb] + candidate + sentence[we:]
+            x = self.featurizer.featurize(modified_sentence, wb,
+                                          wb + len(candidate))
+            score = self.model.forward(x)
+            scored_candidates.append((candidate, score))
+            logger.debug("Sorted candidates: {}".format(scored_candidates))
+        return [candidate for candidate, score in sorted(scored_candidates,
+                                                         key=lambda x: x[1])]
 
     def save(self, userId):
-        with open(RANKER_MODEL_PATH_TEMPLATE.format(userId), 'wb') as pf:
-            # pickle.dump((self.fe, self.model), pf, pickle.HIGHEST_PROTOCOL)
-            pickle.dump(self, pf, pickle.HIGHEST_PROTOCOL)
+        json = jsonpickle.encode(self)
+        with open(RANKER_MODEL_PATH_TEMPLATE.format(userId), 'w') as jsonfile:
+            jsonfile.write(json)
 
-    def load(self, path):
-        pass
+    @staticmethod
+    def staticload(path):
+        with open(path) as jsonfile:
+            json = jsonfile.read()
+        return jsonpickle.decode(json)
 
-    def fresh_train(self, x, y):
-        pass
+    def train(self, data, batch_size=64, lr=1e-3,
+                    epochs=30, dev=None, clip=None, early_stopping=None,
+                    l2=1e-5, lr_schedule=None):
+
+        loss = 0
+        optimizer = torch.optim.Adam(self.model.parameters(), lr=lr,
+                                     weight_decay=l2)
+        for input1, input2 in data:
+            pass  # TODO
+
+
+class LexiScorerNet(torch.nn.Module):
+    def __init__(self, input_size, hidden_sizes):
+        super(LexiScorerNet, self).__init__()
+        self.input = torch.nn.Linear(input_size, hidden_sizes[0])
+        self.hidden_layers = [torch.nn.Linear(hidden_sizes[i],
+                                              hidden_sizes[i+1])
+                              for i in range(len(hidden_sizes)-1)]
+        self.out = torch.nn.Linear(hidden_sizes[-1], 1)
+
+    def forward(self, x):
+        x = torch.Tensor(x)
+        h = torch.relu(self.input(x))
+        for layer in self.hidden_layers:
+            h = torch.relu(layer(h))
+        return self.out(h)
+
+    def fit(self, x, y, optimizer, epochs=1):
+        for _ in range(epochs):
+            self.train()
+            # optimizer.zero_grad()
+            pred = self.forward(x)
+            # loss = torch.sqrt(torch.mean((y - pred) ** 2))
+            loss = torch.mean((y - pred))
+            loss.backward()
+            optimizer.step()
+
+
+class RankerNet(torch.nn.Module):
+    def __init__(self, input_size, hidden_sizes):
+        super(RankerNet, self).__init__()
+        self.input = torch.nn.Linear(input_size, hidden_sizes[0])
+        self.out = torch.nn.Linear(hidden_sizes[0] * 2, 1)
+
+    def forward(self, input1, input2):
+        l = self.input(torch.Tensor(input1))
+        r = self.input(torch.Tensor(input2))
+        combined = torch.cat((l.view(-1), r.view(-1)))
+        return self.out(combined)
 
-    def update(self, x, y):
-        pass
 
 
 class DummyLexicalSimplificationPipeline(SimplificationPipeline):

diff --git a/lexi/server/run_lexi_server.py b/lexi/server/run_lexi_server.py
@@ -17,7 +17,7 @@
     CWI_MODEL_PATH_TEMPLATE, MODELS_DIR, RESOURCES
 from lexi.core.endpoints import update_ranker
 from lexi.core.simplification.lexical import LexicalSimplificationPipeline, \
-    LexiCWI, LexiRanker, LexiGenerator, LexiRankingFeaturizer
+    LexiCWI, LexiRanker, LexiGenerator, LexiFeaturizer, LexiFeaturizer
 from lexi.server.util import statuscodes
 from lexi.server.util.html import process_html
 from lexi.server.util.communication import make_response
@@ -126,10 +126,12 @@
 #     RANKER_MODEL_PATH_TEMPLATE.format("default"))
 # default_cwi = load_pickled_model(
 #     CWI_MODEL_PATH_TEMPLATE.format("default"))
-default_ranker = LexiRanker("default")
-# default_ranker.set_featurizer()
+
+featurizer = LexiFeaturizer.staticload("default_featurizer.json")
+
+default_ranker = LexiRanker("default", featurizer=featurizer)
 logger.debug("Default Ranker Featurizer: {}".format(default_ranker.featurizer))
-default_cwi = LexiCWI("default")  # TODO pretrain offline and load
+default_cwi = LexiCWI("default", featurizer=featurizer)  # TODO pretrain offline and load
 personalized_rankers = {"default": default_ranker}
 personalized_cwi = {"default": default_cwi}
 logger.debug("Default ranker: {} ({})".format(default_ranker,
@@ -306,7 +308,7 @@ def get_personalized_ranker(user_id):
         # retrieve model
         # model_path = db_connection.get_model_path(user_id)
         # ranker = LexiRanker.load(model_path)
-        ranker = LexiRanker(user_id)
+        ranker = LexiRanker(user_id, featurizer=featurizer)
         # featurizer = ...  # retrieve
         # ranker.set_featurizer(featurizer)
         # ranker.featurizer = LexiRankingFeaturizer()
@@ -324,7 +326,7 @@ def get_personalized_cwi(user_id):
         try:
             # retrieve model
             model_path = db_connection.get_model_path(user_id)
-            cwi = LexiCWI(user_id)
+            cwi = LexiCWI(user_id, featurizer=featurizer)
         except:
             logger.warning("Could not load personalized model. "
                            "Loading default cwi.")