word character offsets in simplification object, needed for featuriza…

…tion e.g. when updating
yuluqinn · Apr 9, 2019 · b73cf80 · b73cf80
1 parent 4814492
commit b73cf80
Show file tree

Hide file tree

Showing 5 changed files with 63 additions and 60 deletions.
diff --git a/lexi/config.py b/lexi/config.py
@@ -18,8 +18,6 @@
 RESOURCES_FULL = {
     "da": {
         "embeddings":
-            #[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_
-        # filtered.bin",
             [RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow.bin",
              RESOURCES_DIR+"/da/embeddings/da.bin"],
         "lm":
@@ -45,5 +43,5 @@
         "ranking_training_dataset":
             RESOURCES_DIR + "/da/simplification/clean_danish_ls_dataset.txt",
         "synonyms":
-            RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"}
+            [RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"]}
 }
diff --git a/lexi/core/endpoints.py b/lexi/core/endpoints.py
@@ -131,8 +131,9 @@ def get_local_hyperlink_balance(tags):
             html_out += "".join(offset2html[i])
         if i in offset2simplification and not open_hyperlinks_count > 0:
             # checking for hyperlinks because we don't want to simplify those
-            original, replacements, \
-                sentence, word_index = offset2simplification[i]
+            original, replacements, sentence, \
+                word_offset_start, word_offset_end = \
+                offset2simplification[i]
             # in future, possibly get more alternatives, and possibly return
             # in some other order
             replacements = [util.escape(r) for r in replacements]
@@ -147,18 +148,17 @@ def get_local_hyperlink_balance(tags):
                        "</span>"\
                 .format(elemId, displaying_original, original)
             html_out += span_out
-            # TODO allow more than two alternatives as `choices' in future
-            # (https://github.com/jbingel/lexi-backend/issues/2)
             simplifications.update(
                 {elemId: {
                     "request_id": requestId,
                     "original": original, 
-                    "simple": replacements,  # legacy for frontend version <= 0.2
+                    "simple": replacements,  # legacy for frontend v. <= 0.2
                     "choices": choices,
                     "bad_feedback": False,
                     "selection": 0,
                     "sentence": sentence,
-                    "word_index": word_index,
+                    "word_offset_start": word_offset_start,
+                    "word_offset_end": word_offset_end
                 }
                 })
             i += len(original)-1
@@ -170,31 +170,6 @@ def get_local_hyperlink_balance(tags):
     return html_out, simplifications
 
 
-# TODO adapt to new structure
-def update_classifier(classifier, feedback):
-    """
-    Featurizes simplification feedback from user and updates classifier
-    accordingly
-    :param classifier:
-    :param feedback:
-    :return:
-    """
-    xs, ys = [], []
-    for item in feedback.values():
-        original = item["original"]
-        simple = item["simple"]
-        original_is_simpler = item["is_simplified"]  # boolean
-        xs.append(original)
-        ys.append(int(original_is_simpler))  # 1 iff original_is_simpler
-        xs.append(simple)
-        ys.append(int(not original_is_simpler))  # inverse
-    try:
-        classifier.featurize_update(xs, ys)
-    except AttributeError:  # likely because featurizer hasn't been trained
-        classifier.featurize_train(xs, ys)
-
-
-# TODO adapt to new structure
 def update_ranker(ranker, user_id, feedback, overall_rating=0):
     """
     Collects feedback and updates ranker
@@ -204,38 +179,56 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0):
     :param overall_rating: a 1-to-5 scale rating of the overall performance
     :return:
     """
-    # ranker online training expects formatted text as input (instead of
-    # structured data)
-    # TODO really gotta improve interface, make ranker take structured data
-    textblock = ""
+    update_batch = []
+    featurized_words = {}
+
+    # iterate over feedback items (user choices for simplified words)
     for _, simplification in feedback.items():
+
+        # catch some cases in which we don't want to do anything
         if simplification["bad_feedback"]:
             continue
-        selection = simplification["selection"]
         choices = simplification.get("choices")
         if not choices:
             logger.warning("No `choices` field in the "
                            "simplifications: {}".format(simplification))
-            return None
+            continue
+        selection = simplification["selection"]
         logger.debug(simplification)
         if selection == 0:  # nothing selected
             continue
-        else:
-            simple_index = selection % len(choices)
-            simple_word = choices[simple_index]
-            difficult_words = [w for w in choices if not w == simple_word]
-            for difficult in difficult_words:
-                textblock += "{}\t{}\t{}\t{}\t{}\n".format(
-                    simplification["sentence"].replace("\n", " "),
-                    choices,
-                    str(simplification["word_index"]),
-                    "1:" + simple_word,
-                    "2:" + difficult
-                )
-    textblock = textblock.strip()  # remove last newline
-    if textblock:
-        logger.debug("Updating with the following textblock:\n\n"+textblock)
-        ranker.onlineTrainRegressionModel(textblock)
+
+        # if all good, collect batch
+        # featurize words in context
+        original_sentence = simplification.get("sentence")
+        original_start_offset = simplification.get("word_offset_start")
+        original_end_offset = simplification.get("word_offset_end")
+        for w in choices:
+            if w not in featurized_words:
+                # construct modified sentence
+                modified_sentence = "{}{}{}".format(
+                    original_sentence[:original_start_offset],
+                    w,
+                    original_sentence[original_end_offset:])
+                # featurize word in modified context
+                logger.debug("{} {} {} {}".format(modified_sentence, w,
+                                                  original_start_offset,
+                                                  original_start_offset+len(w)))
+                featurized_words[w] = ranker.featurizer.transform(
+                    modified_sentence, original_start_offset,
+                    original_start_offset+len(w), w)
+
+        simple_index = selection % len(choices)
+        simple_word = choices[simple_index]
+        difficult_words = [w for w in choices if not w == simple_word]
+
+        # add feature vectors to update batch
+        for difficult in difficult_words:
+            update_batch.append((featurized_words[simple_word],
+                                 featurized_words[difficult]))
+
+    if update_batch:
+        ranker.update(update_batch)
         ranker.save(user_id)
     else:
         logger.info("Didn't get any useable feedback.")
diff --git a/lexi/core/simplification/__init__.py b/lexi/core/simplification/__init__.py
@@ -6,4 +6,14 @@ class SimplificationPipeline(metaclass=ABCMeta):
     @abstractmethod
     def simplify_text(self, txt, startOffset=0, endOffset=None,
                       cwi=None, ranker=None):
+        """
+
+        :param txt:
+        :param startOffset:
+        :param endOffset:
+        :param cwi:
+        :param ranker:
+        :return: original, replacements, sentence, sentence_offset_start,
+        sentence_offset_end
+        """
         raise NotImplementedError
diff --git a/lexi/core/simplification/lexical.py b/lexi/core/simplification/lexical.py
@@ -80,7 +80,7 @@ def simplify_text(self, text, startOffset=0, endOffset=None, cwi=None,
             sent = text[sb:se]
             token_offsets = util.span_tokenize_words(sent)
 
-            for i, (wb, we) in enumerate(token_offsets):
+            for wb, we in token_offsets:
                 global_word_offset_start = sb + wb
                 global_word_offset_end = sb + we
                 if global_word_offset_start < startOffset or \
@@ -123,7 +123,7 @@ def simplify_text(self, text, startOffset=0, endOffset=None, cwi=None,
                 else:
                     ranking = candidates
                 offset2simplification[global_word_offset_start] = \
-                    (sent[wb:we], ranking, sent, i)
+                    (sent[wb:we], ranking, sent, wb, we)
         return offset2simplification
 
 
@@ -272,7 +272,7 @@ class LexiRanker(LexiPersonalizedPipelineStep):
 
     def __init__(self, userId, featurizer=None):
         self.userId = userId
-        self.featurizer = featurizer if featurizer else LexiRankingFeaturizer()
+        self.featurizer = featurizer or LexiRankingFeaturizer()
         self.model = self.build_model()
         super().__init__(userId)
 

diff --git a/lexi/server/run_lexi_server.py b/lexi/server/run_lexi_server.py
@@ -17,7 +17,7 @@
     CWI_MODEL_PATH_TEMPLATE, MODELS_DIR, RESOURCES
 from lexi.core.endpoints import update_ranker
 from lexi.core.simplification.lexical import LexicalSimplificationPipeline, \
-    LexiCWI, LexiRanker, LexiGenerator
+    LexiCWI, LexiRanker, LexiGenerator, LexiRankingFeaturizer
 from lexi.server.util import statuscodes
 from lexi.server.util.html import process_html
 from lexi.server.util.communication import make_response
@@ -127,6 +127,8 @@
 # default_cwi = load_pickled_model(
 #     CWI_MODEL_PATH_TEMPLATE.format("default"))
 default_ranker = LexiRanker("default")
+default_ranker.featurizer = LexiRankingFeaturizer()
+logger.debug("Default Ranker Featurizer: {}".format(default_ranker.featurizer))
 default_cwi = LexiCWI("default")  # TODO pretrain offline and load
 personalized_rankers = {"default": default_ranker}
 personalized_cwi = {"default": default_cwi}