towards 0.3, new structure for simplification pipeline, move away fro…

…m Pickle
yuluqinn · Apr 9, 2019 · 4814492 · 4814492
1 parent f862662
commit 4814492
Show file tree

Hide file tree

Showing 28 changed files with 487 additions and 11,410 deletions.
diff --git a/.gitignore b/.gitignore
@@ -5,4 +5,5 @@ models/*
 *.pyc
 trash/
 .idea
-lexi.cfg
+lexi.cfg
+lexi/res/*
diff --git a/README.md b/README.md
@@ -2,6 +2,11 @@
 
 ## Changelog
 
+
+### Version 0.3
++ no more pickling!
++ POS-based synonym selection
+
 ### Version 0.2.5
 + more general database error handling
 

diff --git a/lexi/config.py b/lexi/config.py
@@ -5,13 +5,17 @@
 LOG_DIR = os.path.join(LEXI_BASE, "logs")
 MODELS_DIR = os.path.join(LEXI_BASE, "models")
 RANKER_MODELS_DIR = os.path.join(MODELS_DIR, "rankers")
+CWI_MODELS_DIR = os.path.join(MODELS_DIR, "cwi")
 RESOURCES_DIR = os.path.join(LEXI_BASE, "res")
+STANFORDNLP = os.path.join(RESOURCES_DIR, "stanfordnlp_resources")
 
 RANKER_MODEL_PATH_TEMPLATE = os.path.join(RANKER_MODELS_DIR, "{}.pickle")
+CWI_MODEL_PATH_TEMPLATE = os.path.join(CWI_MODELS_DIR, "{}.pickle")
+
 LEXICAL_MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}-lexical.pickle")
 MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}.pickle")
 
-RESOURCES = {
+RESOURCES_FULL = {
     "da": {
         "embeddings":
             #[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_
@@ -25,10 +29,10 @@
         "ranking_training_dataset":
             RESOURCES_DIR+"/da/simplification/clean_danish_ls_dataset.txt",
         "synonyms":
-            RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"}
+            [RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"]}
 }
 
-RESOURCES_TEST = {
+RESOURCES = {
     "da": {
         "embeddings":
             [RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_"

diff --git a/lexi/core/endpoints.py b/lexi/core/endpoints.py
@@ -35,7 +35,7 @@ def process_html_structured(classifier, html, ranker, parId):
     spanId = 0
     if not html.strip():
         return html
-    output_sents = classifier.predict_text(html, ranker)
+    output_sents = classifier.simplify_text(html, ranker)
     for original, simple in zip(*output_sents):
         simple_parsed = parser.parse_sent(simple)
         logger.debug([simple_parsed, simple.replace('\n', ''), parser])
@@ -64,17 +64,18 @@ def process_html_structured(classifier, html, ranker, parId):
     return " ".join(html_out), simplifications
 
 
-def process_html_lexical(classifier, html, startOffset, endOffset, ranker,
+def process_html_lexical(pipeline, html, startOffset, endOffset, cwi, ranker,
                          requestId=0, min_similarity=0.7,
                          blacklist=None):
     """
     Transforms HMTL source, enriching simplified words with core markup by
     separating markup from text and sending pure text to simplification class.
 
-    :param classifier: Simplification classifier instance
+    :param pipeline: Simplification pipeline instance
     :param html: Input HTML source
     :param startOffset: offset after which simplifications are solicited
     :param endOffset: offset until which simplifications are solicited
+    :param cwi: personalized CWI module
     :param ranker: personalized ranker
     :param requestId: Request identifier to disambiguate core simplification
     targets across multiple calls to this method
@@ -115,8 +116,8 @@ def get_local_hyperlink_balance(tags):
     # output is a sequence of tokens including whitespaces, id2simplification
     # is a dict mapping token IDs to simplifications, if applicable
     offset2html, pure_text = util.filter_html(html)
-    offset2simplification = classifier.predict_text(
-        pure_text, startOffset, endOffset, ranker,
+    offset2simplification = pipeline.simplify_text(
+        pure_text, startOffset, endOffset, cwi=cwi, ranker=ranker,
         min_similarity=min_similarity, blacklist=blacklist)
     logger.debug("Simplifying text between character offsets {} "
                  "and {}: {}".format(startOffset, endOffset, pure_text))
@@ -130,11 +131,12 @@ def get_local_hyperlink_balance(tags):
             html_out += "".join(offset2html[i])
         if i in offset2simplification and not open_hyperlinks_count > 0:
             # checking for hyperlinks because we don't want to simplify those
-            original, simple, sentence, word_index = offset2simplification[i]
+            original, replacements, \
+                sentence, word_index = offset2simplification[i]
             # in future, possibly get more alternatives, and possibly return
             # in some other order
-            choices = [original, simple]
-            simple = util.escape(simple)
+            replacements = [util.escape(r) for r in replacements]
+            choices = [original] + replacements
             spanId += 1
             elemId = "lexi_{}_{}".format(requestId, spanId)
             displaying_original = "true" if choices[0] == original else "false"
@@ -151,7 +153,7 @@ def get_local_hyperlink_balance(tags):
                 {elemId: {
                     "request_id": requestId,
                     "original": original, 
-                    "simple": simple,  # legacy for frontend version <= 0.2
+                    "simple": replacements,  # legacy for frontend version <= 0.2
                     "choices": choices,
                     "bad_feedback": False,
                     "selection": 0,
@@ -168,6 +170,7 @@ def get_local_hyperlink_balance(tags):
     return html_out, simplifications
 
 
+# TODO adapt to new structure
 def update_classifier(classifier, feedback):
     """
     Featurizes simplification feedback from user and updates classifier
@@ -191,6 +194,7 @@ def update_classifier(classifier, feedback):
         classifier.featurize_train(xs, ys)
 
 
+# TODO adapt to new structure
 def update_ranker(ranker, user_id, feedback, overall_rating=0):
     """
     Collects feedback and updates ranker

diff --git a/lexi/core/featurize/feat_util.py b/lexi/core/featurize/feat_util.py
@@ -2,14 +2,17 @@
 
 import networkx as nx
 import numpy as np
-import spacy
+# import stanfordnlp
 from networkx.algorithms.traversal.depth_first_search import dfs_edges
-
+from lexi.config import STANFORDNLP
 from lexi.core.featurize.util import resources
 
 COMMA = ","
 VERB = "V"
-nlp = spacy.load('en')
+# nlp = stanfordnlp.Pipeline(nlp = stanfordnlp.Pipeline(
+#     processors='tokenize,mwt,pos',
+#     lang='da', models_dir=STANFORDNLP,
+#     tokenize_pretokenized=True))
 
 
 class EtymWN:
@@ -121,26 +124,26 @@ def has_ancestor_in_lang(lang, word_etym):
             return True
     return False
 
-
-def read_sentences_plain(raw_data):
-    doc = nlp(raw_data)
-    words_seen = 0
-    for s in doc.sents:
-        sent = defaultdict(list)
-        for i, w in enumerate(s):
-            sent["idx"].append(i+1)
-            sent["form"].append(w.text)
-            sent["lemma"].append(w.lemma_)
-            sent["pos"].append(w.pos_)
-            ne = w.ent_type_ if w.ent_type_ else "O"
-            sent["ne"].append(ne)
-            # target = w.head.i - words_seen if w.dep_.lower() != "root" else -1
-            target = w.head.i - words_seen
-            sent["head"].append(target+1)
-            sent["deprel"].append(w.dep_)
-            sent["label"].append("?")
-        words_seen += len(s)
-        yield sent
+#
+# def read_sentences_plain(raw_data):
+#     doc = nlp(raw_data)
+#     words_seen = 0
+#     for s in doc.sentences:
+#         sent = defaultdict(list)
+#         for i, w in enumerate(s):
+#             sent["idx"].append(i+1)
+#             sent["form"].append(w.text)
+#             sent["lemma"].append(w.lemma_)
+#             sent["pos"].append(w.pos_)
+#             ne = w.ent_type_ if w.ent_type_ else "O"
+#             sent["ne"].append(ne)
+#             # target = w.head.i - words_seen if w.dep_.lower() != "root" else -1
+#             target = w.head.i - words_seen
+#             sent["head"].append(target+1)
+#             sent["deprel"].append(w.dep_)
+#             sent["label"].append("?")
+#         words_seen += len(s)
+#         yield sent
 
 
 def read_sentences(data):

diff --git a/lexi/core/featurize/featurizers.py b/lexi/core/featurize/featurizers.py
@@ -3,6 +3,7 @@
 
 from lexi.core.featurize import extract_lexical_feats, feat_util
 from lexi.core.featurize.extract_sentence_feats import TreeNode
+from abc import ABCMeta, abstractmethod
 
 
 class LabelMapper:
@@ -30,6 +31,17 @@ def map_inv(self, ids):
         return out
 
 
+class LexiFeaturizer(metaclass=ABCMeta):
+
+    @abstractmethod
+    def save(self, path):
+        raise NotImplementedError
+
+    @abstractmethod
+    def load(self, path):
+        raise NotImplementedError
+
+
 class Featurizer:
 
     def __init__(self, features=None):

diff --git a/lexi/core/featurize/functions.py b/lexi/core/featurize/functions.py
@@ -0,0 +1 @@
+# # # Feature Functions
diff --git a/lexi/core/simplification/__init__.py b/lexi/core/simplification/__init__.py
@@ -1,47 +1,9 @@
 from abc import ABCMeta, abstractmethod
-from sacremoses import MosesDetokenizer
 
-detokenizer = MosesDetokenizer()
 
-
-class Classifier(metaclass=ABCMeta):
-    # @abstractmethod
-    # def fresh_train(self, x, y):
-    #     pass
-
-    @abstractmethod
-    def predict(self, x, ranker=None):
-        raise NotImplementedError
+class SimplificationPipeline(metaclass=ABCMeta):
 
     @abstractmethod
-    def predict_text(self, txt, startOffset=0, endOffset=None, ranker=None):
+    def simplify_text(self, txt, startOffset=0, endOffset=None,
+                      cwi=None, ranker=None):
         raise NotImplementedError
-
-    @abstractmethod
-    def update(self, x, y):
-        raise NotImplementedError
-
-    @abstractmethod
-    def save(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def load(self, model_id):
-        raise NotImplementedError
-
-    @abstractmethod
-    def load_default_init(self):
-        raise NotImplementedError
-
-    @abstractmethod
-    def check_featurizer_set(self):
-        raise NotImplementedError
-
-
-# Classifier.register(DummyLexicalClassifier)
-# Classifier.register(PystructClassifier)
-# Classifier.register(LexensteinSimplifier)
-# PystructClassifier.register(ChainCRFClassifier)
-# PystructClassifier.register(EdgeCRFClassifier)
-# Classifier.register(AveragedPerceptron)
-# Classifier.register(OnlineStructuredPerceptron)