Skip to content

Commit

Permalink
towards 0.3, new structure for simplification pipeline, move away fro…
Browse files Browse the repository at this point in the history
…m Pickle
  • Loading branch information
jbingel committed Apr 9, 2019
1 parent f862662 commit 4814492
Show file tree
Hide file tree
Showing 28 changed files with 487 additions and 11,410 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -5,4 +5,5 @@ models/*
*.pyc
trash/
.idea
lexi.cfg
lexi.cfg
lexi/res/*
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,11 @@

## Changelog


### Version 0.3
+ no more pickling!
+ POS-based synonym selection

### Version 0.2.5
+ more general database error handling

Expand Down
10 changes: 7 additions & 3 deletions lexi/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,17 @@
LOG_DIR = os.path.join(LEXI_BASE, "logs")
MODELS_DIR = os.path.join(LEXI_BASE, "models")
RANKER_MODELS_DIR = os.path.join(MODELS_DIR, "rankers")
CWI_MODELS_DIR = os.path.join(MODELS_DIR, "cwi")
RESOURCES_DIR = os.path.join(LEXI_BASE, "res")
STANFORDNLP = os.path.join(RESOURCES_DIR, "stanfordnlp_resources")

RANKER_MODEL_PATH_TEMPLATE = os.path.join(RANKER_MODELS_DIR, "{}.pickle")
CWI_MODEL_PATH_TEMPLATE = os.path.join(CWI_MODELS_DIR, "{}.pickle")

LEXICAL_MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}-lexical.pickle")
MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}.pickle")

RESOURCES = {
RESOURCES_FULL = {
"da": {
"embeddings":
#[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_
Expand All @@ -25,10 +29,10 @@
"ranking_training_dataset":
RESOURCES_DIR+"/da/simplification/clean_danish_ls_dataset.txt",
"synonyms":
RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"}
[RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"]}
}

RESOURCES_TEST = {
RESOURCES = {
"da": {
"embeddings":
[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_"
Expand Down
22 changes: 13 additions & 9 deletions lexi/core/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def process_html_structured(classifier, html, ranker, parId):
spanId = 0
if not html.strip():
return html
output_sents = classifier.predict_text(html, ranker)
output_sents = classifier.simplify_text(html, ranker)
for original, simple in zip(*output_sents):
simple_parsed = parser.parse_sent(simple)
logger.debug([simple_parsed, simple.replace('\n', ''), parser])
Expand Down Expand Up @@ -64,17 +64,18 @@ def process_html_structured(classifier, html, ranker, parId):
return " ".join(html_out), simplifications


def process_html_lexical(classifier, html, startOffset, endOffset, ranker,
def process_html_lexical(pipeline, html, startOffset, endOffset, cwi, ranker,
requestId=0, min_similarity=0.7,
blacklist=None):
"""
Transforms HMTL source, enriching simplified words with core markup by
separating markup from text and sending pure text to simplification class.
:param classifier: Simplification classifier instance
:param pipeline: Simplification pipeline instance
:param html: Input HTML source
:param startOffset: offset after which simplifications are solicited
:param endOffset: offset until which simplifications are solicited
:param cwi: personalized CWI module
:param ranker: personalized ranker
:param requestId: Request identifier to disambiguate core simplification
targets across multiple calls to this method
Expand Down Expand Up @@ -115,8 +116,8 @@ def get_local_hyperlink_balance(tags):
# output is a sequence of tokens including whitespaces, id2simplification
# is a dict mapping token IDs to simplifications, if applicable
offset2html, pure_text = util.filter_html(html)
offset2simplification = classifier.predict_text(
pure_text, startOffset, endOffset, ranker,
offset2simplification = pipeline.simplify_text(
pure_text, startOffset, endOffset, cwi=cwi, ranker=ranker,
min_similarity=min_similarity, blacklist=blacklist)
logger.debug("Simplifying text between character offsets {} "
"and {}: {}".format(startOffset, endOffset, pure_text))
Expand All @@ -130,11 +131,12 @@ def get_local_hyperlink_balance(tags):
html_out += "".join(offset2html[i])
if i in offset2simplification and not open_hyperlinks_count > 0:
# checking for hyperlinks because we don't want to simplify those
original, simple, sentence, word_index = offset2simplification[i]
original, replacements, \
sentence, word_index = offset2simplification[i]
# in future, possibly get more alternatives, and possibly return
# in some other order
choices = [original, simple]
simple = util.escape(simple)
replacements = [util.escape(r) for r in replacements]
choices = [original] + replacements
spanId += 1
elemId = "lexi_{}_{}".format(requestId, spanId)
displaying_original = "true" if choices[0] == original else "false"
Expand All @@ -151,7 +153,7 @@ def get_local_hyperlink_balance(tags):
{elemId: {
"request_id": requestId,
"original": original,
"simple": simple, # legacy for frontend version <= 0.2
"simple": replacements, # legacy for frontend version <= 0.2
"choices": choices,
"bad_feedback": False,
"selection": 0,
Expand All @@ -168,6 +170,7 @@ def get_local_hyperlink_balance(tags):
return html_out, simplifications


# TODO adapt to new structure
def update_classifier(classifier, feedback):
"""
Featurizes simplification feedback from user and updates classifier
Expand All @@ -191,6 +194,7 @@ def update_classifier(classifier, feedback):
classifier.featurize_train(xs, ys)


# TODO adapt to new structure
def update_ranker(ranker, user_id, feedback, overall_rating=0):
"""
Collects feedback and updates ranker
Expand Down
49 changes: 26 additions & 23 deletions lexi/core/featurize/feat_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,14 +2,17 @@

import networkx as nx
import numpy as np
import spacy
# import stanfordnlp
from networkx.algorithms.traversal.depth_first_search import dfs_edges

from lexi.config import STANFORDNLP
from lexi.core.featurize.util import resources

COMMA = ","
VERB = "V"
nlp = spacy.load('en')
# nlp = stanfordnlp.Pipeline(nlp = stanfordnlp.Pipeline(
# processors='tokenize,mwt,pos',
# lang='da', models_dir=STANFORDNLP,
# tokenize_pretokenized=True))


class EtymWN:
Expand Down Expand Up @@ -121,26 +124,26 @@ def has_ancestor_in_lang(lang, word_etym):
return True
return False


def read_sentences_plain(raw_data):
doc = nlp(raw_data)
words_seen = 0
for s in doc.sents:
sent = defaultdict(list)
for i, w in enumerate(s):
sent["idx"].append(i+1)
sent["form"].append(w.text)
sent["lemma"].append(w.lemma_)
sent["pos"].append(w.pos_)
ne = w.ent_type_ if w.ent_type_ else "O"
sent["ne"].append(ne)
# target = w.head.i - words_seen if w.dep_.lower() != "root" else -1
target = w.head.i - words_seen
sent["head"].append(target+1)
sent["deprel"].append(w.dep_)
sent["label"].append("?")
words_seen += len(s)
yield sent
#
# def read_sentences_plain(raw_data):
# doc = nlp(raw_data)
# words_seen = 0
# for s in doc.sentences:
# sent = defaultdict(list)
# for i, w in enumerate(s):
# sent["idx"].append(i+1)
# sent["form"].append(w.text)
# sent["lemma"].append(w.lemma_)
# sent["pos"].append(w.pos_)
# ne = w.ent_type_ if w.ent_type_ else "O"
# sent["ne"].append(ne)
# # target = w.head.i - words_seen if w.dep_.lower() != "root" else -1
# target = w.head.i - words_seen
# sent["head"].append(target+1)
# sent["deprel"].append(w.dep_)
# sent["label"].append("?")
# words_seen += len(s)
# yield sent


def read_sentences(data):
Expand Down
12 changes: 12 additions & 0 deletions lexi/core/featurize/featurizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from lexi.core.featurize import extract_lexical_feats, feat_util
from lexi.core.featurize.extract_sentence_feats import TreeNode
from abc import ABCMeta, abstractmethod


class LabelMapper:
Expand Down Expand Up @@ -30,6 +31,17 @@ def map_inv(self, ids):
return out


class LexiFeaturizer(metaclass=ABCMeta):

@abstractmethod
def save(self, path):
raise NotImplementedError

@abstractmethod
def load(self, path):
raise NotImplementedError


class Featurizer:

def __init__(self, features=None):
Expand Down
1 change: 1 addition & 0 deletions lexi/core/featurize/functions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
# # # Feature Functions
44 changes: 3 additions & 41 deletions lexi/core/simplification/__init__.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,9 @@
from abc import ABCMeta, abstractmethod
from sacremoses import MosesDetokenizer

detokenizer = MosesDetokenizer()


class Classifier(metaclass=ABCMeta):
# @abstractmethod
# def fresh_train(self, x, y):
# pass

@abstractmethod
def predict(self, x, ranker=None):
raise NotImplementedError
class SimplificationPipeline(metaclass=ABCMeta):

@abstractmethod
def predict_text(self, txt, startOffset=0, endOffset=None, ranker=None):
def simplify_text(self, txt, startOffset=0, endOffset=None,
cwi=None, ranker=None):
raise NotImplementedError

@abstractmethod
def update(self, x, y):
raise NotImplementedError

@abstractmethod
def save(self):
raise NotImplementedError

@abstractmethod
def load(self, model_id):
raise NotImplementedError

@abstractmethod
def load_default_init(self):
raise NotImplementedError

@abstractmethod
def check_featurizer_set(self):
raise NotImplementedError


# Classifier.register(DummyLexicalClassifier)
# Classifier.register(PystructClassifier)
# Classifier.register(LexensteinSimplifier)
# PystructClassifier.register(ChainCRFClassifier)
# PystructClassifier.register(EdgeCRFClassifier)
# Classifier.register(AveragedPerceptron)
# Classifier.register(OnlineStructuredPerceptron)
Loading

0 comments on commit 4814492

Please sign in to comment.