Skip to content

Commit

Permalink
word character offsets in simplification object, needed for featuriza…
Browse files Browse the repository at this point in the history
…tion e.g. when updating
  • Loading branch information
jbingel committed Apr 9, 2019
1 parent 4814492 commit b73cf80
Show file tree
Hide file tree
Showing 5 changed files with 63 additions and 60 deletions.
4 changes: 1 addition & 3 deletions lexi/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,6 @@
RESOURCES_FULL = {
"da": {
"embeddings":
#[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_
# filtered.bin",
[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow.bin",
RESOURCES_DIR+"/da/embeddings/da.bin"],
"lm":
Expand All @@ -45,5 +43,5 @@
"ranking_training_dataset":
RESOURCES_DIR + "/da/simplification/clean_danish_ls_dataset.txt",
"synonyms":
RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"}
[RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"]}
}
99 changes: 46 additions & 53 deletions lexi/core/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,9 @@ def get_local_hyperlink_balance(tags):
html_out += "".join(offset2html[i])
if i in offset2simplification and not open_hyperlinks_count > 0:
# checking for hyperlinks because we don't want to simplify those
original, replacements, \
sentence, word_index = offset2simplification[i]
original, replacements, sentence, \
word_offset_start, word_offset_end = \
offset2simplification[i]
# in future, possibly get more alternatives, and possibly return
# in some other order
replacements = [util.escape(r) for r in replacements]
Expand All @@ -147,18 +148,17 @@ def get_local_hyperlink_balance(tags):
"</span>"\
.format(elemId, displaying_original, original)
html_out += span_out
# TODO allow more than two alternatives as `choices' in future
# (https://github.com/jbingel/lexi-backend/issues/2)
simplifications.update(
{elemId: {
"request_id": requestId,
"original": original,
"simple": replacements, # legacy for frontend version <= 0.2
"simple": replacements, # legacy for frontend v. <= 0.2
"choices": choices,
"bad_feedback": False,
"selection": 0,
"sentence": sentence,
"word_index": word_index,
"word_offset_start": word_offset_start,
"word_offset_end": word_offset_end
}
})
i += len(original)-1
Expand All @@ -170,31 +170,6 @@ def get_local_hyperlink_balance(tags):
return html_out, simplifications


# TODO adapt to new structure
def update_classifier(classifier, feedback):
"""
Featurizes simplification feedback from user and updates classifier
accordingly
:param classifier:
:param feedback:
:return:
"""
xs, ys = [], []
for item in feedback.values():
original = item["original"]
simple = item["simple"]
original_is_simpler = item["is_simplified"] # boolean
xs.append(original)
ys.append(int(original_is_simpler)) # 1 iff original_is_simpler
xs.append(simple)
ys.append(int(not original_is_simpler)) # inverse
try:
classifier.featurize_update(xs, ys)
except AttributeError: # likely because featurizer hasn't been trained
classifier.featurize_train(xs, ys)


# TODO adapt to new structure
def update_ranker(ranker, user_id, feedback, overall_rating=0):
"""
Collects feedback and updates ranker
Expand All @@ -204,38 +179,56 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0):
:param overall_rating: a 1-to-5 scale rating of the overall performance
:return:
"""
# ranker online training expects formatted text as input (instead of
# structured data)
# TODO really gotta improve interface, make ranker take structured data
textblock = ""
update_batch = []
featurized_words = {}

# iterate over feedback items (user choices for simplified words)
for _, simplification in feedback.items():

# catch some cases in which we don't want to do anything
if simplification["bad_feedback"]:
continue
selection = simplification["selection"]
choices = simplification.get("choices")
if not choices:
logger.warning("No `choices` field in the "
"simplifications: {}".format(simplification))
return None
continue
selection = simplification["selection"]
logger.debug(simplification)
if selection == 0: # nothing selected
continue
else:
simple_index = selection % len(choices)
simple_word = choices[simple_index]
difficult_words = [w for w in choices if not w == simple_word]
for difficult in difficult_words:
textblock += "{}\t{}\t{}\t{}\t{}\n".format(
simplification["sentence"].replace("\n", " "),
choices,
str(simplification["word_index"]),
"1:" + simple_word,
"2:" + difficult
)
textblock = textblock.strip() # remove last newline
if textblock:
logger.debug("Updating with the following textblock:\n\n"+textblock)
ranker.onlineTrainRegressionModel(textblock)

# if all good, collect batch
# featurize words in context
original_sentence = simplification.get("sentence")
original_start_offset = simplification.get("word_offset_start")
original_end_offset = simplification.get("word_offset_end")
for w in choices:
if w not in featurized_words:
# construct modified sentence
modified_sentence = "{}{}{}".format(
original_sentence[:original_start_offset],
w,
original_sentence[original_end_offset:])
# featurize word in modified context
logger.debug("{} {} {} {}".format(modified_sentence, w,
original_start_offset,
original_start_offset+len(w)))
featurized_words[w] = ranker.featurizer.transform(
modified_sentence, original_start_offset,
original_start_offset+len(w), w)

simple_index = selection % len(choices)
simple_word = choices[simple_index]
difficult_words = [w for w in choices if not w == simple_word]

# add feature vectors to update batch
for difficult in difficult_words:
update_batch.append((featurized_words[simple_word],
featurized_words[difficult]))

if update_batch:
ranker.update(update_batch)
ranker.save(user_id)
else:
logger.info("Didn't get any useable feedback.")
10 changes: 10 additions & 0 deletions lexi/core/simplification/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,14 @@ class SimplificationPipeline(metaclass=ABCMeta):
@abstractmethod
def simplify_text(self, txt, startOffset=0, endOffset=None,
cwi=None, ranker=None):
"""
:param txt:
:param startOffset:
:param endOffset:
:param cwi:
:param ranker:
:return: original, replacements, sentence, sentence_offset_start,
sentence_offset_end
"""
raise NotImplementedError
6 changes: 3 additions & 3 deletions lexi/core/simplification/lexical.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def simplify_text(self, text, startOffset=0, endOffset=None, cwi=None,
sent = text[sb:se]
token_offsets = util.span_tokenize_words(sent)

for i, (wb, we) in enumerate(token_offsets):
for wb, we in token_offsets:
global_word_offset_start = sb + wb
global_word_offset_end = sb + we
if global_word_offset_start < startOffset or \
Expand Down Expand Up @@ -123,7 +123,7 @@ def simplify_text(self, text, startOffset=0, endOffset=None, cwi=None,
else:
ranking = candidates
offset2simplification[global_word_offset_start] = \
(sent[wb:we], ranking, sent, i)
(sent[wb:we], ranking, sent, wb, we)
return offset2simplification


Expand Down Expand Up @@ -272,7 +272,7 @@ class LexiRanker(LexiPersonalizedPipelineStep):

def __init__(self, userId, featurizer=None):
self.userId = userId
self.featurizer = featurizer if featurizer else LexiRankingFeaturizer()
self.featurizer = featurizer or LexiRankingFeaturizer()
self.model = self.build_model()
super().__init__(userId)

Expand Down
4 changes: 3 additions & 1 deletion lexi/server/run_lexi_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
CWI_MODEL_PATH_TEMPLATE, MODELS_DIR, RESOURCES
from lexi.core.endpoints import update_ranker
from lexi.core.simplification.lexical import LexicalSimplificationPipeline, \
LexiCWI, LexiRanker, LexiGenerator
LexiCWI, LexiRanker, LexiGenerator, LexiRankingFeaturizer
from lexi.server.util import statuscodes
from lexi.server.util.html import process_html
from lexi.server.util.communication import make_response
Expand Down Expand Up @@ -127,6 +127,8 @@
# default_cwi = load_pickled_model(
# CWI_MODEL_PATH_TEMPLATE.format("default"))
default_ranker = LexiRanker("default")
default_ranker.featurizer = LexiRankingFeaturizer()
logger.debug("Default Ranker Featurizer: {}".format(default_ranker.featurizer))
default_cwi = LexiCWI("default") # TODO pretrain offline and load
personalized_rankers = {"default": default_ranker}
personalized_cwi = {"default": default_cwi}
Expand Down

0 comments on commit b73cf80

Please sign in to comment.