From b73cf80b8cdd8b26691e47a68353a1e7249b47a1 Mon Sep 17 00:00:00 2001 From: Joachim Bingel Date: Tue, 9 Apr 2019 23:02:46 +0200 Subject: [PATCH] word character offsets in simplification object, needed for featurization e.g. when updating --- lexi/config.py | 4 +- lexi/core/endpoints.py | 99 +++++++++++++--------------- lexi/core/simplification/__init__.py | 10 +++ lexi/core/simplification/lexical.py | 6 +- lexi/server/run_lexi_server.py | 4 +- 5 files changed, 63 insertions(+), 60 deletions(-) diff --git a/lexi/config.py b/lexi/config.py index e2d1446..b6df2a8 100644 --- a/lexi/config.py +++ b/lexi/config.py @@ -18,8 +18,6 @@ RESOURCES_FULL = { "da": { "embeddings": - #[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_ - # filtered.bin", [RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow.bin", RESOURCES_DIR+"/da/embeddings/da.bin"], "lm": @@ -45,5 +43,5 @@ "ranking_training_dataset": RESOURCES_DIR + "/da/simplification/clean_danish_ls_dataset.txt", "synonyms": - RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"} + [RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"]} } diff --git a/lexi/core/endpoints.py b/lexi/core/endpoints.py index f933974..c050969 100644 --- a/lexi/core/endpoints.py +++ b/lexi/core/endpoints.py @@ -131,8 +131,9 @@ def get_local_hyperlink_balance(tags): html_out += "".join(offset2html[i]) if i in offset2simplification and not open_hyperlinks_count > 0: # checking for hyperlinks because we don't want to simplify those - original, replacements, \ - sentence, word_index = offset2simplification[i] + original, replacements, sentence, \ + word_offset_start, word_offset_end = \ + offset2simplification[i] # in future, possibly get more alternatives, and possibly return # in some other order replacements = [util.escape(r) for r in replacements] @@ -147,18 +148,17 @@ def get_local_hyperlink_balance(tags): ""\ .format(elemId, displaying_original, original) html_out += span_out - # TODO allow more than two alternatives as `choices' in future - # (https://github.com/jbingel/lexi-backend/issues/2) simplifications.update( {elemId: { "request_id": requestId, "original": original, - "simple": replacements, # legacy for frontend version <= 0.2 + "simple": replacements, # legacy for frontend v. <= 0.2 "choices": choices, "bad_feedback": False, "selection": 0, "sentence": sentence, - "word_index": word_index, + "word_offset_start": word_offset_start, + "word_offset_end": word_offset_end } }) i += len(original)-1 @@ -170,31 +170,6 @@ def get_local_hyperlink_balance(tags): return html_out, simplifications -# TODO adapt to new structure -def update_classifier(classifier, feedback): - """ - Featurizes simplification feedback from user and updates classifier - accordingly - :param classifier: - :param feedback: - :return: - """ - xs, ys = [], [] - for item in feedback.values(): - original = item["original"] - simple = item["simple"] - original_is_simpler = item["is_simplified"] # boolean - xs.append(original) - ys.append(int(original_is_simpler)) # 1 iff original_is_simpler - xs.append(simple) - ys.append(int(not original_is_simpler)) # inverse - try: - classifier.featurize_update(xs, ys) - except AttributeError: # likely because featurizer hasn't been trained - classifier.featurize_train(xs, ys) - - -# TODO adapt to new structure def update_ranker(ranker, user_id, feedback, overall_rating=0): """ Collects feedback and updates ranker @@ -204,38 +179,56 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0): :param overall_rating: a 1-to-5 scale rating of the overall performance :return: """ - # ranker online training expects formatted text as input (instead of - # structured data) - # TODO really gotta improve interface, make ranker take structured data - textblock = "" + update_batch = [] + featurized_words = {} + + # iterate over feedback items (user choices for simplified words) for _, simplification in feedback.items(): + + # catch some cases in which we don't want to do anything if simplification["bad_feedback"]: continue - selection = simplification["selection"] choices = simplification.get("choices") if not choices: logger.warning("No `choices` field in the " "simplifications: {}".format(simplification)) - return None + continue + selection = simplification["selection"] logger.debug(simplification) if selection == 0: # nothing selected continue - else: - simple_index = selection % len(choices) - simple_word = choices[simple_index] - difficult_words = [w for w in choices if not w == simple_word] - for difficult in difficult_words: - textblock += "{}\t{}\t{}\t{}\t{}\n".format( - simplification["sentence"].replace("\n", " "), - choices, - str(simplification["word_index"]), - "1:" + simple_word, - "2:" + difficult - ) - textblock = textblock.strip() # remove last newline - if textblock: - logger.debug("Updating with the following textblock:\n\n"+textblock) - ranker.onlineTrainRegressionModel(textblock) + + # if all good, collect batch + # featurize words in context + original_sentence = simplification.get("sentence") + original_start_offset = simplification.get("word_offset_start") + original_end_offset = simplification.get("word_offset_end") + for w in choices: + if w not in featurized_words: + # construct modified sentence + modified_sentence = "{}{}{}".format( + original_sentence[:original_start_offset], + w, + original_sentence[original_end_offset:]) + # featurize word in modified context + logger.debug("{} {} {} {}".format(modified_sentence, w, + original_start_offset, + original_start_offset+len(w))) + featurized_words[w] = ranker.featurizer.transform( + modified_sentence, original_start_offset, + original_start_offset+len(w), w) + + simple_index = selection % len(choices) + simple_word = choices[simple_index] + difficult_words = [w for w in choices if not w == simple_word] + + # add feature vectors to update batch + for difficult in difficult_words: + update_batch.append((featurized_words[simple_word], + featurized_words[difficult])) + + if update_batch: + ranker.update(update_batch) ranker.save(user_id) else: logger.info("Didn't get any useable feedback.") diff --git a/lexi/core/simplification/__init__.py b/lexi/core/simplification/__init__.py index 19f7d9f..25dff90 100644 --- a/lexi/core/simplification/__init__.py +++ b/lexi/core/simplification/__init__.py @@ -6,4 +6,14 @@ class SimplificationPipeline(metaclass=ABCMeta): @abstractmethod def simplify_text(self, txt, startOffset=0, endOffset=None, cwi=None, ranker=None): + """ + + :param txt: + :param startOffset: + :param endOffset: + :param cwi: + :param ranker: + :return: original, replacements, sentence, sentence_offset_start, + sentence_offset_end + """ raise NotImplementedError diff --git a/lexi/core/simplification/lexical.py b/lexi/core/simplification/lexical.py index af99b1a..d75672c 100644 --- a/lexi/core/simplification/lexical.py +++ b/lexi/core/simplification/lexical.py @@ -80,7 +80,7 @@ def simplify_text(self, text, startOffset=0, endOffset=None, cwi=None, sent = text[sb:se] token_offsets = util.span_tokenize_words(sent) - for i, (wb, we) in enumerate(token_offsets): + for wb, we in token_offsets: global_word_offset_start = sb + wb global_word_offset_end = sb + we if global_word_offset_start < startOffset or \ @@ -123,7 +123,7 @@ def simplify_text(self, text, startOffset=0, endOffset=None, cwi=None, else: ranking = candidates offset2simplification[global_word_offset_start] = \ - (sent[wb:we], ranking, sent, i) + (sent[wb:we], ranking, sent, wb, we) return offset2simplification @@ -272,7 +272,7 @@ class LexiRanker(LexiPersonalizedPipelineStep): def __init__(self, userId, featurizer=None): self.userId = userId - self.featurizer = featurizer if featurizer else LexiRankingFeaturizer() + self.featurizer = featurizer or LexiRankingFeaturizer() self.model = self.build_model() super().__init__(userId) diff --git a/lexi/server/run_lexi_server.py b/lexi/server/run_lexi_server.py index a79f76d..e10c381 100644 --- a/lexi/server/run_lexi_server.py +++ b/lexi/server/run_lexi_server.py @@ -17,7 +17,7 @@ CWI_MODEL_PATH_TEMPLATE, MODELS_DIR, RESOURCES from lexi.core.endpoints import update_ranker from lexi.core.simplification.lexical import LexicalSimplificationPipeline, \ - LexiCWI, LexiRanker, LexiGenerator + LexiCWI, LexiRanker, LexiGenerator, LexiRankingFeaturizer from lexi.server.util import statuscodes from lexi.server.util.html import process_html from lexi.server.util.communication import make_response @@ -127,6 +127,8 @@ # default_cwi = load_pickled_model( # CWI_MODEL_PATH_TEMPLATE.format("default")) default_ranker = LexiRanker("default") +default_ranker.featurizer = LexiRankingFeaturizer() +logger.debug("Default Ranker Featurizer: {}".format(default_ranker.featurizer)) default_cwi = LexiCWI("default") # TODO pretrain offline and load personalized_rankers = {"default": default_ranker} personalized_cwi = {"default": default_cwi}