diff --git a/lexi/config.py b/lexi/config.py index b6df2a8..b2a5880 100644 --- a/lexi/config.py +++ b/lexi/config.py @@ -4,13 +4,16 @@ LEXI_BASE = os.path.join(SOURCE_BASE, "..") LOG_DIR = os.path.join(LEXI_BASE, "logs") MODELS_DIR = os.path.join(LEXI_BASE, "models") -RANKER_MODELS_DIR = os.path.join(MODELS_DIR, "rankers") -CWI_MODELS_DIR = os.path.join(MODELS_DIR, "cwi") +RANKER_DIR = os.path.join(MODELS_DIR, "rankers") +CWI_DIR = os.path.join(MODELS_DIR, "cwi") +SCORERS_DIR = os.path.join(MODELS_DIR, "scorers") RESOURCES_DIR = os.path.join(LEXI_BASE, "res") STANFORDNLP = os.path.join(RESOURCES_DIR, "stanfordnlp_resources") -RANKER_MODEL_PATH_TEMPLATE = os.path.join(RANKER_MODELS_DIR, "{}.pickle") -CWI_MODEL_PATH_TEMPLATE = os.path.join(CWI_MODELS_DIR, "{}.pickle") +RANKER_PATH_TEMPLATE = os.path.join(RANKER_DIR, "{}.json") +CWI_PATH_TEMPLATE = os.path.join(CWI_DIR, "{}.json") +SCORER_PATH_TEMPLATE = os.path.join(SCORERS_DIR, "{}.json") +SCORER_MODEL_PATH_TEMPLATE = os.path.join(SCORERS_DIR, "{}.pt") LEXICAL_MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}-lexical.pickle") MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}.pickle") diff --git a/lexi/core/endpoints.py b/lexi/core/endpoints.py index 39e294c..2bb2aec 100644 --- a/lexi/core/endpoints.py +++ b/lexi/core/endpoints.py @@ -181,10 +181,9 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0): :return: """ update_batch = [] - featurized_words = {} logger.debug("Updating ranker: {}".format(ranker)) - logger.debug("Ranker has featurizer: {}".format(ranker.featurizer)) + logger.debug("Ranker has featurizer: {}".format(ranker.scorer.featurizer)) # iterate over feedback items (user choices for simplified words) for _, simplification in feedback.items(): @@ -207,35 +206,27 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0): original_sentence = simplification.get("sentence") original_start_offset = simplification.get("word_offset_start") original_end_offset = simplification.get("word_offset_end") - for w in choices: - if w not in featurized_words: - # construct modified sentence - modified_sentence = "{}{}{}".format( - original_sentence[:original_start_offset], - w, - original_sentence[original_end_offset:]) - # featurize word in modified context - logger.debug("Word in modified context: {} {} {} {}".format( - modified_sentence, w, original_start_offset, - original_start_offset+len(w))) - featurized_words[w] = ranker.featurizer.featurize( - modified_sentence, original_start_offset, - original_start_offset+len(w)) simple_index = selection % len(choices) simple_word = choices[simple_index] - difficult_words = [w for w in choices if not w == simple_word] - # add feature vectors to update batch - update_batch.append((featurized_words[simple_word], 0)) - for difficult in difficult_words: - update_batch.append((featurized_words[difficult], 1)) - # update_batch.append((featurized_words[simple_word], - # featurized_words[difficult])) + for w in choices: + # construct modified sentence + modified_sentence = "{}{}{}".format( + original_sentence[:original_start_offset], + w, + original_sentence[original_end_offset:]) + + # put modified sentences into update batch (list of tuples + # (items, label), where items are 3-tuples + # (modified_sent, start_offset, end_offset)) + update_batch.append(( + (modified_sentence, # item + original_start_offset, + original_start_offset + len(w)), + int(w != simple_word))) # label: 0 if w simple, 1 if difficult if update_batch: - update_batch = list(zip(*update_batch)) - # print(help(ranker)) ranker.update(update_batch) ranker.save(user_id) else: diff --git a/lexi/core/featurize/featurizers.py b/lexi/core/featurize/featurizers.py index 84af44b..0da73e1 100644 --- a/lexi/core/featurize/featurizers.py +++ b/lexi/core/featurize/featurizers.py @@ -1,254 +1,67 @@ -import numpy as np +import logging +import jsonpickle from sklearn.feature_extraction import DictVectorizer +from sklearn.preprocessing import MinMaxScaler -from lexi.core.featurize import extract_lexical_feats, feat_util -from lexi.core.featurize.extract_sentence_feats import TreeNode -from abc import ABCMeta, abstractmethod - -class LabelMapper: - def __init__(self): - self.label2id = dict() - self.id2label = {} - - def map_batch(self, labels): - out = [] - for label in labels: - out.append(self.map(label)) - return out - - def map(self, label): - if label not in self.label2id: - newId = len(self.label2id) - self.label2id[label] = newId - self.id2label[newId] = label - return self.label2id[label] - - def map_inv(self, ids): - out = [] - for _id in ids: - out.append(self.id2label.get(_id, "?")) - return out - - -class Featurizer: - - def __init__(self, features=None): - self.mapper = LabelMapper() - self.features = features - - def fit_transform(self, data): - return self.transform(data, fit=True) - - def transform(self, data, fit=False): - raise NotImplementedError - - def transform_plain(self, data): - raise NotImplementedError - - def map_labels(self, data): - return self.mapper.map_batch(data) - - def map_inv(self, ids): - return self.mapper.map_inv(ids) - - -class LexicalFeaturizer(Featurizer): - - def __init__(self, features=None): - super().__init__(features) - self.vectorizer = DictVectorizer() - - def featurize_word(self, w): - word = extract_lexical_feats.Word(w) - return word.featurize_by_type(self.features) - - def transform(self, data, fit=False): - feats = [] - labels = [] - for word in data: - feats.append(self.featurize_word(word)) - # labels.append(label) - if fit: - feats = self.vectorizer.fit_transform(feats) - else: - feats = self.vectorizer.transform(feats) - - # labels = np.array(labels) - return feats, labels - - def transform_plain(self, data): - return self.transform(data, fit=False) +logger = logging.getLogger('lexi') -class PystructEdgeFeaturizer(Featurizer): +class LexicalFeaturizer(DictVectorizer): - def __init__(self, features=None): - super().__init__(features) - self.node_vectorizer = DictVectorizer() - self.edge_vectorizer = DictVectorizer() - # self.nlp = spacy.load('en') - print("Loaded natural language processor.") - - def prettyprintweights(self, linearmodel): - for name, value in zip(self.node_vectorizer.feature_names_, linearmodel.coef_[0]): - print("\t".join([name, str(value)])) - - def featurize_sentence(self, s): - nodefeats = [] - edges = [] - edgefeats = [] - labels = [] - for l, i in zip(s["label"], s["idx"]): - i -= 1 - w = TreeNode(s, i, s["form"][i], s["lemma"][i], s["pos"][i], - s["ne"][i], s["head"], s["deprel"], l) - nodefeats.append(w.featurize_by_type(self.features)) - head = int(s["head"][i]) - tgt = head if head > 0 else i+1 - edges.append((tgt-1, i)) - edgefeats.append(w.featurize_by_type(["dependency"])) - labels.append(l) - return nodefeats, edges, edgefeats, labels - - def fit_transform(self, data): - return self.transform(data, fit=True) - - def transform(self, data, fit=False): - labels = [] - X = [] - y = [] - sentence_lengths = [] - Xnodefeats = [] - Xedges = [] - Xedgefeats = [] - print("Collecting features...") - # for s in feat_util.read_sentences_plain(data): - for s in feat_util.read_sentences(data): - nodefeats, edges, edgefeats, nodelabels = self.featurize_sentence(s) - sentence_lengths.append(len(nodefeats)) - Xnodefeats.extend(nodefeats) - Xedges.extend(edges) - Xedgefeats.extend(edgefeats) - labels.extend(nodelabels) - - if fit: - Xnodefeats = self.node_vectorizer.fit_transform(Xnodefeats).toarray() - Xedgefeats = self.edge_vectorizer.fit_transform(Xedgefeats).toarray() - else: - Xnodefeats = self.node_vectorizer.transform(Xnodefeats).toarray() - Xedgefeats = self.edge_vectorizer.transform(Xedgefeats).toarray() - i = 0 - for sl in sentence_lengths: - X.append((Xnodefeats[i:i+sl], np.array(Xedges[i:i+sl]), Xedgefeats[i:i+sl])) - y.append(np.array(self.mapper.map_batch(labels[i:i + sl]))) - i = i+sl - - for i in range(len(X)): - if not len(X[i][0]) == len(y[i]): - print("unequal {}: {} vs {}".format(i, len(X[i][0]), len(y[i]))) - return X, y - - def transform_plain(self, data): - X = [] - parses = [] - sentence_lengths = [] - Xnodefeats = [] - Xedges = [] - Xedgefeats = [] - print("Collecting features...") - for s in feat_util.read_sentences_plain(data): - nodefeats, edges, edgefeats, _ = self.featurize_sentence(s) - sentence_lengths.append(len(nodefeats)) - Xnodefeats.extend(nodefeats) - Xedges.extend(edges) - Xedgefeats.extend(edgefeats) - parses.append(s) - - Xnodefeats = self.node_vectorizer.transform(Xnodefeats).toarray() - Xedgefeats = self.edge_vectorizer.transform(Xedgefeats).toarray() - i = 0 - for sl in sentence_lengths: - X.append((Xnodefeats[i:i+sl], np.array(Xedges[i:i+sl]), Xedgefeats[i:i+sl])) - i = i+sl - return X, parses - - -class PystructChainFeaturizer(Featurizer): - - def __init__(self, features=None): - super().__init__(features) - self.node_vectorizer = DictVectorizer() - self.edge_vectorizer = DictVectorizer() - # self.nlp = spacy.load('en') - print("Loaded natural language processor.") - - def prettyprintweights(self, linearmodel): - for name, value in zip(self.node_vectorizer.feature_names_, linearmodel.coef_[0]): - print("\t".join([name, str(value)])) - - def featurize_sentence(self, s): - nodefeats = [] - labels = [] - for l, i in zip(s["label"], s["idx"]): - i -= 1 - w = TreeNode(s, i, s["form"][i], s["lemma"][i], s["pos"][i], - s["ne"][i], s["head"], s["deprel"], l) - nodefeats.append(w.featurize_by_type(self.features)) - labels.append(l) - return nodefeats, labels - - def fit_transform(self, data): - return self.transform(data, fit=True) - - def transform(self, data, fit=False): - labels = [] - X = [] - y = [] - sentence_lengths = [] - Xnodefeats = [] - Xedges = [] - Xedgefeats = [] - print("Collecting features...") - # for s in feat_util.read_sentences_plain(data): - for s in feat_util.read_sentences(data): - nodefeats, nodelabels = self.featurize_sentence(s) - sentence_lengths.append(len(nodefeats)) - Xnodefeats.extend(nodefeats) - labels.extend(nodelabels) - - if fit: - Xnodefeats = self.node_vectorizer.fit_transform(Xnodefeats).toarray() - Xedgefeats = self.edge_vectorizer.fit_transform(Xedgefeats).toarray() + def __init__(self): + super().__init__(sparse=False) + self.scaler = MinMaxScaler() + self.cache = {} + + def __getstate__(self): + state = self.__dict__.copy() + if 'cache' in state: + del state['cache'] + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.cache = {} + + def dimensions(self): + if hasattr(self, "feature_names_"): + return len(self.get_feature_names()) else: - Xnodefeats = self.node_vectorizer.transform(Xnodefeats).toarray() - Xedgefeats = self.edge_vectorizer.transform(Xedgefeats).toarray() - i = 0 - for sl in sentence_lengths: - X.append((Xnodefeats[i:i+sl], np.array(Xedges[i:i+sl]), Xedgefeats[i:i+sl])) - y.append(np.array(self.mapper.map_batch(labels[i:i + sl]))) - i = i+sl - - for i in range(len(X)): - if not len(X[i][0]) == len(y[i]): - print("unequal {}: {} vs {}".format(i, len(X[i][0]), len(y[i]))) - return X, y - - def transform_plain(self, data): - X = [] - parses = [] - sentence_lengths = [] - Xnodefeats = [] - print("Collecting features...") - for s in feat_util.read_sentences_plain(data): - nodefeats, _ = self.featurize_sentence(s) - sentence_lengths.append(len(nodefeats)) - Xnodefeats.extend(nodefeats) - parses.append(s) - - Xnodefeats = self.node_vectorizer.transform(Xnodefeats).toarray() - i = 0 - for sl in sentence_lengths: - X.append(Xnodefeats[i:i+sl]) - i = i+sl - return X, parses + logger.warning("Asking for vectorizer dimensionality, " + "but vectorizer has not been fit yet. Returning 0.") + return 0 + + def to_dict(self, sentence, start_offset, end_offset): + featuredict = dict() + featuredict["word_length"] = end_offset - start_offset + featuredict["sentence_length"] = len(sentence) + return featuredict + + def fit(self, words_in_context): + wic_dicts = [self.to_dict(*wic) for wic in words_in_context] + vecs = super().fit_transform(wic_dicts) + self.scaler.fit(vecs) + + def featurize(self, sentence, start_offset, end_offset, scale=True): + cached = self.cache.get((sentence, start_offset, end_offset)) + if cached is not None: + return cached + x = self.transform(self.to_dict(sentence, start_offset, end_offset)) + if scale: + x = self.scaler.transform(x) + self.cache[(sentence, start_offset, end_offset)] = x + return x + + def save(self, path): + json = jsonpickle.encode(self) + with open(path, "w") as jsonfile: + jsonfile.write(json) + + @staticmethod + def staticload(path): + with open(path) as jsonfile: + json = jsonfile.read() + featurizer = jsonpickle.decode(json) + featurizer.cache = {} + return featurizer diff --git a/lexi/core/simplification/lexical.py b/lexi/core/simplification/lexical.py index 95c18af..5848793 100644 --- a/lexi/core/simplification/lexical.py +++ b/lexi/core/simplification/lexical.py @@ -1,20 +1,16 @@ import logging import pickle -import os import jsonpickle import torch -from lexi.config import LEXICAL_MODEL_PATH_TEMPLATE, RANKER_MODEL_PATH_TEMPLATE +from lexi.config import LEXICAL_MODEL_PATH_TEMPLATE, RANKER_PATH_TEMPLATE, \ + SCORER_PATH_TEMPLATE, SCORER_MODEL_PATH_TEMPLATE, CWI_PATH_TEMPLATE from lexi.core.simplification import SimplificationPipeline from lexi.core.simplification.util import make_synonyms_dict, \ parse_embeddings from lexi.core.featurize.featurizers import LexicalFeaturizer from lexi.core.util import util from abc import ABCMeta, abstractmethod -import keras -from keras.layers import Input, Dense -from sklearn.feature_extraction import DictVectorizer -from sklearn.preprocessing import MinMaxScaler logger = logging.getLogger('lexi') @@ -187,165 +183,92 @@ def select(self, sentence, startOffset, endOffset, candidates): class LexiPersonalizedPipelineStep(metaclass=ABCMeta): - def __init__(self, userId=None): + def __init__(self, userId=None, scorer=None): self.userId = userId - self.model = None + self.scorer = scorer + self.scorer_path = None - @abstractmethod - def fresh_train(self, data): - raise NotImplementedError + def set_scorer(self, scorer): + self.scorer = scorer + self.scorer_path = scorer.get_path() + + def set_userId(self, userId): + self.userId = userId @abstractmethod def update(self, data): raise NotImplementedError - def save(self, models_path): - path_prefix = os.path.join(models_path, self.userId) - self.model.save(path_prefix+".model.h5") - if hasattr(self, "featurizer") and self.featurizer: - self.featurizer.save(path_prefix+".featurizer") - - def load(self, path): - self.model = keras.models.load_model(path) - - -# class LexiFeaturizer(DictVectorizer): -# -# def __init__(self): -# super().__init__() -# -# def dimensions(self): -# return len(self.get_feature_names()) -# # return 3 -# -# def featurize(self, sentence, startOffset, endOffset): -# featuredict = dict() -# featuredict["word_length"] = endOffset - startOffset -# featuredict["sentence_length"] = len(sentence) -# self.transform(featuredict) -# -# def save(self, path): -# json = jsonpickle.encode(self) -# with open(path, "w") as jsonfile: -# jsonfile.write(json) -# -# @staticmethod -# def staticload(path): -# with open(path) as jsonfile: -# json = jsonfile.read() -# return jsonpickle.decode(json) - - -class LexiCWI(LexiPersonalizedPipelineStep): + def __getstate__(self): + """ + Needed to save pipeline steps using jsonpickle, since this module cannot + handle torch models -- we use torch's model saving functionality + instead. This is the method used by jsonpickle to get the state of the + object when serializing. + :return: + """ + state = self.__dict__.copy() + del state['scorer'] + return state - def __init__(self, userId, featurizer=None): - # self.model = self.build_model() - super().__init__(userId) - self.featurizer = featurizer if featurizer is not None else \ - LexiFeaturizer() - self.model = self.build_model() - self.optimizer = torch.optim.Adam(self.model.parameters()) + def __setstate__(self, state): + self.__dict__.update(state) - def build_model(self): - return LexiScorerNet(self.featurizer.dimensions(), [10, 10]) - def fresh_train(self, cwi_data): - x, y = cwi_data - self.model.fit(x, y, self.optimizer) +class LexiCWI(LexiPersonalizedPipelineStep): - def update(self, cwi_data): - x, y = cwi_data - self.model.fit(x, y, self.optimizer) # TODO updating like this is problematic if we - # want learning rate decay or other things that rely on previous - # iterations, those are not saved in the model or optimizer... + def __init__(self, userId, scorer=None): + super().__init__(userId, scorer) + self.cwi_threshold = 0.67 def identify_targets(self, sent, token_offsets): return [(wb, we) for wb, we in token_offsets if self.is_complex(sent, wb, we)] def is_complex(self, sent, startOffset, endOffset): - x = self.featurizer.featurize(sent, startOffset, endOffset) - logger.debug(x) - cwi_score = self.model(x) - return cwi_score > 0 + cwi_score = self.scorer.score(sent, startOffset, endOffset) + return cwi_score > self.cwi_threshold + def set_cwi_threshold(self, threshold): + self.cwi_threshold = threshold -class LexiFeaturizer(DictVectorizer): - - def __init__(self): - super().__init__(sparse=False) - self.scaler = MinMaxScaler() + def update(self, data): + if self.scorer: + self.scorer.update(data) - def dimensions(self): - if hasattr(self, "feature_names_"): - return len(self.get_feature_names()) - else: - logger.warning("Asking for vectorizer dimensionality, " - "but vectorizer has not been fit yet. Returning 0.") - return 0 - - def to_dict(self, sentence, startOffset, endOffset): - featuredict = dict() - featuredict["word_length"] = endOffset - startOffset - featuredict["sentence_length"] = len(sentence) - return featuredict - - def fit(self, words_in_context): - wic_dicts = [self.to_dict(*wic) for wic in words_in_context] - vecs = super().fit_transform(wic_dicts) - self.scaler.fit(vecs) - - def featurize(self, sentence, startOffset, endOffset, scale=True): - vecs = self.transform(self.to_dict(sentence, startOffset, endOffset)) - if scale: - vecs = self.scaler.transform(vecs) - return vecs - - def save(self, path): + def save(self, userId): json = jsonpickle.encode(self) - with open(path, "w") as jsonfile: + with open(CWI_PATH_TEMPLATE.format(userId), 'w') as jsonfile: jsonfile.write(json) @staticmethod def staticload(path): with open(path) as jsonfile: json = jsonfile.read() - return jsonpickle.decode(json) + cwi = jsonpickle.decode(json) + if hasattr(cwi, "scorer_path") and cwi.scorer_path is not None: + cwi.set_scorer(LexiScorer.staticload(cwi.scorer_path)) + else: + logger.warn("Ranker file does not provide link to a scorer. Set " + "manually with ranker.set_scorer()!") + return cwi class LexiRanker(LexiPersonalizedPipelineStep): - def __init__(self, userId, featurizer=None): - super().__init__(userId) - self.featurizer = featurizer or LexiFeaturizer() - self.model = self.build_model() - self.optimizer = torch.optim.Adam(self.model.parameters()) - - def build_model(self): - return LexiScorerNet(self.featurizer.dimensions(), [10, 10]) - - def fresh_train(self, data): - x, y = data - self.model.fit(x, y, self.optimizer) + def __init__(self, userId, scorer=None): + super().__init__(userId, scorer) - def update(self, cwi_data): - x, y = cwi_data - x = torch.Tensor(x) - y = torch.Tensor(y) - self.model.fit(x, y, self.optimizer) # TODO updating like this is - # problematic if we want learning rate decay or other things that rely - # on previous iterations, those are not saved in the model or optimizer... - - def set_featurizer(self, featurizer): - self.featurizer = featurizer + def update(self, data): + if self.scorer is not None: + self.scorer.update(data) def rank(self, candidates, sentence=None, wb=0, we=0): scored_candidates = [] for candidate in candidates: modified_sentence = sentence[:wb] + candidate + sentence[we:] - x = self.featurizer.featurize(modified_sentence, wb, - wb + len(candidate)) - score = self.model.forward(x) + score = self.scorer.score(modified_sentence, wb, + wb + len(candidate)) scored_candidates.append((candidate, score)) logger.debug("Sorted candidates: {}".format(scored_candidates)) return [candidate for candidate, score in sorted(scored_candidates, @@ -353,24 +276,95 @@ def rank(self, candidates, sentence=None, wb=0, we=0): def save(self, userId): json = jsonpickle.encode(self) - with open(RANKER_MODEL_PATH_TEMPLATE.format(userId), 'w') as jsonfile: + with open(RANKER_PATH_TEMPLATE.format(userId), 'w') as jsonfile: jsonfile.write(json) + self.scorer.save() @staticmethod def staticload(path): with open(path) as jsonfile: json = jsonfile.read() - return jsonpickle.decode(json) + ranker = jsonpickle.decode(json) + if hasattr(ranker, "scorer_path") and ranker.scorer_path is not None: + ranker.set_scorer(LexiScorer.staticload(ranker.scorer_path)) + else: + logger.warn("Ranker file does not provide link to a scorer. Set " + "manually with ranker.set_scorer()!") + return ranker + + +class LexiScorer: + def __init__(self, userId, featurizer, hidden_dims): + self.userId = userId + self.path = SCORER_PATH_TEMPLATE.format(userId) + self.featurizer = featurizer + self.hidden_dims = hidden_dims + self.model = self.build_model() + self.model_path = SCORER_MODEL_PATH_TEMPLATE.format(self.userId) + self.optimizer = torch.optim.Adam(self.model.parameters()) + self.update_steps = 0 + self.cache = {} + + def __getstate__(self): + state = self.__dict__.copy() + del state['model'], state['cache'] + return state + + def __setstate__(self, state): + self.__dict__.update(state) + self.cache = {} + self.model = self.build_model() + + def get_path(self): + return SCORER_PATH_TEMPLATE.format(self.userId) + + def set_userId(self, userId): + self.userId = userId + self.path = SCORER_PATH_TEMPLATE.format(userId) - def train(self, data, batch_size=64, lr=1e-3, - epochs=30, dev=None, clip=None, early_stopping=None, - l2=1e-5, lr_schedule=None): + def build_model(self): + return LexiScorerNet(self.featurizer.dimensions(), self.hidden_dims) + + def train_model(self, x, y): + self.model.fit(torch.Tensor(x), torch.Tensor(y), self.optimizer) + + def update(self, data): + # TODO do this in one batch (or several batches of more than 1 item...) + for (sentence, start_offset, end_offset), label in data: + x = self.featurizer.featurize(sentence, start_offset, end_offset) + self.model.fit(x, label, self.optimizer) + self.update_steps += 1 + + def score(self, sent, start_offset, end_offset): + cached = self.cache.get((sent, start_offset, end_offset)) + if cached is not None: + return cached + self.model.eval() + x = self.featurizer.featurize(sent, start_offset, end_offset) + score = float(self.model.forward(x)) + self.cache[(sent, start_offset, end_offset)] = score + return score + + def save(self): + # save state of this object, except model (excluded in __getstate__()) + with open(self.get_path(), 'w') as f: + json = jsonpickle.encode(self) + f.write(json) + # save model + torch.save({ + 'model_state_dict': self.model.state_dict() + }, self.model_path) - loss = 0 - optimizer = torch.optim.Adam(self.model.parameters(), lr=lr, - weight_decay=l2) - for input1, input2 in data: - pass # TODO + @staticmethod + def staticload(path): + with open(path) as jsonfile: + json = jsonfile.read() + scorer = jsonpickle.decode(json) + scorer.cache = {} + scorer.model = scorer.build_model() + checkpoint = torch.load(scorer.model_path) + scorer.model.load_state_dict(checkpoint['model_state_dict']) + return scorer class LexiScorerNet(torch.nn.Module): @@ -389,31 +383,18 @@ def forward(self, x): h = torch.relu(layer(h)) return self.out(h) - def fit(self, x, y, optimizer, epochs=1): + def fit(self, x, y, optimizer, epochs=100): for _ in range(epochs): self.train() # optimizer.zero_grad() pred = self.forward(x) # loss = torch.sqrt(torch.mean((y - pred) ** 2)) loss = torch.mean((y - pred)) + print(loss) loss.backward() optimizer.step() -class RankerNet(torch.nn.Module): - def __init__(self, input_size, hidden_sizes): - super(RankerNet, self).__init__() - self.input = torch.nn.Linear(input_size, hidden_sizes[0]) - self.out = torch.nn.Linear(hidden_sizes[0] * 2, 1) - - def forward(self, input1, input2): - l = self.input(torch.Tensor(input1)) - r = self.input(torch.Tensor(input2)) - combined = torch.cat((l.view(-1), r.view(-1))) - return self.out(combined) - - - class DummyLexicalSimplificationPipeline(SimplificationPipeline): def __init__(self, userId="anonymous"): self.model = None diff --git a/lexi/server/run_lexi_server.py b/lexi/server/run_lexi_server.py index 80a04b7..e8ac7ad 100644 --- a/lexi/server/run_lexi_server.py +++ b/lexi/server/run_lexi_server.py @@ -13,11 +13,13 @@ DatabaseConnectionError from werkzeug.exceptions import HTTPException -from lexi.config import LEXI_BASE, LOG_DIR, RANKER_MODEL_PATH_TEMPLATE, \ - CWI_MODEL_PATH_TEMPLATE, MODELS_DIR, RESOURCES +from lexi.config import LEXI_BASE, LOG_DIR, RANKER_PATH_TEMPLATE, \ + CWI_PATH_TEMPLATE, MODELS_DIR, RESOURCES, SCORER_PATH_TEMPLATE,\ + SCORER_MODEL_PATH_TEMPLATE from lexi.core.endpoints import update_ranker from lexi.core.simplification.lexical import LexicalSimplificationPipeline, \ - LexiCWI, LexiRanker, LexiGenerator, LexiFeaturizer, LexiFeaturizer + LexiCWI, LexiRanker, LexiGenerator, LexiScorer +from lexi.core.featurize.featurizers import LexicalFeaturizer from lexi.server.util import statuscodes from lexi.server.util.html import process_html from lexi.server.util.communication import make_response @@ -122,18 +124,23 @@ generator = LexiGenerator(synonyms_files=RESOURCES["da"]["synonyms"], embedding_files=RESOURCES["da"]["embeddings"]) simplification_pipeline.setGenerator(generator) -# default_ranker = load_pickled_model( -# RANKER_MODEL_PATH_TEMPLATE.format("default")) -# default_cwi = load_pickled_model( -# CWI_MODEL_PATH_TEMPLATE.format("default")) -featurizer = LexiFeaturizer.staticload("default_featurizer.json") +default_scorer = LexiScorer.staticload(SCORER_PATH_TEMPLATE.format("default")) +logger.debug("SCORER PATH: {}".format(default_scorer.path)) +default_ranker = LexiRanker.staticload(RANKER_PATH_TEMPLATE.format("default")) +default_ranker.set_scorer(default_scorer) +default_cwi = LexiCWI.staticload(CWI_PATH_TEMPLATE.format("default")) +default_cwi.set_scorer(default_scorer) + +# default_ranker = LexiRanker("default", scorer=default_scorer) +# default_cwi = LexiCWI("default", scorer=default_scorer) # TODO pretrain offline and load +# default_ranker.save("default") +# default_cwi.save("default") -default_ranker = LexiRanker("default", featurizer=featurizer) -logger.debug("Default Ranker Featurizer: {}".format(default_ranker.featurizer)) -default_cwi = LexiCWI("default", featurizer=featurizer) # TODO pretrain offline and load personalized_rankers = {"default": default_ranker} personalized_cwi = {"default": default_cwi} +personalized_scorers = {"default": default_scorer} + logger.debug("Default ranker: {} ({})".format(default_ranker, type(default_ranker))) logger.info("Base simplifier loaded.") @@ -167,8 +174,9 @@ def process(): .format(email, website_url)) logger.debug("Simplification request: {}".format(request.json)) user_id = db_connection.get_user(email) - if not user_id: + if not user_id or user_id is None: user_id = 1 # default user. TODO issue warning here to user + logger.info("User ID: {}".format(user_id)) request_id = db_connection.insert_session(user_id, website_url, frontend_version=frontend_version, language=language) @@ -212,8 +220,8 @@ def register_user(): # get fields from request email = request.json["email"].lower() # pw_hash = request.json["pw_hash"] - year_of_birth = request.json["year_of_birth"] - education = request.json["education"] + year_of_birth = request.json.get("year_of_birth", 1900) + education = request.json.get("education", "N/A") # get maximum user ID logger.info("New user: {}".format([email, year_of_birth, education])) user = db_connection.get_user(email) @@ -223,7 +231,7 @@ def register_user(): return make_response(statuscodes.EMAIL_ADDRESS_REGISTERED, msg) else: new_user_id = db_connection.insert_user(email) - model_path = RANKER_MODEL_PATH_TEMPLATE.format(new_user_id) + model_path = RANKER_PATH_TEMPLATE.format(new_user_id) db_connection.insert_model(new_user_id, year_of_birth, education, model_path, "ranker") return make_response(statuscodes.OK, "Registration successful") @@ -238,7 +246,7 @@ def login_user(): email = request.json["email"].lower() logger.info("Received login request for user {}".format(email)) user_id = db_connection.get_user(email) - if not user_id: + if not user_id or user_id is None: msg = "Email address {} not found".format(email) logger.info(msg) return make_response(statuscodes.EMAIL_ADDRESS_NOT_FOUND, msg) @@ -252,6 +260,10 @@ def login_user(): def get_feedback(): email = request.json["email"].lower() user_id = db_connection.get_user(email) + if not user_id or user_id is None: + msg = "User ID not available for email address {}".format(email) + logger.error(msg) + return make_response(statuscodes.EMAIL_ADDRESS_NOT_FOUND, msg) simplifications = request.json.get("simplifications", None) feedback_text = request.json.get("feedback_text", "N/A") website = request.json.get("url", "N/A") @@ -269,7 +281,6 @@ def get_feedback(): logger.debug("Getting ranker for user: {}".format(user_id)) ranker = get_personalized_ranker(user_id) logger.debug("Ranker: {}".format(ranker)) - logger.debug(" -- Featurizer: {} ({})".format(ranker.featurizer, hasattr(ranker, "featurizer"))) update_ranker(ranker, user_id, simplifications, rating) return make_response(statuscodes.OK, "Feedback successful") else: @@ -305,14 +316,14 @@ def get_personalized_ranker(user_id): ranker = personalized_rankers[user_id] else: logger.info("Gotta load ranker or use default...") - # retrieve model - # model_path = db_connection.get_model_path(user_id) - # ranker = LexiRanker.load(model_path) - ranker = LexiRanker(user_id, featurizer=featurizer) - # featurizer = ... # retrieve - # ranker.set_featurizer(featurizer) - # ranker.featurizer = LexiRankingFeaturizer() - ranker.userId = user_id + try: + # retrieve model + path = RANKER_PATH_TEMPLATE.format(user_id) + ranker = LexiRanker.staticload(path) + except: + ranker = copy.copy(personalized_rankers["default"]) + ranker.set_userId(user_id) + ranker.set_scorer(get_personalized_scorer(user_id)) personalized_rankers[user_id] = ranker return ranker @@ -325,18 +336,42 @@ def get_personalized_cwi(user_id): logger.info("Gotta load cwi or use default...") try: # retrieve model - model_path = db_connection.get_model_path(user_id) - cwi = LexiCWI(user_id, featurizer=featurizer) + path = CWI_PATH_TEMPLATE.format(user_id) + cwi = LexiCWI.staticload(path) except: logger.warning("Could not load personalized model. " "Loading default cwi.") cwi = copy.copy(personalized_cwi["default"]) logger.debug(cwi) - cwi.userId = user_id + cwi.set_userId(user_id) + cwi.set_scorer(get_personalized_scorer(user_id)) personalized_cwi[user_id] = cwi return cwi +def get_personalized_scorer(user_id): + logger.debug("Getting personalized scorer for user {}. In memory? {}".format(user_id, user_id in personalized_scorers)) + if user_id in personalized_scorers: + logger.info("Using personalized scorer, still in memory.") + scorer = personalized_scorers[user_id] + logger.debug("Scorer Featurizer: {}".format(hasattr(scorer, "featurizer"))) + else: + logger.info("Gotta load scorer or use default...") + try: + # retrieve model + path = SCORER_PATH_TEMPLATE.format(user_id) + scorer = LexiScorer.staticload(path) + except: + logger.warning("Could not load personalized model. " + "Loading default scorer.") + path = SCORER_PATH_TEMPLATE.format("default") + scorer = LexiScorer.staticload(path) + scorer.set_userId(user_id) + scorer.model_path = SCORER_MODEL_PATH_TEMPLATE.format(user_id) + personalized_scorers[user_id] = scorer + return scorer + + if __name__ == "__main__": app.run(threaded=True) logger.debug("Rules: " + str([rule for rule in app.url_map.iter_rules()])) diff --git a/scripts/convert_ls_data.py b/scripts/convert_ls_data.py new file mode 100644 index 0000000..5f76c9a --- /dev/null +++ b/scripts/convert_ls_data.py @@ -0,0 +1,28 @@ +import sys + + +def create_modified_sent(tokens, idx, w, lbl): + + tokens_new = tokens + tokens[idx] = w + sent = " ".join(tokens_new) + startOffset = 0 + for i in range(idx): + startOffset += len(tokens_new[i]) + 1 + endOffset = startOffset + len(w) + + return "{}\t{}\t{}\t{}".format(sent, startOffset, endOffset, lbl) + + +infile = open(sys.argv[1]) +outfile = open(sys.argv[2], "w") + +for line in infile: + line = line.strip() + if line: + sent, w, idx, alt1, alt2 = line.split("\t") + toks = sent.split() + out = create_modified_sent(toks, int(idx), alt1.split(":")[1], 0) + outfile.write(out+"\n") + out = create_modified_sent(toks, int(idx), alt2.split(":")[1], 1) + outfile.write(out + "\n") diff --git a/scripts/train_default_scorer.py b/scripts/train_default_scorer.py new file mode 100644 index 0000000..1455405 --- /dev/null +++ b/scripts/train_default_scorer.py @@ -0,0 +1,27 @@ +from lexi.core.simplification.lexical import LexiScorer +from lexi.core.featurize.featurizers import LexicalFeaturizer + +lf = LexicalFeaturizer() + + +items, y = [], [] + +for line in open("res/danish_ls_data.tsv"): + line = line.strip().split("\t") + if line: + items.append((line[0], int(line[1]), int(line[2]))) + y.append(int(line[-1])) + +print(items[:4]) +lf.fit(items) +lf.save("default_featurizer.json") + +x = [lf.featurize(*item) for item in items] + +print(x) +print(y) + +ls = LexiScorer("default", lf, [20, 20]) + +ls.train_model(x, y) +ls.save()