From 48144923696b78c3ea818cea822d42af1e164a99 Mon Sep 17 00:00:00 2001 From: Joachim Bingel Date: Tue, 9 Apr 2019 21:07:45 +0200 Subject: [PATCH] towards 0.3, new structure for simplification pipeline, move away from Pickle --- .gitignore | 3 +- README.md | 5 + lexi/config.py | 10 +- lexi/core/endpoints.py | 22 +- lexi/core/featurize/feat_util.py | 49 +- lexi/core/featurize/featurizers.py | 12 + lexi/core/featurize/functions.py | 1 + lexi/core/simplification/__init__.py | 44 +- lexi/core/simplification/lexical.py | 435 +-- lexi/core/simplification/structured.py | 16 +- lexi/core/simplification/util.py | 61 + lexi/lib/__init__.py | 0 lexi/lib/lexenstein/__init__.py | 0 lexi/lib/lexenstein/evaluators.py | 572 ---- lexi/lib/lexenstein/features.py | 3547 ------------------------ lexi/lib/lexenstein/generators.py | 2129 -------------- lexi/lib/lexenstein/identifiers.py | 395 --- lexi/lib/lexenstein/morphadorner.py | 175 -- lexi/lib/lexenstein/rankers.py | 1450 ---------- lexi/lib/lexenstein/selectors.py | 1569 ----------- lexi/lib/lexenstein/spelling.py | 74 - lexi/lib/lexenstein/util.py | 383 --- lexi/lib/lib.py | 795 ------ lexi/server/run_lexi_server.py | 90 +- lexi/server/util/database.py | 8 +- lexi/server/util/html.py | 8 +- requirements.txt | 3 +- scripts/train_default_classifier.py | 41 +- 28 files changed, 487 insertions(+), 11410 deletions(-) create mode 100644 lexi/core/featurize/functions.py create mode 100644 lexi/core/simplification/util.py delete mode 100644 lexi/lib/__init__.py delete mode 100755 lexi/lib/lexenstein/__init__.py delete mode 100755 lexi/lib/lexenstein/evaluators.py delete mode 100755 lexi/lib/lexenstein/features.py delete mode 100755 lexi/lib/lexenstein/generators.py delete mode 100755 lexi/lib/lexenstein/identifiers.py delete mode 100755 lexi/lib/lexenstein/morphadorner.py delete mode 100755 lexi/lib/lexenstein/rankers.py delete mode 100755 lexi/lib/lexenstein/selectors.py delete mode 100755 lexi/lib/lexenstein/spelling.py delete mode 100755 lexi/lib/lexenstein/util.py delete mode 100644 lexi/lib/lib.py diff --git a/.gitignore b/.gitignore index 96af683..c019046 100644 --- a/.gitignore +++ b/.gitignore @@ -5,4 +5,5 @@ models/* *.pyc trash/ .idea -lexi.cfg \ No newline at end of file +lexi.cfg +lexi/res/* diff --git a/README.md b/README.md index 604134d..f51ceef 100644 --- a/README.md +++ b/README.md @@ -2,6 +2,11 @@ ## Changelog + +### Version 0.3 ++ no more pickling! ++ POS-based synonym selection + ### Version 0.2.5 + more general database error handling diff --git a/lexi/config.py b/lexi/config.py index 7fbb12b..e2d1446 100644 --- a/lexi/config.py +++ b/lexi/config.py @@ -5,13 +5,17 @@ LOG_DIR = os.path.join(LEXI_BASE, "logs") MODELS_DIR = os.path.join(LEXI_BASE, "models") RANKER_MODELS_DIR = os.path.join(MODELS_DIR, "rankers") +CWI_MODELS_DIR = os.path.join(MODELS_DIR, "cwi") RESOURCES_DIR = os.path.join(LEXI_BASE, "res") +STANFORDNLP = os.path.join(RESOURCES_DIR, "stanfordnlp_resources") RANKER_MODEL_PATH_TEMPLATE = os.path.join(RANKER_MODELS_DIR, "{}.pickle") +CWI_MODEL_PATH_TEMPLATE = os.path.join(CWI_MODELS_DIR, "{}.pickle") + LEXICAL_MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}-lexical.pickle") MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}.pickle") -RESOURCES = { +RESOURCES_FULL = { "da": { "embeddings": #[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_ @@ -25,10 +29,10 @@ "ranking_training_dataset": RESOURCES_DIR+"/da/simplification/clean_danish_ls_dataset.txt", "synonyms": - RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"} + [RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"]} } -RESOURCES_TEST = { +RESOURCES = { "da": { "embeddings": [RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_" diff --git a/lexi/core/endpoints.py b/lexi/core/endpoints.py index 66be11c..f933974 100644 --- a/lexi/core/endpoints.py +++ b/lexi/core/endpoints.py @@ -35,7 +35,7 @@ def process_html_structured(classifier, html, ranker, parId): spanId = 0 if not html.strip(): return html - output_sents = classifier.predict_text(html, ranker) + output_sents = classifier.simplify_text(html, ranker) for original, simple in zip(*output_sents): simple_parsed = parser.parse_sent(simple) logger.debug([simple_parsed, simple.replace('\n', ''), parser]) @@ -64,17 +64,18 @@ def process_html_structured(classifier, html, ranker, parId): return " ".join(html_out), simplifications -def process_html_lexical(classifier, html, startOffset, endOffset, ranker, +def process_html_lexical(pipeline, html, startOffset, endOffset, cwi, ranker, requestId=0, min_similarity=0.7, blacklist=None): """ Transforms HMTL source, enriching simplified words with core markup by separating markup from text and sending pure text to simplification class. - :param classifier: Simplification classifier instance + :param pipeline: Simplification pipeline instance :param html: Input HTML source :param startOffset: offset after which simplifications are solicited :param endOffset: offset until which simplifications are solicited + :param cwi: personalized CWI module :param ranker: personalized ranker :param requestId: Request identifier to disambiguate core simplification targets across multiple calls to this method @@ -115,8 +116,8 @@ def get_local_hyperlink_balance(tags): # output is a sequence of tokens including whitespaces, id2simplification # is a dict mapping token IDs to simplifications, if applicable offset2html, pure_text = util.filter_html(html) - offset2simplification = classifier.predict_text( - pure_text, startOffset, endOffset, ranker, + offset2simplification = pipeline.simplify_text( + pure_text, startOffset, endOffset, cwi=cwi, ranker=ranker, min_similarity=min_similarity, blacklist=blacklist) logger.debug("Simplifying text between character offsets {} " "and {}: {}".format(startOffset, endOffset, pure_text)) @@ -130,11 +131,12 @@ def get_local_hyperlink_balance(tags): html_out += "".join(offset2html[i]) if i in offset2simplification and not open_hyperlinks_count > 0: # checking for hyperlinks because we don't want to simplify those - original, simple, sentence, word_index = offset2simplification[i] + original, replacements, \ + sentence, word_index = offset2simplification[i] # in future, possibly get more alternatives, and possibly return # in some other order - choices = [original, simple] - simple = util.escape(simple) + replacements = [util.escape(r) for r in replacements] + choices = [original] + replacements spanId += 1 elemId = "lexi_{}_{}".format(requestId, spanId) displaying_original = "true" if choices[0] == original else "false" @@ -151,7 +153,7 @@ def get_local_hyperlink_balance(tags): {elemId: { "request_id": requestId, "original": original, - "simple": simple, # legacy for frontend version <= 0.2 + "simple": replacements, # legacy for frontend version <= 0.2 "choices": choices, "bad_feedback": False, "selection": 0, @@ -168,6 +170,7 @@ def get_local_hyperlink_balance(tags): return html_out, simplifications +# TODO adapt to new structure def update_classifier(classifier, feedback): """ Featurizes simplification feedback from user and updates classifier @@ -191,6 +194,7 @@ def update_classifier(classifier, feedback): classifier.featurize_train(xs, ys) +# TODO adapt to new structure def update_ranker(ranker, user_id, feedback, overall_rating=0): """ Collects feedback and updates ranker diff --git a/lexi/core/featurize/feat_util.py b/lexi/core/featurize/feat_util.py index f762344..4fe6d97 100644 --- a/lexi/core/featurize/feat_util.py +++ b/lexi/core/featurize/feat_util.py @@ -2,14 +2,17 @@ import networkx as nx import numpy as np -import spacy +# import stanfordnlp from networkx.algorithms.traversal.depth_first_search import dfs_edges - +from lexi.config import STANFORDNLP from lexi.core.featurize.util import resources COMMA = "," VERB = "V" -nlp = spacy.load('en') +# nlp = stanfordnlp.Pipeline(nlp = stanfordnlp.Pipeline( +# processors='tokenize,mwt,pos', +# lang='da', models_dir=STANFORDNLP, +# tokenize_pretokenized=True)) class EtymWN: @@ -121,26 +124,26 @@ def has_ancestor_in_lang(lang, word_etym): return True return False - -def read_sentences_plain(raw_data): - doc = nlp(raw_data) - words_seen = 0 - for s in doc.sents: - sent = defaultdict(list) - for i, w in enumerate(s): - sent["idx"].append(i+1) - sent["form"].append(w.text) - sent["lemma"].append(w.lemma_) - sent["pos"].append(w.pos_) - ne = w.ent_type_ if w.ent_type_ else "O" - sent["ne"].append(ne) - # target = w.head.i - words_seen if w.dep_.lower() != "root" else -1 - target = w.head.i - words_seen - sent["head"].append(target+1) - sent["deprel"].append(w.dep_) - sent["label"].append("?") - words_seen += len(s) - yield sent +# +# def read_sentences_plain(raw_data): +# doc = nlp(raw_data) +# words_seen = 0 +# for s in doc.sentences: +# sent = defaultdict(list) +# for i, w in enumerate(s): +# sent["idx"].append(i+1) +# sent["form"].append(w.text) +# sent["lemma"].append(w.lemma_) +# sent["pos"].append(w.pos_) +# ne = w.ent_type_ if w.ent_type_ else "O" +# sent["ne"].append(ne) +# # target = w.head.i - words_seen if w.dep_.lower() != "root" else -1 +# target = w.head.i - words_seen +# sent["head"].append(target+1) +# sent["deprel"].append(w.dep_) +# sent["label"].append("?") +# words_seen += len(s) +# yield sent def read_sentences(data): diff --git a/lexi/core/featurize/featurizers.py b/lexi/core/featurize/featurizers.py index ed04eea..16a0052 100644 --- a/lexi/core/featurize/featurizers.py +++ b/lexi/core/featurize/featurizers.py @@ -3,6 +3,7 @@ from lexi.core.featurize import extract_lexical_feats, feat_util from lexi.core.featurize.extract_sentence_feats import TreeNode +from abc import ABCMeta, abstractmethod class LabelMapper: @@ -30,6 +31,17 @@ def map_inv(self, ids): return out +class LexiFeaturizer(metaclass=ABCMeta): + + @abstractmethod + def save(self, path): + raise NotImplementedError + + @abstractmethod + def load(self, path): + raise NotImplementedError + + class Featurizer: def __init__(self, features=None): diff --git a/lexi/core/featurize/functions.py b/lexi/core/featurize/functions.py new file mode 100644 index 0000000..8a4e560 --- /dev/null +++ b/lexi/core/featurize/functions.py @@ -0,0 +1 @@ +# # # Feature Functions diff --git a/lexi/core/simplification/__init__.py b/lexi/core/simplification/__init__.py index 303f994..19f7d9f 100644 --- a/lexi/core/simplification/__init__.py +++ b/lexi/core/simplification/__init__.py @@ -1,47 +1,9 @@ from abc import ABCMeta, abstractmethod -from sacremoses import MosesDetokenizer -detokenizer = MosesDetokenizer() - -class Classifier(metaclass=ABCMeta): - # @abstractmethod - # def fresh_train(self, x, y): - # pass - - @abstractmethod - def predict(self, x, ranker=None): - raise NotImplementedError +class SimplificationPipeline(metaclass=ABCMeta): @abstractmethod - def predict_text(self, txt, startOffset=0, endOffset=None, ranker=None): + def simplify_text(self, txt, startOffset=0, endOffset=None, + cwi=None, ranker=None): raise NotImplementedError - - @abstractmethod - def update(self, x, y): - raise NotImplementedError - - @abstractmethod - def save(self): - raise NotImplementedError - - @abstractmethod - def load(self, model_id): - raise NotImplementedError - - @abstractmethod - def load_default_init(self): - raise NotImplementedError - - @abstractmethod - def check_featurizer_set(self): - raise NotImplementedError - - -# Classifier.register(DummyLexicalClassifier) -# Classifier.register(PystructClassifier) -# Classifier.register(LexensteinSimplifier) -# PystructClassifier.register(ChainCRFClassifier) -# PystructClassifier.register(EdgeCRFClassifier) -# Classifier.register(AveragedPerceptron) -# Classifier.register(OnlineStructuredPerceptron) diff --git a/lexi/core/simplification/lexical.py b/lexi/core/simplification/lexical.py index 4e9586c..af99b1a 100644 --- a/lexi/core/simplification/lexical.py +++ b/lexi/core/simplification/lexical.py @@ -1,115 +1,69 @@ import logging import pickle +import os -from lexi.config import RESOURCES, LEXICAL_MODEL_PATH_TEMPLATE -from lexi.core.simplification import Classifier -from lexi.core.featurize.featurizers import LexicalFeaturizer +from lexi.config import LEXICAL_MODEL_PATH_TEMPLATE, RANKER_MODEL_PATH_TEMPLATE +from lexi.core.simplification import SimplificationPipeline +from lexi.core.simplification.util import make_synonyms_dict, \ + parse_embeddings +from lexi.core.featurize.featurizers import LexicalFeaturizer, LexiFeaturizer from lexi.core.util import util -from lexi.lib.lexenstein.features import FeatureEstimator -from lexi.lib.lib import LexensteinGenerator, BoundaryRanker, BoundarySelector,\ - OnlineRegressionRanker, SynonymDBGenerator - +from abc import ABCMeta, abstractmethod +import keras +from keras.layers import Input, Dense +from sklearn.feature_extraction import DictVectorizer logger = logging.getLogger('lexi') -class LexensteinSimplifier(Classifier): +class LexicalSimplificationPipeline(SimplificationPipeline): def __init__(self, userId, language="da"): self.language = language self.userId = userId + self.cwi = None self.generator = None self.selector = None self.ranker = None - # self.fresh_train(resources) - - def generateCandidates(self, sent, target, index, min_similarity=0.6): - # Produce candidates: - subs = self.generator.getSubstitutionsSingle( - sent, target, index, min_similarity=min_similarity) - # Create input data instance: - fulldata = [sent, target, index] - for sub in subs[target]: - fulldata.append('0:'+sub) - fulldata = [fulldata] - - # Return requested structures: - return fulldata - - def selectCandidates(self, data): - # # If there are not enough candidates to be selected, select none: - # if len(data[0]) < 5: - # selected = [[]] - # else: - selected = self.selector.selectCandidates( - data, 0.65, proportion_type='percentage') - - # Produce resulting data: - fulldata = [data[0][0], data[0][1], data[0][2]] - for sub in selected[0]: - fulldata.append('0:'+sub) - fulldata = [fulldata] - - # Return desired objects: - return fulldata - - def rankCandidates(self, data, ranker=None): - # Rank selected candidates: - if ranker: - ranks = ranker.getRankings(data) - elif self.ranker: - ranks = self.ranker.getRankings(data) - else: - raise AttributeError("No ranker provided to lexical simplifier.") - # TODO just return unranked/randomly ranked data? - return ranks - - def get_replacement(self, sent, word, index, ranker=None, - min_similarity=0.6): - candidates = self.generateCandidates(sent, word, index, - min_similarity=min_similarity) - logger.debug("Candidates {}".format(candidates)) - candidates = self.selectCandidates(candidates) - logger.debug("Candidates (selected) {}".format(candidates)) - candidates = self.rankCandidates(candidates, ranker) - logger.debug("Candidates (ranked) {}".format(candidates)) - replacement = "" - if candidates and len(candidates[0]) > 0: - try: - replacement = candidates[0][0].decode('utf8') - except (UnicodeDecodeError, AttributeError): - replacement = candidates[0][0] - # heuristics: if target and candidate are too similar, exclude (probably - # just morphological variation) - if replacement and util.relative_levenshtein(word, replacement) < 0.2: - return "" - return replacement - - def predict_text(self, text, startOffset=0, endOffset=None, - ranker=None, min_similarity=0.6, blacklist=None): - """ - Receives pure text, without HTML markup, as input and returns - simplifications for character offsets. - :param text: the input string - :param startOffset: offset after which simplifications are solicited - :param endOffset: offset until which simplifications are solicited. If - None, this will be set to the entire text length - :param ranker: a personalized ranker - :param min_similarity: minimum similarity for generator, if available - :param blacklist: list of words not to be simplified - :return: a dictionary mapping character offset anchors to - simplifications, which are 4-tuples (original_word, simplified_word, - sentence, original_word_index) - """ - if not blacklist: - blacklist = [] - def to_be_simplified(_word): - return len(_word) > 4 and _word not in blacklist + def generateCandidates(self, sent, startOffset, endOffset, + min_similarity=0.6): + if self.generator is not None: + return self.generator.getSubstitutions( + sent[startOffset:endOffset], min_similarity=min_similarity) + return [] + + def selectCandidates(self, sent, startOffset, endOffset, candidates): + if self.selector is not None: + return self.selector.select(sent, startOffset, endOffset, + candidates) + return candidates # fallback if selector not set + + def setCwi(self, cwi): + self.cwi = cwi - if not endOffset: - endOffset = len(text) + def setRanker(self, ranker): + self.ranker = ranker + def setGenerator(self, generator): + self.generator = generator + + def setSelector(self, selector): + self.selector = selector + + def simplify_text(self, text, startOffset=0, endOffset=None, cwi=None, + ranker=None, min_similarity=0.6, blacklist=None): + """ + Full lexical simplification pipeline. + :param text: + :param startOffset: + :param endOffset: + :param cwi: + :param ranker: + :param min_similarity: + :param blacklist: + :return: + """ startOffset = max(0, startOffset) endOffset = min(len(text), endOffset) @@ -122,101 +76,228 @@ def to_be_simplified(_word): # after the selection if se < startOffset or sb > endOffset: continue + sent = text[sb:se] - word_offsets = util.span_tokenize_words(sent) - for i, (wb, we) in enumerate(word_offsets): - # make sure we're within start/end offset + token_offsets = util.span_tokenize_words(sent) + + for i, (wb, we) in enumerate(token_offsets): global_word_offset_start = sb + wb global_word_offset_end = sb + we - if global_word_offset_start >= startOffset and \ - global_word_offset_end <= endOffset: - word = sent[wb:we] - logger.debug("Trying to simplify: {}".format(word)) - if to_be_simplified(word): - try: - replacement = self.get_replacement(sent, word, - str(i), ranker, - min_similarity) - except (IndexError, ValueError): - replacement = "" - if replacement: - - # This is where the output is generated - offset2simplification[global_word_offset_start] = \ - (word, replacement, sent, i) - else: - logger.debug("Found no simplification " - "for: {}".format(word)) - else: - logger.debug("Some rule prevents simplification " - "for: {}".format(word)) + if global_word_offset_start < startOffset or \ + global_word_offset_end > endOffset: + continue + + # STEP 1: TARGET IDENTIFICATION + complex_word = True # default case, e.g. for when no CWI module + # provided for single-word requests + if cwi: + complex_word = cwi.is_complex(sent, wb, we) + elif self.cwi: + complex_word = self.cwi.is_complex(sent, wb, we) + if not complex_word: + continue + + logger.debug("Identified targets: {}".format(sent[wb:we])) + + # STEP 2: CANDIDATE GENERATION + candidates = self.generateCandidates( + sent, wb, we, min_similarity=min_similarity) + if not candidates: + logger.debug("No candidate replacements found " + "for '{}'.".format(sent[wb:we])) + continue + logger.debug("Candidate replacements: {}.".format(candidates)) + + # STEP 3: CANDIDATE SELECTION + candidates = self.selectCandidates(sent, wb, we, candidates) + if not candidates: + logger.debug("No valid replacements in context.") + continue + logger.debug("Filtered replacements: {}.".format(candidates)) + + # STEP 4: RANKING + if ranker: + ranking = ranker.rank(candidates) + elif self.ranker: + ranking = self.ranker.rank(candidates) + else: + ranking = candidates + offset2simplification[global_word_offset_start] = \ + (sent[wb:we], ranking, sent, i) return offset2simplification - def load_default_init(self): - self.load("default") - def predict(self, x, ranker=None): +class LexiGenerator: + + def __init__(self, language="da", synonyms_files=(), embedding_files=()): + self.language = language + self.thesaura = [make_synonyms_dict(sf) for sf in synonyms_files] + self.w2v_model = parse_embeddings(embedding_files) + + def getSubstitutions(self, word, sources=("thesaurus", "embeddings"), + min_similarity=0.0, eager_return=True): + """ + Get substitutions from different types of sources (e.g. thesaura, + embeddings). Using `eager_return`, this method can return substitutions + as soon as one of the sources provides substitutions, such that e.g. + low-quality substitutions from embeddings do not dilute gold synonyms + from a thesaurus. + :param word: the target word to replace + :param sources: which types of sources to use for mining substitutions. + Valid options are `thesaurus` and `embeddings`. + :param min_similarity: For embedding substitions, defines the cosine + similarity theshold for a candidate to be considered a synonym + :param eager_return: if True, return found substitutions as soon as + one of the sources provides candidates + :return: + """ + subs = set() + for src in sources: + if src == "thesaurus": + subs = self.getSubstitutionsThesaurus(word) + elif src == "embeddings": + subs = self.getSubstitutionsEmbeddings(word, min_similarity) + if subs and eager_return: + return subs + return subs + + def getSubstitutionsEmbeddings(self, word, min_similarity=0.6): + return set([w for w, score in + self.w2v_model.most_similar(word, min_similarity)]) + + def getSubstitutionsThesaurus(self, word): + substitutions = set() + for t in self.thesaura: + substitutions.update(t.get(word, [])) + return substitutions + + +class LexiSelector: + + def __init__(self, language="da"): + self.language = language + + def select(self, sentence, startOffset, endOffset, candidates): + return candidates # TODO implement properly + + +class LexiPersonalizedPipelineStep(metaclass=ABCMeta): + + def __init__(self, userId=None): + self.userId = userId + self.model = None + self.featurizer = None + + @abstractmethod + def fresh_train(self, data): raise NotImplementedError - def update(self, x, y): + @abstractmethod + def update(self, data): raise NotImplementedError - def save(self): - with open(LEXICAL_MODEL_PATH_TEMPLATE.format(self.userId), 'wb') as pf: - pickle.dump((self.language, self.userId, self.generator, - self.selector, self.ranker), pf, - pickle.HIGHEST_PROTOCOL) + def save(self, models_path): + path_prefix = os.path.join(models_path, self.userId) + self.model.save(path_prefix+".model.h5") + self.featurizer.save(path_prefix+".featurizer") - def load(self, userId=None): - if not userId: - userId = self.userId - with open(LEXICAL_MODEL_PATH_TEMPLATE.format(userId), 'rb') as pf: - unpickled = pickle.load(pf) - logger.debug(unpickled) - (self.language, self.userId, self.generator, - self.selector, self.ranker) = unpickled - return self + def load(self, path): + self.model = keras.models.load_model(path) - def fresh_train(self, resources=None): - if not resources: - try: - resources = RESOURCES[self.language] - except KeyError: - logger.error("Couldn't find resources for language " - "ID {}".format(self.language)) - # General purpose - w2vpm = resources['embeddings'] - # Generator - # gg = LexensteinGenerator(w2vpm) - # gg = SynonymDBGenerator(w2vpm, resources['synonyms']) - gg = LexensteinGenerator(w2vpm) - - # Selector - fe = FeatureEstimator() - fe.resources[w2vpm[0]] = gg.model - fe.addCollocationalFeature(resources['lm'], 2, 2, 'Complexity') - fe.addWordVectorSimilarityFeature(w2vpm[0], 'Simplicity') - br = BoundaryRanker(fe) - bs = BoundarySelector(br) - bs.trainSelectorWithCrossValidation(resources['ubr'], 1, 5, 0.25, - k='all') - # Ranker - fe = FeatureEstimator() - fe.addLengthFeature('Complexity') - fe.addCollocationalFeature(resources['lm'], 2, 2, 'Simplicity') - orr = OnlineRegressionRanker(fe, None, training_dataset=resources[ - 'ranking_training_dataset']) - # Return LexicalSimplifier object - self.generator = gg - self.selector = bs - self.ranker = orr - return self - def check_featurizer_set(self): - return True +class LexiCWIFeaturizer(LexiFeaturizer, DictVectorizer): + + def __init__(self): + super().__init__(self) + + def save(self, path): + pass # TODO + + def load(self, path): + pass # TODO + + def dimensions(self): + # return len(self.get_feature_names()) + return 3 + + +class LexiCWI(LexiPersonalizedPipelineStep): + + def __init__(self, userId, featurizer=None): + self.featurizer = featurizer if featurizer else LexiCWIFeaturizer() + # self.model = self.build_model() + super().__init__(userId) + + def build_model(self, ): + n_input = self.featurizer.dimensions() + i = Input(shape=(n_input,)) + o = Dense([2]) + model = keras.models.Model(Input(n_input), ) + return model + + def fresh_train(self, cwi_data): + x, y = cwi_data + self.model.fit(x, y) + + def update(self, cwi_data): + x, y = cwi_data + self.model.fit(x, y) # TODO updating like this is problematic if we + # want learning rate decay or other things that rely on previous + # iterations, those are not saved in the model or optimizer... + + def identify_targets(self, sent, token_offsets): + return token_offsets # TODO implement, use is_complex + + def is_complex(self, sent, startOffset, endOffset): + return endOffset-startOffset > 7 # TODO implement properly + + +class LexiRankingFeaturizer(LexiFeaturizer, DictVectorizer): + + def __init__(self): + super().__init__(self) + + def save(self, path): + pass # TODO + + def load(self, path): + pass # TODO + + def dimensions(self): + # return len(self.get_feature_names()) + return 3 + + +class LexiRanker(LexiPersonalizedPipelineStep): + + def __init__(self, userId, featurizer=None): + self.userId = userId + self.featurizer = featurizer if featurizer else LexiRankingFeaturizer() + self.model = self.build_model() + super().__init__(userId) + + def build_model(self): + pass + + def rank(self, candidates, sentence=None, index=None): + return sorted(candidates, key=lambda x: len(x)) + + def save(self, userId): + with open(RANKER_MODEL_PATH_TEMPLATE.format(userId), 'wb') as pf: + # pickle.dump((self.fe, self.model), pf, pickle.HIGHEST_PROTOCOL) + pickle.dump(self, pf, pickle.HIGHEST_PROTOCOL) + + def load(self, path): + pass + + def fresh_train(self, x, y): + pass + + def update(self, x, y): + pass -class DummyLexicalClassifier(Classifier): +class DummyLexicalSimplificationPipeline(SimplificationPipeline): def __init__(self, userId="anonymous"): self.model = None self.featurizer = None @@ -267,9 +348,13 @@ def load_default_init(self): with open(LEXICAL_MODEL_PATH_TEMPLATE.format("default"), 'rb') as pf: self.model, self.featurizer = pickle.load(pf) - def predict_text(self, txt, ranker=None): + def simplify_text(self, txt, startOffset=0, endOffset=None, + cwi=None, ranker=None): """ :param txt: + :param startOffset: + :param endOffset: + :param cwi: :param ranker: :return: tokenized text (incl. word-final whitespaces) and id2simplifications dict diff --git a/lexi/core/simplification/structured.py b/lexi/core/simplification/structured.py index 3384386..be953fe 100644 --- a/lexi/core/simplification/structured.py +++ b/lexi/core/simplification/structured.py @@ -14,13 +14,13 @@ from lexi.core.featurize.featurizers import PystructChainFeaturizer, \ PystructEdgeFeaturizer from lexi.config import MODEL_PATH_TEMPLATE -from lexi.core.simplification import Classifier +from lexi.core.simplification import SimplificationPipeline from lexi.core.simplification import detokenizer logger = logging.getLogger('lexi') -class PystructClassifier(Classifier, metaclass=ABCMeta): +class PystructSimplificationPipeline(SimplificationPipeline, metaclass=ABCMeta): def __init__(self, userId="anonymous"): self.model = None self.learner = None @@ -84,7 +84,7 @@ def load_default_init(self): with open(MODEL_PATH_TEMPLATE.format("default"), 'rb') as pf: self.learner, self.model, self.featurizer = pickle.load(pf) - def predict_text(self, input_txt): + def simplify_text(self, input_txt): original = [] simplified = [] X, parses = self.featurizer.transform_plain(input_txt) @@ -123,7 +123,7 @@ def predict_text(self, input_txt): return original, simplified -class ChainCRFClassifier(PystructClassifier): +class ChainCRFClassifier(PystructSimplificationPipeline): def fresh_train(self, x, y, iterations=10): self.model = ChainCRF(inference_method="max-product") @@ -146,7 +146,7 @@ def check_featurizer_set(self): # self.fresh_train(x, y, iterations=iterations) -class EdgeCRFClassifier(PystructClassifier): +class EdgeCRFClassifier(PystructSimplificationPipeline): def fresh_train(self, x, y, iterations=10, decay_rate=1): self.model = EdgeFeatureGraphCRF(inference_method="max-product") @@ -169,7 +169,7 @@ def check_featurizer_set(self): # self.fresh_train(x, y, iterations=iterations) -class AveragedPerceptron(Classifier): +class AveragedPerceptron(SimplificationPipeline): """ An averaged perceptron, as implemented by Matthew Honnibal. @@ -246,7 +246,7 @@ def load(self, path): return None -class OnlineStructuredPerceptron(Classifier): +class OnlineStructuredPerceptron(SimplificationPipeline): """Implements a first order CRF""" def __init__(self, @@ -397,7 +397,7 @@ def compute_scores(self, sequence): def predict(self, x): return self.viterbi_decode(x)[0] - def predict_text(self, txt): + def simplify_text(self, txt): # TODO pass diff --git a/lexi/core/simplification/util.py b/lexi/core/simplification/util.py new file mode 100644 index 0000000..eccc96c --- /dev/null +++ b/lexi/core/simplification/util.py @@ -0,0 +1,61 @@ +import gensim +import numpy as np + + +class W2VModelEnsemble: + + def __init__(self, models): + self.models = models + + def most_similar(self, target, min_similarity=0.5, topn=10): + + all_similar_words = set() + for model in self.models: + if target in model: + all_similar_words.update([w for w, sim in + model.most_similar(target, topn=topn) + if sim > min_similarity]) + candidate_mean_scores = [] + for w in all_similar_words: + mean_score = np.mean([model.similarity(target, w) + for model in self.models + if w in model and target in model]) + candidate_mean_scores.append((w, mean_score)) + + # sort + most_similar = sorted(candidate_mean_scores, key=lambda x: x[1], + reverse=True) + # select top n + return most_similar[:topn] + + def similarity(self, w1, w2): + return np.mean([model.similarity(w1, w2) for model in self.models]) + + +def make_synonyms_dict(synonyms_file): + """ + + :param synonyms_file: + :return: + """ + from collections import defaultdict + words2synonyms = defaultdict(set) + for line in open(synonyms_file): + tgt, syns = line.strip().split("\t", 1) + words2synonyms[tgt].update(syns.split(";")) + return words2synonyms + + +def parse_embeddings(embeddings_files): + individual_models = [] + for model_file in embeddings_files: + try: + _model = gensim.models.KeyedVectors.load_word2vec_format( + model_file, binary=True, unicode_errors='ignore') + except UnicodeDecodeError: + try: + _model = gensim.models.KeyedVectors.load(model_file) + except: + continue + individual_models.append(_model) + return W2VModelEnsemble(individual_models) diff --git a/lexi/lib/__init__.py b/lexi/lib/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/lexi/lib/lexenstein/__init__.py b/lexi/lib/lexenstein/__init__.py deleted file mode 100755 index e69de29..0000000 diff --git a/lexi/lib/lexenstein/evaluators.py b/lexi/lib/lexenstein/evaluators.py deleted file mode 100755 index e54c7dc..0000000 --- a/lexi/lib/lexenstein/evaluators.py +++ /dev/null @@ -1,572 +0,0 @@ -from scipy.stats import * - -class IdentifierEvaluator: - - def evaluateIdentifier(self, cwictor_corpus, predicted_labels): - """ - Performs an intrinsic evaluation of a Complex Word Identification approach. - - @param cwictor_corpus: Path to a training corpus in CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param predicted_labels: A vector containing the predicted binary labels of each instance in the CWICTOR corpus. - @return: Accuracy, Precision, Recall and the F-score between Accuracy and Recall for the substitutions provided as input with respect to the gold-standard in the VICTOR corpus. - For more information on how the metrics are calculated, please refer to the LEXenstein Manual. - """ - - gold = [int(line.strip().split('\t')[3]) for line in open(cwictor_corpus)] - - #Initialize variables: - accuracyc = 0.0 - accuracyt = 0.0 - precisionc = 0.0 - precisiont = 0.0 - recallc = 0.0 - recallt = 0.0 - - #Calculate measures: - for i in range(0, len(gold)): - gold_label = gold[i] - predicted_label = predicted_labels[i] - if gold_label==predicted_label: - accuracyc += 1 - if gold_label==1: - recallc += 1 - precisionc += 1 - if gold_label==1: - recallt += 1 - if predicted_label==1: - precisiont += 1 - accuracyt += 1 - - try: - accuracy = accuracyc / accuracyt - except ZeroDivisionError: - accuracy = 0 - try: - precision = precisionc / precisiont - except ZeroDivisionError: - precision = 0 - try: - recall = recallc / recallt - except ZeroDivisionError: - recall = 0 - fmean = 0 - gmean = 0 - - try: - fmean = 2 * (precision * recall) / (precision + recall) - gmean = 2 * (accuracy * recall) / (accuracy + recall) - except ZeroDivisionError: - fmean = 0 - gmean = 0 - - #Return measures: - return accuracy, precision, recall, fmean, gmean - -class GeneratorEvaluator: - - def evaluateGenerator(self, victor_corpus, substitutions): - """ - Performs an intrinsic evaluation of a Substitution Generation approach. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param substitutions: A dictionary that assigns target complex words to sets of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - @return: Values for Potential, Precision, Recall and F-measure for the substitutions provided as input with respect to the gold-standard in the VICTOR corpus. - For more information on how the metrics are calculated, please refer to the LEXenstein Manual. - """ - - #Initialize variables: - potentialc = 0 - potentialt = 0 - precisionc = 0 - precisiont = 0 - recallt = 0 - - #Calculate measures: - f = open(victor_corpus) - for line in f: - data = line.strip().split('\t') - target = data[1].strip() - items = data[3:len(data)] - candidates = set([item.strip().split(':')[1].strip() for item in items]) - if target in substitutions: - overlap = candidates.intersection(set(substitutions[target])) - precisionc += len(overlap) - if len(overlap)>0: - potentialc += 1 - precisiont += len(substitutions[target]) - potentialt += 1 - recallt += len(candidates) - f.close() - - potential = float(potentialc)/float(potentialt) - precision = float(precisionc)/float(precisiont) - recall = float(precisionc)/float(recallt) - fmean = 0.0 - if precision==0.0 and recall==0.0: - fmean = 0.0 - else: - fmean = 2*(precision*recall)/(precision+recall) - - #Return measures: - return potential, precision, recall, fmean - -class SelectorEvaluator: - - def evaluateSelector(self, victor_corpus, substitutions): - """ - Performs an intrinsic evaluation of a Substitution Selection approach. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param substitutions: A vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - @return: Values for Potential, Recall, Precision and F-measure for the substitutions provided as input with respect to the gold-standard in the VICTOR corpus. - For more information on how the metrics are calculated, please refer to the LEXenstein Manual. - """ - - #Initialize variables: - potentialc = 0 - potentialt = 0 - precisionc = 0 - precisiont = 0 - recallt = 0 - - #Calculate measures: - f = open(victor_corpus) - index = -1 - for line in f: - index += 1 - - data = line.strip().split('\t') - target = data[1].strip() - items = data[3:len(data)] - candidates = set([item.strip().split(':')[1].strip() for item in items]) - - selected = substitutions[index] - if len(selected)>0: - overlap = candidates.intersection(set(selected)) - precisionc += len(overlap) - if len(overlap)>0: - potentialc += 1 - potentialt += 1 - precisiont += len(selected) - recallt += len(candidates) - f.close() - - potential = float(potentialc)/float(potentialt) - precision = float(precisionc)/float(precisiont) - recall = float(precisionc)/float(recallt) - fmean = 0.0 - if precision==0.0 and recall==0.0: - fmean = 0.0 - else: - fmean = 2*(precision*recall)/(precision+recall) - - #Return measures: - return potential, precision, recall, fmean - -class RankerEvaluator: - - def evaluateRanker(self, victor_corpus, rankings): - """ - Performs an intrinsic evaluation of a Substitution Ranking approach. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param rankings: A vector of size N, containing a set of ranked substitutions for each instance in the VICTOR corpus. - @return: Values for TRank-at-1/2/3, Recall-at-1/2/3, Spearman and Pearson correlation for the substitutions provided as input with respect to the gold-standard in the VICTOR corpus. - For more information on how the metrics are calculated, please refer to the LEXenstein Manual. - """ - - #Initialize variables: - total1 = 0 - total2 = 0 - total3 = 0 - corrects1 = 0 - corrects2 = 0 - corrects3 = 0 - recall1 = 0 - recall2 = 0 - recall3 = 0 - trecall1 = 0 - trecall2 = 0 - trecall3 = 0 - - #Read data: - index = -1 - f = open(victor_corpus) - all_gold = [] - all_ranks = [] - for data in f: - index += 1 - line = data.strip().split('\t') - gold_rankings = {} - for subst in line[3:len(line)]: - subst_data = subst.strip().split(':') - word = subst_data[1].strip() - ranking = int(subst_data[0].strip()) - gold_rankings[word] = ranking - ranked_candidates = rankings[index] - - for i in range(0, len(ranked_candidates)): - word = ranked_candidates[i] - all_gold.append(gold_rankings[word]) - all_ranks.append(i) - - first = gold_rankings[ranked_candidates[0]] - - #Get recall sets: - set1, set2, set3 = self.getRecallSets(line[3:len(line)]) - rankedset1 = set([]) - rankedset2 = set([]) - rankedset3 = set([]) - - #Calculate TRank 1: - if first==1: - rankedset1 = set([ranked_candidates[0]]) - corrects1 += 1 - recall1 += len(rankedset1.intersection(set1)) - trecall1 += len(set1) - total1 += 1 - - #Calculate TRank 2: - if len(list(gold_rankings.keys()))>2: - rankedset2 = rankedset1.union(set([ranked_candidates[1]])) - recall2 += len(rankedset2.intersection(set2)) - trecall2 += len(set2) - if first<=2: - corrects2 += 1 - total2 += 1 - - #Calculate TRank 3: - if len(list(gold_rankings.keys()))>3: - rankedset3 = rankedset2.union(set([ranked_candidates[2]])) - recall3 += len(rankedset3.intersection(set3)) - trecall3 += len(set3) - if first<=3: - corrects3 += 1 - total3 += 1 - - S, p = spearmanr(all_ranks, all_gold) - P = pearsonr(all_ranks, all_gold) - - return float(corrects1)/float(total1), float(corrects2)/float(total2), float(corrects3)/float(total3), float(recall1)/float(trecall1), float(recall2)/float(trecall2), float(recall3)/float(trecall3), S, P[0] - - def getRecallSets(self, substs): - result1 = set([]) - result2 = set([]) - result3 = set([]) - for subst in substs: - datasubst = subst.strip().split(':') - word = datasubst[1].strip() - index = datasubst[0].strip() - if index=="1": - result1.add(word) - result2.add(word) - result3.add(word) - elif index=="2": - result2.add(word) - result3.add(word) - elif index=="3": - result3.add(word) - return result1, result2, result3 - -class PipelineEvaluator: - - def evaluatePipeline(self, victor_corpus, rankings): - """ - Performs a round-trip evaluation of a Substitution Generation, Selection and Ranking approach combined. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param rankings: A list of ranked candidates for each instance in the VICTOR corpus, from simplest to most complex. - One should produce candidates with a Substitution Generation approach, select them for a given VICTOR corpus with a Substitution Selection approach, then rank them with a Substitution Ranking approach. - @return: Values for Precision, Accuracy and Changed Proportion for the substitutions provided as input with respect to the gold-standard in the VICTOR corpus. - For more information on how the metrics are calculated, please refer to the LEXenstein Manual. - """ - - #Initialize counting variables: - total = 0 - totalc = 0 - accurate = 0 - precise = 0 - - #Read victor corpus: - f = open(victor_corpus) - for i in range(0, len(rankings)): - #Get gold candidates: - data = f.readline().strip().split('\t') - target = data[1].strip() - data = data[3:len(data)] - gold_subs = set([item.strip().split(':')[1].strip() for item in data]) - - #Get highest ranked candidate: - first = rankings[i][0] - - #Check if it is in gold candidates: - total += 1 - if first!=target: - totalc += 1 - if first in gold_subs: - accurate += 1 - precise += 1 - else: - precise += 1 - - #Return metrics: - return float(precise)/float(total), float(accurate)/float(total), float(totalc)/float(total) - -class PLUMBErr: - - def __init__(self, dataset, complex): - """ - Creates a PLUMBErr error categorizer. - This class implements the strategy introduced in: - Paetzold, G. H.; Specia, L. PLUMBErr: An Automatic Error Identification Framework for Lexical Simplification. Proceedings of the 1st QATS. 2016. - One can download BenchLS (dataset) and NNSVocab (complex) from http://ghpaetzold.github.io/data/PLUMBErr.zip - - @param dataset: Path to a data in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param complex: Path to a file containing complex words. - Each line of the file must contain a single word. - """ - self.data = [line.strip().split('\t') for line in open(dataset)] - self.complex = set([line.strip() for line in open(complex)]) - - def cumulativeAnalysis(self, identified, selected, ranked): - """ - Performs the cumulative error identification analysis of a simplifier. - In a cumulative analysis, the errors made during Complex Word Identification are carried onto the following steps of the pipeline. - - @param identified: A vector containing one binary value (0 for simple, 1 for complex) for each word in the dataset. - To produce the vector, one can run a Complex Word Identification approach from LEXenstein over the dataset. - @param selected: A vector containing the candidates selected for each instance in the dataset. - To produce the vector, one can pair a Substitution Generation and a Substitution Selection approach from LEXenstein. - @param ranked: A vector containing the selected candidates ranked in order of simplicity. - To produce the vector, one can run a Substitution Ranking approach from LEXenstein over the selected candidates provided. - """ - - #Initialize report: - report = [] - - #Create CWI gold-standard: - gold = [] - for line in self.data: - if line[1] in self.complex: - gold.append(1) - else: - gold.append(0) - - #Find errors of type 2: - error2a = 0 - error2b = 0 - for i in range(0, len(gold)): - errors = set([]) - g = gold[i] - p = identified[i] - if p==0 and g==1: - error2a += 1 - errors.add('2A') - elif p==1 and g==0: - error2b += 1 - errors.add('2B') - report.append(errors) - - #Find errors of type 3: - error3a = 0 - error3b = 0 - - goldcands = [] - simplecands = [] - for line in self.data: - cs = set([cand.strip().split(':')[1].strip() for cand in line[3:]]) - goldcands.append(cs) - simplecands.append(cs.difference(self.complex)) - - cands = [] - for vec in selected: - cands.append(set(vec)) - - control = [] - for i in range(0, len(self.data)): - gold_label = gold[i] - pred_label = identified[i] - ac = goldcands[i] - sc = simplecands[i] - cs = cands[i] - if gold_label==0: - sc = set([]) - else: - if pred_label==0: - cs = set([]) - ainter = ac.intersection(cs) - sinter = sc.intersection(cs) - - if gold_label==1: - if len(ainter)==0: - error3a += 1 - report[i].add('3A') - control.append('Error') - elif len(sinter)==0: - error3b += 1 - report[i].add('3B') - control.append('Error') - else: - control.append('Ok') - else: - control.append('Ignore') - - #Find errors of type 4 and 5: - error4 = 0 - error5 = 0 - noerror = 0 - for i in range(0, len(self.data)): - gold_label = gold[i] - pred_label = identified[i] - ac = goldcands[i] - sc = simplecands[i] - cs = ranked[i] - if gold_label==0: - sc = set([]) - else: - if pred_label==0: - cs = set([]) - - sub = '' - if len(cs)>0: - sub = cs[0] - - if control[i]=='Ok': - if sub not in ac: - error4 += 1 - report[i].add('4') - elif sub not in sc: - error5 += 1 - report[i].add('5') - else: - noerror += 1 - report[i].add('1') - - #Create error count map: - counts = {} - counts['2A'] = error2a - counts['2B'] = error2b - counts['3A'] = error3a - counts['3B'] = error3b - counts['4'] = error4 - counts['5'] = error5 - counts['1'] = noerror - - return report, counts - - def nonCumulativeAnalysis(self, identified, selected, ranked): - """ - Performs the non-cumulative error identification analysis of a simplifier. - In a non-cumulative analysis, the errors made during Complex Word Identification are not carried onto the following steps of the pipeline. - - @param identified: A vector containing one binary value (0 for simple, 1 for complex) for each word in the dataset. - To produce the vector, one can run a Complex Word Identification approach from LEXenstein over the dataset. - @param selected: A vector containing the candidates selected for each instance in the dataset. - To produce the vector, one can pair a Substitution Generation and a Substitution Selection approach from LEXenstein. - @param ranked: A vector containing the selected candidates ranked in order of simplicity. - To produce the vector, one can run a Substitution Ranking approach from LEXenstein over the selected candidates provided. - @return: A report vector containing the errors made in each instance of the dataset, as well as a map containing total error counts for the entire dataset. - """ - - #Initialize report: - report = [] - - #Create CWI gold-standard: - gold = [] - for line in self.data: - if line[1] in self.complex: - gold.append(1) - else: - gold.append(0) - - #Find errors of type 2: - error2a = 0 - error2b = 0 - for i in range(0, len(gold)): - errors = set([]) - g = gold[i] - p = identified[i] - if p==0 and g==1: - error2a += 1 - errors.add('2A') - elif p==1 and g==0: - error2b += 1 - errors.add('2B') - report.append(errors) - - #Find errors of type 3: - error3a = 0 - error3b = 0 - - goldcands = [] - simplecands = [] - for line in self.data: - cs = set([cand.strip().split(':')[1].strip() for cand in line[3:]]) - goldcands.append(cs) - simplecands.append(cs.difference(self.complex)) - - cands = [] - for vec in selected: - cands.append(set(vec)) - - for i in range(0, len(self.data)): - gold_label = gold[i] - pred_label = identified[i] - ac = goldcands[i] - sc = simplecands[i] - cs = cands[i] - ainter = ac.intersection(cs) - sinter = sc.intersection(cs) - - if gold_label==1: - if len(ainter)==0: - error3a += 1 - report[i].add('3A') - elif len(sinter)==0: - error3b += 1 - report[i].add('3B') - - #Find errors of type 4 and 5: - error4 = 0 - error5 = 0 - noerror = 0 - for i in range(0, len(self.data)): - gold_label = gold[i] - pred_label = identified[i] - ac = goldcands[i] - sc = simplecands[i] - cs = ranked[i] - - sub = '' - if len(cs)>0: - sub = cs[0] - - if gold_label==1: - if sub not in ac: - error4 += 1 - report[i].add('4') - elif sub not in sc: - error5 += 1 - report[i].add('5') - else: - noerror += 1 - report[i].add('1') - - #Create error count map: - counts = {} - counts['2A'] = error2a - counts['2B'] = error2b - counts['3A'] = error3a - counts['3B'] = error3b - counts['4'] = error4 - counts['5'] = error5 - counts['1'] = noerror - - return report, counts diff --git a/lexi/lib/lexenstein/features.py b/lexi/lib/lexenstein/features.py deleted file mode 100755 index a7363bc..0000000 --- a/lexi/lib/lexenstein/features.py +++ /dev/null @@ -1,3547 +0,0 @@ -from .util import getGeneralisedPOS, dependencyParseSentences -from nltk.corpus import wordnet as wn -import kenlm -import math -import gensim -from nltk.tag.stanford import StanfordPOSTagger -from nltk.parse.stanford import StanfordParser -import os -import pickle -from sklearn.preprocessing import normalize -import numpy -import shelve -import urllib.request, urllib.error, urllib.parse -import json -import re -import logging - -logger = logging.getLogger('lexi') - -class FeatureEstimator: - - def __init__(self, norm=False): - """ - Creates an instance of the FeatureEstimator class. - - @param norm: Boolean variable that determines whether or not feature values should be normalized. - """ - #List of features to be calculated: - self.features = [] - #List of identifiers of features to be calculated: - self.identifiers = [] - #Normalization parameter: - self.norm = norm - #Persistent resource list: - self.resources = {} - #One-run resource list: - self.temp_resources = {} - - def calculateFeatures(self, corpus, format='victor', input='file'): - """ - Calculate the selected features over the candidates of a VICTOR or CWICTOR corpus. - - @param corpus: Path to a corpus in the VICTOR or CWICTOR format. - For more information about the input's format, refer to the LEXenstein Manual. - @param format: Input format. - Values available: victor, cwictor. - @param input: Type of input provided. - Values available: file, text. - @return: Returns a MxN matrix, where M is the number of substitutions of all instances in the VICTOR corpus, and N the number of selected features. - """ - data = [] - if format.strip().lower()=='victor': - if input=='file': - data = [line.strip().split('\t') for line in open(corpus)] - elif input=='text': - data = [line.strip().split('\t') for line in corpus.split('\n')] - else: - logger.debug('Unrecognized format: must be file or text.') - elif format.strip().lower()=='cwictor': - if input=='file': - f = open(corpus) - for line in f: - line_data = line.strip().split('\t') - data.append([line_data[0].strip(), line_data[1].strip(), line_data[2].strip(), '0:'+line_data[1].strip()]) - elif input=='text': - for line in corpus.split('\n'): - line_data = line.strip().split('\t') - data.append([line_data[0].strip(), line_data[1].strip(), line_data[2].strip(), '0:'+line_data[1].strip()]) - else: - logger.debug('Unrecognized format: must be file or text.') - else: - logger.debug('Unknown input format during feature estimation!') - return [] - - values = [] - for feature in self.features: - values.append(feature[0].__call__(data, feature[1])) - - result = [] - index = 0 - for line in data: - for i in range(3, len(line)): - vector = self.generateVector(values, index) - result.append(vector) - index += 1 - - #Normalize if required: - if self.norm: - result = normalize(result, axis=0) - - #Clear one-run resources: - self.temp_resources = {} - - return result - - def calculateInstanceFeatures(self, sent, target, head, candidate): - """ - Calculate the selected features over an instance of a VICTOR corpus. - - @param sent: Sentence containing a target complex word. - @param target: Target complex sentence to be simplified. - @param head: Position of target complex word in sentence. - @param candidate: Candidate substitution. - @return: Returns a vector containing the feature values of VICTOR instance. - """ - - data = [[sent, target, head, '0:'+candidate]] - - values = [] - for feature in self.features: - values.append(feature[0].__call__(data, feature[1])) - vector = self.generateVector(values, 0) - return vector - - def generateVector(self, feature_vector, index): - result = [] - for feature in feature_vector: - if not isinstance(feature[index], list): - result.append(feature[index]) - else: - result.extend(feature[index]) - return result - - def targetPOSTagProbability(self, data, args): - model = self.resources[args[0]] - tagger = self.resources[args[1]] - result = [] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - for i in range(0, len(data)): - line = data[i] - target = line[1].strip().lower() - head = int(line[2].strip()) - target_pos = tagged_sents[i][head][1] - - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - probability = model[words].prob(target_pos) - result.append(probability) - return result - - def minimumWordVectorSimilarityFeature(self, data, args): - model = self.resources[args[0]] - result = [] - for line in data: - target = line[1].strip().lower().replace(' ', '_') - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - similarities = [] - cand_size = 0 - for word in words.split(' '): - cand_size += 1 - try: - similarities.append(model.similarity(target, word)) - except KeyError: - try: - similarities.append(model.similarity(target, word.lower())) - except KeyError: - pass - if len(similarities)>0: - similarity = numpy.min(similarities) - result.append(similarity) - else: - result.append(0.0) - return result - - def maximumWordVectorSimilarityFeature(self, data, args): - model = self.resources[args[0]] - result = [] - for line in data: - target = line[1].strip().lower().replace(' ', '_') - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - similarities = [] - cand_size = 0 - for word in words.split(' '): - cand_size += 1 - try: - similarities.append(model.similarity(target, word)) - except KeyError: - try: - similarities.append(model.similarity(target, word.lower())) - except KeyError: - pass - if len(similarities)>0: - similarity = numpy.max(similarities) - result.append(similarity) - else: - result.append(0.0) - return result - - def averageWordVectorSimilarityFeature(self, data, args): - model = self.resources[args[0]] - result = [] - for line in data: - target = line[1].strip().lower().replace(' ', '_') - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - similarities = [] - cand_size = 0 - for word in words.split(' '): - cand_size += 1 - try: - similarities.append(model.similarity(target, word)) - except KeyError: - try: - similarities.append(model.similarity(target, word.lower())) - except KeyError: - pass - if len(similarities)>0: - similarity = numpy.mean(similarities) - result.append(similarity) - else: - result.append(0.0) - return result - - def wordVectorSimilarityFeature(self, data, args): - model = self.resources[args[0]] - result = [] - for line in data: - target = line[1].strip().lower() - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - similarity = 0.0 - cand_size = 0 - for word in words.split(' '): - cand_size += 1 - try: - similarity += model.similarity(target, word) - except KeyError: - try: - similarity += model.similarity(target, word.lower()) - except KeyError: - pass - similarity /= cand_size - result.append(similarity) - return result - - def taggedWordVectorSimilarityFeature(self, data, args): - result = [] - - model = self.resources[args[0]] - tagger = self.resources[args[1]] - pos_type = args[2] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - #Transform them to the right format: - if pos_type=='paetzold': - transformed = [] - for sent in tagged_sents: - tokens = [] - for token in sent: - tokens.append((token[0], getGeneralisedPOS(token[1]))) - transformed.append(tokens) - tagged_sents = transformed - - for i in range(0, len(data)): - line = data[i] - target = line[1].strip().lower() - head = int(line[2].strip()) - target_pos = tagged_sents[i][head][1] - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - similarity = 0.0 - cand_size = 0 - for word in words.split(' '): - cand_size += 1 - try: - similarity += model.similarity(target+'|||'+target_pos, word+'|||'+target_pos) - except KeyError: - try: - similarity += model.similarity(target+'|||'+target_pos, word.lower()+'|||'+target_pos) - except KeyError: - pass - similarity /= cand_size - result.append(similarity) - return result - - def wordVectorValuesFeature(self, data, args): - model = self.resources[args[0]] - size = args[1] - result = [] - for line in data: - target = line[1].strip().lower() - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - word_vector = numpy.zeros(size) - for word in words.split(' '): - try: - word_vector = numpy.add(word_vector, model[words]) - except KeyError: - pass - result.append(word_vector) - for i in range(0, len(result)): - result[i] = result[i].tolist() - return result - - def translationProbabilityFeature(self, data, args): - probabilities = self.resources[args[0]] - result = [] - for line in data: - target = line[1].strip() - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - prob = -9999999999 - for word in words.split(' '): - if target+'\t'+word in probabilities: - p = probabilities[target+'\t'+word] - if p>prob: - prob = p - result.append(prob) - return result - - def lexiconFeature(self, data, args): - path = args[0] - result = [] - basics = self.resources[path] - for line in data: - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - basicCount = 0 - for word in words.split(' '): - if word.strip() in basics: - basicCount += 1 - if basicCount==len(words.split(' ')): - result.append(1.0) - else: - result.append(0.0) - return result - - def lengthFeature(self, data, args): - result = [] - for line in data: - for subst in line[3:len(line)]: - word = subst.strip().split(':')[1].strip() - result.append(len(word)) - return result - - def numberOfTokens(self, data, args): - result = [] - for line in data: - for subst in line[3:len(line)]: - word = subst.strip().split(':')[1].strip().split(' ') - result.append(len(word)) - return result - - def syllableFeature(self, data, args): - mat = args[0] - #Create the input for the Java application: - input = [] - for line in data: - for subst in line[3:len(line)]: - word = subst.strip().split(':')[1].strip() - input.append(word) - - #Run the syllable splitter: - outr = mat.splitSyllables(input) - - #Decode output: - out = [] - for o in outr: - out.append(o.decode("latin1").replace(' ', '-')) - - #Calculate number of syllables - result = [] - for instance in out: - if len(instance.strip())>0: - result.append(len(instance.split('-'))) - return result - - def collocationalFeature(self, data, args): - lm = args[0] - spanl = args[1] - spanr = args[2] - result = [] - model = self.resources[lm] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - spanlv = list(range(0, spanl+1)) - spanrv = list(range(0, spanr+1)) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - values = [] - for span1 in spanlv: - for span2 in spanrv: - ngram, bosv, eosv = self.getNgram(word, sent, head, span1, span2) - # aux = model.score(ngram, bos=bosv, eos=eosv) - aux = model.score(ngram) - values.append(aux) - result.append(values) - return result - - def frequencyCollocationalFeature(self, data, args): - ngrams = args[0] - spanl = args[1] - spanr = args[2] - result = [] - counts = self.resources[ngrams] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - spanlv = list(range(0, spanl+1)) - spanrv = list(range(0, spanr+1)) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - values = [] - for span1 in spanlv: - for span2 in spanrv: - ngram, bosv, eosv = self.getNgram(word, sent, head, span1, span2) - if ngram in counts: - values.append(counts[ngram]) - else: - values.append(0.0) - result.append(values) - return result - - def taggedFrequencyCollocationalFeature(self, data, args): - counts = self.resources[args[0]] - spanl = args[1] - spanr = args[2] - tagger = self.resources[args[3]] - pos_type = args[4] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - #Transform them to the right format: - if pos_type=='paetzold': - transformed = [] - for sent in tagged_sents: - tokens = [] - for token in sent: - tokens.append((token[0], getGeneralisedPOS(token[1]))) - transformed.append(tokens) - tagged_sents = transformed - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = [''] + [tokendata[1] for tokendata in tagged_sents[i]] + [''] - target = line[1] - head = int(line[2])+1 - spanlv = list(range(0, spanl+1)) - spanrv = list(range(0, spanr+1)) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - values = [] - for span1 in spanlv: - for span2 in spanrv: - ngram, bosv, eosv = self.getNgram(word, sent, head, span1, span2) - if ngram in counts: - values.append(counts[ngram]) - else: - values.append(0.0) - result.append(values) - return result - - def binaryTaggedFrequencyCollocationalFeature(self, data, args): - counts = self.resources[args[0]] - spanl = args[1] - spanr = args[2] - tagger = self.resources[args[3]] - pos_type = args[4] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - #Transform them to the right format: - if pos_type=='paetzold': - transformed = [] - for sent in tagged_sents: - tokens = [] - for token in sent: - tokens.append((token[0], getGeneralisedPOS(token[1]))) - transformed.append(tokens) - tagged_sents = transformed - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = [''] + [tokendata[1] for tokendata in tagged_sents[i]] + [''] - target = line[1] - head = int(line[2])+1 - spanlv = list(range(0, spanl+1)) - spanrv = list(range(0, spanr+1)) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - values = [] - for span1 in spanlv: - for span2 in spanrv: - ngram, bosv, eosv = self.getNgram(word, sent, head, span1, span2) - if ngram in counts: - values.append(1.0) - else: - values.append(0.0) - result.append(values) - return result - - def popCollocationalFeature(self, data, args): - lm = args[0] - spanl = args[1] - spanr = args[2] - result = [] - model = self.resources[lm] - for line in data: - sent = line[0] - target = line[1] - head = int(line[2]) - spanlv = list(range(0, spanl+1)) - spanrv = list(range(0, spanr+1)) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - values = [] - for span1 in spanlv: - for span2 in spanrv: - ngrams = self.getPopNgrams(word, sent, head, span1, span2) - maxscore = -999999 - for ngram in ngrams: - # aux = model.score(ngram[0], bos=ngram[1], eos=ngram[2]) - aux = model.score(ngram[0]) - if aux>maxscore: - maxscore = aux - values.append(maxscore) - result.append(values) - return result - - def ngramProbabilityFeature(self, data, args): - lm = args[0] - spanl = args[1] - spanr = args[2] - result = [] - model = self.resources[lm] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - ngram, bosv, eosv = self.getNgram(word, sent, head, spanl, spanr) - # prob = model.score(ngram, bos=bosv, eos=eosv) - prob = model.score(ngram) - result.append(prob) - return result - - def averageTokenProbabilityFeature(self, data, args): - lm = args[0] - result = [] - model = self.resources[lm] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - candidate = subst.split(':')[1].strip().split(' ') - probabilities = [] - for token in candidate: - ngram, bosv, eosv = self.getNgram(token, sent, head, 0, 0) - prob = model.score(ngram, bos=bosv, eos=eosv) - probabilities.append(prob) - result.append(numpy.mean(probabilities)) - return result - - def maximumTokenProbabilityFeature(self, data, args): - lm = args[0] - result = [] - model = self.resources[lm] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - candidate = subst.split(':')[1].strip().split(' ') - probabilities = [] - for token in candidate: - ngram, bosv, eosv = self.getNgram(token, sent, head, 0, 0) - # prob = model.score(ngram, bos=bosv, eos=eosv) - prob = model.score(ngram) - probabilities.append(prob) - result.append(numpy.max(probabilities)) - return result - - def minimumTokenProbabilityFeature(self, data, args): - lm = args[0] - result = [] - model = self.resources[lm] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - candidate = subst.split(':')[1].strip().split(' ') - probabilities = [] - for token in candidate: - ngram, bosv, eosv = self.getNgram(token, sent, head, 0, 0) - # prob = model.score(ngram, bos=bosv, eos=eosv) - prob = model.score(ngram) - probabilities.append(prob) - result.append(numpy.min(probabilities)) - return result - - def ngramFrequencyFeature(self, data, args): - ngrams = args[0] - spanl = args[1] - spanr = args[2] - result = [] - counts = self.resources[ngrams] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - ngram, bosv, eosv = self.getNgram(word, sent, head, spanl, spanr) - if ngram in counts: - result.append(counts[ngram]) - else: - result.append(0.0) - return result - - def binaryNgramFrequencyFeature(self, data, args): - ngrams = args[0] - spanl = args[1] - spanr = args[2] - result = [] - counts = self.resources[ngrams] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - ngram, bosv, eosv = self.getNgram(word, sent, head, spanl, spanr) - if ngram in counts: - result.append(1.0) - else: - result.append(0.0) - return result - - def popNgramProbabilityFeature(self, data, args): - lm = args[0] - spanl = args[1] - spanr = args[2] - result = [] - model = self.resources[lm] - for line in data: - sent = line[0] - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - ngrams = self.getPopNgrams(word, sent, head, spanl, spanl) - maxscore = -999999 - for ngram in ngrams: - # aux = model.score(ngram[0], bos=ngram[1], eos=ngram[2]) - aux = model.score(ngram[0]) - if aux>maxscore: - maxscore = aux - result.append(maxscore) - return result - - def popNgramFrequencyFeature(self, data, args): - ngrams = args[0] - spanl = args[1] - spanr = args[2] - result = [] - counts = self.resources[ngrams] - for line in data: - sent = line[0].strip() - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - ngrams = self.getPopNgrams(word, sent, head, spanl, spanl) - maxscore = -999999 - for ngram in ngrams: - aux = 0.0 - if ngram[0] in counts: - aux = counts[ngram[0]] - - if aux>maxscore: - maxscore = aux - result.append(maxscore) - - return result - - def getNgram(self, cand, tokens, head, configl, configr): - if configl==0 and configr==0: - return cand, False, False - else: - result = '' - bosv = False - if max(0, head-configl)==0: - bosv = True - eosv = False - if min(len(tokens), head+configr+1)==len(tokens): - eosv = True - for i in range(max(0, head-configl), head): - result += tokens[i] + ' ' - result += cand + ' ' - for i in range(head+1, min(len(tokens), head+configr+1)): - result += tokens[i] + ' ' - return result.strip(), bosv, eosv - - def getPopNgrams(self, cand, sent, head, configl, configr): - if configl==0 and configr==0: - bos = False - eos = False - if head==0: - bos = True - if head==len(sent.split(' '))-1: - eos = True - return [(cand, bos, eos)] - else: - result = set([]) - contexts = self.getPopContexts(sent, head) - for context in contexts: - ctokens = context[0] - chead = context[1] - bosv = False - if max(0, chead-configl)==0: - bosv = True - eosv = False - ngram = '' - if min(len(ctokens), chead+configr+1)==len(ctokens): - eosv = True - for i in range(max(0, chead-configl), chead): - ngram += ctokens[i] + ' ' - ngram += cand + ' ' - for i in range(chead+1, min(len(ctokens), chead+configr+1)): - ngram += ctokens[i] + ' ' - result.add((ngram.strip(), bosv, eosv)) - return result - - def getPopContexts(self, sent, head): - tokens = sent.strip().split(' ') - result = [] - check = 0 - if head>0: - check += 1 - tokens1 = list(tokens) - tokens1.pop(head-1) - result.append((tokens1, head-1)) - if head0: - resultsy.append(1.0) - else: - resultsy.append(0.0) - return resultsy - - def hypernymCount(self, data, args): - resulthe = [] - for line in data: - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - hypernyms = set([]) - for word in words.split(' '): - senses = None - try: - senses = wn.synsets(word) - except UnicodeDecodeError: - senses = [] - for sense in senses: - hypernyms.update(sense.hypernyms()) - resulthe.append(len(hypernyms)) - return resulthe - - def isHypernym(self, data, args): - resultsy = [] - for line in data: - target = line[1].strip() - tgthypernyms = set([]) - try: - tgtsenses = wn.synsets(target) - for sense in tgtsenses: - tgthypernyms.update(sense.hypernyms()) - except Exception: - tgthypernyms = tgthypernyms - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - senses = set([]) - for word in words.split(' '): - try: - senses.update(wn.synsets(word)) - except UnicodeDecodeError: - senses = senses - if len(tgthypernyms)==0 or len(senses.intersection(tgthypernyms))>0: - resultsy.append(1.0) - else: - resultsy.append(0.0) - return resultsy - - def hyponymCount(self, data, args): - resultho = [] - for line in data: - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - hyponyms = set([]) - for word in words.split(' '): - senses = None - try: - senses = wn.synsets(word) - except UnicodeDecodeError: - senses = [] - for sense in senses: - hyponyms.update(sense.hyponyms()) - resultho.append(len(hyponyms)) - return resultho - - def isHyponym(self, data, args): - resultsy = [] - for line in data: - target = line[1].strip() - tgthyponyms = set([]) - try: - tgtsenses = wn.synsets(target) - for sense in tgtsenses: - tgthyponyms.update(sense.hyponyms()) - except Exception: - tgthyponyms = tgthyponyms - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - senses = set([]) - for word in words.split(' '): - try: - senses.update(wn.synsets(word)) - except UnicodeDecodeError: - senses = senses - if len(tgthyponyms)==0 or len(senses.intersection(tgthyponyms))>0: - resultsy.append(1.0) - else: - resultsy.append(0.0) - return resultsy - - def minDepth(self, data, args): - resultmi = [] - for line in data: - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - mindepth = 9999999 - for word in words.split(' '): - senses = None - try: - senses = wn.synsets(word) - except UnicodeDecodeError: - senses = [] - for sense in senses: - auxmin = sense.min_depth() - if auxminmaxdepth: - maxdepth = auxmax - resultma.append(maxdepth) - return resultma - - def averageDepth(self, data, args): - resultma = [] - for line in data: - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - avgdepth = 0 - total = 0 - for word in words.split(' '): - senses = None - try: - senses = wn.synsets(word) - except UnicodeDecodeError: - senses = [] - for sense in senses: - auxmax = sense.max_depth() - avgdepth += auxmax - total += len(senses) - try: - avgdepth /= total - except Exception: - avgdepth = 0 - resultma.append(avgdepth) - return resultma - - def subjectDependencyProbabilityFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - dep_map = dep_maps[i] - insts = set([]) - if head in dep_map: - for object in dep_map[head]: - for dep_link in dep_map[head][object]: - insts.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 0.0 - if len(insts)>0: - for inst in insts: - ngram = inst[0] + ' ' + word + ' ' + inst[1] - # prob = math.exp(model.score(ngram, bos=False, eos=False)) - prob = math.exp(model.score(ngram)) - total += prob - total /= float(len(insts)) - else: - total = 1.0 - result.append(total) - return result - - def binarySubjectDependencyFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - dep_map = dep_maps[i] - insts = set([]) - if head in dep_map: - for object in dep_map[head]: - for dep_link in dep_map[head][object]: - insts.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 1.0 - if len(insts)>0: - for inst in insts: - ngram = inst[0] + ' ' + word + ' ' + inst[1] - if ngram not in model: - total = 0.0 - else: - total = 1.0 - result.append(total) - return result - - def subjectDependencyFrequencyFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - dep_map = dep_maps[i] - insts = set([]) - if head in dep_map: - for object in dep_map[head]: - for dep_link in dep_map[head][object]: - insts.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 0.0 - if len(insts)>0: - for inst in insts: - ngram = inst[0] + ' ' + word + ' ' + inst[1] - if ngram in model: - total += model[ngram] - if total>0.0: - total /= float(len(insts)) - else: - total = 99999.0 - result.append(total) - return result - - def objectDependencyProbabilityFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - inv_dep_maps = None - if 'inv_dep_maps' in self.temp_resources: - inv_dep_maps = self.temp_resources['inv_dep_maps'] - else: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - inv_dep_maps = [] - for inst in dep_maps: - inv_dep_map = {} - for subjectindex in inst: - for objectindex in inst[subjectindex]: - if objectindex not in inv_dep_map: - inv_dep_map[objectindex] = {} - inv_dep_map[objectindex][subjectindex] = inst[subjectindex][objectindex] - inv_dep_maps.append(inv_dep_map) - self.temp_resources['inv_dep_maps'] = inv_dep_maps - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - inv_dep_map = inv_dep_maps[i] - insts = set([]) - if head in inv_dep_map: - for object in inv_dep_map[head]: - for dep_link in inv_dep_map[head][object]: - insts.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 0.0 - if len(insts)>0: - for inst in insts: - ngram = inst[0] + ' ' + inst[1] + ' ' + word - # prob = math.exp(model.score(ngram, bos=False, eos=False)) - prob = math.exp(model.score(ngram)) - total += prob - total /= float(len(insts)) - else: - total = 1.0 - result.append(total) - return result - - def binaryObjectDependencyFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - inv_dep_maps = None - if 'inv_dep_maps' in self.temp_resources: - inv_dep_maps = self.temp_resources['inv_dep_maps'] - else: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - inv_dep_maps = [] - for inst in dep_maps: - inv_dep_map = {} - for subjectindex in inst: - for objectindex in inst[subjectindex]: - if objectindex not in inv_dep_map: - inv_dep_map[objectindex] = {} - inv_dep_map[objectindex][subjectindex] = inst[subjectindex][objectindex] - inv_dep_maps.append(inv_dep_map) - self.temp_resources['inv_dep_maps'] = inv_dep_maps - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - inv_dep_map = inv_dep_maps[i] - insts = set([]) - if head in inv_dep_map: - for object in inv_dep_map[head]: - for dep_link in inv_dep_map[head][object]: - insts.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 1.0 - if len(insts)>0: - for inst in insts: - ngram = inst[0] + ' ' + inst[1] + ' ' + word - if ngram not in model: - total = 0.0 - else: - total = 1.0 - result.append(total) - return result - - def objectDependencyFrequencyFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - inv_dep_maps = None - if 'inv_dep_maps' in self.temp_resources: - inv_dep_maps = self.temp_resources['inv_dep_maps'] - else: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - inv_dep_maps = [] - for inst in dep_maps: - inv_dep_map = {} - for subjectindex in inst: - for objectindex in inst[subjectindex]: - if objectindex not in inv_dep_map: - inv_dep_map[objectindex] = {} - inv_dep_map[objectindex][subjectindex] = inst[subjectindex][objectindex] - inv_dep_maps.append(inv_dep_map) - self.temp_resources['inv_dep_maps'] = inv_dep_maps - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - inv_dep_map = inv_dep_maps[i] - insts = set([]) - if head in inv_dep_map: - for object in inv_dep_map[head]: - for dep_link in inv_dep_map[head][object]: - insts.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 0.0 - if len(insts)>0: - for inst in insts: - ngram = inst[0] + ' ' + inst[1] + ' ' + word - if ngram in model: - total += model[ngram] - if total>0.0: - total /= float(len(insts)) - else: - total = 99999.0 - result.append(total) - return result - - def allDependencyProbabilityFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - if 'inv_dep_maps' in self.temp_resources: - inv_dep_maps = self.temp_resources['inv_dep_maps'] - else: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - inv_dep_maps = [] - for inst in dep_maps: - inv_dep_map = {} - for subjectindex in inst: - for objectindex in inst[subjectindex]: - if objectindex not in inv_dep_map: - inv_dep_map[objectindex] = {} - inv_dep_map[objectindex][subjectindex] = inst[subjectindex][objectindex] - inv_dep_maps.append(inv_dep_map) - self.temp_resources['inv_dep_maps'] = inv_dep_maps - - dep_maps = self.temp_resources['dep_maps'] - inv_dep_maps = self.temp_resources['inv_dep_maps'] - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - - dep_map = dep_maps[i] - inv_dep_map = inv_dep_maps[i] - insts = set([]) - if head in dep_map: - for object in dep_map[head]: - for dep_link in dep_map[head][object]: - insts.add((dep_link, sent[object])) - insts_inv = set([]) - if head in inv_dep_map: - for object in inv_dep_map[head]: - for dep_link in inv_dep_map[head][object]: - insts_inv.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 0.0 - if len(insts)>0 or len(insts_inv)>0: - for inst in insts: - ngram = inst[0] + ' ' + word + ' ' + inst[1] - # prob = math.exp(model.score(ngram, bos=False, eos=False)) - prob = math.exp(model.score(ngram)) - total += prob - for inst in insts_inv: - ngram = inst[0] + ' ' + inst[1] + ' ' + word - # prob = math.exp(model.score(ngram, bos=False, eos=False)) - prob = math.exp(model.score(ngram)) - total += prob - total /= float(len(insts)+len(insts_inv)) - else: - total = 1.0 - result.append(total) - return result - - def binaryAllDependencyFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - if 'inv_dep_maps' in self.temp_resources: - inv_dep_maps = self.temp_resources['inv_dep_maps'] - else: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - inv_dep_maps = [] - for inst in dep_maps: - inv_dep_map = {} - for subjectindex in inst: - for objectindex in inst[subjectindex]: - if objectindex not in inv_dep_map: - inv_dep_map[objectindex] = {} - inv_dep_map[objectindex][subjectindex] = inst[subjectindex][objectindex] - inv_dep_maps.append(inv_dep_map) - self.temp_resources['inv_dep_maps'] = inv_dep_maps - - dep_maps = self.temp_resources['dep_maps'] - inv_dep_maps = self.temp_resources['inv_dep_maps'] - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - - dep_map = dep_maps[i] - inv_dep_map = inv_dep_maps[i] - insts = set([]) - if head in dep_map: - for object in dep_map[head]: - for dep_link in dep_map[head][object]: - insts.add((dep_link, sent[object])) - insts_inv = set([]) - if head in inv_dep_map: - for object in inv_dep_map[head]: - for dep_link in inv_dep_map[head][object]: - insts_inv.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 1.0 - if len(insts)>0 or len(insts_inv)>0: - for inst in insts: - ngram = inst[0] + ' ' + word + ' ' + inst[1] - if ngram not in model: - total = 0.0 - for inst in insts_inv: - ngram = inst[0] + ' ' + inst[1] + ' ' + word - if ngram not in model: - total = 0.0 - else: - total = 1.0 - result.append(total) - return result - - def allDependencyFrequencyFeature(self, data, args): - model = self.resources[args[0]] - parser = self.resources[args[1]] - - #Get parsed sentences: - if 'inv_dep_maps' in self.temp_resources: - inv_dep_maps = self.temp_resources['inv_dep_maps'] - else: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - inv_dep_maps = [] - for inst in dep_maps: - inv_dep_map = {} - for subjectindex in inst: - for objectindex in inst[subjectindex]: - if objectindex not in inv_dep_map: - inv_dep_map[objectindex] = {} - inv_dep_map[objectindex][subjectindex] = inst[subjectindex][objectindex] - inv_dep_maps.append(inv_dep_map) - self.temp_resources['inv_dep_maps'] = inv_dep_maps - - dep_maps = self.temp_resources['dep_maps'] - inv_dep_maps = self.temp_resources['inv_dep_maps'] - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - - dep_map = dep_maps[i] - inv_dep_map = inv_dep_maps[i] - insts = set([]) - if head in dep_map: - for object in dep_map[head]: - for dep_link in dep_map[head][object]: - insts.add((dep_link, sent[object])) - insts_inv = set([]) - if head in inv_dep_map: - for object in inv_dep_map[head]: - for dep_link in inv_dep_map[head][object]: - insts_inv.add((dep_link, sent[object])) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - total = 0.0 - if len(insts)>0 or len(insts_inv)>0: - for inst in insts: - ngram = inst[0] + ' ' + word + ' ' + inst[1] - if ngram in model: - total += model[ngram] - for inst in insts_inv: - ngram = inst[0] + ' ' + inst[1] + ' ' + word - if ngram in model: - total += model[ngram] - if total>0.0: - total /= float(len(insts)+len(insts_inv)) - else: - total = 99999.0 - result.append(total) - return result - - def wordVectorContextSimilarityFeature(self, data, args): - model = self.resources[args[0]] - tagger = self.resources[args[1]] - result = [] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - for i in range(0, len(data)): - line = data[i] - tokens = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - - #Get content words in sentence: - content_words = set([]) - for j in range(0, len(tokens)): - token = tokens[j] - tag = tagged_sents[i][j][1] - if self.isContentWord(token, tag): - content_words.add(token) - - #Produce divisor: - divisor = float(len(content_words)) - - for subst in line[3:len(line)]: - word = subst.strip().split(':')[1].strip() - similarity = 0.0 - for content_word in content_words: - try: - similarity += model.similarity(content_word, word) - except KeyError: - try: - similarity += model.similarity(content_word, word.lower()) - except KeyError: - pass - similarity /= divisor - result.append(similarity) - return result - - def taggedWordVectorContextSimilarityFeature(self, data, args): - model = self.resources[args[0]] - tagger = self.resources[args[1]] - pos_type = args[2] - result = [] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - - #Produce embeddings vector tags: - model_tagged_sents = None - if pos_type=='paetzold': - transformed = [] - for sent in tagged_sents: - tokens = [] - for token in sent: - tokens.append((token[0], getGeneralisedPOS(token[1]))) - transformed.append(tokens) - model_tagged_sents = transformed - else: - model_tagged_sents = tagged_sents - - for i in range(0, len(data)): - line = data[i] - tokens = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - target_pos = model_tagged_sents[i][head][1] - - #Get content words in sentence: - content_words = set([]) - for j in range(0, len(tokens)): - token = tokens[j] - tag = tagged_sents[i][j][1] - model_tag = model_tagged_sents[i][j][1] - if self.isContentWord(token, tag): - content_words.add(token+'|||'+model_tag) - - #Produce divisor: - divisor = float(len(content_words)) - - for subst in line[3:len(line)]: - word = subst.strip().split(':')[1].strip() - similarity = 0.0 - for content_word in content_words: - try: - similarity += model.similarity(content_word, word+'|||'+target_pos) - except KeyError: - try: - similarity += model.similarity(content_word, word.lower()+'|||'+target_pos) - except KeyError: - pass - similarity /= divisor - result.append(similarity) - return result - - def nullLinkNominalFeature(self, data, args): - parser = self.resources[args[0]] - - #Get parsed sentences: - if 'inv_dep_maps' in self.temp_resources: - inv_dep_maps = self.temp_resources['inv_dep_maps'] - else: - dep_maps = None - if 'dep_maps' in self.temp_resources: - dep_maps = self.temp_resources['dep_maps'] - else: - sentences = [l[0].strip().split(' ') for l in data] - dep_parsed_sents = None - if 'dep_parsed_sents' in self.temp_resources: - dep_parsed_sents = self.temp_resources['dep_parsed_sents'] - else: - dep_parsed_sents = dependencyParseSentences(parser, sentences) - self.temp_resources['dep_parsed_sents'] = dep_parsed_sents - dep_maps = [] - for sent in dep_parsed_sents: - dep_map = {} - for parse in sent: - deplink = str(parse[0]) - subjectindex = int(str(parse[2]))-1 - objectindex = int(str(parse[4]))-1 - if subjectindex not in dep_map: - dep_map[subjectindex] = {objectindex: set([deplink])} - elif objectindex not in dep_map[subjectindex]: - dep_map[subjectindex][objectindex] = set([deplink]) - else: - dep_map[subjectindex][objectindex].add(deplink) - dep_maps.append(dep_map) - self.temp_resources['dep_maps'] = dep_maps - - inv_dep_maps = [] - for inst in dep_maps: - inv_dep_map = {} - for subjectindex in inst: - for objectindex in inst[subjectindex]: - if objectindex not in inv_dep_map: - inv_dep_map[objectindex] = {} - inv_dep_map[objectindex][subjectindex] = inst[subjectindex][objectindex] - inv_dep_maps.append(inv_dep_map) - self.temp_resources['inv_dep_maps'] = inv_dep_maps - - dep_maps = self.temp_resources['dep_maps'] - inv_dep_maps = self.temp_resources['inv_dep_maps'] - - result = [] - for i in range(0, len(data)): - line = data[i] - sent = line[0].strip().split(' ') - target = line[1].strip().lower() - head = int(line[2].strip()) - - dep_map = dep_maps[i] - inv_dep_map = inv_dep_maps[i] - value = False - if head in dep_map or head in inv_dep_map: - value = True - - for subst in line[3:len(line)]: - result.append(value) - return result - - def backoffBehaviorNominalFeature(self, data, args): - ngrams = args[0] - result = [] - counts = self.resources[ngrams] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - ngram2t, bos2t, eos2t = self.getNgram(word, sent, head, 2, 0) - ngram1t, bos1t, eos1t = self.getNgram(word, sent, head, 1, 0) - ngram0t, bos0t, eos0t = self.getNgram(word, sent, head, 0, 0) - ngram2f, bos2f, eos2f = word, True, False - ngram1f, bos1f, eos1f = word, True, False - if head>0: - ngram2f, bos2f, eos2f = self.getNgram(sent[head-1], sent, head-1, 1, 0) - ngram1f, bos1f, eos1f = self.getNgram(sent[head-1], sent, head-1, 0, 0) - - backoff = -1 - if ngram2t in counts: - backoff = 7.0 - elif ngram2f in counts and ngram1t in counts: - backoff = 6.0 - elif ngram1t in counts: - backoff = 5.0 - elif ngram2f in counts and ngram0t in counts: - backoff = 4.0 - elif ngram1f in counts and ngram0t in counts: - backoff = 3.0 - elif ngram0t in counts: - backoff = 2.0 - else: - backoff = 1.0 - result.append(backoff) - return result - - def candidateNominalFeature(self, data, args): - result = [] - for line in data: - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - result.append(words) - return result - - def ngramNominalFeature(self, data, args): - spanl = args[0] - spanr = args[1] - result = [] - for line in data: - sent = line[0].strip().split(' ') - target = line[1] - head = int(line[2]) - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - ngram, bosv, eosv = self.getNgram(word, sent, head, spanl, spanr) - tokens = ngram.split(' ') - fngram = '' - for token in tokens: - fngram += token + '|||' - result.append(fngram[0:len(fngram)-3]) - return result - - def candidatePOSNominalFeature(self, data, args): - result = [] - - tagger = self.resources[args[0]] - pos_type = args[1] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - #Transform them to the right format: - if pos_type=='paetzold': - transformed = [] - for sent in tagged_sents: - tokens = [] - for token in sent: - tokens.append((token[0], getGeneralisedPOS(token[1]))) - transformed.append(tokens) - tagged_sents = transformed - - for i in range(0, len(data)): - line = data[i] - target = line[1].strip().lower() - head = int(line[2].strip()) - target_pos = tagged_sents[i][head][1] - for subst in line[3:len(line)]: - result.append(target_pos) - return result - - def POSNgramNominalFeature(self, data, args): - result = [] - - spanl = args[0] - spanr = args[1] - tagger = self.resources[args[2]] - pos_type = args[3] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - #Transform them to the right format: - if pos_type=='paetzold': - transformed = [] - for sent in tagged_sents: - tokens = [] - for token in sent: - tokens.append((token[0], getGeneralisedPOS(token[1]))) - transformed.append(tokens) - tagged_sents = transformed - - for i in range(0, len(data)): - line = data[i] - target = line[1].strip().lower() - head = int(line[2].strip()) - target_pos = tagged_sents[i][head][1] - POStokens = [posdata[1] for posdata in tagged_sents[i]] - for subst in line[3:len(line)]: - ngram, bosv, eosv = self.getNgram(target_pos, POStokens, head, spanl, spanr) - tokens = ngram.split(' ') - fngram = '' - for token in tokens: - fngram += token + '|||' - result.append(fngram[0:len(fngram)-3]) - return result - - def POSNgramWithCandidateNominalFeature(self, data, args): - result = [] - - spanl = args[0] - spanr = args[1] - tagger = self.resources[args[2]] - pos_type = args[3] - - #Get tagged sentences: - tagged_sents = None - if 'tagged_sents' in self.temp_resources: - tagged_sents = self.temp_resources['tagged_sents'] - else: - sentences = [l[0].strip().split(' ') for l in data] - tagged_sents = tagger.tag_sents(sentences) - self.temp_resources['tagged_sents'] = tagged_sents - - #Transform them to the right format: - if pos_type=='paetzold': - transformed = [] - for sent in tagged_sents: - tokens = [] - for token in sent: - tokens.append((token[0], getGeneralisedPOS(token[1]))) - transformed.append(tokens) - tagged_sents = transformed - - for i in range(0, len(data)): - line = data[i] - target = line[1].strip().lower() - head = int(line[2].strip()) - target_pos = tagged_sents[i][head][1] - POStokens = [posdata[1] for posdata in tagged_sents[i]] - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - ngram, bosv, eosv = self.getNgram(word, POStokens, head, spanl, spanr) - tokens = ngram.split(' ') - fngram = '' - for token in tokens: - fngram += token + '|||' - result.append(fngram[0:len(fngram)-3]) - return result - - def imageSearchCountFeature(self, data, args): - result = [] - - key = args[0] - - for i in range(0, len(data)): - line = data[i] - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - imagecount = None - if word not in self.resources['image_counts']: - imagecount = self.getImageCount(word, key) - self.resources['image_counts'][word] = imagecount - else: - imagecount = self.resources['image_counts'][word] - result.append(imagecount) - return result - - def webSearchCountFeature(self, data, args): - result = [] - - for i in range(0, len(data)): - line = data[i] - for subst in line[3:len(line)]: - word = subst.split(':')[1].strip() - pagecount = None - if word not in self.resources['page_counts']: - pagecount = self.getPageCount(word) - self.resources['page_counts'][word] = pagecount - else: - pagecount = self.resources['page_counts'][word] - result.append(pagecount) - return result - - def getImageCount(self, word, key): - headers = {} - headers['Api-Key'] = key - tokens = word.strip().split(' ') - suffix = '' - for token in tokens: - suffix += token + '+' - suffix = suffix[0:len(suffix)-1] - - #Make HTTP request: - url = 'https://api.gettyimages.com/v3/search/images?fields=id&phrase='+suffix - req = urllib.request.Request(url=url, headers=headers) - - #Send request: - count = None - try: - f = urllib.request.urlopen(req) - data = json.loads(f.read()) - count = int(data['result_count']) - except Exception: - count = 0 - return count - - def getPageCount(self, word): - tokens = word.strip().split(' ') - suffix = '' - for token in tokens: - suffix += token + '+' - suffix = suffix[0:len(suffix)-1] - - #Make HTTP request: - exp = re.compile('class=\"sb_count\"[^>]*>([^<]+)<') - url = 'https://www.bing.com/search?q='+suffix - req = urllib.request.Request(url=url) - - #Send request: - count = None - try: - f = urllib.request.urlopen(req) - data = f.read() - result = exp.findall(data) - count = int(result[0].strip().split(' ')[0].strip().replace(',', '')) - except Exception: - count = 0 - return count - - def morphologicalFeature(self, data, args): - dictionary = args[0] - result = [] - for line in data: - for subst in line[3:len(line)]: - words = subst.strip().split(':')[1].strip() - if words in dictionary: - result.append(dictionary[words]) - else: - result.append(0.0) - return result - - def readNgramFile(self, ngram_file): - counts = shelve.open(ngram_file, protocol=pickle.HIGHEST_PROTOCOL) - return counts - - def isContentWord(self, word, tag): - content_tags = set(['JJ', 'JJS', 'JJR', 'NN', 'NNS', 'RB', 'RBR', 'RBS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']) - if tag in content_tags: - return True - else: - return False - - def addWordVectorValues(self, model, size, orientation): - """ - Adds all the word vector values of a model to the estimator. - - @param model: Path to a binary word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param size: Number of feature values that represent a word in the model. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if model not in self.resources: - m = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True) - self.resources[model] = m - self.features.append((self.wordVectorValuesFeature, [model, size])) - for i in range(0, size): - self.identifiers.append(('Word Vector Value '+str(i)+' (Model: '+model+')', orientation)) - - def addTargetPOSTagProbability(self, condprob_model, pos_model, stanford_tagger, java_path, orientation): - """ - Adds a target POS tag probability feature to the estimator. - The value will be the conditional probability between a candidate substitution and the POS tag of a given target word. - - @param condprob_model: Path to a binary conditional probability model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - os.environ['JAVAHOME'] = java_path - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - if condprob_model not in self.resources: - m = pickle.load(open(condprob_model, 'rb')) - self.resources[condprob_model] = m - - self.features.append((self.targetPOSTagProbability, [condprob_model, pos_model])) - self.identifiers.append(('Target POS Tag Probability (Model:'+str(condprob_model)+')', orientation)) - - def addWordVectorSimilarityFeature(self, model, orientation): - """ - Adds a word vector similarity feature to the estimator. - The value will be the similarity between the word vector of a target complex word and the word vector of a candidate. - - @param model: Path to a binary word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if model not in self.resources: - m = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True) - self.resources[model] = m - self.features.append((self.wordVectorSimilarityFeature, [model])) - self.identifiers.append(('Word Vector Similarity (Model: '+model+')', orientation)) - - def addTaggedWordVectorSimilarityFeature(self, model, pos_model, stanford_tagger, java_path, pos_type, orientation): - """ - Adds a tagged word vector similarity feature to the estimator. - The value will be the similarity between the word vector of a target complex word and the word vector of a candidate, while accompanied by their POS tags. - Each entry in the word vector model must be in the following format: ||| - To create a corpus for such model to be trained, one must tag each word in a corpus, and then concatenate words and tags using the aforementioned convention. - - @param model: Path to a binary tagged word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param pos_type: The type of POS tags to be used. - Values supported: treebank, paetzold - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - os.environ['JAVAHOME'] = java_path - if model not in self.resources: - m = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True) - self.resources[model] = m - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - self.features.append((self.taggedWordVectorSimilarityFeature, [model, pos_model, pos_type])) - self.identifiers.append(('Word Vector Similarity (Model: '+model+') (POS Model: '+pos_model+') (POS Type: '+pos_type+')', orientation)) - - def addTranslationProbabilityFeature(self, translation_probabilities, orientation): - """ - Adds a translation probability feature to the estimator. - The value will be the probability of a target complex word of being translated into a given candidate substitution. - - @param translation_probabilities: Path to a shelve file containing translation probabilities. - To produce the file, first run the following command through fast_align: - fast_align -i -v -d -o - Then, produce a shelve file with the "addTranslationProbabilitiesFileToShelve" function from LEXenstein's "util" module. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - probabilities = self.readNgramFile(translation_probabilities) - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if translation_probabilities not in self.resources: - self.resources[translation_probabilities] = probabilities - self.features.append((self.translationProbabilityFeature, [translation_probabilities])) - self.identifiers.append(('Translation Probability (File: '+translation_probabilities+')', orientation)) - - def addLexiconFeature(self, lexicon, orientation): - """ - Adds a lexicon feature to the estimator. - The value will be 1 if a given candidate is in the provided lexicon, and 0 otherwise. - - @param lexicon: Path to a file containing the words of the lexicon. - The file must have one word per line. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if lexicon not in self.resources: - words = set([w.strip() for w in open(lexicon)]) - self.resources[lexicon] = words - self.features.append((self.lexiconFeature, [lexicon])) - self.identifiers.append(('Lexicon Occurrence (Lexicon: '+lexicon+')', orientation)) - - def addLengthFeature(self, orientation): - """ - Adds a word length feature to the estimator. - The value will be the number of characters in each candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.lengthFeature, [])) - self.identifiers.append(('Word Length', orientation)) - - def addSyllableFeature(self, mat, orientation): - """ - Adds a syllable count feature to the estimator. - The value will be the number of syllables of each candidate. - - @param mat: A configured MorphAdornerToolkit object. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.syllableFeature, [mat])) - self.identifiers.append(('Syllable Count', orientation)) - - def addCollocationalFeature(self, language_model, leftw, rightw, orientation): - """ - Adds a set of collocational features to the estimator. - The values will be the language model probabilities of all collocational features selected. - Each feature is the probability of an n-gram with 0<=l<=leftw tokens to the left and 0<=r<=rightw tokens to the right. - This method creates (leftw+1)*(rightw+1) features. - - @param language_model: Path to the language model from which to extract probabilities. - @param leftw: Maximum number of tokens to the left. - @param rightw: Maximum number of tokens to the right. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.collocationalFeature, [language_model, leftw, rightw])) - for i in range(0, leftw+1): - for j in range(0, rightw+1): - self.identifiers.append(('Collocational Feature ['+str(i)+', '+str(j)+'] (LM: '+language_model+')', orientation)) - - def addFrequencyCollocationalFeature(self, ngram_file, leftw, rightw, orientation): - """ - Adds a set of frequency collocational features to the estimator. - The values will be the n-gram frequencies of all collocational features selected. - Each feature is the frequency of an n-gram with 0<=l<=leftw tokens to the left and 0<=r<=rightw tokens to the right. - This method creates (leftw+1)*(rightw+1) features. - To produce the ngram counts file, the user must first acquire a large corpus of text. - In sequence, the user can then use SRILM to produce an ngram counts file with the "-write" option. - Finally, the user must create a shelve file using the "addNgramCountsFileToShelve" function from the "util" module. - - @param ngram_file: Path to a shelve file containing n-gram frequency counts. - @param leftw: Maximum number of tokens to the left. - @param rightw: Maximum number of tokens to the right. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if ngram_file not in self.resources: - counts = self.readNgramFile(ngram_file) - self.resources[ngram_file] = counts - self.features.append((self.frequencyCollocationalFeature, [ngram_file, leftw, rightw])) - for i in range(0, leftw+1): - for j in range(0, rightw+1): - self.identifiers.append(('Frequency Collocational Feature ['+str(i)+', '+str(j)+'] (N-Grams File: '+ngram_file+')', orientation)) - - def addTaggedFrequencyCollocationalFeature(self, ngram_file, leftw, rightw, pos_model, stanford_tagger, java_path, pos_type, orientation): - """ - Adds a set of frequency tagged n-gram frequency features to the estimator. - The values will be the n-gram frequencies of all tagged collocational features selected. - Each feature is the frequency of an n-gram with 0<=l<=leftw tagged tokens to the left and 0<=r<=rightw tagged tokens to the right. - This method creates (leftw+1)*(rightw+1) features. - This function requires for a special type of ngram counts file. - Each n-gram in the file must be composed of n-1 tags, and exactly 1 word. - To produce this file, one must first parse a corpus and create a corpus with n-grams in the aforementioned format. - The user can then use SRILM to produce an ngram counts file with the "-write" option. - Finally, the user must create a shelve file using the "addNgramCountsFileToShelve" function from the "util" module. - - @param ngram_file: Path to a shelve file containing n-gram frequency counts. - @param leftw: Maximum number of tokens to the left. - @param rightw: Maximum number of tokens to the right. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param pos_type: The type of POS tags to be used. - Values supported: treebank, paetzold - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if ngram_file not in self.resources: - counts = self.readNgramFile(ngram_file) - self.resources[ngram_file] = counts - os.environ['JAVAHOME'] = java_path - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - self.features.append((self.taggedFrequencyCollocationalFeature, [ngram_file, leftw, rightw, pos_model, pos_type])) - for i in range(0, leftw+1): - for j in range(0, rightw+1): - self.identifiers.append(('Tagged Frequency Collocational Feature ['+str(i)+', '+str(j)+'] (N-Grams File: '+ngram_file+') (POS type: '+pos_type+')', orientation)) - - def addBinaryTaggedFrequencyCollocationalFeature(self, ngram_file, leftw, rightw, pos_model, stanford_tagger, java_path, pos_type, orientation): - """ - Adds a set of binary tagged frequency collocational features to the estimator. - The values will be the binary n-gram values of all tagged collocational features selected. - Each feature is the frequency of an n-gram with 0<=l<=leftw tagged tokens to the left and 0<=r<=rightw tagged tokens to the right. - This method creates (leftw+1)*(rightw+1) features. - This function requires for a special type of ngram counts file. - Each n-gram in the file must be composed of n-1 tags, and exactly 1 word. - To produce this file, one must first parse a corpus and create a corpus with n-grams in the aforementioned format. - The user can then use SRILM to produce an ngram counts file with the "-write" option. - Finally, the user must create a shelve file using the "addNgramCountsFileToShelve" function from the "util" module. - - @param ngram_file: Path to a shelve file containing n-gram frequency counts. - @param leftw: Maximum number of tokens to the left. - @param rightw: Maximum number of tokens to the right. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param pos_type: The type of POS tags to be used. - Values supported: treebank, paetzold - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if ngram_file not in self.resources: - counts = self.readNgramFile(ngram_file) - self.resources[ngram_file] = counts - os.environ['JAVAHOME'] = java_path - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - self.features.append((self.binaryTaggedFrequencyCollocationalFeature, [ngram_file, leftw, rightw, pos_model, pos_type])) - for i in range(0, leftw+1): - for j in range(0, rightw+1): - self.identifiers.append(('Binary Tagged Frequency Collocational Feature ['+str(i)+', '+str(j)+'] (N-Grams File: '+ngram_file+') (POS type: '+pos_type+')', orientation)) - - def addPopCollocationalFeature(self, language_model, leftw, rightw, orientation): - """ - Adds a set of "pop" collocational features to the estimator. - Each feature is the probability of an n-gram with 0<=l<=leftw tokens to the left and 0<=r<=rightw tokens to the right. - The value of each feature will be the highest frequency between all "popping" n-gram combinations of one token to the left and right. - This method creates (leftw+1)*(rightw+1) features. - - @param language_model: Path to the language model from which to extract probabilities. - @param leftw: Maximum number of tokens to the left. - @param rightw: Maximum number of tokens to the right. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.popCollocationalFeature, [language_model, leftw, rightw])) - for i in range(0, leftw+1): - for j in range(0, rightw+1): - self.identifiers.append(('Pop Collocational Feature ['+str(i)+', '+str(j)+'] (LM: '+language_model+')', orientation)) - - def addNGramProbabilityFeature(self, language_model, leftw, rightw, orientation): - """ - Adds a n-gram probability feature to the estimator. - The value will be the language model probability of the n-gram composed by leftw tokens to the left and rightw tokens to the right of a given word. - - @param language_model: Path to the language model from which to extract probabilities. - @param leftw: Number of tokens to the left. - @param rightw: Number of tokens to the right. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.ngramProbabilityFeature, [language_model, leftw, rightw])) - self.identifiers.append(('N-Gram Probability Feature ['+str(leftw)+', '+str(rightw)+'] (LM: '+language_model+')', orientation)) - - def addNGramFrequencyFeature(self, ngram_file, leftw, rightw, orientation): - """ - Adds a n-gram frequency feature to the estimator. - The value will be the the frequency of the n-gram composed by leftw tokens to the left and rightw tokens to the right of a given word. - To produce the ngram counts file, the user must first acquire a large corpus of text. - In sequence, the user can then use SRILM to produce an ngram counts file with the "-write" option. - Finally, the user must create a shelve file using the "addNgramCountsFileToShelve" function from the "util" module. - - @param ngram_file: Path to a shelve file containing n-gram frequency counts. - @param leftw: Number of tokens to the left. - @param rightw: Number of tokens to the right. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if ngram_file not in self.resources: - counts = self.readNgramFile(ngram_file) - self.resources[ngram_file] = counts - self.features.append((self.ngramFrequencyFeature, [ngram_file, leftw, rightw])) - self.identifiers.append(('N-Gram Frequency Feature ['+str(leftw)+', '+str(rightw)+'] (N-grams File: '+ngram_file+')', orientation)) - - def addBinaryNGramFrequencyFeature(self, ngram_file, leftw, rightw, orientation): - """ - Adds a binary n-gram frequency feature to the estimator. - The value will be 1 if the n-gram composed by leftw tokens to the left and rightw tokens to the right of a given word are in the n-grams file, and 0 otherwise. - To produce the ngram counts file, the user must first acquire a large corpus of text. - In sequence, the user can then use SRILM to produce an ngram counts file with the "-write" option. - Finally, the user must create a shelve file using the "addNgramCountsFileToShelve" function from the "util" module. - - @param ngram_file: Path to a shelve file containing n-gram frequency counts. - @param leftw: Number of tokens to the left. - @param rightw: Number of tokens to the right. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if ngram_file not in self.resources: - counts = self.readNgramFile(ngram_file) - self.resources[ngram_file] = counts - self.features.append((self.binaryNgramFrequencyFeature, [ngram_file, leftw, rightw])) - self.identifiers.append(('Binary N-Gram Probability Feature ['+str(leftw)+', '+str(rightw)+'] (N-grams File: '+ngram_file+')', orientation)) - - def addPopNGramProbabilityFeature(self, language_model, leftw, rightw, orientation): - """ - Adds a pop n-gram probability feature to the estimator. - The value is the highest probability of the n-gram with leftw tokens to the left and rightw tokens to the right, with a popping window of one token to the left and right. - - @param language_model: Path to the language model from which to extract probabilities. - @param leftw: Number of tokens to the left. - @param rightw: Number of tokens to the right. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.popNgramProbabilityFeature, [language_model, leftw, rightw])) - self.identifiers.append(('Pop N-Gram Frequency Feature ['+str(leftw)+', '+str(rightw)+'] (LM: '+language_model+')', orientation)) - - def addPopNGramFrequencyFeature(self, ngram_file, leftw, rightw, orientation): - """ - Adds a pop n-gram frequency feature to the estimator. - The value is the highest raw frequency count of the n-gram with leftw tokens to the left and rightw tokens to the right, with a popping window of one token to the left and right. - To produce the ngram counts file, the user must first acquire a large corpus of text. - In sequence, the user can then use SRILM to produce an ngram counts file with the "-write" option. - Finally, the user must create a shelve file using the "addNgramCountsFileToShelve" function from the "util" module. - - @param ngram_file: Path to a shelve file containing n-gram frequency counts. - @param leftw: Number of tokens to the left. - @param rightw: Number of tokens to the right. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if ngram_file not in self.resources: - counts = self.readNgramFile(ngram_file) - self.resources[ngram_file] = counts - self.features.append((self.popNgramFrequencyFeature, [ngram_file, leftw, rightw])) - self.identifiers.append(('Pop N-Gram Frequency Feature ['+str(leftw)+', '+str(rightw)+'] (N-grams File: '+ngram_file+')', orientation)) - - def addSentenceProbabilityFeature(self, language_model, orientation): - """ - Adds a sentence probability feature to the estimator. - The value will be the language model probability of each sentence in the VICTOR corpus with its target complex word replaced by a candidate. - - @param language_model: Path to the language model from which to extract probabilities. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.sentenceProbabilityFeature, [language_model])) - self.identifiers.append(('Sentence Probability (LM: '+language_model+')', orientation)) - - def addReverseSentenceProbabilityFeature(self, language_model, orientation): - """ - Adds a reverse sentence probability feature to the estimator. - The value will be the language model probability of each inverted sentence in the VICTOR corpus with its target complex word replaced by a candidate. - - @param language_model: Path to the language model from which to extract probabilities. - This language model must be trained over a corpus composed of inverted sentences (Ex: ". sentence a is This"). - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.reverseSentenceProbabilityFeature, [language_model])) - self.identifiers.append(('Reverse Sentence Probability (LM: '+language_model+')', orientation)) - - def addPrefixProbabilityFeature(self, language_model, orientation): - """ - Adds a prefix probability feature to the estimator. - The value will be the language model probability of all words in each sentence in the VICTOR corpus until the target complex word, while replaced by a candidate. - - @param language_model: Path to the language model from which to extract probabilities. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.prefixProbabilityFeature, [language_model])) - self.identifiers.append(('Prefix Probability (LM: '+language_model+')', orientation)) - - def addReversePrefixProbabilityFeature(self, language_model, orientation): - """ - Adds a reverse prefix probability feature to the estimator. - The value will be the language model probability of all words in each inverted sentence in the VICTOR corpus until the target complex word, while replaced by a candidate. - - @param language_model: Path to the language model from which to extract probabilities. - This language model must be trained over a corpus composed of inverted sentences (Ex: ". sentence a is This"). - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.reversePrefixProbabilityFeature, [language_model])) - self.identifiers.append(('Reverse Prefix Probability (LM: '+language_model+')', orientation)) - - def addSenseCountFeature(self, orientation): - """ - Adds a sense count feature to the estimator. - Calculates the number of senses registered in WordNet of a candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.senseCount ,[])) - self.identifiers.append(('Sense Count', orientation)) - - def addSynonymCountFeature(self, orientation): - """ - Adds a synonym count feature to the estimator. - Calculates the number of synonyms registered in WordNet of a candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.synonymCount ,[])) - self.identifiers.append(('Synonym Count', orientation)) - - def addIsSynonymFeature(self, orientation): - """ - Adds a synonymy relation feature to the estimator. - If a candidate substitution is a synonym of the target word, then it returns 1, if not, it returns 0. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.isSynonym ,[])) - self.identifiers.append(('Is Synonym', orientation)) - - def addHypernymCountFeature(self, orientation): - """ - Adds a hypernym count feature to the estimator. - Calculates the number of hypernyms registered in WordNet of a candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.hypernymCount ,[])) - self.identifiers.append(('Hypernym Count', orientation)) - - def addIsHypernymFeature(self, orientation): - """ - Adds a hypernymy relation feature to the estimator. - If a candidate substitution is a hypernym of the target word, then it returns 1, if not, it returns 0. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.isHypernym ,[])) - self.identifiers.append(('Is Hypernym', orientation)) - - def addHyponymCountFeature(self, orientation): - """ - Adds a hyponym count feature to the estimator. - Calculates the number of hyponyms registered in WordNet of a candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.hyponymCount ,[])) - self.identifiers.append(('Hyponym Count', orientation)) - - def addIsHyponymFeature(self, orientation): - """ - Adds a hyponymy relation feature to the estimator. - If a candidate substitution is a hyponym of the target word, then it returns 1, if not, it returns 0. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.isHyponym ,[])) - self.identifiers.append(('Is Hyponym', orientation)) - - def addMinDepthFeature(self, orientation): - """ - Adds a minimum sense depth feature to the estimator. - Calculates the minimum distance between two senses of a given candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.minDepth ,[])) - self.identifiers.append(('Minimal Sense Depth', orientation)) - - def addMaxDepthFeature(self, orientation): - """ - Adds a maximum sense depth feature to the estimator. - Calculates the maximum distance between two senses of a given candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.maxDepth ,[])) - self.identifiers.append(('Maximal Sense Depth', orientation)) - - def addAverageDepthFeature(self, orientation): - """ - Adds an average sense depth feature to the estimator. - Calculates the average distance between two senses of a given candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.averageDepth ,[])) - self.identifiers.append(('Average Sense Depth', orientation)) - - def addSubjectDependencyProbabilityFeature(self, language_model, stanford_parser, dependency_models, java_path, orientation): - """ - Adds a subject dependency probability feature to the estimator. - The value will be the average language model probability of all dependency links of which the target word is subject, with the target word replaced by a given candidate. - To train the language model used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param language_model: Path to the language model from which to extract dependency link probabilities. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.subjectDependencyProbabilityFeature, [language_model, dependency_models])) - self.identifiers.append(('Subject Dependency Probability Feature (Language Model: '+language_model+') (Models: '+dependency_models+')', orientation)) - - def addBinarySubjectDependencyFeature(self, dep_counts_file, stanford_parser, dependency_models, java_path, orientation): - """ - Adds a binary subject dependency feature to the estimator. - The value will be 1 if all dependency links of which the target word is subject exist for a given candidate, and 0 otherwise. - To produce the dependency link counts file used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param dep_counts_file: Path to a shelve file containing dependency link counts. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if dep_counts_file not in self.resources: - counts = self.readNgramFile(dep_counts_file) - self.resources[dep_counts_file] = counts - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.binarySubjectDependencyFeature, [dep_counts_file, dependency_models])) - self.identifiers.append(('Binary Subject Dependency Feature (Dependency Link Counts File: '+dep_counts_file+') (Models: '+dependency_models+')', orientation)) - - def addSubjectDependencyFrequencyFeature(self, dep_counts_file, stanford_parser, dependency_models, java_path, orientation): - """ - Adds a subject dependency frequency feature to the estimator. - The value will be the average raw frequency of all dependency links of which the target word is subject, with the target word replaced by a given candidate. - To produce the dependency link counts file used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param dep_counts_file: Path to a shelve file containing dependency link counts. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if dep_counts_file not in self.resources: - counts = self.readNgramFile(dep_counts_file) - self.resources[dep_counts_file] = counts - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.subjectDependencyFrequencyFeature, [dep_counts_file, dependency_models])) - self.identifiers.append(('Subject Dependency Frequency Feature (Dependency Link Counts File: '+dep_counts_file+') (Models: '+dependency_models+')', orientation)) - - def addObjectDependencyProbabilityFeature(self, language_model, stanford_parser, dependency_models, java_path, orientation): - """ - Adds an object dependency probability feature to the estimator. - The value will be the average language model probability of all dependency links of which the target word is object, with the target word replaced by a given candidate. - To train the language model used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param language_model: Path to the language model from which to extract dependency link probabilities. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.objectDependencyProbabilityFeature, [language_model, dependency_models])) - self.identifiers.append(('Object Dependency Probability Feature (Language Model: '+language_model+') (Models: '+dependency_models+')', orientation)) - - def addBinaryObjectDependencyFeature(self, dep_counts_file, stanford_parser, dependency_models, java_path, orientation): - """ - Adds a binary object dependency feature to the estimator. - The value will be 1 if all dependency links of which the target word is object exist for a given candidate, and 0 otherwise. - To produce the dependency link counts file used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param dep_counts_file: Path to a shelve file containing dependency link counts. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if dep_counts_file not in self.resources: - counts = self.readNgramFile(dep_counts_file) - self.resources[dep_counts_file] = counts - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.binaryObjectDependencyFeature, [dep_counts_file, dependency_models])) - self.identifiers.append(('Binary Object Dependency Feature (Dependency Link Counts File: '+dep_counts_file+') (Models: '+dependency_models+')', orientation)) - - def addObjectDependencyFrequencyFeature(self, dep_counts_file, stanford_parser, dependency_models, java_path, orientation): - """ - Adds an object dependency frequency feature to the estimator. - The value will be the average raw frequency of all dependency links of which the target word is object, with the target word replaced by a given candidate. - To produce the dependency link counts file used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param dep_counts_file: Path to a shelve file containing dependency link counts. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if dep_counts_file not in self.resources: - counts = self.readNgramFile(dep_counts_file) - self.resources[dep_counts_file] = counts - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.objectDependencyFrequencyFeature, [dep_counts_file, dependency_models])) - self.identifiers.append(('Object Dependency Frequency Feature (Dependency Link Counts File: '+dep_counts_file+') (Models: '+dependency_models+')', orientation)) - - def addAllDependencyProbabilityFeature(self, language_model, stanford_parser, dependency_models, java_path, orientation): - """ - Adds a dependency probability feature to the estimator. - The value will be the average language model probability of all the target word's dependency links, with the target word replaced by a given candidate. - To train the language model used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param language_model: Path to the language model from which to extract dependency link probabilities. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.allDependencyProbabilityFeature, [language_model, dependency_models])) - self.identifiers.append(('Dependency Probability Feature (Language Model: '+language_model+') (Models: '+dependency_models+')', orientation)) - - def addBinaryAllDependencyFeature(self, dep_counts_file, stanford_parser, dependency_models, java_path, orientation): - """ - Adds a binary dependency feature to the estimator. - The value will be 1 if all dependency links of the target word exist for a given candidate, and 0 otherwise. - To produce the dependency link counts file used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param dep_counts_file: Path to a shelve file containing dependency link counts. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if dep_counts_file not in self.resources: - counts = self.readNgramFile(dep_counts_file) - self.resources[dep_counts_file] = counts - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.binaryAllDependencyFeature, [dep_counts_file, dependency_models])) - self.identifiers.append(('Binary All Dependency Feature (Dependency Link Counts File: '+dep_counts_file+') (Models: '+dependency_models+')', orientation)) - - def addAllDependencyFrequencyFeature(self, dep_counts_file, stanford_parser, dependency_models, java_path, orientation): - """ - Adds a dependency frequency feature to the estimator. - The value will be the average raw frequency of all dependency links of the target word, with the target word replaced by a given candidate. - To produce the dependency link counts file used by this feature, one must first extract dependency links from a large corpora of sentences. - In sequence, the dependency links must be transformed into the following format: - In the format above, each token is space-separated. - Once transformed, the dependency links can then be placed in a text file, one per line. - Finally, one can then run any language modelling tool to produce a language model in ARPA format. - - @param dep_counts_file: Path to a shelve file containing dependency link counts. - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if dep_counts_file not in self.resources: - counts = self.readNgramFile(dep_counts_file) - self.resources[dep_counts_file] = counts - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - self.features.append((self.allDependencyFrequencyFeature, [dep_counts_file, dependency_models])) - self.identifiers.append(('All Dependency Frequency Feature (Dependency Link Counts File: '+dep_counts_file+') (Models: '+dependency_models+')', orientation)) - - def addWordVectorContextSimilarityFeature(self, model, pos_model, stanford_tagger, java_path, orientation): - """ - Adds a word vector context similarity feature to the estimator. - The value will be the average similarity between the word vector of a candidate and the vectors of all content word in the target word's context. - - @param model: Path to a binary word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if model not in self.resources: - m = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True) - self.resources[model] = m - os.environ['JAVAHOME'] = java_path - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - self.features.append((self.wordVectorContextSimilarityFeature, [model, pos_model])) - self.identifiers.append(('Word Vector Context Similarity (Model: '+model+') (POS Model: '+pos_model+')', orientation)) - - def addTaggedWordVectorContextSimilarityFeature(self, model, pos_model, stanford_tagger, java_path, pos_type, orientation): - """ - Adds a tagged word vector context similarity feature to the estimator. - The value will be the average similarity between the word vector of a candidate and the vectors of all content word in the target word's context. - Each entry in the word vector model must be in the following format: ||| - To create a corpus for such model to be trained, one must tag each word in a corpus, and then concatenate words and tags using the aforementioned convention. - - @param model: Path to a binary tagged word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param pos_type: The type of POS tags to be used. - Values supported: treebank, paetzold - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if model not in self.resources: - m = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True) - self.resources[model] = m - os.environ['JAVAHOME'] = java_path - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - self.features.append((self.taggedWordVectorContextSimilarityFeature, [model, pos_model, pos_type])) - self.identifiers.append(('Tagged Word Vector Context Similarity (Model: '+model+') (POS Model: '+pos_model+') (POS Type: '+pos_type+')', orientation)) - - def addNullLinkNominalFeature(self, stanford_parser, dependency_models, java_path, orientation): - """ - Adds a null link nominal feature to the estimator - The value will be 1 if there is at least one dependency link of which the candidate is part of, and 0 otherwise. - - @param stanford_parser: Path to the "stanford-parser.jar" file. - The parser can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param dependency_models: Path to a JAR file containing parsing models. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/lex-parser.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - os.environ['JAVAHOME'] = java_path - if dependency_models not in self.resources: - parser = StanfordParser(path_to_jar=stanford_parser, path_to_models_jar=dependency_models) - self.resources[dependency_models] = parser - - self.features.append((self.nullLinkNominalFeature, [dependency_models])) - self.identifiers.append(('Null Link Nominal Feature (Models: '+dependency_models+')', orientation)) - - def addBackoffBehaviorNominalFeature(self, ngram_file, orientation): - """ - Adds a nominal language model backoff behavior nominal feature to the estimator. - - @param ngram_file: Path to a shelve file containing n-gram frequency counts. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if ngram_file not in self.resources: - counts = self.readNgramFile(ngram_file) - self.resources[ngram_file] = counts - - self.features.append((self.backoffBehaviorNominalFeature, [ngram_file])) - self.identifiers.append(('N-Gram Nominal Feature (N-Grams File: '+ngram_file+')', orientation)) - - def addImageSearchCountFeature(self, key, orientation): - """ - Adds an image search count feature to the estimator. - The resulting value will be the number of distinct pictures retrieved by the Getty Images API. - This feature requires for a free "Connect Embed" key, which gives you access to 5 queries per second, and unlimited queries per day. - For more information on how to acquire a key, please visit their website at: https://developer.gettyimages.com - - @param key: Connect Embed key for the Getty Images API. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if key not in self.resources: - self.resources['GettyImagesKey'] = key - if 'image_counts' not in self.resources: - self.resources['image_counts'] = {} - - self.features.append((self.imageSearchCountFeature, [key])) - self.identifiers.append(('Image Search Count Feature (Key: '+key+')', orientation)) - - def addWebSearchCountFeature(self, orientation): - """ - Adds a web search count feature to the estimator. - The resulting value will be the number of websites retrieved by Bing. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if 'page_counts' not in self.resources: - self.resources['page_counts'] = {} - - self.features.append((self.webSearchCountFeature, [])) - self.identifiers.append((' Web Search Count Feature', orientation)) - - def addMorphologicalFeature(self, dictionary, description, orientation): - """ - Adds a generalized morphological feature to the estimator. - It requires for a dictionary that assigns words to their respective feature values. - For each word in a dataset, the value of this feature will be the one found in the dictionar provided, or 0 if it is not available. - - @param dictionary: A dictionary object assigning words to values. - Example: dictionary['chair'] = 45.33. - @param description: Description of the feature. - Example: "Age of Acquisition". - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.morphologicalFeature, [dictionary])) - self.identifiers.append((description, orientation)) - - # Nominal features: - - def addCandidateNominalFeature(self): - """ - Adds a candidate nominal feature to the estimator. - """ - self.features.append((self.candidateNominalFeature, [])) - self.identifiers.append(('Candidate Nominal Feature', 'Not Applicable')) - - def addNgramNominalFeature(self, leftw, rightw): - """ - Adds a n-gram nominal feature to the estimator. - - @param leftw: Number of tokens to the left. - @param rightw: Number of tokens to the right. - """ - self.features.append((self.ngramNominalFeature, [leftw, rightw])) - self.identifiers.append(('N-Gram Nominal Feature ['+str(leftw)+', '+str(rightw)+']', 'Not Applicable')) - - def addCandidatePOSNominalFeature(self, pos_model, stanford_tagger, java_path, pos_type): - """ - Adds a candidate POS tag nominal feature to the estimator. - - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param pos_type: The type of POS tags to be used. - Values supported: treebank, paetzold - """ - os.environ['JAVAHOME'] = java_path - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - - self.features.append((self.candidatePOSNominalFeature, [pos_model, pos_type])) - self.identifiers.append(('Candidate POS Nominal Feature (POS Model: '+pos_model+') (POS Type: '+pos_type+')', 'Not Applicable')) - - def addPOSNgramNominalFeature(self, leftw, rightw, pos_model, stanford_tagger, java_path, pos_type): - """ - Adds a POS n-gram nominal feature to the estimator. - The n-gram will contain the candidate's POS tag surrounded by the POS tags of neighboring words. - - @param leftw: Number of tokens to the left. - @param rightw: Number of tokens to the right. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param pos_type: The type of POS tags to be used. - Values supported: treebank, paetzold - """ - os.environ['JAVAHOME'] = java_path - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - - self.features.append((self.POSNgramNominalFeature, [leftw, rightw, pos_model, pos_type])) - self.identifiers.append(('POS N-gram Nominal Feature ['+str(leftw)+', '+str(rightw)+'] (POS Model: '+pos_model+') (POS Type: '+pos_type+')', 'Not Applicable')) - - def addPOSNgramWithCandidateNominalFeature(self, leftw, rightw, pos_model, stanford_tagger, java_path, pos_type): - """ - Adds a candidate centered POS n-gram nominal feature to the estimator. - The n-gram will contain the candidate surrounded by the POS tags of neighboring words. - - @param leftw: Number of tokens to the left. - @param rightw: Number of tokens to the right. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param pos_type: The type of POS tags to be used. - Values supported: treebank, paetzold - """ - os.environ['JAVAHOME'] = java_path - if pos_model not in self.resources: - tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.resources[pos_model] = tagger - - self.features.append((self.POSNgramWithCandidateNominalFeature, [leftw, rightw, pos_model, pos_type])) - self.identifiers.append(('POS N-gram with Candidate Nominal Feature ['+str(leftw)+', '+str(rightw)+'] (POS Model: '+pos_model+') (POS Type: '+pos_type+')', 'Not Applicable')) - -############################################################################################################################################################################################################################################## -#Phrase-level LS features: -############################################################################################################################################################################################################################################## - - def addNumberOfTokensFeature(self, orientation): - """ - Adds a number of tokens feature to the estimator. - The value will be the number of tokens in each candidate. - - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - self.features.append((self.numberOfTokens, [])) - self.identifiers.append(('Number of Tokens', orientation)) - - def addAverageTokenProbabilityFeature(self, language_model, orientation): - """ - Adds an average token probability feature to the estimator. - The value will be the average language model probability of all tokens that compose the candidate. - - @param language_model: Path to the language model from which to extract probabilities. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.averageTokenProbabilityFeature, [language_model])) - self.identifiers.append(('Average Token Probability (LM: '+language_model+')', orientation)) - - def addMaximumTokenProbabilityFeature(self, language_model, orientation): - """ - Adds an maximum token probability feature to the estimator. - The value will be the maximum language model probability of all tokens that compose the candidate. - - @param language_model: Path to the language model from which to extract probabilities. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.maximumTokenProbabilityFeature, [language_model])) - self.identifiers.append(('Maximum Token Probability (LM: '+language_model+')', orientation)) - - def addMinimumTokenProbabilityFeature(self, language_model, orientation): - """ - Adds an minimum token probability feature to the estimator. - The value will be the minimum language model probability of all tokens that compose the candidate. - - @param language_model: Path to the language model from which to extract probabilities. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if language_model not in self.resources: - model = kenlm.LanguageModel(language_model) - self.resources[language_model] = model - self.features.append((self.minimumTokenProbabilityFeature, [language_model])) - self.identifiers.append(('Minimum Token Probability (LM: '+language_model+')', orientation)) - - def addMinimumWordVectorSimilarityFeature(self, model, orientation): - """ - Adds a minimum word vector similarity feature to the estimator. - The value will be the minimum similarity between the word vectors of the words that compose the candidate and the word vector of the target complex word. - - @param model: Path to a binary word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if model not in self.resources: - m = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True) - self.resources[model] = m - self.features.append((self.minimumWordVectorSimilarityFeature, [model])) - self.identifiers.append(('Minimum Word Vector Similarity (Model: '+model+')', orientation)) - - def addMaximumWordVectorSimilarityFeature(self, model, orientation): - """ - Adds a maximum word vector similarity feature to the estimator. - The value will be the maximum similarity between the word vectors of the words that compose the candidate and the word vector of the target complex word. - - @param model: Path to a binary word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if model not in self.resources: - m = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True) - self.resources[model] = m - self.features.append((self.maximumWordVectorSimilarityFeature, [model])) - self.identifiers.append(('Maximum Word Vector Similarity (Model: '+model+')', orientation)) - - def addAverageWordVectorSimilarityFeature(self, model, orientation): - """ - Adds an average word vector similarity feature to the estimator. - The value will be the average similarity between the word vectors of the words that compose the candidate and the word vector of the target complex word. - - @param model: Path to a binary word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param orientation: Whether the feature is a simplicity of complexity measure. - Possible values: Complexity, Simplicity. - """ - - if orientation not in ['Complexity', 'Simplicity']: - logger.debug('Orientation must be Complexity or Simplicity') - else: - if model not in self.resources: - m = gensim.models.KeyedVectors.load_word2vec_format(model, binary=True) - self.resources[model] = m - self.features.append((self.averageWordVectorSimilarityFeature, [model])) - self.identifiers.append(('Average Word Vector Similarity (Model: '+model+')', orientation)) diff --git a/lexi/lib/lexenstein/generators.py b/lexi/lib/lexenstein/generators.py deleted file mode 100755 index 86cae5f..0000000 --- a/lexi/lib/lexenstein/generators.py +++ /dev/null @@ -1,2129 +0,0 @@ -import xml.etree.ElementTree as ET -import re -import urllib2 as urllib -import subprocess -import nltk -from nltk.tag.stanford import StanfordPOSTagger -import kenlm -import codecs -import os -import gensim -from nltk.corpus import wordnet as wn -from nltk.stem.wordnet import WordNetLemmatizer -from nltk.stem.porter import PorterStemmer - - - -class PaetzoldGenerator: - - def __init__(self, posw2vmodel, nc, pos_model, stanford_tagger, java_path): - """ - Creates a PaetzoldGenerator instance. - - @param posw2vmodel: Binary parsed word vector model. - For more information on how to produce the model, please refer to the LEXenstein Manual. - @param nc: NorvigCorrector object. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - """ - self.lemmatizer = WordNetLemmatizer() - self.stemmer = PorterStemmer() - self.model = gensim.models.KeyedVectors.load_word2vec_format(posw2vmodel, binary=True) - self.nc = nc - os.environ['JAVAHOME'] = java_path - self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) - - def getSubstitutions(self, victor_corpus, amount): - """ - Generates substitutions for the target words of a corpus in VICTOR format. - - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A dictionary that assigns target complex words to sets of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - """ - #Get candidate->pos map: - tagged_sents = self.getParsedSentences(victor_corpus) - - #Get initial set of substitutions: - substitutions = self.getInitialSet(victor_corpus, tagged_sents, amount) - return substitutions - - def getParsedSentences(self, victor_corpus): - lexf = open(victor_corpus) - sents = [] - for line in lexf: - data = line.strip().split('\t') - sent = data[0].strip().split(' ') - sents.append(sent) - lexf.close() - - tagged_sents = self.tagger.tag_sents(sents) - return tagged_sents - - def getInitialSet(self, victor_corpus, tsents, amount): - lexf = open(victor_corpus) - data = [] - for line in lexf: - d = line.strip().split('\t') - data.append(d) - lexf.close() - - trgs = [] - trgsc = [] - trgsstems = [] - trgslemmas = [] - trgscstems = [] - trgsclemmas = [] - for i in range(0, len(data)): - d = data[i] - tags = tsents[i] - target = d[1].strip().lower() - head = int(d[2].strip()) - tag = self.getClass(tags[head][1]) - targetc = self.nc.correct(target) - trgs.append(target) - trgsc.append(targetc) - trgslemmas = self.lemmatizeWords(trgs) - trgsclemmas = self.lemmatizeWords(trgsc) - trgsstems = self.stemWords(trgs) - trgscstems = self.stemWords(trgsc) - trgmap = {} - for i in range(0, len(trgslemmas)): - target = data[i][1].strip().lower() - head = int(data[i][2].strip()) - tag = self.getClass(tsents[i][head][1]) - lemma = trgslemmas[i] - stem = trgsstems[i] - trgmap[target] = (lemma, stem) - - subs = [] - cands = set([]) - for i in range(0, len(data)): - d = data[i] - - t = trgs[i] - tstem = trgsstems[i] - tlemma = trgslemmas[i] - tc = trgsc[i] - tcstem = trgscstems[i] - tclemma = trgsclemmas[i] - - tags = tsents[i] - head = int(d[2].strip()) - tag = tags[head][1] - - word = t+'|||'+self.getClass(tag) - wordc = tc+'|||'+self.getClass(tag) - - most_sim = [] - try: - most_sim = self.model.most_similar(positive=[word], topn=50) - except KeyError: - try: - most_sim = self.model.most_similar(positive=[wordc], topn=50) - except KeyError: - most_sim = [] - - subs.append([word[0] for word in most_sim]) - - subsr = subs - subs = [] - for l in subsr: - lr = [] - for inst in l: - cand = inst.split('|||')[0].strip() - encc = None - try: - encc = cand.encode('ascii') - except Exception: - encc = None - if encc: - cands.add(cand) - lr.append(inst) - subs.append(lr) - - cands = list(cands) - candslemmas = self.lemmatizeWords(cands) - candsstems = self.stemWords(cands) - candmap = {} - for i in range(0, len(cands)): - cand = cands[i] - lemma = candslemmas[i] - stem = candsstems[i] - candmap[cand] = (lemma, stem) - - subs_filtered = self.filterSubs(data, tsents, subs, candmap, trgs, trgsc, trgsstems, trgscstems, trgslemmas, trgsclemmas) - - final_cands = {} - for i in range(0, len(data)): - target = data[i][1] - cands = subs_filtered[i][0:min(amount, subs_filtered[i])] - cands = [str(word.split('|||')[0].strip()) for word in cands] - if target not in final_cands: - final_cands[target] = set([]) - final_cands[target].update(set(cands)) - - return final_cands - - def lemmatizeWords(self, words): - result = [] - for word in words: - result.append(self.lemmatizer.lemmatize(word)) - return result - - def stemWords(self, words): - result = [] - for word in words: - result.append(self.stemmer.stem(word)) - return result - - def filterSubs(self, data, tsents, subs, candmap, trgs, trgsc, trgsstems, trgscstems, trgslemmas, trgsclemmas): - result = [] - for i in range(0, len(data)): - d = data[i] - - t = trgs[i] - tstem = trgsstems[i] - tlemma = trgslemmas[i] - tc = trgsc[i] - tcstem = trgscstems[i] - tclemma = trgsclemmas[i] - - tags = tsents[i] - head = int(d[2].strip()) - tag = self.getClass(tags[head][1]) - - word = t+'|||'+self.getClass(tag) - wordc = tc+'|||'+self.getClass(tag) - - most_sim = subs[i] - most_simf = [] - - for cand in most_sim: - candd = cand.split('|||') - cword = candd[0].strip() - ctag = candd[1].strip() - clemma = candmap[cword][0] - cstem = candmap[cword][1] - - if ctag==tag: - if clemma!=tlemma and clemma!=tclemma and cstem!=tstem and cstem!=tcstem: - if cword not in t and cword not in tc and t not in cword and tc not in cword: - most_simf.append(cand) - - class_filtered = [] - for cand in most_simf: - candd = cand.split('|||') - cword = candd[0].strip() - ctag = candd[1].strip() - clemma = candmap[cword][0] - cstem = candmap[cword][1] - - if tag=='V': - if (t.endswith('ing') or tc.endswith('ing')) and cword.endswith('ing'): - class_filtered.append(cand) - elif (t.endswith('d') or tc.endswith('d')) and cword.endswith('d'): - class_filtered.append(cand) - else: - class_filtered.append(cand) - - result.append(most_simf) - return result - - def getClass(self, tag): - result = None - if tag.startswith('N'): - result = 'N' - elif tag.startswith('V'): - result = 'V' - elif tag.startswith('RB'): - result = 'A' - elif tag.startswith('J'): - result = 'J' - elif tag.startswith('W'): - result = 'W' - elif tag.startswith('PRP'): - result = 'P' - else: - result = tag.strip() - return result - -class GlavasGenerator: - - def __init__(self, w2vmodel): - """ - Creates a GlavasGenerator instance. - - @param w2vmodel: Binary parsed word vector model. - For more information on how to produce the model, please refer to the LEXenstein Manual. - """ - self.lemmatizer = WordNetLemmatizer() - self.stemmer = PorterStemmer() - self.model = gensim.models.KeyedVectors.load_word2vec_format(w2vmodel, binary=True) - - def getSubstitutions(self, victor_corpus, amount): - """ - Generates substitutions for the target words of a corpus in VICTOR format. - - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A dictionary that assigns target complex words to sets of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - """ - - #Get initial set of substitutions: - substitutions = self.getInitialSet(victor_corpus, amount) - return substitutions - - def getInitialSet(self, victor_corpus, amount): - lexf = open(victor_corpus) - data = [] - for line in lexf: - d = line.strip().split('\t') - data.append(d) - lexf.close() - - trgs = [] - trgsstems = [] - trgslemmas = [] - for i in range(0, len(data)): - d = data[i] - target = d[1].strip().lower() - head = int(d[2].strip()) - trgs.append(target) - trgslemmas = self.lemmatizeWords(trgs) - trgsstems = self.stemWords(trgs) - - trgmap = {} - for i in range(0, len(trgslemmas)): - target = data[i][1].strip().lower() - head = int(data[i][2].strip()) - lemma = trgslemmas[i] - stem = trgsstems[i] - trgmap[target] = (lemma, stem) - - subs = [] - cands = set([]) - for i in range(0, len(data)): - d = data[i] - - t = trgs[i] - tstem = trgsstems[i] - tlemma = trgslemmas[i] - - word = t - - most_sim = [] - try: - most_sim = self.model.most_similar(positive=[word], topn=50) - except KeyError: - most_sim = [] - - subs.append([word[0] for word in most_sim]) - - subsr = subs - subs = [] - for l in subsr: - lr = [] - for inst in l: - cand = inst.split('|||')[0].strip() - encc = None - try: - encc = cand.encode('ascii') - except Exception: - encc = None - if encc: - cands.add(cand) - lr.append(inst) - subs.append(lr) - - cands = list(cands) - candslemmas = self.lemmatizeWords(cands) - candsstems = self.stemWords(cands) - candmap = {} - for i in range(0, len(cands)): - cand = cands[i] - lemma = candslemmas[i] - stem = candsstems[i] - candmap[cand] = (lemma, stem) - - subs_filtered = self.filterSubs(data, subs, candmap, trgs, trgsstems, trgslemmas) - - final_cands = {} - for i in range(0, len(data)): - target = data[i][1] - cands = subs_filtered[i][0:min(amount, subs_filtered[i])] - cands = [str(word.split('|||')[0].strip()) for word in cands] - if target not in final_cands: - final_cands[target] = set([]) - final_cands[target].update(set(cands)) - - return final_cands - - def lemmatizeWords(self, words): - result = [] - for word in words: - result.append(self.lemmatizer.lemmatize(word)) - return result - - def stemWords(self, words): - result = [] - for word in words: - result.append(self.stemmer.stem(word)) - return result - - def filterSubs(self, data, subs, candmap, trgs, trgsstems, trgslemmas): - result = [] - for i in range(0, len(data)): - d = data[i] - - t = trgs[i] - tstem = trgsstems[i] - tlemma = trgslemmas[i] - - word = t - - most_sim = subs[i] - most_simf = [] - - for cand in most_sim: - cword = cand - clemma = candmap[cword][0] - cstem = candmap[cword][1] - - if clemma!=tlemma and cstem!=tstem: - most_simf.append(cand) - - result.append(most_simf) - return result - -class KauchakGenerator: - - def __init__(self, mat, parallel_pos_file, alignments_file, stop_words, nc): - """ - Creates a KauchakGenerator instance. - - @param mat: MorphAdornerToolkit object. - @param parallel_pos_file: Path to the parsed parallel corpus from which to extract substitutions. - For more information about the file's format, refer to the LEXenstein Manual. - @param alignments_file: Path to the alignments for the parsed parallel corpus from which to extract substitutions. - For more information about the file's format, refer to the LEXenstein Manual. - @param stop_words: Path to the file containing stop words of the desired language. - The file must contain one stop word per line. - @param nc: NorvigCorrector object. - """ - self.mat = mat - self.parallel_pos_file = parallel_pos_file - self.alignments_file = alignments_file - self.stop_words = set([word.strip() for word in open(stop_words)]) - self.nc = nc - - def getSubstitutions(self, victor_corpus): - """ - Generates substitutions for the target words of a corpus in VICTOR format. - - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A dictionary that assigns target complex words to sets of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - """ - #Get candidate->pos map: - print('Getting POS map...') - target_pos = self.getPOSMap(victor_corpus) - - #Get initial set of substitutions: - print('Getting initial set of substitutions...') - substitutions_initial = self.getInitialSet(victor_corpus, target_pos) - - #Get final substitutions: - print('Inflecting substitutions...') - substitutions_inflected = self.getInflectedSet(substitutions_initial) - - #Return final set: - print('Finished!') - return substitutions_inflected - - def getInitialSet(self, victor_corpus, pos_map): - substitutions_initial = {} - - targets = set([line.strip().split('\t')[1].strip() for line in open(victor_corpus)]) - - fparallel = open(self.parallel_pos_file) - falignments = open(self.alignments_file) - - for line in fparallel: - data = line.strip().split('\t') - source = data[0].strip().split(' ') - target = data[1].strip().split(' ') - - alignments = set(falignments.readline().strip().split(' ')) - - for alignment in alignments: - adata = alignment.strip().split('-') - left = int(adata[0].strip()) - right = int(adata[1].strip()) - leftraw = source[left].strip() - leftp = leftraw.split('|||')[1].strip().lower() - leftw = leftraw.split('|||')[0].strip() - rightraw = target[right].strip() - rightp = rightraw.split('|||')[1].strip().lower() - rightw = rightraw.split('|||')[0].strip() - - if len(leftw)>0 and len(rightw)>0 and leftp!='nnp' and rightp!='nnp' and rightp==leftp and leftw not in self.stop_words and rightw not in self.stop_words and leftw!=rightw: - if leftw in substitutions_initial: - if leftp in substitutions_initial[leftw]: - substitutions_initial[leftw][leftp].add(rightw) - else: - substitutions_initial[leftw][leftp] = set([rightw]) - else: - substitutions_initial[leftw] = {leftp:set([rightw])} - fparallel.close() - falignments.close() - return substitutions_initial - - def getPOSMap(self, path): - result = {} - lex = open(path) - for line in lex: - data = line.strip().split('\t') - sent = data[0].strip().lower().split(' ') - target = data[1].strip().lower() - head = int(data[2].strip()) - - posd = nltk.pos_tag(sent) - postarget = posd[head][1].lower().strip() - if target in result: - result[target].add(postarget) - else: - result[target] = set([postarget]) - lex.close() - return result - - def getInflectedSet(self, result): - final_substitutions = {} - - #Get inflections: - allkeys = sorted(list(result.keys())) - - singulars = {} - plurals = {} - verbs = {} - - singularsk = {} - pluralsk = {} - verbsk = {} - - for i in range(0, len(allkeys)): - key = allkeys[i] - leftw = key - - for leftp in result[leftw]: - if leftp.startswith('n'): - if leftp=='nns': - pluralsk[leftw] = set([]) - for subst in result[key][leftp]: - plurals[subst] = set([]) - else: - singularsk[leftw] = set([]) - for subst in result[key][leftp]: - singulars[subst] = set([]) - elif leftp.startswith('v'): - verbsk[leftw] = {} - for subst in result[key][leftp]: - verbs[subst] = {} - - #------------------------------------------------------------------------------------------------ - - #Generate keys input: - singkeys = sorted(list(singularsk.keys())) - plurkeys = sorted(list(pluralsk.keys())) - verbkeys = sorted(list(verbsk.keys())) - - #Get stems: - singstems, plurstems, verbstems = self.getStems(singkeys, plurkeys, verbkeys) - - #Get plurals: - singres = self.getPlurals(singstems) - - #Get singulars: - plurres = self.getSingulars(plurstems) - - #Get verb inflections: - verbres1, verbres2, verbres3, verbres4, verbres5 = self.getInflections(verbstems) - - #Add information to dictionaries: - for i in range(0, len(singkeys)): - k = singkeys[i] - singre = singres[i] - singularsk[k] = singre - for i in range(0, len(plurkeys)): - k = plurkeys[i] - plurre = plurres[i] - pluralsk[k] = plurre - for i in range(0, len(verbkeys)): - k = verbkeys[i] - verbre1 = verbres1[i] - verbre2 = verbres2[i] - verbre3 = verbres3[i] - verbre4 = verbres4[i] - verbre5 = verbres5[i] - verbsk[k] = {'PAST_PERFECT_PARTICIPLE': verbre1, 'PAST_PARTICIPLE': verbre2, 'PRESENT_PARTICIPLE': verbre3, 'PRESENT': verbre4, 'PAST': verbre5} - - #------------------------------------------------------------------------------------------------ - - #Generate substs input: - singkeys = sorted(list(singulars.keys())) - plurkeys = sorted(list(plurals.keys())) - verbkeys = sorted(list(verbs.keys())) - - #Get stems: - singstems, plurstems, verbstems = self.getStems(singkeys, plurkeys, verbkeys) - - #Get plurals: - singres = self.getPlurals(singstems) - - #Get singulars: - plurres = self.getSingulars(plurstems) - - #Get verb inflections: - verbres1, verbres2, verbres3, verbres4, verbres5 = self.getInflections(verbstems) - - #Add information to dictionaries: - for i in range(0, len(singkeys)): - k = singkeys[i] - singre = singres[i] - singulars[k] = singre - for i in range(0, len(plurkeys)): - k = plurkeys[i] - plurre = plurres[i] - plurals[k] = plurre - for i in range(0, len(verbkeys)): - k = verbkeys[i] - verbre1 = verbres1[i] - verbre2 = verbres2[i] - verbre3 = verbres3[i] - verbre4 = verbres4[i] - verbre5 = verbres5[i] - verbs[k] = {'PAST_PERFECT_PARTICIPLE': verbre1, 'PAST_PARTICIPLE': verbre2, 'PRESENT_PARTICIPLE': verbre3, 'PRESENT': verbre4, 'PAST': verbre5} - - #------------------------------------------------------------------------------------------------ - - #Generate final substitution list: - for i in range(0, len(allkeys)): - key = allkeys[i] - leftw = key - for leftp in result[leftw]: - - #Add final version to candidates: - if leftw not in final_substitutions: - final_substitutions[leftw] = result[key][leftp] - else: - final_substitutions[leftw] = final_substitutions[leftw].union(result[key][leftp]) - #If left is a noun: - if leftp.startswith('n'): - #If it is a plural: - if leftp=='nns': - plurl = pluralsk[leftw] - newcands = set([]) - for candidate in result[key][leftp]: - candplurl = plurals[candidate] - newcands.add(candplurl) - if plurl not in final_substitutions: - final_substitutions[plurl] = newcands - else: - final_substitutions[plurl] = final_substitutions[plurl].union(newcands) - #If it is singular: - else: - singl = singularsk[leftw] - newcands = set([]) - for candidate in result[key][leftp]: - candsingl = singulars[candidate] - newcands.add(candsingl) - if singl not in final_substitutions: - final_substitutions[singl] = newcands - else: - final_substitutions[singl] = final_substitutions[singl].union(newcands) - #If left is a verb: - elif leftp.startswith('v'): - for verb_tense in ['PAST_PERFECT_PARTICIPLE', 'PAST_PARTICIPLE', 'PRESENT_PARTICIPLE', 'PRESENT', 'PAST']: - tensedl = verbsk[leftw][verb_tense] - newcands = set([]) - for candidate in result[key][leftp]: - candtensedl = verbs[candidate][verb_tense] - newcands.add(candtensedl) - if tensedl not in final_substitutions: - final_substitutions[tensedl] = newcands - else: - final_substitutions[tensedl] = final_substitutions[tensedl].union(newcands) - return final_substitutions - - def getInflections(self, verbstems): - data1 = self.mat.conjugateVerbs(verbstems, 'PAST_PERFECT_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data2 = self.mat.conjugateVerbs(verbstems, 'PAST_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data3 = self.mat.conjugateVerbs(verbstems, 'PRESENT_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data4 = self.mat.conjugateVerbs(verbstems, 'PRESENT', 'FIRST_PERSON_SINGULAR') - data5 = self.mat.conjugateVerbs(verbstems, 'PAST', 'FIRST_PERSON_SINGULAR') - return self.correctWords(data1), self.correctWords(data2), self.correctWords(data3), self.correctWords(data4), self.correctWords(data5) - - def getSingulars(self, plurstems): - data = self.mat.inflectNouns(plurstems, 'singular') - return self.correctWords(data) - - def getPlurals(self, singstems): - data = self.mat.inflectNouns(singstems, 'plural') - return self.correctWords(data) - - def getStems(self, sings, plurs, verbs): - data = self.mat.lemmatizeWords(sings+plurs+verbs) - rsings = [] - rplurs = [] - rverbs = [] - c = -1 - for sing in sings: - c += 1 - if len(data[c])>0: - rsings.append(data[c]) - else: - rsings.append(sing) - for plur in plurs: - c += 1 - if len(data[c])>0: - rplurs.append(data[c]) - else: - rplurs.append(plur) - for verb in verbs: - c += 1 - if len(data[c])>0: - rverbs.append(data[c]) - else: - rverbs.append(verb) - return self.correctWords(rsings), self.correctWords(rplurs), self.correctWords(rverbs) - - def correctWords(self, words): - result = [] - for word in words: - result.append(self.nc.correct(word)) - return result - -class YamamotoGenerator: - - def __init__(self, mat, dictionary_key, nc): - """ - Creates a YamamotoGenerator instance. - - @param mat: MorphAdornerToolkit object. - @param dictionary_key: Key for the Merriam Dictionary. - @param nc: NorvigCorrector object. - For more information on how to get the key for free, please refer to the LEXenstein Manual - """ - self.mat = mat - self.dictionary_key = dictionary_key - self.nc = nc - - def getSubstitutions(self, victor_corpus): - """ - Generates substitutions for the target words of a corpus in VICTOR format. - - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A dictionary that assigns target complex words to sets of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - """ - #Get initial set of substitutions: - print('Getting initial set of substitutions...') - substitutions_initial = self.getInitialSet(victor_corpus) - - #Get final substitutions: - print('Inflecting substitutions...') - substitutions_inflected = self.getInflectedSet(substitutions_initial) - - #Return final set: - print('Finished!') - return substitutions_inflected - - def getInflectedSet(self, result): - final_substitutions = {} - - #Get inflections: - allkeys = sorted(list(result.keys())) - - singulars = {} - plurals = {} - verbs = {} - - singularsk = {} - pluralsk = {} - verbsk = {} - - for i in range(0, len(allkeys)): - key = allkeys[i] - leftw = key - - for leftp in result[leftw]: - if leftp.startswith('n'): - if leftp=='nns': - pluralsk[leftw] = set([]) - for subst in result[key][leftp]: - plurals[subst] = set([]) - else: - singularsk[leftw] = set([]) - for subst in result[key][leftp]: - singulars[subst] = set([]) - elif leftp.startswith('v'): - verbsk[leftw] = {} - for subst in result[key][leftp]: - verbs[subst] = {} - - #------------------------------------------------------------------------------------------------ - - #Generate keys input: - singkeys = sorted(list(singularsk.keys())) - plurkeys = sorted(list(pluralsk.keys())) - verbkeys = sorted(list(verbsk.keys())) - - #Get stems: - singstems, plurstems, verbstems = self.getStems(singkeys, plurkeys, verbkeys) - - #Get plurals: - singres = self.getPlurals(singstems) - - #Get singulars: - plurres = self.getSingulars(plurstems) - - #Get verb inflections: - verbres1, verbres2, verbres3, verbres4, verbres5 = self.getInflections(verbstems) - - #Add information to dictionaries: - for i in range(0, len(singkeys)): - k = singkeys[i] - singre = singres[i] - singularsk[k] = singre - for i in range(0, len(plurkeys)): - k = plurkeys[i] - plurre = plurres[i] - pluralsk[k] = plurre - for i in range(0, len(verbkeys)): - k = verbkeys[i] - verbre1 = verbres1[i] - verbre2 = verbres2[i] - verbre3 = verbres3[i] - verbre4 = verbres4[i] - verbre5 = verbres5[i] - verbsk[k] = {'PAST_PERFECT_PARTICIPLE': verbre1, 'PAST_PARTICIPLE': verbre2, 'PRESENT_PARTICIPLE': verbre3, 'PRESENT': verbre4, 'PAST': verbre5} - - #------------------------------------------------------------------------------------------------ - - #Generate substs input: - singkeys = sorted(list(singulars.keys())) - plurkeys = sorted(list(plurals.keys())) - verbkeys = sorted(list(verbs.keys())) - - #Get stems: - singstems, plurstems, verbstems = self.getStems(singkeys, plurkeys, verbkeys) - - #Get plurals: - singres = self.getPlurals(singstems) - - #Get singulars: - plurres = self.getSingulars(plurstems) - - #Get verb inflections: - verbres1, verbres2, verbres3, verbres4, verbres5 = self.getInflections(verbstems) - - #Add information to dictionaries: - for i in range(0, len(singkeys)): - k = singkeys[i] - singre = singres[i] - singulars[k] = singre - for i in range(0, len(plurkeys)): - k = plurkeys[i] - plurre = plurres[i] - plurals[k] = plurre - for i in range(0, len(verbkeys)): - k = verbkeys[i] - verbre1 = verbres1[i] - verbre2 = verbres2[i] - verbre3 = verbres3[i] - verbre4 = verbres4[i] - verbre5 = verbres5[i] - verbs[k] = {'PAST_PERFECT_PARTICIPLE': verbre1, 'PAST_PARTICIPLE': verbre2, 'PRESENT_PARTICIPLE': verbre3, 'PRESENT': verbre4, 'PAST': verbre5} - - #------------------------------------------------------------------------------------------------ - - #Generate final substitution list: - for i in range(0, len(allkeys)): - key = allkeys[i] - leftw = key - for leftp in result[leftw]: - - #Add final version to candidates: - if leftw not in final_substitutions: - final_substitutions[leftw] = result[key][leftp] - else: - final_substitutions[leftw] = final_substitutions[leftw].union(result[key][leftp]) - #If left is a noun: - if leftp.startswith('n'): - #If it is a plural: - if leftp=='nns': - plurl = pluralsk[leftw] - newcands = set([]) - for candidate in result[key][leftp]: - candplurl = plurals[candidate] - newcands.add(candplurl) - if plurl not in final_substitutions: - final_substitutions[plurl] = newcands - else: - final_substitutions[plurl] = final_substitutions[plurl].union(newcands) - #If it is singular: - else: - singl = singularsk[leftw] - newcands = set([]) - for candidate in result[key][leftp]: - candsingl = singulars[candidate] - newcands.add(candsingl) - if singl not in final_substitutions: - final_substitutions[singl] = newcands - else: - final_substitutions[singl] = final_substitutions[singl].union(newcands) - #If left is a verb: - elif leftp.startswith('v'): - for verb_tense in ['PAST_PERFECT_PARTICIPLE', 'PAST_PARTICIPLE', 'PRESENT_PARTICIPLE', 'PRESENT', 'PAST']: - tensedl = verbsk[leftw][verb_tense] - newcands = set([]) - for candidate in result[key][leftp]: - candtensedl = verbs[candidate][verb_tense] - newcands.add(candtensedl) - if tensedl not in final_substitutions: - final_substitutions[tensedl] = newcands - else: - final_substitutions[tensedl] = final_substitutions[tensedl].union(newcands) - return final_substitutions - - def getInflections(self, verbstems): - data1 = self.mat.conjugateVerbs(verbstems, 'PAST_PERFECT_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data2 = self.mat.conjugateVerbs(verbstems, 'PAST_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data3 = self.mat.conjugateVerbs(verbstems, 'PRESENT_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data4 = self.mat.conjugateVerbs(verbstems, 'PRESENT', 'FIRST_PERSON_SINGULAR') - data5 = self.mat.conjugateVerbs(verbstems, 'PAST', 'FIRST_PERSON_SINGULAR') - return self.correctWords(data1), self.correctWords(data2), self.correctWords(data3), self.correctWords(data4), self.correctWords(data5) - - def getSingulars(self, plurstems): - data = self.mat.inflectNouns(plurstems, 'singular') - return self.correctWords(data) - - def getPlurals(self, singstems): - data = self.mat.inflectNouns(singstems, 'plural') - return self.correctWords(data) - - def getStems(self, sings, plurs, verbs): - data = self.mat.lemmatizeWords(sings+plurs+verbs) - rsings = [] - rplurs = [] - rverbs = [] - c = -1 - for sing in sings: - c += 1 - if len(data[c])>0: - rsings.append(data[c]) - else: - rsings.append(sing) - for plur in plurs: - c += 1 - if len(data[c])>0: - rplurs.append(data[c]) - else: - rplurs.append(plur) - for verb in verbs: - c += 1 - if len(data[c])>0: - rverbs.append(data[c]) - else: - rverbs.append(verb) - return self.correctWords(rsings), self.correctWords(rplurs), self.correctWords(rverbs) - - def getInitialSet(self, victor_corpus): - substitutions_initial = {} - - lex = open(victor_corpus) - for line in lex: - data = line.strip().split('\t') - target = data[1].strip() - head = int(data[2].strip()) - - url = 'http://www.dictionaryapi.com/api/v1/references/collegiate/xml/' + target + '?key=' + self.dictionary_key - conn = urllib.request.urlopen(url) - root = ET.fromstring(conn.read()) - - newline = target + '\t' - cands = {} - - entries = root.iter('entry') - for entry in entries: - node_pos = entry.find('fl') - if node_pos != None: - node_pos = node_pos.text.strip()[0].lower() - if node_pos not in cands: - cands[node_pos] = set([]) - for definition in entry.iter('dt'): - if definition.text!=None: - text = definition.text.strip() - text = text[1:len(text)] - tokens = nltk.word_tokenize(text) - postags = nltk.pos_tag(tokens) - for p in postags: - postag = p[1].strip()[0].lower() - cand = p[0].strip() - if postag==node_pos: - cands[node_pos].add(cand) - for pos in cands: - if target in cands[pos]: - cands[pos].remove(target) - if len(list(cands.keys()))>0: - substitutions_initial[target] = cands - lex.close() - return substitutions_initial - - def correctWords(self, words): - result = [] - for word in words: - result.append(self.nc.correct(word)) - return result - -class MerriamGenerator: - - def __init__(self, mat, thesaurus_key, nc): - """ - Creates a MerriamGenerator instance. - - @param mat: MorphAdornerToolkit object. - @param thesaurus_key: Key for the Merriam Thesaurus. - For more information on how to get the key for free, please refer to the LEXenstein Manual - @param nc: NorvigCorrector object. - """ - self.mat = mat - self.thesaurus_key = thesaurus_key - self.nc = nc - - def getSubstitutions(self, victor_corpus): - """ - Generates substitutions for the target words of a corpus in VICTOR format. - - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A dictionary that assigns target complex words to sets of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - """ - #Get initial set of substitutions: - print('Getting initial set of substitutions...') - substitutions_initial = self.getInitialSet(victor_corpus) - - #Get final substitutions: - print('Inflecting substitutions...') - substitutions_inflected = self.getInflectedSet(substitutions_initial) - - #Return final set: - print('Finished!') - return substitutions_inflected - - def getInflectedSet(self, result): - final_substitutions = {} - - #Get inflections: - allkeys = sorted(list(result.keys())) - - singulars = {} - plurals = {} - verbs = {} - - singularsk = {} - pluralsk = {} - verbsk = {} - - for i in range(0, len(allkeys)): - key = allkeys[i] - leftw = key - - for leftp in result[leftw]: - if leftp.startswith('n'): - if leftp=='nns': - pluralsk[leftw] = set([]) - for subst in result[key][leftp]: - plurals[subst] = set([]) - else: - singularsk[leftw] = set([]) - for subst in result[key][leftp]: - singulars[subst] = set([]) - elif leftp.startswith('v'): - verbsk[leftw] = {} - for subst in result[key][leftp]: - verbs[subst] = {} - - #------------------------------------------------------------------------------------------------ - - #Generate keys input: - singkeys = sorted(list(singularsk.keys())) - plurkeys = sorted(list(pluralsk.keys())) - verbkeys = sorted(list(verbsk.keys())) - - #Get stems: - singstems, plurstems, verbstems = self.getStems(singkeys, plurkeys, verbkeys) - - #Get plurals: - singres = self.getPlurals(singstems) - - #Get singulars: - plurres = self.getSingulars(plurstems) - - #Get verb inflections: - verbres1, verbres2, verbres3, verbres4, verbres5 = self.getInflections(verbstems) - - #Add information to dictionaries: - for i in range(0, len(singkeys)): - k = singkeys[i] - singre = singres[i] - singularsk[k] = singre - for i in range(0, len(plurkeys)): - k = plurkeys[i] - plurre = plurres[i] - pluralsk[k] = plurre - for i in range(0, len(verbkeys)): - k = verbkeys[i] - verbre1 = verbres1[i] - verbre2 = verbres2[i] - verbre3 = verbres3[i] - verbre4 = verbres4[i] - verbre5 = verbres5[i] - verbsk[k] = {'PAST_PERFECT_PARTICIPLE': verbre1, 'PAST_PARTICIPLE': verbre2, 'PRESENT_PARTICIPLE': verbre3, 'PRESENT': verbre4, 'PAST': verbre5} - - #------------------------------------------------------------------------------------------------ - - #Generate substs input: - singkeys = sorted(list(singulars.keys())) - plurkeys = sorted(list(plurals.keys())) - verbkeys = sorted(list(verbs.keys())) - - #Get stems: - singstems, plurstems, verbstems = self.getStems(singkeys, plurkeys, verbkeys) - - #Get plurals: - singres = self.getPlurals(singstems) - - #Get singulars: - plurres = self.getSingulars(plurstems) - - #Get verb inflections: - verbres1, verbres2, verbres3, verbres4, verbres5 = self.getInflections(verbstems) - - #Add information to dictionaries: - for i in range(0, len(singkeys)): - k = singkeys[i] - singre = singres[i] - singulars[k] = singre - for i in range(0, len(plurkeys)): - k = plurkeys[i] - plurre = plurres[i] - plurals[k] = plurre - for i in range(0, len(verbkeys)): - k = verbkeys[i] - verbre1 = verbres1[i] - verbre2 = verbres2[i] - verbre3 = verbres3[i] - verbre4 = verbres4[i] - verbre5 = verbres5[i] - verbs[k] = {'PAST_PERFECT_PARTICIPLE': verbre1, 'PAST_PARTICIPLE': verbre2, 'PRESENT_PARTICIPLE': verbre3, 'PRESENT': verbre4, 'PAST': verbre5} - - #------------------------------------------------------------------------------------------------ - - #Generate final substitution list: - for i in range(0, len(allkeys)): - key = allkeys[i] - leftw = key - for leftp in result[leftw]: - - #Add final version to candidates: - if leftw not in final_substitutions: - final_substitutions[leftw] = result[key][leftp] - else: - final_substitutions[leftw] = final_substitutions[leftw].union(result[key][leftp]) - #If left is a noun: - if leftp.startswith('n'): - #If it is a plural: - if leftp=='nns': - plurl = pluralsk[leftw] - newcands = set([]) - for candidate in result[key][leftp]: - candplurl = plurals[candidate] - newcands.add(candplurl) - if plurl not in final_substitutions: - final_substitutions[plurl] = newcands - else: - final_substitutions[plurl] = final_substitutions[plurl].union(newcands) - #If it is singular: - else: - singl = singularsk[leftw] - newcands = set([]) - for candidate in result[key][leftp]: - candsingl = singulars[candidate] - newcands.add(candsingl) - if singl not in final_substitutions: - final_substitutions[singl] = newcands - else: - final_substitutions[singl] = final_substitutions[singl].union(newcands) - #If left is a verb: - elif leftp.startswith('v'): - for verb_tense in ['PAST_PERFECT_PARTICIPLE', 'PAST_PARTICIPLE', 'PRESENT_PARTICIPLE', 'PRESENT', 'PAST']: - tensedl = verbsk[leftw][verb_tense] - newcands = set([]) - for candidate in result[key][leftp]: - candtensedl = verbs[candidate][verb_tense] - newcands.add(candtensedl) - if tensedl not in final_substitutions: - final_substitutions[tensedl] = newcands - else: - final_substitutions[tensedl] = final_substitutions[tensedl].union(newcands) - return final_substitutions - - def getInflections(self, verbstems): - data1 = self.mat.conjugateVerbs(verbstems, 'PAST_PERFECT_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data2 = self.mat.conjugateVerbs(verbstems, 'PAST_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data3 = self.mat.conjugateVerbs(verbstems, 'PRESENT_PARTICIPLE', 'FIRST_PERSON_SINGULAR') - data4 = self.mat.conjugateVerbs(verbstems, 'PRESENT', 'FIRST_PERSON_SINGULAR') - data5 = self.mat.conjugateVerbs(verbstems, 'PAST', 'FIRST_PERSON_SINGULAR') - return self.correctWords(data1), self.correctWords(data2), self.correctWords(data3), self.correctWords(data4), self.correctWords(data5) - - def getSingulars(self, plurstems): - data = self.mat.inflectNouns(plurstems, 'singular') - return self.correctWords(data) - - def getPlurals(self, singstems): - data = self.mat.inflectNouns(singstems, 'plural') - return self.correctWords(data) - - def getStems(self, sings, plurs, verbs): - data = self.mat.lemmatizeWords(sings+plurs+verbs) - rsings = [] - rplurs = [] - rverbs = [] - c = -1 - for sing in sings: - c += 1 - if len(data[c])>0: - rsings.append(data[c]) - else: - rsings.append(sing) - for plur in plurs: - c += 1 - if len(data[c])>0: - rplurs.append(data[c]) - else: - rplurs.append(plur) - for verb in verbs: - c += 1 - if len(data[c])>0: - rverbs.append(data[c]) - else: - rverbs.append(verb) - return self.correctWords(rsings), self.correctWords(rplurs), self.correctWords(rverbs) - - def getInitialSet(self, victor_corpus): - substitutions_initial = {} - - lex = open(victor_corpus) - for line in lex: - data = line.strip().split('\t') - target = data[1].strip() - url = 'http://www.dictionaryapi.com/api/v1/references/thesaurus/xml/' + target + '?key=' + self.thesaurus_key - conn = urllib.request.urlopen(url) - root = ET.fromstring(conn.read()) - root = root.findall('entry') - - cands = {} - if len(root)>0: - for root_node in root: - node_pos = root_node.find('fl') - if node_pos != None: - node_pos = node_pos.text.strip()[0].lower() - if node_pos not in cands: - cands[node_pos] = set([]) - for sense in root_node.iter('sens'): - syn = sense.findall('syn')[0] - res = '' - for snip in syn.itertext(): - res += snip + ' ' - finds = re.findall('\([^\)]+\)', res) - for find in finds: - res = res.replace(find, '') - - synonyms = [s.strip() for s in res.split(',')] - - for synonym in synonyms: - if len(synonym.split(' '))==1: - try: - test = codecs.ascii_encode(synonym) - cands[node_pos].add(synonym) - except UnicodeEncodeError: - cands = cands - for pos in cands: - if target in cands[pos]: - cands[pos].remove(target) - if len(list(cands.keys()))>0: - substitutions_initial[target] = cands - lex.close() - return substitutions_initial - - def correctWords(self, words): - result = [] - for word in words: - result.append(self.nc.correct(word)) - return result - - -#Class for the Wordnet Generator -class WordnetGenerator: - - def __init__(self, mat, nc, pos_model, stanford_tagger, java_path): - """ - Creates a WordnetGenerator instance. - - @param mat: MorphAdornerToolkit object. - @param nc: NorvigCorrector object. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - """ - self.mat = mat - self.nc = nc - os.environ['JAVAHOME'] = java_path - self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) - - def getSubstitutions(self, victor_corpus): - """ - Generates substitutions for the target words of a corpus in VICTOR format. - - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A dictionary that assigns target complex words to sets of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - """ - - #Get initial set of substitutions: - print('Getting initial set of substitutions...') - substitutions_initial = self.getInitialSet(victor_corpus) - - #Get final substitutions: - print('Inflecting substitutions...') - substitutions_inflected = self.getInflectedSet(substitutions_initial) - - #Return final set: - print('Finished!') - return substitutions_inflected - - def getInflectedSet(self, subs): - #Create list of targets: - targets = [] - - #Create lists for inflection: - toNothing = [] - toSingular = [] - toPlural = [] - toPAPEPA = [] - toPA = [] - toPRPA = [] - toPAPA = [] - toPE = [] - toPR = [] - toComparative = [] - toSuperlative = [] - toOriginal = [] - - #Fill lists: - for target in subs: - targets.append(target) - for pos in subs[target]: - #Get cands for a target and tag combination: - cands = list(subs[target][pos]) - - #Add candidates to lists: - if pos == 'NN': - toSingular.extend(cands) - elif pos == 'NNS': - toPlural.extend(cands) - elif pos == 'VB': - toPAPEPA.extend(cands) - elif pos == 'VBD': - toPA.extend(cands) - toPAPA.extend(cands) - elif pos == 'VBG': - toPRPA.extend(cands) - elif pos == 'VBN': - toPA.extend(cands) - toPAPA.extend(cands) - elif pos == 'VBP': - toPE.extend(cands) - elif pos == 'VBZ': - toPR.extend(cands) - elif pos == 'JJR' or pos == 'RBR': - toComparative.extend(cands) - elif pos == 'JJS' or pos == 'RBS': - toSuperlative.extend(cands) - else: - toNothing.extend(cands) - - #Lemmatize targets: - targetsL = self.mat.lemmatizeWords(targets) - - #Lemmatize words: - toNothingL = self.correctWords(self.mat.lemmatizeWords(toNothing)) - toSingularL = self.correctWords(self.mat.lemmatizeWords(toSingular)) - toPluralL = self.correctWords(self.mat.lemmatizeWords(toPlural)) - toPAPEPAL = self.correctWords(self.mat.lemmatizeWords(toPAPEPA)) - toPAL = self.correctWords(self.mat.lemmatizeWords(toPA)) - toPRPAL = self.correctWords(self.mat.lemmatizeWords(toPRPA)) - toPAPAL = self.correctWords(self.mat.lemmatizeWords(toPAPA)) - toPEL = self.correctWords(self.mat.lemmatizeWords(toPE)) - toPRL = self.correctWords(self.mat.lemmatizeWords(toPR)) - toComparativeL = self.correctWords(self.mat.lemmatizeWords(toComparative)) - toSuperlativeL = self.correctWords(self.mat.lemmatizeWords(toSuperlative)) - - #Inflect nouns: - singulars = self.correctWords(self.mat.inflectNouns(toSingularL, 'singular')) - plurals = self.correctWords(self.mat.inflectNouns(toPluralL, 'plural')) - - #Inflect verbs: - papepas = self.correctWords(self.mat.conjugateVerbs(toPAPEPAL, 'PAST_PERFECT_PARTICIPLE', 'FIRST_PERSON_SINGULAR')) - pas = self.correctWords(self.mat.conjugateVerbs(toPAL, 'PAST', 'FIRST_PERSON_SINGULAR')) - prpas = self.correctWords(self.mat.conjugateVerbs(toPRPAL, 'PRESENT_PARTICIPLE', 'FIRST_PERSON_SINGULAR')) - papas = self.correctWords(self.mat.conjugateVerbs(toPAPAL, 'PAST_PARTICIPLE', 'FIRST_PERSON_SINGULAR')) - pes = self.correctWords(self.mat.conjugateVerbs(toPEL, 'PERFECT', 'FIRST_PERSON_SINGULAR')) - prs = self.correctWords(self.mat.conjugateVerbs(toPRL, 'PRESENT', 'THIRD_PERSON_SINGULAR')) - - #Inflect adjectives and adverbs: - comparatives = self.correctWords(self.mat.inflectAdjectives(toComparativeL, 'comparative')) - superlatives = self.correctWords(self.mat.inflectAdjectives(toSuperlativeL, 'superlative')) - - #Create maps: - stemM = {} - singularM = {} - pluralM = {} - papepaM = {} - paM = {} - prpaM = {} - papaM = {} - peM = {} - prM = {} - comparativeM = {} - superlativeM = {} - - for i in range(0, len(toNothing)): - stemM[toNothing[i]] = toNothingL[i] - for i in range(0, len(targets)): - stemM[targets[i]] = targetsL[i] - for i in range(0, len(toSingular)): - stemM[toSingular[i]] = toSingularL[i] - singularM[toSingular[i]] = singulars[i] - for i in range(0, len(toPlural)): - stemM[toPlural[i]] = toPluralL[i] - pluralM[toPlural[i]] = plurals[i] - for i in range(0, len(toPAPEPA)): - stemM[toPAPEPA[i]] = toPAPEPAL[i] - papepaM[toPAPEPA[i]] = papepas[i] - for i in range(0, len(toPA)): - stemM[toPA[i]] = toPAL[i] - paM[toPA[i]] = pas[i] - for i in range(0, len(toPRPA)): - stemM[toPRPA[i]] = toPRPAL[i] - prpaM[toPRPA[i]] = prpas[i] - for i in range(0, len(toPAPA)): - stemM[toPAPA[i]] = toPAPAL[i] - papaM[toPAPA[i]] = papas[i] - for i in range(0, len(toPE)): - stemM[toPE[i]] = toPEL[i] - peM[toPE[i]] = pes[i] - for i in range(0, len(toPR)): - stemM[toPR[i]] = toPRL[i] - prM[toPR[i]] = prs[i] - for i in range(0, len(toComparative)): - stemM[toComparative[i]] = toComparativeL[i] - comparativeM[toComparative[i]] = comparatives[i] - for i in range(0, len(toSuperlative)): - stemM[toSuperlative[i]] = toSuperlativeL[i] - superlativeM[toSuperlative[i]] = superlatives[i] - - #Create final substitutions: - final_substitutions = {} - for target in subs: - #Get lemma of target: - targetL = stemM[target] - - #Create instance in final_substitutions: - final_substitutions[target] = set([]) - - #Iterate through pos tags of target: - for pos in subs[target]: - #Create final cands: - final_cands = set([]) - - #Get cands for a target and tag combination: - cands = list(subs[target][pos]) - - #Add candidates to lists: - if pos == 'NN': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(singularM[cand]) - final_cands.add(cand) - elif pos == 'NNS': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(pluralM[cand]) - final_cands.add(cand) - elif pos == 'VB': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(papepaM[cand]) - elif pos == 'VBD': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(paM[cand]) - final_cands.add(papaM[cand]) - elif pos == 'VBG': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(prpaM[cand]) - elif pos == 'VBN': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(paM[cand]) - final_cands.add(papaM[cand]) - elif pos == 'VBP': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(peM[cand]) - elif pos == 'VBZ': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(prM[cand]) - elif pos == 'JJR' or pos == 'RBR': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(comparativeM[cand]) - elif pos == 'JJS' or pos == 'RBS': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(superlativeM[cand]) - else: - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(cand) - - #Add final cands to final substitutions: - final_substitutions[target].update(final_cands) - return final_substitutions - - def getExpandedSet(self, subs): - #Create lists for inflection: - nouns = set([]) - verbs = set([]) - adjectives = set([]) - - #Fill lists: - for target in subs: - for pos in subs[target]: - #Get cands for a target and tag combination: - cands = list(subs[target][pos]) - - #Add candidates to lists: - if pos == 'NN' or pos == 'NNS': - nouns.add(target) - elif pos.startswith('V'): - verbs.add(target) - elif pos.startswith('J') or pos.startswith('RB'): - adjectives.add(target) - - #Transform sets in lists: - nouns = list(nouns) - verbs = list(verbs) - adjectives = list(adjectives) - - #Lemmatize words: - nounsL = self.correctWords(self.mat.lemmatizeWords(nouns)) - verbsL = self.correctWords(self.mat.lemmatizeWords(verbs)) - adjectivesL = self.correctWords(self.mat.lemmatizeWords(adjectives)) - - #Create lemma maps: - nounM = {} - verbM = {} - adjectiveM = {} - for i in range(0, len(nouns)): - nounM[nouns[i]] = nounsL[i] - for i in range(0, len(verbs)): - verbM[verbs[i]] = verbsL[i] - for i in range(0, len(adjectives)): - adjectiveM[adjectives[i]] = adjectivesL[i] - - #Inflect words: - plurals = self.correctWords(self.mat.inflectNouns(nounsL, 'plural')) - pas = self.correctWords(self.mat.conjugateVerbs(verbsL, 'PAST')) - prpas = self.correctWords(self.mat.conjugateVerbs(verbsL, 'PRESENT_PARTICIPLE')) - papas = self.correctWords(self.mat.conjugateVerbs(verbsL, 'PAST_PARTICIPLE')) - prs = self.correctWords(self.mat.conjugateVerbs(verbsL, 'PRESENT')) - comparatives = self.correctWords(self.mat.inflectAdjectives(adjectives, 'comparative')) - superlatives = self.correctWords(self.mat.inflectAdjectives(adjectives, 'superlative')) - - #Create inflected maps: - pluralM = {} - paM = {} - prpaM = {} - papaM = {} - prM = {} - comparativeM = {} - superlativeM = {} - for i in range(0, len(nouns)): - pluralM[nouns[i]] = plurals[i] - for i in range(0, len(verbs)): - paM[verbs[i]] = pas[i] - prpaM[verbs[i]] = prpas[i] - papaM[verbs[i]] = papas[i] - prM[verbs[i]] = prs[i] - for i in range(0, len(adjectives)): - comparativeM[adjectives[i]] = comparatives[i] - superlativeM[adjectives[i]] = superlatives[i] - - #Create extended substitutions: - substitutions_extended = {} - for target in subs: - for pos in subs[target]: - #Get cands for a target and tag combination: - cands = list(subs[target][pos]) - - #Add original to substitution dictionary: - self.addToExtended(target, pos, cands, substitutions_extended) - - #Add candidates to lists: - if pos == 'NN': - pluralT = pluralM[target] - self.addToExtended(pluralT, 'NNS', cands, substitutions_extended) - elif pos == 'NNS': - singularT = nounM[target] - self.addToExtended(singularT, 'NN', cands, substitutions_extended) - elif pos == 'VB': - paT = paM[target] - prpaT = prpaM[target] - papaT = papaM[target] - prT = prM[target] - self.addToExtended(paT, 'VBD', cands, substitutions_extended) - self.addToExtended(prpaT, 'VBG', cands, substitutions_extended) - self.addToExtended(papaT, 'VBN', cands, substitutions_extended) - self.addToExtended(prT, 'VBP', cands, substitutions_extended) - self.addToExtended(prT, 'VBZ', cands, substitutions_extended) - elif pos == 'VBD': - lemmaT = verbM[target] - prpaT = prpaM[target] - papaT = papaM[target] - prT = prM[target] - self.addToExtended(lemmaT, 'VB', cands, substitutions_extended) - self.addToExtended(prpaT, 'VBG', cands, substitutions_extended) - self.addToExtended(papaT, 'VBN', cands, substitutions_extended) - self.addToExtended(prT, 'VBP', cands, substitutions_extended) - self.addToExtended(prT, 'VBZ', cands, substitutions_extended) - elif pos == 'VBG': - lemmaT = verbM[target] - paT = paM[target] - papaT = papaM[target] - prT = prM[target] - self.addToExtended(lemmaT, 'VB', cands, substitutions_extended) - self.addToExtended(paT, 'VBD', cands, substitutions_extended) - self.addToExtended(papaT, 'VBN', cands, substitutions_extended) - self.addToExtended(prT, 'VBP', cands, substitutions_extended) - self.addToExtended(prT, 'VBZ', cands, substitutions_extended) - elif pos == 'VBN': - lemmaT = verbM[target] - paT = paM[target] - prpaT = prpaM[target] - prT = prM[target] - self.addToExtended(lemmaT, 'VB', cands, substitutions_extended) - self.addToExtended(paT, 'VBD', cands, substitutions_extended) - self.addToExtended(prpaT, 'VBG', cands, substitutions_extended) - self.addToExtended(prT, 'VBP', cands, substitutions_extended) - self.addToExtended(prT, 'VBZ', cands, substitutions_extended) - elif pos == 'VBP': - lemmaT = verbM[target] - paT = paM[target] - prpaT = prpaM[target] - papaT = prM[target] - self.addToExtended(target, 'VBZ', cands, substitutions_extended) - self.addToExtended(lemmaT, 'VB', cands, substitutions_extended) - self.addToExtended(paT, 'VBD', cands, substitutions_extended) - self.addToExtended(prpaT, 'VBG', cands, substitutions_extended) - self.addToExtended(papaT, 'VBN', cands, substitutions_extended) - elif pos == 'VBZ': - lemmaT = verbM[target] - paT = paM[target] - prpaT = prpaM[target] - papaT = prM[target] - self.addToExtended(target, 'VBP', cands, substitutions_extended) - self.addToExtended(lemmaT, 'VB', cands, substitutions_extended) - self.addToExtended(paT, 'VBD', cands, substitutions_extended) - self.addToExtended(prpaT, 'VBG', cands, substitutions_extended) - self.addToExtended(papaT, 'VBN', cands, substitutions_extended) - elif pos == 'JJ': - comparativeT = comparativeM[target] - superlativeT = superlativeM[target] - self.addToExtended(comparativeT, 'JJR', cands, substitutions_extended) - self.addToExtended(superlativeT, 'JJS', cands, substitutions_extended) - elif pos == 'JJR': - lemmaT = adjectiveM[target] - superlativeT = superlativeM[target] - self.addToExtended(lemmaT, 'JJ', cands, substitutions_extended) - self.addToExtended(superlativeT, 'JJS', cands, substitutions_extended) - elif pos == 'JJS': - lemmaT = adjectiveM[target] - comparativeT = comparativeM[target] - self.addToExtended(lemmaT, 'JJ', cands, substitutions_extended) - self.addToExtended(comparativeT, 'JJR', cands, substitutions_extended) - elif pos == 'RB': - comparativeT = comparativeM[target] - superlativeT = superlativeM[target] - self.addToExtended(comparativeT, 'RBR', cands, substitutions_extended) - self.addToExtended(superlativeT, 'RBS', cands, substitutions_extended) - elif pos == 'RBR': - lemmaT = adjectiveM[target] - superlativeT = superlativeM[target] - self.addToExtended(lemmaT, 'RB', cands, substitutions_extended) - self.addToExtended(superlativeT, 'RBS', cands, substitutions_extended) - elif pos == 'RBS': - lemmaT = adjectiveM[target] - comparativeT = comparativeM[target] - self.addToExtended(lemmaT, 'RB', cands, substitutions_extended) - self.addToExtended(comparativeT, 'RBR', cands, substitutions_extended) - return substitutions_extended - - def getInitialSet(self, victor_corpus): - substitutions_initial = {} - lexf = open(victor_corpus) - sents = [] - targets = [] - heads = [] - for line in lexf: - data = line.strip().split('\t') - sent = data[0].strip().split(' ') - target = data[1].strip() - head = int(data[2].strip()) - sents.append(sent) - targets.append(target) - heads.append(head) - lexf.close() - - tagged_sents = self.tagger.tag_sents(sents) - - for i in range(0, len(sents)): - target = targets[i] - head = heads[i] - target_pos = str(tagged_sents[i][head][1]) - target_wnpos = self.getWordnetPOS(target_pos) - - syns = wn.synsets(target) - - cands = set([]) - for syn in syns: - for lem in syn.lemmas(): - candidate = self.cleanLemma(lem.name()) - if len(candidate.split(' '))==1: - cands.add(candidate) - if len(cands)>0: - if target in substitutions_initial: - substitutions_initial[target][target_pos] = cands - else: - substitutions_initial[target] = {target_pos:cands} - return substitutions_initial - - def addToExtended(self, target, tag, cands, subs): - if target not in subs: - subs[target] = {tag:cands} - else: - if tag not in subs[target]: - subs[target][tag] = cands - else: - subs[target][tag].extend(cands) - - def correctWords(self, words): - result = [] - for word in words: - result.append(self.nc.correct(word)) - return result - - def cleanLemma(self, lem): - result = '' - aux = lem.strip().split('_') - for word in aux: - result += word + ' ' - return result.strip() - - def getWordnetPOS(self, pos): - if pos[0] == 'N' or pos[0] == 'V' or pos == 'RBR' or pos == 'RBS': - return pos[0].lower() - elif pos[0] == 'J': - return 'a' - else: - return None - -#Class for the Biran Generator: -class BiranGenerator: - - def __init__(self, mat, complex_vocab, simple_vocab, complex_lm, simple_lm, nc, pos_model, stanford_tagger, java_path): - """ - Creates a BiranGenerator instance. - - @param mat: MorphAdornerToolkit object. - @param complex_vocab: Path to a vocabulary of complex words. - For more information on how to create the file, refer to the LEXenstein Manual. - @param simple_vocab: Path to a vocabulary of simple words. - For more information on how to create the file, refer to the LEXenstein Manual. - @param complex_lm: Path to a language model built over complex text. - For more information on how to create the file, refer to the LEXenstein Manual. - @param simple_lm: Path to a language model built over simple text. - For more information on how to create the file, refer to the LEXenstein Manual. - @param nc: NorvigCorrector object. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - """ - - self.complex_vocab = self.getVocab(complex_vocab) - self.simple_vocab = self.getVocab(simple_vocab) - self.complex_lm = kenlm.LanguageModel(complex_lm) - self.simple_lm = kenlm.LanguageModel(simple_lm) - self.mat = mat - self.nc = nc - os.environ['JAVAHOME'] = java_path - self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) - - def getSubstitutions(self, victor_corpus): - """ - Generates substitutions for the target words of a corpus in VICTOR format. - - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A dictionary that assigns target complex words to sets of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - """ - - #Get initial set of substitutions: - print('Getting initial set of substitutions...') - substitutions_initial = self.getInitialSet(victor_corpus) - - #Get inflected substitutions: - print('Inflecting substitutions...') - substitutions_inflected = self.getInflectedSet(substitutions_initial) - - #Get final substitutions: - print('Filtering simple->complex substitutions...') - substitutions_final = self.getFinalSet(substitutions_inflected) - - #Return final set: - print('Finished!') - return substitutions_final - - def getFinalSet(self, substitutions_inflected): - #Remove simple->complex substitutions: - substitutions_final = {} - - for key in substitutions_inflected: - candidate_list = set([]) - key_score = self.getComplexity(key, self.complex_lm, self.simple_lm) - for cand in substitutions_inflected[key]: - cand_score = self.getComplexity(cand, self.complex_lm, self.simple_lm) - if key_score>=cand_score: - candidate_list.add(cand) - if len(candidate_list)>0: - substitutions_final[key] = candidate_list - return substitutions_final - - def getInflectedSet(self, subs): - #Create list of targets: - targets = [] - - #Create lists for inflection: - toNothing = [] - toSingular = [] - toPlural = [] - toPAPEPA = [] - toPA = [] - toPRPA = [] - toPAPA = [] - toPE = [] - toPR = [] - toComparative = [] - toSuperlative = [] - toOriginal = [] - - #Fill lists: - for target in subs: - targets.append(target) - for pos in subs[target]: - #Get cands for a target and tag combination: - cands = list(subs[target][pos]) - - #Add candidates to lists: - if pos == 'NN': - toSingular.extend(cands) - elif pos == 'NNS': - toPlural.extend(cands) - elif pos == 'VB': - toPAPEPA.extend(cands) - elif pos == 'VBD': - toPA.extend(cands) - toPAPA.extend(cands) - elif pos == 'VBG': - toPRPA.extend(cands) - elif pos == 'VBN': - toPA.extend(cands) - toPAPA.extend(cands) - elif pos == 'VBP': - toPE.extend(cands) - elif pos == 'VBZ': - toPR.extend(cands) - elif pos == 'JJR' or pos == 'RBR': - toComparative.extend(cands) - elif pos == 'JJS' or pos == 'RBS': - toSuperlative.extend(cands) - else: - toNothing.extend(cands) - - #Lemmatize targets: - targetsL = self.mat.lemmatizeWords(targets) - - #Lemmatize words: - toNothingL = self.correctWords(self.mat.lemmatizeWords(toNothing)) - toSingularL = self.correctWords(self.mat.lemmatizeWords(toSingular)) - toPluralL = self.correctWords(self.mat.lemmatizeWords(toPlural)) - toPAPEPAL = self.correctWords(self.mat.lemmatizeWords(toPAPEPA)) - toPAL = self.correctWords(self.mat.lemmatizeWords(toPA)) - toPRPAL = self.correctWords(self.mat.lemmatizeWords(toPRPA)) - toPAPAL = self.correctWords(self.mat.lemmatizeWords(toPAPA)) - toPEL = self.correctWords(self.mat.lemmatizeWords(toPE)) - toPRL = self.correctWords(self.mat.lemmatizeWords(toPR)) - toComparativeL = self.correctWords(self.mat.lemmatizeWords(toComparative)) - toSuperlativeL = self.correctWords(self.mat.lemmatizeWords(toSuperlative)) - - #Inflect nouns: - singulars = self.correctWords(self.mat.inflectNouns(toSingularL, 'singular')) - plurals = self.correctWords(self.mat.inflectNouns(toPluralL, 'plural')) - - #Inflect verbs: - papepas = self.correctWords(self.mat.conjugateVerbs(toPAPEPAL, 'PAST_PERFECT_PARTICIPLE', 'FIRST_PERSON_SINGULAR')) - pas = self.correctWords(self.mat.conjugateVerbs(toPAL, 'PAST', 'FIRST_PERSON_SINGULAR')) - prpas = self.correctWords(self.mat.conjugateVerbs(toPRPAL, 'PRESENT_PARTICIPLE', 'FIRST_PERSON_SINGULAR')) - papas = self.correctWords(self.mat.conjugateVerbs(toPAPAL, 'PAST_PARTICIPLE', 'FIRST_PERSON_SINGULAR')) - pes = self.correctWords(self.mat.conjugateVerbs(toPEL, 'PERFECT', 'FIRST_PERSON_SINGULAR')) - prs = self.correctWords(self.mat.conjugateVerbs(toPRL, 'PRESENT', 'THIRD_PERSON_SINGULAR')) - - #Inflect adjectives and adverbs: - comparatives = self.correctWords(self.mat.inflectAdjectives(toComparativeL, 'comparative')) - superlatives = self.correctWords(self.mat.inflectAdjectives(toSuperlativeL, 'superlative')) - - #Create maps: - stemM = {} - singularM = {} - pluralM = {} - papepaM = {} - paM = {} - prpaM = {} - papaM = {} - peM = {} - prM = {} - comparativeM = {} - superlativeM = {} - - for i in range(0, len(toNothing)): - stemM[toNothing[i]] = toNothingL[i] - for i in range(0, len(targets)): - stemM[targets[i]] = targetsL[i] - for i in range(0, len(toSingular)): - stemM[toSingular[i]] = toSingularL[i] - singularM[toSingular[i]] = singulars[i] - for i in range(0, len(toPlural)): - stemM[toPlural[i]] = toPluralL[i] - pluralM[toPlural[i]] = plurals[i] - for i in range(0, len(toPAPEPA)): - stemM[toPAPEPA[i]] = toPAPEPAL[i] - papepaM[toPAPEPA[i]] = papepas[i] - for i in range(0, len(toPA)): - stemM[toPA[i]] = toPAL[i] - paM[toPA[i]] = pas[i] - for i in range(0, len(toPRPA)): - stemM[toPRPA[i]] = toPRPAL[i] - prpaM[toPRPA[i]] = prpas[i] - for i in range(0, len(toPAPA)): - stemM[toPAPA[i]] = toPAPAL[i] - papaM[toPAPA[i]] = papas[i] - for i in range(0, len(toPE)): - stemM[toPE[i]] = toPEL[i] - peM[toPE[i]] = pes[i] - for i in range(0, len(toPR)): - stemM[toPR[i]] = toPRL[i] - prM[toPR[i]] = prs[i] - for i in range(0, len(toComparative)): - stemM[toComparative[i]] = toComparativeL[i] - comparativeM[toComparative[i]] = comparatives[i] - for i in range(0, len(toSuperlative)): - stemM[toSuperlative[i]] = toSuperlativeL[i] - superlativeM[toSuperlative[i]] = superlatives[i] - - #Create final substitutions: - final_substitutions = {} - for target in subs: - #Get lemma of target: - targetL = stemM[target] - - #Create instance in final_substitutions: - final_substitutions[target] = set([]) - - #Iterate through pos tags of target: - for pos in subs[target]: - #Create final cands: - final_cands = set([]) - - #Get cands for a target and tag combination: - cands = list(subs[target][pos]) - - #Add candidates to lists: - if pos == 'NN': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(singularM[cand]) - final_cands.add(cand) - elif pos == 'NNS': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(pluralM[cand]) - final_cands.add(cand) - elif pos == 'VB': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(papepaM[cand]) - elif pos == 'VBD': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(paM[cand]) - final_cands.add(papaM[cand]) - elif pos == 'VBG': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(prpaM[cand]) - elif pos == 'VBN': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(paM[cand]) - final_cands.add(papaM[cand]) - elif pos == 'VBP': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(peM[cand]) - elif pos == 'VBZ': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(prM[cand]) - elif pos == 'JJR' or pos == 'RBR': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(comparativeM[cand]) - elif pos == 'JJS' or pos == 'RBS': - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(superlativeM[cand]) - else: - for cand in cands: - if targetL!=stemM[cand]: - final_cands.add(cand) - - #Add final cands to final substitutions: - final_substitutions[target].update(final_cands) - return final_substitutions - - def getInitialSet(self, victor_corpus): - substitutions_initial = {} - lexf = open(victor_corpus) - sents = [] - targets = [] - heads = [] - for line in lexf: - data = line.strip().split('\t') - sent = data[0].strip().split(' ') - target = data[1].strip() - head = int(data[2].strip()) - sents.append(sent) - targets.append(target) - heads.append(head) - lexf.close() - - tagged_sents = self.tagger.tag_sents(sents) - - for i in range(0, len(sents)): - target = targets[i] - head = heads[i] - target_pos = str(tagged_sents[i][head][1]) - target_wnpos = self.getWordnetPOS(target_pos) - - if target in self.complex_vocab: - syns = wn.synsets(target) - cands = set([]) - for syn in syns: - for lem in syn.lemmas(): - candidate = self.cleanLemma(lem.name()) - if len(candidate.split(' '))==1 and candidate in self.simple_vocab: - cands.add(candidate) - for hyp in syn.hypernyms(): - for lem in hyp.lemmas(): - candidate = self.cleanLemma(lem.name()) - if len(candidate.split(' '))==1 and candidate in self.simple_vocab: - cands.add(candidate) - if target in cands: - cands.remove(target) - if len(cands)>0: - if target in substitutions_initial: - substitutions_initial[target][target_pos] = cands - else: - substitutions_initial[target] = {target_pos:cands} - return substitutions_initial - - def getComplexity(self, word, clm, slm): - C = (clm.score(word, bos=False, eos=False))/(slm.score(word, bos=False, eos=False)) - #C = (clm.score(word)/(slm.score(word)) - L = float(len(word)) - return C*L - - def getVocab(self, path): - return set([line.strip() for line in open(path)]) - - def cleanLemma(self, lem): - result = '' - aux = lem.strip().split('_') - for word in aux: - result += word + ' ' - return result.strip() - - def getWordnetPOS(self, pos): - if pos[0] == 'N' or pos[0] == 'V' or pos == 'RBR' or pos == 'RBS': - return pos[0].lower() - elif pos[0] == 'J': - return 'a' - else: - return None - - def correctWords(self, words): - result = [] - for word in words: - result.append(self.nc.correct(word)) - return result diff --git a/lexi/lib/lexenstein/identifiers.py b/lexi/lib/lexenstein/identifiers.py deleted file mode 100755 index 700fa75..0000000 --- a/lexi/lib/lexenstein/identifiers.py +++ /dev/null @@ -1,395 +0,0 @@ -import numpy as np -from sklearn import svm -from sklearn.linear_model import * -from sklearn.tree import * -from sklearn.ensemble import * -from sklearn.feature_selection import SelectKBest -from sklearn.feature_selection import f_classif -from sklearn.preprocessing import normalize - -class MachineLearningIdentifier: - - def __init__(self, fe): - """ - Creates a MachineLearningIdentifier instance. - - @param fe: FeatureEstimator object. - """ - self.fe = fe - self.classifier = None - - def calculateTrainingFeatures(self, training_corpus): - """ - Calculate features of a corpus in CWICTOR format. - - @param training_corpus: Path to a corpus in the CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - """ - self.Xtr = self.fe.calculateFeatures(training_corpus, format='cwictor') - self.Ytr = [] - f = open(training_corpus) - for line in f: - data = line.strip().split('\t') - y = int(data[3].strip()) - self.Ytr.append(y) - f.close() - - def calculateTestingFeatures(self, testing_corpus): - """ - Calculate testing features of a corpus in VICTOR or CWICTOR format. - - @param testing_corpus: Path to a corpus in the VICTOR or CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - """ - self.Xte = self.fe.calculateFeatures(testing_corpus, format='cwictor') - - def selectKBestFeatures(self, k='all'): - """ - Selects the k best features through univariate feature selection. - - @param k: Number of features to be selected. - """ - feature_selector = SelectKBest(f_classif, k=k) - feature_selector.fit(self.Xtr, self.Ytr) - self.Xtr = feature_selector.transform(self.Xtr) - self.Xte = feature_selector.transform(self.Xte) - - def trainSVM(self, C=1.0, kernel='rbf', degree=3, gamma=0.0, coef0=0.0, class_weight={0:1.0, 1:1.0}): - """ - Trains an SVM classifier. To know more about the meaning of each parameter, - please refer to http://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html#sklearn.svm.SVC - """ - self.classifier = svm.SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0, class_weight=class_weight) - self.classifier.fit(self.Xtr, self.Ytr) - - def trainPassiveAggressiveClassifier(self, C=1.0, loss='hinge'): - """ - Trains a Passive Agressive classifier. To know more about the meaning of each parameter, - please refer to http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html - """ - self.classifier = PassiveAggressiveClassifier(C=C, loss=loss) - self.classifier.fit(self.Xtr, self.Ytr) - - def trainSGDClassifier(self, loss='hinge', penalty='l2', alpha=0.0001, l1_ratio=0.15, epsilon=0.001, class_weight={0:1.0, 1:1.0}): - """ - Trains a Stochastic Gradient Descent classifier. To know more about the meaning of each parameter, - please refer to http://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html - """ - self.classifier = SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, epsilon=epsilon, class_weight=class_weight) - self.classifier.fit(self.Xtr, self.Ytr) - - def trainDecisionTreeClassifier(self, criterion='gini', splitter='best', max_features=None, max_depth=None): - """ - Trains a Decision Tree classifier. To know more about the meaning of each parameter, - please refer to http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html - """ - self.classifier = DecisionTreeClassifier(criterion=criterion, splitter=splitter, max_features=max_features, max_depth=max_depth) - self.classifier.fit(self.Xtr, self.Ytr) - - def trainAdaBoostClassifier(self, n_estimators=50, learning_rate=1, algorithm='SAMME.R'): - """ - Trains an Ada Boost Classifier. To know more about the meaning of each parameter, - please refer to http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html - """ - self.classifier = AdaBoostClassifier(n_estimators=n_estimators, learning_rate=learning_rate, algorithm=algorithm) - self.classifier.fit(self.Xtr, self.Ytr) - - def trainGradientBoostClassifier(self, loss='deviance', n_estimators=50, learning_rate=1, max_features=None): - """ - Trains an Gradient Boost Classifier. To know more about the meaning of each parameter, - please refer to http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html - """ - self.classifier = GradientBoostingClassifier(loss=loss, n_estimators=n_estimators, learning_rate=learning_rate, max_features=max_features) - self.classifier.fit(self.Xtr, self.Ytr) - - def trainExtraTreesClassifier(self, n_estimators=50, criterion='gini', max_features=None): - """ - Trains an Extra Trees Classifier. To know more about the meaning of each parameter, - please refer to http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html - """ - self.classifier = ExtraTreesClassifier(n_estimators=n_estimators, criterion=criterion, max_features=max_features) - self.classifier.fit(self.Xtr, self.Ytr) - - def trainRandomForestClassifier(self, n_estimators=50, criterion='gini', max_features=None): - """ - Trains an Random Trees Classifier. To know more about the meaning of each parameter, - please refer to http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.ExtraTreesClassifier.html - """ - self.classifier = RandomForestClassifier(n_estimators=n_estimators, criterion=criterion, max_features=max_features) - self.classifier.fit(self.Xtr, self.Ytr) - - def identifyComplexWords(self): - return self.classifier.predict(self.Xte) - -class SimplifyAllIdentifier: - - def identifyComplexWords(self, corpus): - """ - Assign label 1 (complex) to all target words in the VICTOR or CWICTOR corpus. - - @param corpus: Path to a corpus in the VICTOR or CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A list of binary values, one per line, with value 1. - """ - result = [] - f = open(corpus) - for line in f: - result.append(1) - f.close() - return result - -class SimplifyNoneIdentifier: - - def identifyComplexWords(self, corpus): - """ - Assign label 0 (simple) to all target words in the VICTOR or CWICTOR corpus. - - @param corpus: Path to a corpus in the VICTOR or CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A list of binary values, one per line, with value 0. - """ - result = [] - f = open(corpus) - for line in f: - result.append(0) - f.close() - return result - -class LexiconIdentifier: - - def __init__(self, lexicon, type): - """ - Creates a LexiconIdentifier instance. - - @param lexicon: Lexicon containing simple or complex, one word per line. - @param type: Type of lexicon. - Values: 'complex', 'simple' - """ - self.lexicon = set([line.strip() for line in open(lexicon)]) - self.type = type - self.feature_index = None - - def identifyComplexWords(self, corpus): - """ - Judge if the target words of a corpus in VICTOR or CWICTOR format are complex or not - - @param corpus: Path to a corpus in the VICTOR or CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A list of binary values, one per line, with value 1 if a target word is complex, and 0 otherwise. - """ - result = [] - f = open(corpus) - for line in f: - data = line.strip().split('\t') - target = data[1].strip() - if target in self.lexicon: - if self.type=='simple': - result.append(0) - else: - result.append(1) - else: - if self.type=='simple': - result.append(1) - else: - result.append(0) - f.close() - return result - -class ThresholdIdentifier: - - def __init__(self, fe): - """ - Creates a ThresholdIdentifier instance. - - @param fe: FeatureEstimator object. - """ - self.fe = fe - - def calculateTrainingFeatures(self, training_corpus): - """ - Calculate features of a corpus in CWICTOR format. - - @param training_corpus: Path to a corpus in the CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - """ - self.Xtr = self.fe.calculateFeatures(training_corpus, format='cwictor') - self.Ytr = [] - f = open(training_corpus) - for line in f: - data = line.strip().split('\t') - y = int(data[3].strip()) - self.Ytr.append(y) - f.close() - - def calculateTestingFeatures(self, testing_corpus): - """ - Calculate testing features of a corpus in VICTOR or CWICTOR format. - - @param testing_corpus: Path to a corpus in the VICTOR or CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - """ - self.Xte = self.fe.calculateFeatures(testing_corpus, format='cwictor') - - def trainIdentifierBruteForce(self, feature_index, step=None): - """ - Trains the threshold identifier with respect to a certain feature through brute force. - - @param feature_index: Index of the feature to be used in training. - """ - #Save feature index: - self.feature_index = feature_index - - #Estimate min and max: - self.minX, self.maxX = self.getMinMax() - - #Set initial min, max and pivot: - min = float(self.minX) - max = float(self.maxX) - - #Set step: - if step==None: - step = (max-min)/1000 - - #Find best threshold: - best = -1 - bestIndex = None - i = min+step - while ibest: - best=score - bestIndex = i - i += step - - #Set threshold and score: - self.threshold = bestIndex - - def trainIdentifierBinarySearch(self, feature_index, diff=None, order=None): - """ - Trains the threshold identifier with respect to a certain feature through binary search. - - @param feature_index: Index of the feature to be used in training. - """ - #Save feature index: - self.feature_index = feature_index - - #Estimate min and max: - self.minX, self.maxX = self.getMinMax() - - #Set initial min, max and pivot: - min = float(self.minX) - max = float(self.maxX) - - #Define difference threshold: - if diff==None: - diff = (max-min)/1000 - - #Define order: - if order==None or order<1: - order = 1 - - #Estimate best threshold: - best = -1 - bestIndex = None - divisor = float(2**order) - step = (max-min)/divisor - for i in range(1, int(divisor)): - pivot = i*step - index, score = self.findMaxBinary(min, max, pivot, diff) - if score>best: - best = score - bestIndex = index - - #Set threshold and score: - self.threshold = bestIndex - - def findMaxBinary(self, min, max, pivot, diff): - #Estimate best threshold: - best = -1 - bestIndex = None - while (max-min)>diff: - left = (min+pivot)/2.0 - right = (pivot+max)/2.0 - scoreL = self.getScore(left) - scoreR = self.getScore(right) - if scoreL>scoreR: - max = pivot - pivot = left - if scoreL>best: - best = scoreL - bestIndex = left - else: - min = pivot - pivot = right - if scoreR>best: - best = scoreR - bestIndex = right - - #Set threshold and score: - return bestIndex, best - - def identifyComplexWords(self): - """ - Judge if the target words of the testing instances are complex or not. - - @return: A list of binary values, one per line, with value 1 if a target word is complex, and 0 otherwise. - """ - result = [] - for i in range(0, len(self.Xte)): - x = self.Xte[i][self.feature_index] - if self.fe.identifiers[self.feature_index][1]=='Complexity': - if x>self.threshold: - result.append(1) - else: - result.append(0) - else: - if xmax: - max = value - if valuethreshold and y==1) or (xthreshold and y==0): - precisionc += 1 - if y==1: - recallc += 1 - precisiont += 1 - if y==1: - recallt += 1 - - precision = float(precisionc)/float(precisiont) - recall = float(recallc)/float(recallt) - fmean = 0.0 - if precision==0.0 and recall==0.0: - fmean = 0.0 - else: - fmean = 2*(precision*recall)/(precision+recall) - - #Return F-Measure: - return fmean diff --git a/lexi/lib/lexenstein/morphadorner.py b/lexi/lib/lexenstein/morphadorner.py deleted file mode 100755 index edd9f17..0000000 --- a/lexi/lib/lexenstein/morphadorner.py +++ /dev/null @@ -1,175 +0,0 @@ -import subprocess - -class MorphAdornerToolkit: - - def __init__(self, path): - """ - Creates an instance of the MorphAdornerToolkit class. - - @param path: Path to the root installation folder of Morph Adorner Toolkit. - """ - - self.root = path - if not self.root.endswith('/'): - self.root += '/' - self.lemmatizer = self.root + 'WordLemmatizer/WordLemmatizer.jar' - self.stemmer = self.root + 'WordStemmer/WordStemmer.jar' - self.conjugator = self.root + 'VerbConjugator/VerbConjugator.jar' - self.inflector = self.root + 'NounInflector/NounInflector.jar' - self.tenser = self.root + 'VerbTenser/VerbTenser.jar' - self.syllabler = self.root + 'SyllableSplitter/SyllableSplitter.jar' - self.adjinflector = self.root + 'AdjectiveInflector/AdjectiveInflector.jar' - - def lemmatizeWords(self, words): - """ - Lemmatizes a set of words. - - @param words: List of words to be lemmatized. - @return: List of the lemmas of the words passed as input. - """ - - input = '' - for word in words: - input += word + '\n' - input += '\n' - - args = ['java', '-jar', self.lemmatizer] - proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False) - (out, err) = proc.communicate(input) - - result = out.strip().split('\n') - return result - - def stemWords(self, words): - """ - Porter stems a set of words. - - @param words: List of words to be Porter stemmed. - @return: List of the Porter stems of the words passed as input. - """ - - input = '' - for word in words: - input += word + '\n' - input += '\n' - - args = ['java', '-jar', self.stemmer] - proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False) - (out, err) = proc.communicate(input) - - result = out.strip().split('\n') - return result - - def conjugateVerbs(self, lemmas, tense, person): - """ - Conjugate a set of verbs in a given tense. - - @param lemmas: Lemmas of verbs to be conjugated. - @param tense: Tense in which to conjugate the verbs. - Tenses available: PAST, PAST_PARTICIPLE, PAST_PERFECT, PAST_PERFECT_PARTICIPLE, PERFECT, PRESENT, PRESENT_PARTICIPLE. - @param person: Person in which to conjugate the verbs. - Tenses available: FIRST_PERSON_SINGULAR, FIRST_PERSON_PLURAL, SECOND_PERSON_SINGULAR, SECOND_PERSON_PLURAL, THIRD_PERSON_SINGULAR, THIRD_PERSON_PLURAL. - @return: List of the conjugated versions of the verb lemmas passed as input. - """ - - input = '' - for lemma in lemmas: - input += lemma + ' ' + tense + ' ' + person + '\n' - input += '\n' - - args = ['java', '-jar', self.conjugator] - proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False) - (out, err) = proc.communicate(input) - - result = out.strip().split('\n') - return result - - - def inflectNouns(self, lemmas, number): - """ - Inflect a list of nouns to its singular or plural form. - - @param lemmas: Lemmas of nouns to be inflected. - @param number: Form in which to inflect the lemmas. - Forms available: singular, plural. - @return: List of the inflected versions of the noun lemmas passed as input. - """ - - input = '' - for lemma in lemmas: - input += lemma + ' ' + number + '\n' - input += '\n' - - args = ['java', '-jar', self.inflector] - proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False) - (out, err) = proc.communicate(input) - - result = out.strip().split('\n') - return result - - def tenseVerbs(self, lemmas, verbs): - """ - Retrieve the tense of a given set of verbs. - - @param lemmas: Lemmas of verbs to be tensed. - @param verbs: Verbs in their original forms. - @return: List of the tenses and persons of the verb passed as input. - Tenses available: PAST, PAST_PARTICIPLE, PAST_PERFECT, PAST_PERFECT_PARTICIPLE, PERFECT, PRESENT, PRESENT_PARTICIPLE. - Persons available: FIRST_PERSON_SINGULAR, FIRST_PERSON_PLURAL, SECOND_PERSON_SINGULAR, SECOND_PERSON_PLURAL, THIRD_PERSON_SINGULAR, THIRD_PERSON_PLURAL. - """ - - input = '' - for i in range(0, len(lemmas)): - input += lemmas[i] + ' ' + verbs[i] + '\n' - input += '\n' - - args = ['java', '-jar', self.tenser] - proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False) - (out, err) = proc.communicate(input) - - result = [line.strip().split(' ') for line in out.strip().split('\n')] - return result - - - def splitSyllables(self, words): - """ - Splits a set of words in syllables. - - @param words: List of words to be lemmatized. - @return: List of words with their syllables separated by hyphen markers. - """ - - input = '' - for word in words: - input += word + '\n' - input += '\n' - - args = ['java', '-jar', self.syllabler] - proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False) - (out, err) = proc.communicate(input) - - out = out.replace('\xc2\xad', '-') - result = out.strip().split('\n') - return result - - def inflectAdjectives(self, lemmas, form): - """ - Inflect a list of adjectives/adverbs to its singular or plural form. - - @param lemmas: Lemmas of adjectives/adverbs to be inflected. - @param form: Form in which to inflect the lemmas. - Forms available: comparative, superlative. - @return: List of the inflected versions of the adjective/adverb lemmas passed as input. - """ - - input = '' - for lemma in lemmas: - input += lemma + ' ' + form + '\n' - input += '\n' - - args = ['java', '-jar', self.adjinflector] - proc = subprocess.Popen(args, stdin=subprocess.PIPE, stdout=subprocess.PIPE, shell=False) - (out, err) = proc.communicate(input) - - result = out.strip().split('\n') - return result diff --git a/lexi/lib/lexenstein/rankers.py b/lexi/lib/lexenstein/rankers.py deleted file mode 100755 index e38fe04..0000000 --- a/lexi/lib/lexenstein/rankers.py +++ /dev/null @@ -1,1450 +0,0 @@ -import os -import kenlm -import math -from keras.optimizers import * -from keras.models import * -from keras.layers.core import * -from nltk.corpus import wordnet as wn -from sklearn.preprocessing import normalize -from sklearn.feature_selection import f_classif -from sklearn import linear_model -from sklearn.svm import SVC -from sklearn.cross_validation import train_test_split -from sklearn.feature_selection import SelectKBest - -class NNRegressionRanker: - - def __init__(self, fe, model=None): - """ - Creates an instance of the NNRegressionRanker class. - This ranker was introduced by "Lexical Simplification with Neural Ranking, Proceedings of the 15th EACL, 2017". - - @param fe: A configured FeatureEstimator object. - @param model: A trained neural ranking model. If provided, it must be an instance created by the ranker itself, and the features provided must be the same used for its training. - """ - self.fe = fe - self.model = model - - def createRanker(self, layers, hidden_size): - """ - Creates a new neural ranker based on the architecture specifications provided. - - @param layers: number of hidden layers of the neural ranker. - @param hidden_size: size of the hidden layers of the neural ranker. - """ - model = Sequential() - model.add(Dense(output_dim=hidden_size, input_dim=len(self.fe.identifiers)*2, init="glorot_uniform")) - model.add(Activation("tanh")) - model.add(Dropout(0.25)) - for i in range(0, layers): - model.add(Dense(output_dim=hidden_size, init="glorot_uniform")) - model.add(Activation("tanh")) - model.add(Dropout(0.10)) - model.add(Dense(output_dim=1)) - model.add(Activation("linear")) - model.compile(loss='mean_squared_error', optimizer='adam') - self.model = model - return model - - def saveRanker(self, json_path, h5_path): - """ - Saves the ranker's neural model. - - @param json_path: Path in which to save the JSON file containing the structure of the neural network. - @param h5_path: Path in which to save the H5 file containing the weights of the neural network. - """ - json_string = self.model.to_json() - open(json_path, 'w').write(json_string) - self.model.save_weights(h5_path, overwrite=True) - - def loadRanker(self, json_path, h5_path): - """ - Loads the ranker's neural model. - - @param json_path: Path of JSON file from which to load the structure of the neural network. - @param h5_path: Path of H5 file from which to load the weights of the neural network. - """ - model = model_from_json(open(json_path).read()) - model.load_weights(h5_path) - model.compile(loss='mean_squared_error', optimizer='adam') - self.model = model - return model - - def trainRanker(self, victor_corpus, epochs, batch_size): - features = self.fe.calculateFeatures(victor_corpus) - Xtr = [] - Ytr = [] - f = open(victor_corpus) - c = -1 - for line in f: - data = line.strip().split('\t') - cands = [cand.strip().split(':')[1] for cand in data[3:]] - indexes = [int(cand.strip().split(':')[0]) for cand in data[3:]] - featmap = {} - for cand in cands: - c += 1 - featmap[cand] = features[c] - for i in range(0, len(cands)-1): - for j in range(i+1, len(cands)): - indexi = indexes[i] - indexj = indexes[j] - indexdiffji = indexj-indexi - indexdiffij = indexi-indexj - positive = featmap[cands[i]] - negative = featmap[cands[j]] - v1 = np.concatenate((positive,negative)) - v2 = np.concatenate((negative,positive)) - Xtr.append(v1) - Xtr.append(v2) - Ytr.append(indexdiffji) - Ytr.append(indexdiffij) - f.close() - Xtr = np.array(Xtr) - Ytr = np.array(Ytr) - self.model.fit(Xtr, Ytr, nb_epoch=epochs, batch_size=batch_size, verbose=0) - - def getRankings(self, victor_corpus): - """ - Ranks candidates using a neural ranker. - Candidates are ranked according to their simplicity score, which is calculated as the sum of the simplicity difference between a given candidate and the remainder. - - @param victor_corpus: Path to a testing corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A list of ranked candidates for each instance in the VICTOR corpus, from simplest to most complex. - """ - #If feature values are not available, then estimate them: - features = self.fe.calculateFeatures(victor_corpus) - - #Read feature values for each candidate in victor corpus: - ranks = [] - c = -1 - f = open(victor_corpus) - index = 0 - for l in f: - #Get all substitutions in ranking instance: - line = l.strip().split('\t') - cands = [cand.strip().split(':')[1].strip() for cand in line[3:]] - - #Estimate feature and candidate maps: - featmap = {} - scoremap = {} - for cand in cands: - c += 1 - featmap[cand] = features[c] - scoremap[cand] = 0.0 - - #Calculate simplicity differences between candidates and update scores: - for i in range(0, len(cands)-1): - cand1 = cands[i] - for j in range(i+1, len(cands)): - cand2 = cands[j] - posneg = np.concatenate((featmap[cand1], featmap[cand2])) - probs = self.model.predict(np.array([posneg])) - score = probs[0] - scoremap[cand1] += score - negpos = np.concatenate((featmap[cand2], featmap[cand1])) - probs = self.model.predict(np.array([negpos])) - score = probs[0] - scoremap[cand1] -= score - - #Rank candidates according to score: - rank = sorted(list(scoremap.keys()), key=scoremap.__getitem__, reverse=True) - ranks.append(rank) - return ranks - -class GlavasRanker: - - def __init__(self, fe): - """ - Creates an instance of the GlavasRanker class. - This ranker was introduced by "Simplifying Lexical Simplification: Do We Need Simplified Corpora?, Proceedings of the 2015 ACL, 2015". - - @param fe: A configured FeatureEstimator object. - """ - - self.fe = fe - self.feature_values = None - - def getRankings(self, victor_corpus): - """ - Ranks candidates with respect to a set of features. - Candidates are ranked according to their average ranking position obtained with all feature values. - - @param victor_corpus: Path to a testing corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A list of ranked candidates for each instance in the VICTOR corpus, from simplest to most complex. - """ - - #If feature values are not available, then estimate them: - self.feature_values = self.fe.calculateFeatures(victor_corpus) - - #Create object for results: - result = [] - - #Read feature values for each candidate in victor corpus: - f = open(victor_corpus) - index = 0 - for line in f: - #Get all substitutions in ranking instance: - data = line.strip().split('\t') - substitutions = data[3:len(data)] - - #Get instance's feature values: - instance_features = [] - for substitution in substitutions: - instance_features.append(self.feature_values[index]) - index += 1 - - rankings = {} - for i in range(0, len(self.fe.identifiers)): - #Create dictionary of substitution to feature value: - scores = {} - for j in range(0, len(substitutions)): - substitution = substitutions[j] - word = substitution.strip().split(':')[1].strip() - scores[word] = instance_features[j][i] - - #Check if feature is simplicity or complexity measure: - rev = False - if self.fe.identifiers[i][1]=='Simplicity': - rev = True - - #Sort substitutions: - words = list(scores.keys()) - sorted_substitutions = sorted(words, key=scores.__getitem__, reverse=rev) - - #Update rankings: - for j in range(0, len(sorted_substitutions)): - word = sorted_substitutions[j] - if word in rankings: - rankings[word] += j - else: - rankings[word] = j - - #Produce final rankings: - final_rankings = sorted(list(rankings.keys()), key=rankings.__getitem__) - - #Add them to result: - result.append(final_rankings) - f.close() - - #Return result: - return result - - def size(self): - """ - Returns the number of features available for a given MetricRanker. - - @return: The number of features in the MetricRanker's FeatureEstimator object. - """ - return len(self.fe.identifiers) - -class SVMBoundaryRanker: - - def __init__(self, fe): - """ - Creates an instance of the SVMBoundaryRanker class. - This simplifier was introduced by "LEXenstein: A Framework for Lexical Simplification, Proceedings of the 2015 ACL, 2015". - - @param fe: A configured FeatureEstimator object. - """ - - self.fe = fe - self.classifier = None - self.feature_selector = None - - def trainRanker(self, victor_corpus, positive_range, C, kernel, degree, gamma, coef0, k='all'): - """ - Trains a SVM Boundary Ranker according to the parameters provided. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param positive_range: Maximum rank to which label 1 is assigned in the binary classification setup. - Recommended value: 1. - @param C: Penalty parameter. - Recommended values: 0.1, 1, 10. - @param kernel: Kernel function to be used. - Supported values: 'linear', 'poly', 'rbf', 'sigmoid'. - @param degree: Degree of the polynomial kernel. - Recommended values: 2, 3. - @param gamma: Kernel coefficient. - Recommended values: 0.01, 0.1, 1. - @param coef0: Independent term value. - Recommended values: 0, 1. - @param k: Number of best features to be selected through univariate feature selection. - If k='all', then no feature selection is performed. - """ - - #Read victor corpus: - data = [] - f = open(victor_corpus) - for line in f: - data.append(line.strip().split('\t')) - f.close() - - #Create matrixes: - X = self.fe.calculateFeatures(victor_corpus) - Y = self.generateLabels(data, positive_range) - - #Select features: - self.feature_selector = SelectKBest(f_classif, k=k) - self.feature_selector.fit(X, Y) - X = self.feature_selector.transform(X) - - #Train classifier: - self.classifier = SVC(C=C, kernel=kernel, degree=degree, gamma=gamma, coef0=coef0) - self.classifier.fit(X, Y) - - def trainRankerWithCrossValidation(self, victor_corpus, positive_range, folds, test_size, Cs=[0.1, 1, 10], kernels=['linear', 'rbf', 'poly', 'sigmoid'], degrees=[2], gammas=[0.01, 0.1, 1], coef0s=[0, 1], k='all'): - """ - Trains a SVM Boundary Ranker while maximizing hyper-parameters through cross-validation. - It uses the TRank-at-1 as an optimization metric. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param positive_range: Maximum rank to which label 1 is assigned in the binary classification setup. - Recommended value: 1. - @param folds: Number of folds to be used in cross-validation. - @param test_size: Percentage of the dataset to be used in testing. - Recommended values: 0.2, 0.25, 0.33 - @param Cs: Penalty parameters. - Recommended values: 0.1, 1, 10. - @param kernels: Kernel functions to be used. - Supported values: 'linear', 'poly', 'rbf', 'sigmoid'. - @param degrees: Degrees of the polynomial kernel. - Recommended values: 2, 3. - @param gammas: Kernel coefficients. - Recommended values: 0.01, 0.1, 1. - @param coef0s: Independent term values. - Recommended values: 0, 1. - @param k: Number of best features to be selected through univariate feature selection. - If k='all', then no feature selection is performed. - """ - #Read victor corpus: - data = [] - f = open(victor_corpus) - for line in f: - data.append(line.strip().split('\t')) - f.close() - - #Create matrixes: - X = self.fe.calculateFeatures(victor_corpus) - Y = self.generateLabels(data, positive_range) - - #Select features: - self.feature_selector = SelectKBest(f_classif, k=k) - self.feature_selector.fit(X, Y) - X = self.feature_selector.transform(X) - - #Extract ranking problems: - firsts = [] - candidates = [] - Xsets = [] - Ysets = [] - index = -1 - for line in data: - fs = set([]) - cs = [] - Xs = [] - Ys = [] - for cand in line[3:len(line)]: - index += 1 - candd = cand.split(':') - rank = candd[0].strip() - word = candd[1].strip() - - cs.append(word) - Xs.append(X[index]) - Ys.append(Y[index]) - if rank=='1': - fs.add(word) - firsts.append(fs) - candidates.append(cs) - Xsets.append(Xs) - Ysets.append(Ys) - - #Create data splits: - datasets = [] - for i in range(0, folds): - Xtr, Xte, Ytr, Yte, Ftr, Fte, Ctr, Cte = train_test_split(Xsets, Ysets, firsts, candidates, test_size=test_size, random_state=i) - Xtra = [] - for matrix in Xtr: - Xtra += matrix - Xtea = [] - for matrix in Xte: - Xtea += matrix - Ytra = [] - for matrix in Ytr: - Ytra += matrix - datasets.append((Xtra, Ytra, Xte, Xtea, Fte, Cte)) - - #Get classifier with best parameters for the RBF kernel: - max_score = -1.0 - parameters = () - if 'rbf' in kernels: - for C in Cs: - for g in gammas: - sum = 0.0 - sum_total = 0 - for dataset in datasets: - Xtra = dataset[0] - Ytra = dataset[1] - Xte = dataset[2] - Xtea = dataset[3] - Fte = dataset[4] - Cte = dataset[5] - - classifier = SVC(kernel='rbf', C=C, gamma=g) - try: - classifier.fit(Xtra, Ytra) - t1 = self.getCrossValidationScore(classifier, Xtea, Xte, Fte, Cte) - sum += t1 - sum_total += 1 - except Exception: - pass - sum_total = max(1, sum_total) - if (sum/sum_total)>max_score: - max_score = sum - parameters = (C, 'rbf', 1, g, 0) - - #Get classifier with best parameters for the Polynomial kernel: - if 'poly' in kernels: - for C in Cs: - for d in degrees: - for g in gammas: - for c in coef0s: - sum = 0.0 - sum_total = 0 - for dataset in datasets: - Xtra = dataset[0] - Ytra = dataset[1] - Xte = dataset[2] - Xtea = dataset[3] - Fte = dataset[4] - Cte = dataset[5] - - classifier = SVC(kernel='poly', C=C, degree=d, gamma=g, coef0=c) - try: - classifier.fit(Xtra, Ytra) - t1 = self.getCrossValidationScore(classifier, Xtea, Xte, Fte, Cte) - sum += t1 - sum_total += 1 - except Exception: - pass - sum_total = max(1, sum_total) - if (sum/sum_total)>max_score: - max_score = sum - parameters = (C, 'poly', d, g, c) - - #Get classifier with best parameters for the Sigmoid kernel: - if 'sigmoid' in kernels: - for C in Cs: - for g in gammas: - for c in coef0s: - sum = 0.0 - sum_total = 0 - for dataset in datasets: - Xtra = dataset[0] - Ytra = dataset[1] - Xte = dataset[2] - Xtea = dataset[3] - Fte = dataset[4] - Cte = dataset[5] - - classifier = SVC(kernel='sigmoid', C=C, gamma=g, coef0=c) - try: - classifier.fit(Xtra, Ytra) - t1 = self.getCrossValidationScore(classifier, Xtea, Xte, Fte, Cte) - sum += t1 - sum_total += 1 - except Exception: - pass - sum_total = max(1, sum_total) - if (sum/sum_total)>max_score: - max_score = sum - parameters = (C, 'sigmoid', d, g, c) - - #Get classifier with best parameters for the Linear kernel: - if 'linear' in kernels: - for C in Cs: - sum = 0.0 - sum_total = 0 - for dataset in datasets: - Xtra = dataset[0] - Ytra = dataset[1] - Xte = dataset[2] - Xtea = dataset[3] - Fte = dataset[4] - Cte = dataset[5] - - classifier = SVC(kernel='linear', C=C, gamma=g, coef0=c) - try: - classifier.fit(Xtra, Ytra) - t1 = self.getCrossValidationScore(classifier, Xtea, Xte, Fte, Cte) - sum += t1 - sum_total += 1 - except Exception: - pass - sum_total = max(1, sum_total) - if (sum/sum_total)>max_score: - max_score = sum - parameters = (C, 'linear', d, g, c) - self.classifier = SVC(C=parameters[0], kernel=parameters[1], degree=parameters[2], gamma=parameters[3], coef0=parameters[4]) - self.classifier.fit(X, Y) - - def getCrossValidationScore(self, classifier, Xtea, Xte, firsts, candidates): - distances = classifier.decision_function(Xtea) - index = -1 - corrects = 0 - total = 0 - for i in range(0, len(Xte)): - xset = Xte[i] - maxd = -999999 - for j in range(0, len(xset)): - index += 1 - distance = distances[index] - if distance>maxd: - maxd = distance - maxc = candidates[i][j] - if maxc in firsts[i]: - corrects += 1 - total += 1 - return float(corrects)/float(total) - - def getRankings(self, victor_corpus): - """ - Ranks candidates with respect to their simplicity. - Requires for the trainRanker function to be previously called so that a model can be trained. - - @param victor_corpus: Path to a testing corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A list of ranked candidates for each instance in the VICTOR corpus, from simplest to most complex. - """ - - #Read victor corpus: - data = [] - f = open(victor_corpus) - for line in f: - data.append(line.strip().split('\t')) - f.close() - - #Create matrixes: - X = self.fe.calculateFeatures(victor_corpus) - - #Select features: - X = self.feature_selector.transform(X) - - #Get boundary distances: - distances = self.classifier.decision_function(X) - - #Get rankings: - result = [] - index = 0 - for i in range(0, len(data)): - line = data[i] - scores = {} - for subst in line[3:len(line)]: - word = subst.strip().split(':')[1].strip() - scores[word] = distances[index] - index += 1 - ranking_data = sorted(list(scores.keys()), key=scores.__getitem__, reverse=True) - result.append(ranking_data) - - #Return rankings: - return result - - def generateLabels(self, data, positive_range): - Y = [] - for line in data: - max_range = min(int(line[len(line)-1].split(':')[0].strip()), positive_range) - for i in range(3, len(line)): - rank_index = int(line[i].split(':')[0].strip()) - if rank_index<=max_range: - Y.append(1) - else: - Y.append(0) - return Y - -class BottRanker: - - def __init__(self, simple_lm): - """ - Creates an instance of the BottRanker class. - This simplifier was introduced by "Can Spanish Be Simpler? LexSiS: Lexical Simplification for Spanish, Proceedings of the 2012 COLING, 2012". - - @param simple_lm: Path to a language model built over simple text. - For more information on how to create the file, refer to the LEXenstein Manual. - """ - - self.simple_lm = kenlm.LanguageModel(simple_lm) - - def getRankings(self, victor_corpus, a1=1.0, a2=1.0): - """ - Ranks candidates with respect to their simplicity. - - @param victor_corpus: Path to a testing corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param a1: Weight of the word's length score. - @param a2: Weight of the word's frequency score. - @return: A list of ranked candidates for each instance in the VICTOR corpus, from simplest to most complex. - """ - #Create object for results: - result = [] - - #Read feature values for each candidate in victor corpus: - f = open(victor_corpus) - for line in f: - #Get all substitutions in ranking instance: - data = line.strip().split('\t') - substitutions = data[3:len(data)] - - #Create dictionary of substitution to feature value: - scores = {} - for substitution in substitutions: - word = substitution.strip().split(':')[1].strip() - scores[word] = self.getCandidateComplexity(word, a1, a2) - - #Sort substitutions: - sorted_substitutions = sorted(list(scores.keys()), key=scores.__getitem__, reverse=False) - - #Add them to result: - result.append(sorted_substitutions) - f.close() - - #Return result: - return result - - def getCandidateComplexity(self, word, a1, a2): - ScoreWL = 0 - if len(word)>4: - ScoreWL = math.sqrt(len(word)-4) - ScoreFreq = -1*self.simple_lm.score(word, bos=False, eos=False) - #ScoreFreq = -1*self.simple_lm.score(word) - return a1*ScoreWL + a2*ScoreFreq - -class YamamotoRanker: - - def __init__(self, simple_lm, cooc_model): - """ - Creates an instance of the YamamotoRanker class. - This simplifier was introduced by "Selecting Proper Lexical Paraphrase for Children, Proceedings of the 2013 ROCLING, 2013". - - @param simple_lm: Path to a language model built over simple text. - For more information on how to create the file, refer to the LEXenstein Manual. - @param cooc_model: Path to a word co-occurrence model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - """ - - self.simple_lm = kenlm.LanguageModel(simple_lm) - self.cooc_model = self.getModel(cooc_model) - - def getRankings(self, victor_corpus, a1=1.0, a2=1.0, a3=1.0, a4=1.0, a5=1.0): - """ - Ranks candidates with respect to their simplicity. - - @param victor_corpus: Path to a testing corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param a1: Weight of the word's frequency score. - @param a2: Weight of the word's sense score. - @param a3: Weight of the word's collocational score. - @param a4: Weight of the word's log score. - @param a5: Weight of the word's trigram score. - @return: A list of ranked candidates for each instance in the VICTOR corpus, from simplest to most complex. - """ - #Create object for results: - result = [] - - #Read feature values for each candidate in victor corpus: - f = open(victor_corpus) - for line in f: - #Get all substitutions in ranking instance: - data = line.strip().split('\t') - sent = data[0].strip() - target = data[1].strip() - head = int(data[2].strip()) - substitutions = data[3:len(data)] - - #Create dictionary of substitution to feature value: - scores = {} - for substitution in substitutions: - word = substitution.strip().split(':')[1].strip() - scores[word] = self.getCandidateScore(sent, target, head, word, a1, a2, a3, a4, a5) - - #Sort substitutions: - sorted_substitutions = sorted(list(scores.keys()), key=scores.__getitem__, reverse=True) - - #Add them to result: - result.append(sorted_substitutions) - f.close() - - #Return result: - return result - - def getModel(self, path): - result = {} - f = open(path) - for line in f: - data = line.strip().split('\t') - target = data[0].strip() - coocs = data[1:len(data)] - result[target] = {} - for cooc in coocs: - coocd = cooc.strip().split(':') - word = coocd[0].strip() - count = int(coocd[1].strip()) - result[target][word] = count - return result - - def getCandidateScore(self, sent, target, head, word, a1, a2, a3, a4, a5): - Fcorpus = a1*self.simple_lm.score(word, bos=False, eos=False) - #Fcorpus = a1*self.simple_lm.score(word) - Sense = a2*self.getSenseScore(word, target) - Cooc = a3*self.getCoocScore(word, sent) - Log = a4*self.getLogScore(Cooc, sent, word) - Trigram = a5*self.getTrigramScore(sent, head, word) - - score = Fcorpus+Sense+Cooc+Log+Trigram - return score - - def getTrigramScore(self, sent, head, word): - tokens = ['', ''] + sent.strip().split(' ') + ['', ''] - h = head + 2 - t1 = tokens[h-2] + ' ' + tokens[h-1] + ' ' + word - t2 = tokens[h-1] + ' ' + word + ' ' + tokens[h+1] - t3 = word + ' ' + tokens[h+1] + ' ' + tokens[h+2] - bos = False - eos = False - if tokens[h-1]=='': - bos = True - if tokens[h+1]=='': - eos = True - result = self.simple_lm.score(t1, bos=bos, eos=eos)+self.simple_lm.score(t2, bos=bos, eos=eos)+self.simple_lm.score(t3, bos=bos, eos=eos) - #result = self.simple_lm.score(t1)+self.simple_lm.score(t2)+self.simple_lm.score(t3) - return result - - def getLogScore(self, Cooc, sent, word): - dividend = Cooc - divisor = self.simple_lm.score(word, bos=False, eos=False)*self.simple_lm.score(sent, bos=True, eos=True) - #divisor = self.simple_lm.score(word)*self.simple_lm.score(sent) - if divisor==0: - return 0 - else: - result = 0 - try: - result = math.log(dividend/divisor) - except ValueError: - result = 0 - return result - - def getCoocScore(self, word, sent): - tokens = sent.strip().split(' ') - if word not in self.cooc_model: - return 0 - else: - result = 0 - for token in tokens: - if token in self.cooc_model[word]: - result += self.cooc_model[word][token] - return result - - def getSenseScore(self, word, target): - candidate_sense = None - try: - candidate_sense = wn.synsets(word)[0] - except Exception: - candidate_sense = None - target_sense = None - try: - target_sense = wn.synsets(target)[0] - except Exception: - target_sense = None - result = 999999 - if candidate_sense and target_sense: - result = candidate_sense.shortest_path_distance(target_sense) - if not result: - result = 999999 - return result - -class BiranRanker: - - def __init__(self, complex_lm, simple_lm): - """ - Creates an instance of the BiranRanker class. - This simplifier was introduced by "Putting it Simply: a Context-Aware Approach to Lexical Simplification, Proceedings of the 2012 ACL, 2012". - - @param complex_lm: Path to a language model built over complex text. - For more information on how to create the file, refer to the LEXenstein Manual. - @param simple_lm: Path to a language model built over simple text. - For more information on how to create the file, refer to the LEXenstein Manual. - """ - - self.complex_lm = kenlm.LanguageModel(complex_lm) - self.simple_lm = kenlm.LanguageModel(simple_lm) - - def getRankings(self, victor_corpus): - """ - Ranks candidates with respect to their simplicity. - - @param victor_corpus: Path to a testing corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A list of ranked candidates for each instance in the VICTOR corpus, from simplest to most complex. - """ - #Create object for results: - result = [] - - #Read feature values for each candidate in victor corpus: - f = open(victor_corpus) - for line in f: - #Get all substitutions in ranking instance: - data = line.strip().split('\t') - substitutions = data[3:len(data)] - - #Create dictionary of substitution to feature value: - scores = {} - for substitution in substitutions: - word = substitution.strip().split(':')[1].strip() - scores[word] = self.getCandidateComplexity(word) - - #Sort substitutions: - sorted_substitutions = sorted(list(scores.keys()), key=scores.__getitem__, reverse=False) - - #Add them to result: - result.append(sorted_substitutions) - f.close() - - #Return result: - return result - - def getCandidateComplexity(self, word): - C = (self.complex_lm.score(word, bos=False, eos=False))/(self.simple_lm.score(word, bos=False, eos=False)) - #C = (self.complex_lm.score(word))/(self.simple_lm.score(word)) - L = float(len(word)) - return C*L - -class BoundaryRanker: - - def __init__(self, fe): - """ - Creates an instance of the BoundaryRanker class. - This simplifier was introduced by "LEXenstein: A Framework for Lexical Simplification, Proceedings of the 2015 ACL, 2015". - - @param fe: A configured FeatureEstimator object. - """ - - self.fe = fe - self.classifier = None - self.feature_selector = None - - def trainRanker(self, victor_corpus, positive_range, loss, penalty, alpha, l1_ratio, epsilon, k='all'): - """ - Trains a Boundary Ranker according to the parameters provided. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param positive_range: Maximum rank to which label 1 is assigned in the binary classification setup. - Recommended value: 1. - @param loss: Loss function to be used. - Values available: hinge, log, modified_huber, squared_hinge, perceptron. - @param penalty: Regularization term to be used. - Values available: l2, l1, elasticnet. - @param alpha: Constant that multiplies the regularization term. - Recommended values: 0.0001, 0.001, 0.01, 0.1 - @param l1_ratio: Elastic net mixing parameter. - Recommended values: 0.05, 0.10, 0.15 - @param epsilon: Acceptable error margin. - Recommended values: 0.0001, 0.001 - @param k: Number of best features to be selected through univariate feature selection. - If k='all', then no feature selection is performed. - """ - - #Read victor corpus: - data = [] - f = open(victor_corpus) - for line in f: - data.append(line.strip().split('\t')) - f.close() - - #Create matrixes: - X = self.fe.calculateFeatures(victor_corpus) - Y = self.generateLabels(data, positive_range) - - #Select features: - self.feature_selector = SelectKBest(f_classif, k=k) - self.feature_selector.fit(X, Y) - X = self.feature_selector.transform(X) - - #Train classifier: - self.classifier = linear_model.SGDClassifier(loss=loss, penalty=penalty, alpha=alpha, l1_ratio=l1_ratio, epsilon=epsilon) - self.classifier.fit(X, Y) - - def trainRankerWithCrossValidation(self, victor_corpus, positive_range, folds, test_size, losses=['hinge', 'modified_huber'], penalties=['elasticnet'], alphas=[0.0001, 0.001, 0.01], l1_ratios=[0.0, 0.15, 0.25, 0.5, 0.75, 1.0], k='all'): - """ - Trains a Boundary Ranker while maximizing hyper-parameters through cross-validation. - It uses the TRank-at-1 as an optimization metric. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param positive_range: Maximum rank to which label 1 is assigned in the binary classification setup. - Recommended value: 1. - @param folds: Number of folds to be used in cross-validation. - @param test_size: Percentage of the dataset to be used in testing. - Recommended values: 0.2, 0.25, 0.33 - @param losses: Loss functions to be considered. - Values available: hinge, log, modified_huber, squared_hinge, perceptron. - @param penalties: Regularization terms to be considered. - Values available: l2, l1, elasticnet. - @param alphas: Constants that multiplies the regularization term. - Recommended values: 0.0001, 0.001, 0.01, 0.1 - @param l1_ratios: Elastic net mixing parameters. - Recommended values: 0.05, 0.10, 0.15 - @param k: Number of best features to be selected through univariate feature selection. - If k='all', then no feature selection is performed. - """ - #Read victor corpus: - data = [] - f = open(victor_corpus) - for line in f: - data.append(line.strip().split('\t')) - f.close() - - #Create matrixes: - X = self.fe.calculateFeatures(victor_corpus) - Y = self.generateLabels(data, positive_range) - - #Select features: - self.feature_selector = SelectKBest(f_classif, k=k) - self.feature_selector.fit(X, Y) - X = self.feature_selector.transform(X) - - #Extract ranking problems: - firsts = [] - candidates = [] - Xsets = [] - Ysets = [] - index = -1 - for line in data: - fs = set([]) - cs = [] - Xs = [] - Ys = [] - for cand in line[3:len(line)]: - index += 1 - candd = cand.split(':') - rank = candd[0].strip() - word = candd[1].strip() - - cs.append(word) - Xs.append(X[index]) - Ys.append(Y[index]) - if rank=='1': - fs.add(word) - firsts.append(fs) - candidates.append(cs) - Xsets.append(Xs) - Ysets.append(Ys) - - #Create data splits: - datasets = [] - for i in range(0, folds): - Xtr, Xte, Ytr, Yte, Ftr, Fte, Ctr, Cte = train_test_split(Xsets, Ysets, firsts, candidates, test_size=test_size, random_state=i) - Xtra = [] - for matrix in Xtr: - Xtra += matrix - Xtea = [] - for matrix in Xte: - Xtea += matrix - Ytra = [] - for matrix in Ytr: - Ytra += matrix - datasets.append((Xtra, Ytra, Xte, Xtea, Fte, Cte)) - - #Get classifier with best parameters: - max_score = -1.0 - parameters = () - for l in losses: - for p in penalties: - for a in alphas: - for r in l1_ratios: - sum = 0.0 - sum_total = 0 - for dataset in datasets: - Xtra = dataset[0] - Ytra = dataset[1] - Xte = dataset[2] - Xtea = dataset[3] - Fte = dataset[4] - Cte = dataset[5] - - classifier = linear_model.SGDClassifier(loss=l, penalty=p, alpha=a, l1_ratio=r, epsilon=0.0001) - try: - classifier.fit(Xtra, Ytra) - t1 = self.getCrossValidationScore(classifier, Xtea, Xte, Fte, Cte) - sum += t1 - sum_total += 1 - except Exception: - pass - sum_total = max(1, sum_total) - if (sum/sum_total)>max_score: - max_score = sum - parameters = (l, p, a, r) - self.classifier = linear_model.SGDClassifier(loss=parameters[0], penalty=parameters[1], alpha=parameters[2], l1_ratio=parameters[3], epsilon=0.0001) - self.classifier.fit(X, Y) - - def getCrossValidationScore(self, classifier, Xtea, Xte, firsts, candidates): - distances = classifier.decision_function(Xtea) - index = -1 - corrects = 0 - total = 0 - for i in range(0, len(Xte)): - xset = Xte[i] - maxd = -999999 - for j in range(0, len(xset)): - index += 1 - distance = distances[index] - if distance>maxd: - maxd = distance - maxc = candidates[i][j] - if maxc in firsts[i]: - corrects += 1 - total += 1 - return float(corrects)/float(total) - - def getRankings(self, victor_corpus): - """ - Ranks candidates with respect to their simplicity. - Requires for the trainRanker function to be previously called so that a model can be trained. - - @param victor_corpus: Path to a testing corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: A list of ranked candidates for each instance in the VICTOR corpus, from simplest to most complex. - """ - - #Read victor corpus: - data = [] - f = open(victor_corpus) - for line in f: - data.append(line.strip().split('\t')) - f.close() - - #Create matrixes: - X = self.fe.calculateFeatures(victor_corpus) - - #Select features: - X = self.feature_selector.transform(X) - - #Get boundary distances: - distances = self.classifier.decision_function(X) - - #Get rankings: - result = [] - index = 0 - for i in range(0, len(data)): - line = data[i] - scores = {} - for subst in line[3:len(line)]: - word = subst.strip().split(':')[1].strip() - scores[word] = distances[index] - index += 1 - ranking_data = sorted(list(scores.keys()), key=scores.__getitem__, reverse=True) - result.append(ranking_data) - - #Return rankings: - return result - - def generateLabels(self, data, positive_range): - Y = [] - for line in data: - max_range = min(int(line[len(line)-1].split(':')[0].strip()), positive_range) - for i in range(3, len(line)): - rank_index = int(line[i].split(':')[0].strip()) - if rank_index<=max_range: - Y.append(1) - else: - Y.append(0) - return Y - -class SVMRanker: - - def __init__(self, fe, svmrank_path): - """ - Creates an instance of the SVMRanker class. - This ranker was introduced in Lexical Simplification by "Learning a Lexical Simplifier Using Wikipedia, Proceedings of the 2014 ACL, 2014". - - @param fe: A configured FeatureEstimator object. - @param svmrank_path: Path to SVM-Rank's root installation folder. - """ - - self.fe = fe - self.svmrank = svmrank_path - if not self.svmrank.endswith('/'): - self.svmrank += '/' - - def trainRankerWithCrossValidation(self, victor_corpus, folds, test_size, temp_folder, temp_id, Cs=['0.01', '0.001'], epsilons=[0.0001, 0.001], kernels=['0', '2', '3']): - """ - Trains a SVM Ranker while maximizing hyper-parameters through cross-validation. - It uses the TRank-at-1 as an optimization metric. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param folds: Number of folds to be used in cross-validation. - @param test_size: Percentage of the dataset to be used in testing. - Recommended values: 0.2, 0.25, 0.33 - @param temp_folder: Folder in which to save temporary files. - @param temp_id: ID to be used in the identification of temporary files. - @param Cs: Trade-offs between training error and margin. - Recommended values: 0.001, 0.01 - @param epsilons: Acceptable error margins. - Recommended values: 0.00001, 0.0001 - @param kernels: ID for the kernels to be considered. - Kernels available: - 0 - Linear - 1 - Polynomial - 2 - Radial Basis Function - 3 - Sigmoid - """ - #Read victor corpus: - data = [] - f = open(victor_corpus) - for line in f: - data.append(line.strip().split('\t')) - f.close() - - #Create matrixes: - X = self.fe.calculateFeatures(victor_corpus) - X = normalize(X, axis=0) - #X = self.toSVMRankFormat(data, X) - - #Extract ranking problems: - firsts = [] - candidates = [] - Xsets = [] - index = -1 - for line in data: - fs = set([]) - cs = [] - Xs = [] - for cand in line[3:len(line)]: - index += 1 - candd = cand.split(':') - rank = candd[0].strip() - word = candd[1].strip() - - cs.append(word) - Xs.append(X[index]) - if rank=='1': - fs.add(word) - firsts.append(fs) - candidates.append(cs) - Xsets.append(Xs) - - #Create data splits: - datasets = [] - for i in range(0, folds): - Xtr, Xte, Ftr, Fte, Ctr, Cte, Dtr, Dte = train_test_split(Xsets, firsts, candidates, data, test_size=test_size, random_state=i) - Xtra = [] - for matrix in Xtr: - Xtra += matrix - Xtra_path = temp_folder + '/' + str(temp_id) + '_' + str(i) + '_training_features_file.txt' - self.fromMatrixToFile(Dtr, Xtra, Xtra_path) - - Xtea = [] - for matrix in Xte: - Xtea += matrix - Xtea_path = temp_folder + '/' + str(temp_id) + '_' + str(i) + '_testing_features_file.txt' - self.fromMatrixToFile(Dte, Xtea, Xtea_path) - datasets.append((Xtra_path, Xte, Xtea_path, Fte, Cte)) - - #Get classifier with best parameters: - max_score = -1.0 - parameters = () - for C in Cs: - for k in kernels: - for e in epsilons: - sum = 0.0 - sum_total = 0 - for dataset in datasets: - Xtra_path = dataset[0] - Xte = dataset[1] - Xtea_path = dataset[2] - Fte = dataset[3] - Cte = dataset[4] - - model_path = temp_folder + '/' + str(temp_id) + '_' + str(i) + '_model_file.txt' - scores_path = temp_folder + '/' + str(temp_id) + '_' + str(i) + '_scores_file.txt' - self.getTrainingModel(Xtra_path, C, e, k, model_path) - self.getScoresFile(Xtea_path, model_path, scores_path) - - t1 = self.getCrossValidationScore(scores_path, Xte, Fte, Cte) - sum += t1 - sum_total += 1 - sum_total = max(1, sum_total) - if (sum/sum_total)>max_score: - max_score = sum - parameters = (C, k, e) - return parameters - - def getCrossValidationScore(self, scores_path, Xte, firsts, candidates): - scores = [str(value.strip()) for value in open(scores_path)] - index = -1 - corrects = 0 - total = 0 - for i in range(0, len(Xte)): - xset = Xte[i] - mind = 999999 - minc = '' - for j in range(0, len(xset)): - index += 1 - distance = scores[index] - if distancetgtvalue: - scoremap[cand] += 1.0 - else: - print('Feature has an invalid Complexity/Simplicity identifier!') - - #Filter candidates: - final_candidates = [] - total_features = float(len(self.fe.identifiers)) - for cand in scoremap: - proportion = scoremap[cand]/total_features - if proportion>=minimum_proportion: - final_candidates.append(cand) - selected_substitutions.append(final_candidates) - - lexf.close() - return selected_substitutions - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class SVMRankSelector: - - def __init__(self, svm_ranker): - """ - Creates an instance of the SVMRankSelector class. - - @param svm_ranker: An instance of the SVMRanker class. - """ - self.ranker = svm_ranker - - def trainSelector(self, tr_victor_corpus, tr_features_file, model_file, c, epsilon, kernel): - """ - Trains a SVM Ranker according to the parameters provided. - - @param tr_victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param tr_features_file: File in which to save the training features file. - @param model_file: File in which to save the trained model. - @param c: Trade-off between training error and margin. - Recommended values: 0.001, 0.01 - @param epsilon: Acceptable error margin. - Recommended values: 0.00001, 0.0001 - @param kernel: ID for the kernel to be used. - Kernels available: - 0 - Linear - 1 - Polynomial - 2 - Radial Basis Function - 3 - Sigmoid - """ - self.ranker.getFeaturesFile(tr_victor_corpus, tr_features_file) - self.ranker.getTrainingModel(tr_features_file, c, epsilon, kernel, model_file) - self.model = model_file - - def trainSelectorWithCrossValidation(self, victor_corpus, features_file, model_file, folds, test_size, temp_folder, temp_id, Cs=['0.01', '0.001'], epsilons=[0.0001, 0.001], kernels=['0', '2', '3']): - """ - Trains a SVM Selector while maximizing hyper-parameters through cross-validation. - It uses the TRank-at-1 as an optimization metric. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param features_file: File in which to save the training features file. - @param model_file: File in which to save the trained model. - @param folds: Number of folds to be used in cross-validation. - @param test_size: Percentage of the dataset to be used in testing. - Recommended values: 0.2, 0.25, 0.33 - @param temp_folder: Folder in which to save temporary files. - @param temp_id: ID to be used in the identification of temporary files. - @param Cs: Trade-offs between training error and margin. - Recommended values: 0.001, 0.01 - @param epsilons: Acceptable error margins. - Recommended values: 0.00001, 0.0001 - @param kernels: ID for the kernels to be considered. - Kernels available: - 0 - Linear - 1 - Polynomial - 2 - Radial Basis Function - 3 - Sigmoid - """ - parameters = self.ranker.trainRankerWithCrossValidation(victor_corpus, folds, test_size, temp_folder, temp_id, Cs=Cs, epsilons=epsilons, kernels=kernels) - self.ranker.getFeaturesFile(victor_corpus, features_file) - self.ranker.getTrainingModel(features_file, parameters[0], parameters[2], parameters[1], model_file) - self.model = model_file - - def selectCandidates(self, substitutions, victor_corpus, features_file, scores_file, temp_file, proportion, proportion_type='percentage'): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param features_file: File in which to save the testing features file. - @param scores_file: File in which to save the scores file. - User must have the privilege to delete such file without administrator privileges. - @param temp_file: File in which to save a temporary victor corpus. - The file is removed after the algorithm is concluded. - @param proportion: Proportion of substitutions to keep. - If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1. - If proportion_type is set to "integer", then this parameter must be an integer number. - @param proportion_type: Type of proportion to be kept. - Values supported: percentage, integer. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - void = VoidSelector() - selected_void = void.selectCandidates(substitutions, victor_corpus) - void.toVictorFormat(victor_corpus, selected_void, temp_file) - - self.ranker.getFeaturesFile(temp_file, features_file) - self.ranker.getScoresFile(features_file, self.model, scores_file) - rankings = self.getRankings(temp_file, features_file, scores_file) - - selected_substitutions = [] - - lexf = open(victor_corpus) - index = -1 - for line in lexf: - index += 1 - - selected_candidates = None - if proportion_type == 'percentage': - toselect = None - if proportion > 1.0: - toselect = 1.0 - else: - toselect = proportion - selected_candidates = rankings[index][0:max(1, int(toselect*float(len(rankings[index]))))] - else: - toselect = None - if proportion < 1: - toselect = 1 - elif proportion > len(rankings[index]): - toselect = len(rankings[index]) - else: - toselect = proportion - selected_candidates = rankings[index][0:toselect] - - selected_substitutions.append(selected_candidates) - lexf.close() - - #Delete temp_file: - os.system('rm ' + temp_file) - return selected_substitutions - - def getRankings(self, victor_corpus, features_file, scores_file): - #Read features file: - f = open(features_file) - data = [] - for line in f: - data.append(line.strip().split(' ')) - f.close() - - #Read scores file: - f = open(scores_file) - scores = [] - for line in f: - scores.append(float(line.strip())) - f.close() - - #Combine data: - ranking_data = {} - index = 0 - for line in data: - id = int(line[1].strip().split(':')[1].strip()) - starti = 0 - while line[starti]!='#': - starti += 1 - word = '' - for i in range(starti+1, len(line)): - word += line[i] + ' ' - word = word.strip() - score = scores[index] - index += 1 - if id in ranking_data: - ranking_data[id][word] = score - else: - ranking_data[id] = {word:score} - - #Produce rankings: - result = [] - f = open(victor_corpus) - id = 0 - for line in f: - id += 1 - candidates = [] - if id in ranking_data: - candidates = list(ranking_data[id].keys()) - candidates = sorted(candidates, key=ranking_data[id].__getitem__, reverse=False) - result.append(candidates) - - #Return rankings: - return result - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class SVMBoundarySelector: - - def __init__(self, svm_boundary_ranker): - """ - Creates an instance of the SVMBoundarySelector class. - - @param svm_boundary_ranker: An instance of the BoundaryRanker class. - """ - self.ranker = svm_boundary_ranker - - def trainSelector(self, victor_corpus, positive_range, C, kernel, degree, gamma, coef0, k='all'): - """ - Trains a Boundary Ranker according to the parameters provided. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param positive_range: Maximum rank to which label 1 is assigned in the binary classification setup. - Recommended value: 1. - @param C: Penalty parameter. - Recommended values: 0.1, 1, 10. - @param kernel: Kernel function to be used. - Supported values: 'linear', 'poly', 'rbf', 'sigmoid'. - @param degree: Degree of the polynomial kernel. - Recommended values: 2, 3. - @param gamma: Kernel coefficient. - Recommended values: 0.01, 0.1, 1. - @param coef0: Independent term value. - Recommended values: 0, 1. - @param k: Number of best features to be selected through univariate feature selection. - If k='all', then no feature selection is performed. - """ - self.ranker.trainRanker(victor_corpus, positive_range, C, kernel, degree, gamma, coef0, k=k) - - def trainSelectorWithCrossValidation(self, victor_corpus, positive_range, folds, test_size, Cs=[0.1, 1, 10], kernels=['linear', 'rbf', 'poly', 'sigmoid'], degrees=[2], gammas=[0.01, 0.1, 1], coef0s=[0, 1], k='all'): - """ - Trains a Boundary Selector while maximizing hyper-parameters through cross-validation. - It uses the TRank-at-1 as an optimization metric. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param positive_range: Maximum rank to which label 1 is assigned in the binary classification setup. - Recommended value: 1. - @param folds: Number of folds to be used in cross-validation. - @param test_size: Percentage of the dataset to be used in testing. - Recommended values: 0.2, 0.25, 0.33 - @param Cs: Penalty parameters. - Recommended values: 0.1, 1, 10. - @param kernels: Kernel functions to be used. - Supported values: 'linear', 'poly', 'rbf', 'sigmoid'. - @param degrees: Degrees of the polynomial kernel. - Recommended values: 2, 3. - @param gammas: Kernel coefficients. - Recommended values: 0.01, 0.1, 1. - @param coef0s: Independent term values. - Recommended values: 0, 1. - @param k: Number of best features to be selected through univariate feature selection. - If k='all', then no feature selection is performed. - """ - self.ranker.trainRankerWithCrossValidation(victor_corpus, positive_range, folds, test_size, Cs=Cs, kernels=kernels, degrees=degrees, gammas=gammas, coef0s=coef0s, k=k) - - def selectCandidates(self, substitutions, victor_corpus, temp_file, proportion, proportion_type='percentage'): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - User must have the privilege to delete such file without administrator privileges. - @param temp_file: File in which to save a temporary victor corpus. - The file is removed after the algorithm is concluded. - @param proportion: Proportion of substitutions to keep. - If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1. - If proportion_type is set to "integer", then this parameter must be an integer number. - @param proportion_type: Type of proportion to be kept. - Values supported: percentage, integer. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - void = VoidSelector() - selected_void = void.selectCandidates(substitutions, victor_corpus) - void.toVictorFormat(victor_corpus, selected_void, temp_file) - - rankings = self.ranker.getRankings(temp_file) - - selected_substitutions = [] - - lexf = open(victor_corpus) - index = -1 - for line in lexf: - index += 1 - - selected_candidates = None - if proportion_type == 'percentage': - toselect = None - if proportion > 1.0: - toselect = 1.0 - else: - toselect = proportion - selected_candidates = rankings[index][0:max(1, int(toselect*float(len(rankings[index]))))] - else: - toselect = None - if proportion < 1: - toselect = 1 - elif proportion > len(rankings[index]): - toselect = len(rankings[index]) - else: - toselect = proportion - selected_candidates = rankings[index][0:toselect] - - selected_substitutions.append(selected_candidates) - lexf.close() - - #Delete temp_file: - os.system('rm ' + temp_file) - return selected_substitutions - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class BoundarySelector: - - def __init__(self, boundary_ranker): - """ - Creates an instance of the BoundarySelector class. - - @param boundary_ranker: An instance of the BoundaryRanker class. - """ - self.ranker = boundary_ranker - - def trainSelector(self, victor_corpus, positive_range, loss, penalty, alpha, l1_ratio, epsilon, k='all'): - """ - Trains a Boundary Ranker according to the parameters provided. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param positive_range: Maximum rank to which label 1 is assigned in the binary classification setup. - Recommended value: 1. - @param loss: Loss function to be used. - Values available: hinge, log, modified_huber, squared_hinge, perceptron. - @param penalty: Regularization term to be used. - Values available: l2, l1, elasticnet. - @param alpha: Constant that multiplies the regularization term. - Recommended values: 0.0001, 0.001, 0.01, 0.1 - @param l1_ratio: Elastic net mixing parameter. - Recommended values: 0.05, 0.10, 0.15 - @param epsilon: Acceptable error margin. - Recommended values: 0.0001, 0.001 - @param k: Number of best features to be selected through univariate feature selection. - If k='all', then no feature selection is performed. - """ - self.ranker.trainRanker(victor_corpus, positive_range, loss, penalty, alpha, l1_ratio, epsilon, k=k) - - def trainSelectorWithCrossValidation(self, victor_corpus, positive_range, folds, test_size, losses=['hinge', 'modified_huber'], penalties=['elasticnet'], alphas=[0.0001, 0.001, 0.01], l1_ratios=[0.0, 0.15, 0.25, 0.5, 0.75, 1.0], k='all'): - """ - Trains a Boundary Selector while maximizing hyper-parameters through cross-validation. - It uses the TRank-at-1 as an optimization metric. - - @param victor_corpus: Path to a training corpus in VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param positive_range: Maximum rank to which label 1 is assigned in the binary classification setup. - Recommended value: 1. - @param folds: Number of folds to be used in cross-validation. - @param test_size: Percentage of the dataset to be used in testing. - Recommended values: 0.2, 0.25, 0.33 - @param losses: Loss functions to be considered. - Values available: hinge, log, modified_huber, squared_hinge, perceptron. - @param penalties: Regularization terms to be considered. - Values available: l2, l1, elasticnet. - @param alphas: Constants that multiplies the regularization term. - Recommended values: 0.0001, 0.001, 0.01, 0.1 - @param l1_ratios: Elastic net mixing parameters. - Recommended values: 0.05, 0.10, 0.15 - @param k: Number of best features to be selected through univariate feature selection. - If k='all', then no feature selection is performed. - """ - self.ranker.trainRankerWithCrossValidation(victor_corpus, positive_range, folds, test_size, losses=losses, penalties=penalties, alphas=alphas, l1_ratios=l1_ratios, k=k) - - def selectCandidates(self, substitutions, victor_corpus, temp_file, proportion, proportion_type='percentage'): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - User must have the privilege to delete such file without administrator privileges. - @param temp_file: File in which to save a temporary victor corpus. - The file is removed after the algorithm is concluded. - @param proportion: Proportion of substitutions to keep. - If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1. - If proportion_type is set to "integer", then this parameter must be an integer number. - @param proportion_type: Type of proportion to be kept. - Values supported: percentage, integer. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - void = VoidSelector() - selected_void = void.selectCandidates(substitutions, victor_corpus) - void.toVictorFormat(victor_corpus, selected_void, temp_file) - - rankings = self.ranker.getRankings(temp_file) - - selected_substitutions = [] - - lexf = open(victor_corpus) - index = -1 - for line in lexf: - index += 1 - - selected_candidates = None - if proportion_type == 'percentage': - toselect = None - if proportion > 1.0: - toselect = 1.0 - else: - toselect = proportion - selected_candidates = rankings[index][0:max(1, int(toselect*float(len(rankings[index]))))] - else: - toselect = None - if proportion < 1: - toselect = 1 - elif proportion > len(rankings[index]): - toselect = len(rankings[index]) - else: - toselect = proportion - selected_candidates = rankings[index][0:toselect] - - selected_substitutions.append(selected_candidates) - lexf.close() - - #Delete temp_file: - os.system('rm ' + temp_file) - return selected_substitutions - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class BelderSelector: - - def __init__(self, clusters): - """ - Creates an instance of the BelderSelector class. - - @param clusters: Path to a file containing clusters of words. - For instructions on how to create the file, please refer to the LEXenstein Manual. - """ - self.clusters_to_words, self.words_to_clusters = self.getClusterData(clusters) - - def selectCandidates(self, substitutions, victor_corpus): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - selected_substitutions = [] - - substitution_candidates = [] - if isinstance(substitutions, list): - substitution_candidates = substitutions - elif isinstance(substitutions, dict): - void = VoidSelector() - substitution_candidates = void.selectCandidates(substitutions, victor_corpus) - else: - print('ERROR: Substitutions are neither a dictionary or a list!') - return selected_substitutions - - c = -1 - lexf = open(victor_corpus) - for line in lexf: - c += 1 - data = line.strip().split('\t') - sent = data[0].strip() - target = data[1].strip() - - selected_candidates = set([]) - if target in self.words_to_clusters: - cluster = self.words_to_clusters[target] - candidates = set(substitution_candidates[c]) - selected_candidates = candidates.intersection(self.clusters_to_words[cluster]) - - selected_substitutions.append(selected_candidates) - lexf.close() - return selected_substitutions - - def getClusterData(self, clusters): - cw = {} - wc = {} - f = open(clusters) - for line in f: - data = line.strip().split('\t') - cluster = data[0].strip() - word = data[1].strip() - - if cluster in cw: - cw[cluster].add(word) - else: - cw[cluster] = set([word]) - - wc[word] = cluster - f.close() - return cw, wc - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class POSProbSelector: - - def __init__(self, condprob_model, pos_model, stanford_tagger, java_path): - """ - Creates a POSProbSelector instance. - It selects only the candidate substitutions of which the most likely POS tag is that of the target word. - - @param condprob_model: Path to a binary conditional probability model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - """ - os.environ['JAVAHOME'] = java_path - self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.model = pickle.load(open(condprob_model, 'rb')) - - def selectCandidates(self, substitutions, victor_corpus): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - selected_substitutions = [] - - substitution_candidates = [] - if isinstance(substitutions, list): - substitution_candidates = substitutions - elif isinstance(substitutions, dict): - void = VoidSelector() - substitution_candidates = void.selectCandidates(substitutions, victor_corpus) - else: - print('ERROR: Substitutions are neither a dictionary or a list!') - return selected_substitutions - - #Read VICTOR corpus: - lexf = open(victor_corpus) - sents = [] - targets = [] - heads = [] - c = -1 - for line in lexf: - c += 1 - data = line.strip().split('\t') - sent = data[0].strip().split(' ') - target = data[1].strip() - head = int(data[2].strip()) - sents.append(sent) - targets.append(target) - heads.append(head) - lexf.close() - - #Tag sentences: - tagged_sents = self.tagger.tag_sents(sents) - - for i in range(0, len(sents)): - target = targets[i] - head = heads[i] - target_pos = str(tagged_sents[i][head][1]) - - candidates = [] - candidates = set(substitution_candidates[i]) - candidates = self.getCandidatesWithSamePOS(candidates, target_pos) - - selected_substitutions.append(candidates) - lexf.close() - return selected_substitutions - - def getTargetPOS(self, sent, target, head): - pos_data = [] - try: - pos_data = nltk.pos_tag(sent) - return pos_data[head][1] - except UnicodeDecodeError: - try: - pos_data = nltk.pos_tag(target) - return pos_data[0][1] - except UnicodeDecodeError: - return 'None' - - def getCandidatesWithSamePOS(self, candidates, target_pos): - result = set([]) - for candidate in candidates: - cand_tag = None - try: - cand_tag = self.model[candidate].max() - except Exception: - pass - if cand_tag and cand_tag==target_pos: - result.add(candidate) - return result - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class AluisioSelector: - - def __init__(self, condprob_model, pos_model, stanford_tagger, java_path): - """ - Creates an AluisioSelector instance. - It selects only candidate substitutions that can assume the same POS tag of the target word. - - @param condprob_model: Path to a binary conditional probability model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - """ - os.environ['JAVAHOME'] = java_path - self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) - self.model = pickle.load(open(condprob_model, 'rb')) - - def selectCandidates(self, substitutions, victor_corpus): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - selected_substitutions = [] - - substitution_candidates = [] - if isinstance(substitutions, list): - substitution_candidates = substitutions - elif isinstance(substitutions, dict): - void = VoidSelector() - substitution_candidates = void.selectCandidates(substitutions, victor_corpus) - else: - print('ERROR: Substitutions are neither a dictionary or a list!') - return selected_substitutions - - #Read VICTOR corpus: - lexf = open(victor_corpus) - sents = [] - targets = [] - heads = [] - c = -1 - for line in lexf: - c += 1 - data = line.strip().split('\t') - sent = data[0].strip().split(' ') - target = data[1].strip() - head = int(data[2].strip()) - sents.append(sent) - targets.append(target) - heads.append(head) - lexf.close() - - #Tag sentences: - tagged_sents = self.tagger.tag_sents(sents) - - for i in range(0, len(sents)): - target = targets[i] - head = heads[i] - target_pos = str(tagged_sents[i][head][1]) - - candidates = [] - candidates = set(substitution_candidates[i]) - candidates = self.getCandidatesWithSamePOS(candidates, target_pos) - - selected_substitutions.append(candidates) - lexf.close() - return selected_substitutions - - def getTargetPOS(self, sent, target, head): - pos_data = [] - try: - pos_data = nltk.pos_tag(sent) - return pos_data[head][1] - except UnicodeDecodeError: - try: - pos_data = nltk.pos_tag(target) - return pos_data[0][1] - except UnicodeDecodeError: - return 'None' - - def getCandidatesWithSamePOS(self, candidates, target_pos): - result = set([]) - for candidate in candidates: - tag_freq = 0 - try: - tag_freq = self.model[candidate].prob(target_pos) - except Exception: - pass - if tag_freq>0: - result.add(candidate) - return result - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class VoidSelector: - - def selectCandidates(self, substitutions, victor_corpus): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - selected_substitutions = [] - - if isinstance(substitutions, list): - return substitutions - - lexf = open(victor_corpus) - for line in lexf: - data = line.strip().split('\t') - sent = data[0].strip() - target = data[1].strip() - - candidates = [] - if target in substitutions: - candidates = substitutions[target] - - selected_substitutions.append(candidates) - lexf.close() - return selected_substitutions - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class BiranSelector: - - def __init__(self, cooc_model): - """ - Creates an instance of the BiranSelector class. - - @param cooc_model: Path to a word co-occurrence model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - """ - self.model = self.getModel(cooc_model) - - def selectCandidates(self, substitutions, victor_corpus, common_distance=0.01, candidate_distance=0.9): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param common_distance: The cutoff minimum distance from the sentence's co-occurrence vector and the common vector between the target complex word and the candidate. - We recommend using very small values, such as 0.01, or even 0.0. - @param candidate_distance: The cutoff maximum distance from the sentence's co-occurrence vector and the candidate vector. - We recommend using values close to 1.0, such as 0.8, or 0.9. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - selected_substitutions = [] - - substitution_candidates = [] - if isinstance(substitutions, list): - substitution_candidates = substitutions - elif isinstance(substitutions, dict): - void = VoidSelector() - substitution_candidates = void.selectCandidates(substitutions, victor_corpus) - else: - print('ERROR: Substitutions are neither a dictionary or a list!') - return selected_substitutions - - c = -1 - lexf = open(victor_corpus) - for line in lexf: - c += 1 - data = line.strip().split('\t') - sent = data[0].strip() - target = data[1].strip() - head = int(data[2].strip()) - - target_vec = self.getSentVec(sent, head) - - candidates = set(substitution_candidates[c]) - - final_candidates = set([]) - for candidate_raw in candidates: - candidate = str(candidate_raw) - candidate_vec = self.getVec(candidate) - candidate_dist = 1.0 - try: - candidate_dist = self.getCosine(candidate_vec, target_vec) - except ValueError: - candidate_dist = 1.0 - - common_vec = self.getCommonVec(target, candidate) - common_dist = 0.0 - try: - common_dist = self.getCosine(common_vec, target_vec) - except ValueError: - common_dist = 0.0 - if common_dist>=common_distance and candidate_dist<=candidate_distance: - final_candidates.add(candidate) - selected_substitutions.append(final_candidates) - lexf.close() - return selected_substitutions - - def getModel(self, path): - result = {} - f = open(path) - for line in f: - data = line.strip().split('\t') - target = data[0].strip() - coocs = data[1:len(data)] - result[target] = {} - for cooc in coocs: - coocd = cooc.strip().split(':') - word = coocd[0].strip() - count = int(coocd[1].strip()) - result[target][word] = count - return result - - def getCosine(self, vec1, vec2): - all_keys = sorted(list(set(vec1.keys()).union(set(vec2.keys())))) - v1 = [] - v2 = [] - for k in all_keys: - if k in vec1: - v1.append(vec1[k]) - else: - v1.append(0.0) - if k in vec2: - v2.append(vec2[k]) - else: - v2.append(0.0) - return cosine(v1, v2) - - def getCommonVec(self, target, candidate): - if target not in list(self.model.keys()) or candidate not in self.model: - return {} - else: - result = {} - common_keys = set(self.model[target].keys()).intersection(set(self.model[candidate].keys())) - for k in common_keys: - if self.model[target][k]>self.model[candidate][k]: - result[k] = self.model[candidate][k] - else: - result[k] = self.model[target][k] - return result - - def isNumeral(self, text): - try: - num = float(text.strip()) - return True - except ValueError: - return False - - def getSentVec(self, sent, head): - coocs = {} - tokens = sent.strip().split(' ') - left = max(0, head-5) - right = min(len(tokens), head+6) - for j in range(left, right): - if j!=head: - cooc = tokens[j] - if self.isNumeral(cooc): - cooc = '#NUMERAL#' - if cooc not in coocs: - coocs[cooc] = 1 - else: - coocs[cooc] += 1 - return coocs - - def getVec(self, word): - result = {} - try: - result = self.model[word] - except KeyError: - try: - result = self.model[word.lower()] - except KeyError: - result = {} - return result - - def getCandidateSentence(self, sentence, candidate, head): - tokens = sentence.strip().split(' ') - result = '' - for i in range(0, head): - result += tokens[i] + ' ' - result += candidate + ' ' - for i in range(head+1, len(tokens)): - result += tokens[i] + ' ' - return result.strip() - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class WordVectorSelector: - - def __init__(self, vector_model, pos_model, stanford_tagger, java_path, pos_type='none'): - """ - Creates an instance of the WordVectorSelector class. - - @param vector_model: Path to a binary word vector model. - For instructions on how to create the model, please refer to the LEXenstein Manual. - @param pos_model: Path to a POS tagging model for the Stanford POS Tagger. - The models can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param stanford_tagger: Path to the "stanford-postagger.jar" file. - The tagger can be downloaded from the following link: http://nlp.stanford.edu/software/tagger.shtml - @param java_path: Path to the system's "java" executable. - Can be commonly found in "/usr/bin/java" in Unix/Linux systems, or in "C:/Program Files/Java/jdk_version/java.exe" in Windows systems. - @param pos_type: The type of POS tags with which the model's words are annotated, if any. - Values supported: none, treebank, paetzold - """ - self.model = gensim.models.KeyedVectors.load_word2vec_format(vector_model, binary=True) - self.pos_type = pos_type - os.environ['JAVAHOME'] = java_path - self.tagger = StanfordPOSTagger(pos_model, stanford_tagger) - - def selectCandidates(self, substitutions, victor_corpus, proportion=1.0, proportion_type='percentage', stop_words_file=None, window=99999, onlyInformative=False, keepTarget=False, onePerWord=False): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param proportion: Percentage of substitutions to keep. - If proportion_type is set to "percentage", then this parameter must be a floating point number between 0 and 1. - If proportion_type is set to "integer", then this parameter must be an integer number. - @param proportion_type: Type of proportion to be kept. - Values supported: percentage, integer. - @param stop_words_file: Path to the file containing stop words of the desired language. - The file must contain one stop word per line. - @param window: Number of tokens around the target complex sentence to consider as its context. - @param onlyInformative: If True, only content words are considered as part of the complex word's context, such as nouns, verbs, adjectives and adverbs. - @param keepTarget: If True, the complex target word is also included as part of its context. - @param onePerWord: If True, a word in the complex word's context can only contribute once to its resulting word vector. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - #Initialize selected substitutions: - selected_substitutions = [] - - #Read stop words: - stop_words = set([]) - if stop_words_file != None: - stop_words = set([word.strip() for word in open(stop_words_file)]) - - #Configure input: - substitution_candidates = [] - if isinstance(substitutions, list): - substitution_candidates = substitutions - elif isinstance(substitutions, dict): - void = VoidSelector() - substitution_candidates = void.selectCandidates(substitutions, victor_corpus) - else: - print('ERROR: Substitutions are neither a dictionary or a list!') - return selected_substitutions - - #Parse sentences: - lexf = open(victor_corpus) - sents = [line.strip().split('\t')[0].strip().split(' ') for line in lexf] - lexf.close() - tagged_sents = self.tagger.tag_sents(sents) - - #Transform them to the right format: - if self.pos_type=='paetzold': - transformed = [] - for sent in tagged_sents: - tokens = [] - for token in sent: - tokens.append((token[0], getGeneralisedPOS(token[1]))) - transformed.append(tokens) - tagged_sents = transformed - - #Rank candidates: - c = -1 - lexf = open(victor_corpus) - for line in lexf: - c += 1 - data = line.strip().split('\t') - sent = data[0].strip() - target = data[1].strip() - head = int(data[2].strip()) - pos_tags = tagged_sents[c] - target_pos = pos_tags[head][1] - - target_vec = self.getSentVec(sent, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tags) - candidates = substitution_candidates[c] - - candidate_dists = {} - for candidate in candidates: - candidate_vec = self.getWordVec(candidate, target_pos) - try: - candidate_dists[candidate] = cosine(candidate_vec, target_vec) - except ValueError: - candidate_dists = candidate_dists - - final_candidates = self.getFinalCandidates(candidate_dists, proportion, proportion_type) - - selected_substitutions.append(final_candidates) - lexf.close() - return selected_substitutions - - def getSentVec(self, sentence, head, stop_words, window, onlyInformative, keepTarget, onePerWord, pos_tokens): - informative_tags = set([]) - if onlyInformative: - if self.pos_type=='treebank': - informative_tags = set(['NN', 'NNS', 'JJ', 'JJS', 'JJR', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'RB', 'RBR', 'RBS']) - if self.pos_type=='paetzold': - informative_tags = set(['N', 'V', 'J', 'R']) - - tokens = sentence.split(' ') - - valid_tokens = [] - if keepTarget: - valid = tokens[head].strip() - if self.pos_type!='none': - valid += '|||' + pos_tokens[head][1] - valid_tokens.append(valid) - - if head>0: - for i in range(max(0, head-window), head): - if len(informative_tags)==0 or pos_tokens[i][1].lower().strip() in informative_tags: - if tokens[i] not in stop_words: - valid = tokens[i] - if self.pos_type!='none': - valid += '|||' + pos_tokens[i][1] - valid_tokens.append(valid) - - if head=len(result): - return result - else: - return result[0:max(1, int(proportion))] - else: - print('Unrecognized proportion type.') - return result - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -class WSDSelector: - - def __init__(self, method): - """ - Creates an instance of the WSDSelector class. - - @param method: Type of Word Sense Disambiguation algorithm to use. - Options available: - lesk - Original lesk algorithm. - path - Path similarity algorithm. - random - Random sense from WordNet. - first - First sense from WordNet. - """ - - if method == 'lesk': - self.WSDfunction = self.getLeskSense - elif method == 'path': - self.WSDfunction = self.getPathSense - elif method == 'random': - self.WSDfunction = self.getRandomSense - elif method == 'first': - self.WSDfunction = self.getFirstSense - else: - self.WSDfunction = self.getLeskSense - - def selectCandidates(self, substitutions, victor_corpus): - """ - Selects which candidates can replace the target complex words in each instance of a VICTOR corpus. - - @param substitutions: Candidate substitutions to be filtered. - It can be in two formats: - A dictionary produced by a Substitution Generator linking complex words to a set of candidate substitutions. - Example: substitutions['perched'] = {'sat', 'roosted'} - A list of candidate substitutions selected for the "victor_corpus" dataset by a Substitution Selector. - Example: [['sat', 'roosted'], ['easy', 'uncomplicated']] - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @return: Returns a vector of size N, containing a set of selected substitutions for each instance in the VICTOR corpus. - """ - - selected_substitutions = [] - - substitution_candidates = [] - if isinstance(substitutions, list): - substitution_candidates = substitutions - elif isinstance(substitutions, dict): - void = VoidSelector() - substitution_candidates = void.selectCandidates(substitutions, victor_corpus) - else: - print('ERROR: Substitutions are neither a dictionary or a list!') - return selected_substitutions - - c = -1 - lexf = open(victor_corpus) - for line in lexf: - c += 1 - data = line.strip().split('\t') - sent = data[0].strip() - target = data[1].strip() - head = int(data[2].strip()) - - target_sense = self.WSDfunction.__call__(sent, target) - - candidates = substitution_candidates[c] - - selected_candidates = set([]) - for candidate in candidates: - candidate_sense = None - try: - unic = str(candidate) - candidate_sense = self.WSDfunction.__call__(self.getCandidateSentence(sent, candidate, head), candidate) - except UnicodeDecodeError: - candidate_sense = None - if target_sense or not candidate_sense: - if not candidate_sense or candidate_sense==target_sense: - selected_candidates.add(candidate) - selected_substitutions.append(selected_candidates) - lexf.close() - return selected_substitutions - - def getLeskSense(self, sentence, target): - try: - result = pywsd.lesk.original_lesk(sentence, target) - return result - except IndexError: - return None - - def getPathSense(self, sentence, target): - try: - result = pywsd.similarity.max_similarity(sentence, target, option="path", best=False) - return result - except IndexError: - return None - - def getRandomSense(self, sentence, target): - try: - result = pywsd.baseline.random_sense(target) - return result - except IndexError: - return None - - def getFirstSense(self, sentence, target): - try: - result = pywsd.baseline.first_sense(target) - return result - except IndexError: - return None - - def getMaxLemmaSense(self, sentence, target): - try: - result = pywsd.baseline.max_lemma_count(target) - return result - except IndexError: - return None - - def getCandidateSentence(self, sentence, candidate, head): - tokens = sentence.strip().split(' ') - result = '' - for i in range(0, head): - result += tokens[i] + ' ' - result += candidate + ' ' - for i in range(head+1, len(tokens)): - result += tokens[i] + ' ' - return result.strip() - - def toVictorFormat(self, victor_corpus, substitutions, output_path, addTargetAsCandidate=False): - """ - Saves a set of selected substitutions in a file in VICTOR format. - - @param victor_corpus: Path to the corpus in the VICTOR format to which the substitutions were selected. - @param substitutions: The vector of substitutions selected for the VICTOR corpus. - @param output_path: The path in which to save the resulting VICTOR corpus. - @param addTargetAsCandidate: If True, adds the target complex word of each instance as a candidate substitution. - """ - o = open(output_path, 'w') - f = open(victor_corpus) - for subs in substitutions: - data = f.readline().strip().split('\t') - sentence = data[0].strip() - target = data[1].strip() - head = data[2].strip() - - newline = sentence + '\t' + target + '\t' + head + '\t' - for sub in subs: - newline += '0:'+sub + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() diff --git a/lexi/lib/lexenstein/spelling.py b/lexi/lib/lexenstein/spelling.py deleted file mode 100755 index 6aff446..0000000 --- a/lexi/lib/lexenstein/spelling.py +++ /dev/null @@ -1,74 +0,0 @@ -import re, collections, pickle - -class NorvigCorrector: - - def __init__(self, model_file, format='text'): - """ - Creates an instance of the NorvigCorrector class. - - @param model_file: Path to a file containing either raw, untokenized text, or a binary spelling correction model. - If "model_file" is the path to a text file, then the value of "format" must be "text". - If "model_file" is the path to a binary spelling correction model, then the value of "format" must be "bin". - @param format: Indicator of the type of input provided. - Possible values: "text", "bin". - """ - - #If input is text, then train a model: - if format=='text': - #Read text file: - file = open(model_file) - text = file.read() - file.close() - - #Create model: - self.model = self.getSpellingModel(re.findall('[a-z]+', text)) - #If input is binary, then load the model: - elif format=='bin': - self.model = pickle.load(open(model_file, 'rb')) - else: - self.model = None - print(('Input format \"' + format + '\" no supported, see documentation for available formats.')) - - #Create alphabet: - self.alphabet = 'abcdefghijklmnopqrstuvwxyz' - - def correct(self, word): - """ - Returns the spell-corrected version of a word. - If the model determines that the word has no spelling errors, it returns the word itself. - - @param word: Word to be spell-corrected. - """ - - candidates = self.getKnown([word]) or self.getKnown(self.getEdits(word)) or self.getKnownEdits(word) or [word] - return max(candidates, key=self.model.get) - - def saveBinaryModel(self, model_path): - """ - Saves the spelling correction model in binary format. - The saved model can then be loaded with the "bin" format during the creation of a NorvigCorrector. - - @param model_path: Path in which to save the model. - """ - - pickle.dump(self.model, open(model_path, 'wb')) - - def getSpellingModel(self, words): - model = collections.defaultdict(int) - for f in words: - model[f] += 1 - return model - - def getEdits(self, word): - splits = [(word[:i], word[i:]) for i in range(len(word) + 1)] - deletes = [a + b[1:] for a, b in splits if b] - transposes = [a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] - replaces = [a + c + b[1:] for a, b in splits for c in self.alphabet if b] - inserts = [a + c + b for a, b in splits for c in self.alphabet] - return set(deletes + transposes + replaces + inserts) - - def getKnownEdits(self, word): - return set(e2 for e1 in self.getEdits(word) for e2 in self.getEdits(e1) if e2 in self.model) - - def getKnown(self, words): - return set(w for w in words if w in self.model) diff --git a/lexi/lib/lexenstein/util.py b/lexi/lib/lexenstein/util.py deleted file mode 100755 index 54612ad..0000000 --- a/lexi/lib/lexenstein/util.py +++ /dev/null @@ -1,383 +0,0 @@ -import nltk -import pickle -import shelve -import re - - -def dependencyParseSentences(parser, sentences): - """ - Use StanfordParser to parse multiple sentences. - Takes multiple sentences as a list where each sentence is a list of words. - Each sentence will be automatically tagged with this StanfordParser instance's tagger. - If whitespaces exists inside a token, then the token will be treated as separate tokens. - This method is an adaptation of the code provided by NLTK. - - @param parser: An instance of the nltk.parse.stanford.StanfordParser class. - @param sentences: Input sentences to parse. - Each sentence must be a list of tokens. - @return A list of the dependency links of each sentence. - Each dependency link is composed by the relation type, the source word, its position in the sentence, the target word, and its position in the sentence. - """ - cmd = [ - 'edu.stanford.nlp.parser.lexparser.LexicalizedParser', - '-model', parser.model_path, - '-sentences', 'newline', - '-outputFormat', 'typedDependencies', - '-tokenized', - '-escaper', 'edu.stanford.nlp.process.PTBEscapingProcessor', - ] - - output=parser._execute(cmd, '\n'.join(' '.join(sentence) for sentence in sentences), False) - - depexp = re.compile("([^\\(]+)\\(([^\\,]+)\\,\s([^\\)]+)\\)") - - res = [] - cur_lines = [] - for line in output.splitlines(False): - if line == '': - res.append(cur_lines) - cur_lines = [] - else: - depdata = re.findall(depexp, line) - if len(depdata)>0: - link = depdata[0] - subjecth = link[1].rfind('-') - objecth = link[2].rfind('-') - subjectindex = link[1][subjecth+1:len(link[1])] - if subjectindex.endswith(r"'"): - subjectindex = subjectindex[0:len(subjectindex)-1] - objectindex = link[2][objecth+1:len(link[2])] - if objectindex.endswith(r"'"): - objectindex = objectindex[0:len(objectindex)-1] - clean_link = (link[0], link[1][0:subjecth], subjectindex, link[2][0:objecth], objectindex) - try: - a = int(subjectindex) - b = int(objectindex) - cur_lines.append(clean_link) - except Exception: - pass - return res - -def getGeneralisedPOS(tag): - """ - Returns a generalised version of a POS tag in Treebank format. - - @param tag: POS tag in Treebank format. - @return A generalised POS tag. - """ - result = None - if tag.startswith('N'): - result = 'N' - elif tag.startswith('V'): - result = 'V' - elif tag.startswith('RB'): - result = 'A' - elif tag.startswith('J'): - result = 'J' - elif tag.startswith('W'): - result = 'W' - elif tag.startswith('PRP'): - result = 'P' - else: - result = tag.strip() - return result - -def createTaggedNgramsFile(ngrams_file, tagged_ngrams_file): - """ - Creates a tagged version of an annotated n-gram counts file. - - @param ngrams_file: File containing POS tag annotated n-gram counts. - The file must be in the format produced by the "-write" option of SRILM. - Each word in the corpus used must be in the following format: ||| - @param tagged_ngrams_file: File with tagged n-gram counts. - """ - o = open(tagged_ngrams_file, 'w') - - print('Opening input n-gram counts file...') - c = 0 - f = open(ngrams_file) - for line in f: - c += 1 - if c % 1000000 == 0: - print((str(c) + ' n-grams processed.')) - data = line.strip().split('\t') - tokens = [t.split('|||') for t in data[0].split(' ')] - if len(tokens)==2: - o.write(tokens[0][0] + ' ' + tokens[1][min(1, len(tokens[1])-1)] + '\t' + data[1] + '\n') - o.write(tokens[0][min(1, len(tokens[0])-1)] + ' ' + tokens[1][0] + '\t' + data[1] + '\n') - elif len(tokens)==3: - o.write(tokens[0][0] + ' ' + tokens[1][min(1, len(tokens[1])-1)] + ' ' + tokens[2][min(1, len(tokens[2])-1)] + '\t' + data[1] + '\n') - o.write(tokens[0][min(1, len(tokens[0])-1)] + ' ' + tokens[1][0] + ' ' + tokens[2][min(1, len(tokens[2])-1)] + '\t' + data[1] + '\n') - o.write(tokens[0][min(1, len(tokens[0])-1)] + ' ' + tokens[1][min(1, len(tokens[1])-1)] + ' ' + tokens[2][0] + '\t' + data[1] + '\n') - elif len(tokens)==4: - o.write(tokens[0][min(1, len(tokens[0])-1)] + ' ' + tokens[1][min(1, len(tokens[1])-1)] + ' ' + tokens[2][0] + ' ' + tokens[3][min(1, len(tokens[3])-1)] + '\t' + data[1] + '\n') - o.write(tokens[0][min(1, len(tokens[0])-1)] + ' ' + tokens[1][0] + ' ' + tokens[2][min(1, len(tokens[2])-1)] + ' ' + tokens[3][min(1, len(tokens[3])-1)] + '\t' + data[1] + '\n') - elif len(tokens)==5: - o.write(tokens[0][min(1, len(tokens[0])-1)] + ' ' + tokens[1][min(1, len(tokens[1])-1)] + ' ' + tokens[2][0] + ' ' + tokens[3][min(1, len(tokens[3])-1)] + ' ' + tokens[4][min(1, len(tokens[4])-1)] + '\t' + data[1] + '\n') - f.close() - print('N-grams file read!') - - print('Saving model...') - o.close() - print('Finished!') - -def removeUnkFromNgramsFile(ngrams_file, output): - """ - Removes n-grams with "" tokens from an SRILM n-grams file. - - @param ngrams_file: Input n-grams file. - @param output: Filtered n-grams file. - """ - f = open(ngrams_file) - o = open(output, 'w') - c = 0 - for line in f: - c += 1 - if c % 1000000==0: - print((str(c) + ' tokens filtered.')) - if '' not in line: - o.write(line) - f.close() - o.close() - -def getVocabularyFromDataset(dataset, vocab_file, leftw, rightw, format='victor'): - """ - Extracts the vocabulary from a dataset in VICTOR or CWICTOR format. - This vocabularies can be used along with SRILM in order for smaller n-gram count files to be produced. - - @param dataset: Dataset from which to extract the vocabulary. - @param vocab_file: File in which to save the vocabulary. - @param leftw: Window to consider from the left of the target word. - @param rightw: Window to consider from the right of the target word. - @param format: Format of the dataset. - Values accepted: victor, cwictor - """ - #Obtain vocabulary: - vocab = set([]) - if format=='victor': - f = open(dataset) - for line in f: - data = line.strip().split('\t') - sent = data[0].strip().split(' ') - head = int(data[2].strip()) - for i in range(max(0, head-leftw), head): - vocab.add(sent[i]) - for i in range(head, min(len(sent), head+rightw+1)): - vocab.add(sent[i]) - target = data[1].strip() - vocab.add(target) - for sub in data[3:len(data)]: - words = sub.strip().split(':')[1].strip().split(' ') - for word in words: - vocab.add(word.strip()) - f.close() - elif format=='cwictor': - f = open(dataset) - for line in f: - data = line.strip().split('\t') - sent = data[0].strip().split(' ') - head = int(data[2].strip()) - for i in range(max(0, head-leftw), head): - vocab.add(sent[i]) - for i in range(head, min(len(sent), head+rightw+1)): - vocab.add(sent[i]) - target = data[1].strip() - vocab.update(sent) - vocab.add(target) - f.close() - - #Save vocabulary: - f = open(vocab_file, 'w') - for word in vocab: - if len(word.strip())>0: - f.write(word.strip() + '\n') - f.close() - -def addTranslationProbabilitiesFileToShelve(transprob_file, model_file): - """ - Adds a translation probabilities file to an either new, or existing shelve dictionary. - The shelve file can then be used for the calculation of features. - To produce the translation probabilities file, first run the following command through fast_align: - fast_align -i -v -d -o - - @param transprob_file: File containing translation probabilities. - @param model_file: Shelve file in which to save the translation probabilities. - """ - print('Opening shelve file...') - d = shelve.open(model_file, protocol=pickle.HIGHEST_PROTOCOL) - print('Shelve file open!') - - print('Reading translation probabilities file...') - c = 0 - f = open(transprob_file) - for line in f: - c += 1 - if c % 1000000 == 0: - print((str(c) + ' translation probabilities read.')) - data = line.strip().split('\t') - key = data[0] + '\t' + data[1] - value = float(data[2]) - if key not in d: - d[key] = value - else: - d[key] += value - f.close() - print('Translation probabilities file read!') - - print('Saving model...') - d.close() - print('Finished!') - -def addNgramCountsFileToShelve(ngrams_file, model_file): - """ - Adds a n-gram counts file to an either new, or existing shelve dictionary. - The shelve file can then be used for the calculation of several features. - The file must be in the format produced by the "-write" option of SRILM ngram-count application. - - @param ngrams_file: File containing n-gram counts. - @param model_file: Shelve file in which to save the n-gram counts file. - """ - print('Opening shelve file...') - d = shelve.open(model_file, protocol=pickle.HIGHEST_PROTOCOL) - print('Shelve file open!') - - print('Reading n-grams file...') - c = 0 - f = open(ngrams_file) - for line in f: - c += 1 - if c % 1000000 == 0: - print((str(c) + ' n-grams read.')) - data = line.strip().split('\t') - if data[0] not in d: - d[data[0]] = int(data[1]) - else: - d[data[0]] += int(data[1]) - f.close() - print('N-grams file read!') - - print('Saving model...') - d.close() - print('Finished!') - -def createConditionalProbabilityModel(folder, fileids, model, sep='/', encoding='utf8'): - """ - Creates an tagging probability model to be used along with the FeatureEstimator object. - Files of tagged data must contain one sentence per line, and each line must follow the following format: - ... - - @param folder: Folder containing files of tagged sentences. - @param fileids: A list or regular expressions specifying the file names with tagged data in "folder". - @param model: File in which to save the trained model. - @param sep: Separator between words and tags in the files with tagged data. - @param encoding: Encoding of the files with tagged data. - """ - print('Reading files...') - tcr = nltk.corpus.reader.tagged.TaggedCorpusReader(folder, fileids, sep=sep, encoding=encoding) - - print('Extracting tagged data...') - data = tcr.tagged_words() - - print('Creating conditional probability maps...') - cfd_tagwords = nltk.ConditionalFreqDist(data) - cpd_tagwords = nltk.ConditionalProbDist(cfd_tagwords, nltk.MLEProbDist) - - print('Saving model...') - pickle.dump(cpd_tagwords, open(model, "wb")) - print('Finished!') - -def fitTranslationProbabilityFileToCorpus(translation_probabilities, corpus, output): - """ - Creates a translation probabilities file that has only translations pertaining to the target complex words of a given VICTOR or CWICTOR corpus. - - @param translation_probabilities: Path to a file containing the translation probabilities. - The file must produced by the following command through fast_align: - fast_align -i -v -d -o - @param corpus: Path to a corpus in the VICTOR or CWICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param output: Path in which to save the filtered translation probabilities file. - """ - targets = set([]) - f = open(corpus) - for line in f: - data = line.strip().split('\t') - target = data[1].strip() - targets.add(target) - f.close() - - o = open(output, 'w') - f = open(translation_probabilities) - for line in f: - data = line.strip().split('\t') - word = data[0].strip() - if word in targets: - o.write(line.strip() + '\n') - f.close() - o.close() - -def addTargetAsFirstToVictorCorpus(self, victor_corpus, output): - """ - Creates a modified version of an input VICTOR corpus in which the target complex word is ranked first. - Can be very useful for the training of Substitution Selection Models - - @param victor_corpus: Path to a corpus in the VICTOR format. - For more information about the file's format, refer to the LEXenstein Manual. - @param output: Path in which to save the modified VICTOR corpus. - """ - f = open(victor_corpus) - o = open(output, 'w') - for line in f: - data = line.strip().split('\t') - newline = data[0].strip() + '\t' + data[1].strip() + '\t' + data[2].strip() + '\t' + '1:'+data[1].strip() + '\t' - for subst in data[3:len(data)]: - substd = subst.strip().split(':') - rank = int(substd[0].strip()) - word = substd[1].strip() - newline += str(rank+1)+':'+word + '\t' - o.write(newline.strip() + '\n') - f.close() - o.close() - -def produceWordCooccurrenceModel(text_file, window, model_file): - """ - Creates a co-occurrence model from a text file. - These models can be used by certain classes in LEXenstein, such as the Yamamoto Ranker and the Biran Selector. - - @param text_file: Text from which to estimate the word co-occurrence model. - @param window: Number of tokens to the left and right of a word to be included as a co-occurring word. - @param model_file: Path in which to save the word co-occurrence model. - """ - inp = open(text_file) - - coocs = {} - - c = 0 - for line in inp: - c += 1 - print(('At line: ' + str(c))) - tokens = line.strip().lower().split(' ') - for i in range(0, len(tokens)): - target = tokens[i] - if target not in list(coocs.keys()): - coocs[target] = {} - left = max(0, i-window) - right = min(len(tokens), i+window+1) - for j in range(left, right): - if j!=i: - cooc = tokens[j] - if cooc not in list(coocs[target].keys()): - coocs[target][cooc] = 1 - else: - coocs[target][cooc] += 1 - inp.close() - - targets = sorted(coocs.keys()) - - out = open(model_file, 'w') - for target in targets: - newline = target + '\t' - words = sorted(coocs[target].keys()) - for word in words: - newline += word + ':' + str(coocs[target][word]) + '\t' - out.write(newline.strip() + '\n') - out.close() diff --git a/lexi/lib/lib.py b/lexi/lib/lib.py deleted file mode 100644 index bb73de3..0000000 --- a/lexi/lib/lib.py +++ /dev/null @@ -1,795 +0,0 @@ -import logging -import pickle - -import numpy as np -from sklearn import linear_model -from sklearn.feature_selection import SelectKBest -from sklearn.feature_selection import f_classif -from sklearn.model_selection import train_test_split -from collections import defaultdict - -from lexi.config import RANKER_MODEL_PATH_TEMPLATE - -logger = logging.getLogger('lexi') - - -def make_synonyms_dict(synonyms_file): - """ - - :param synonyms_file: - :return: - """ - words2synonyms = defaultdict(set) - for line in open(synonyms_file): - tgt, syns = line.strip().split("\t", 1) - words2synonyms[tgt].update(syns.split(";")) - return words2synonyms - - -class Generator: - def __init__(self): - raise NotImplementedError - - def getSubstitutionsSingle(self, sentence, target, index, **kwargs): - raise NotImplementedError - - -class SynonymDBGenerator(Generator): - """ - Generates candidates from a serialized WordNet-like list of synonymy - relations. - """ - - def __init__(self, synonyms_file): - self.word2synonmys = make_synonyms_dict(synonyms_file) - - def getSubstitutionsSingle(self, sentence, target, index, **kwargs): - # TODO get POS of word for filtering? - """ - - :param sentence: - :param target: - :param index: - :return: - """ - return {target: self.word2synonmys.get(target, {})} - - -class LexensteinGenerator(Generator): - - def __init__(self, w2vmodels): - import gensim - self.model = None - self.individual_models = [] - for model_file in w2vmodels: - try: - _model = gensim.models.KeyedVectors.load_word2vec_format( - model_file, binary=True, unicode_errors='ignore') - except UnicodeDecodeError: - try: - _model = gensim.models.KeyedVectors.load(model_file) - except: - continue - self.individual_models.append(_model) - logger.debug(self.individual_models) - self.model = W2VModelEnsemble(self.individual_models) - - def getSubstitutionsSingle(self, sentence, target, index, - min_similarity=0.2): - """ - :param sentence: - :param target: - :param index: - :param min_similarity: minimum similarity score - :return: - """ - if min_similarity <= 0 or min_similarity > 1: - raise ValueError("'min_similarity' must be between 0 and 1 " - "(you provided {}).".format(min_similarity)) - substitutions = self.getInitialSet([[sentence, target, index]], - min_similarity) - return substitutions - - def getInitialSet(self, data, min_similarity): - trgs = [] - for i in range(len(data)): - d = data[i] - logger.debug(d) - target = d[1].strip().lower() - head = int(d[2].strip()) - trgs.append(target) - - logger.debug("tgts: {}".format(trgs)) - logger.debug(" getting candidates with min_similarity={}". - format(min_similarity)) - subs = [] - cands = set([]) - for i in range(len(data)): - d = data[i] - t = trgs[i] - - word = t - - most_sim = self.model.most_similar(word) - - subs.append([word for word, score in most_sim - if score >= min_similarity]) - - logger.debug("subs: {}".format(subs)) - subsr = subs - subs = [] - for l in subsr: - lr = [] - for inst in l: - cand = inst.split('|||')[0].strip() - cands.add(cand) - lr.append(inst) - subs.append(lr) - - cands = list(cands) - - subs_filtered = self.filterSubs(data, subs, trgs) - - final_cands = {} - for i in range(0, len(data)): - target = data[i][1] - logger.debug(subs_filtered) - cands = subs_filtered[i][0:len(subs_filtered[i])] - cands = [word.split('|||')[0].strip() for word in cands] - if target not in final_cands: - final_cands[target] = set([]) - final_cands[target].update(set(cands)) - - return final_cands - - def filterSubs(self, data, subs, trgs): - result = [] - for i in range(0, len(data)): - d = data[i] - - t = trgs[i] - - most_sim = subs[i] - most_simf = [] - - for cand in most_sim: - if cand!=t: - most_simf.append(cand) - - result.append(most_simf) - return result - - -class EnsembleLexensteinGenerator(LexensteinGenerator): - - def __init__(self, w2vmodels): - import gensim - self.model = None - self.individual_models = [] - for model_file in w2vmodels: - try: - _model = gensim.models.KeyedVectors.load_word2vec_format( - model_file, binary=True, unicode_errors='ignore') - except UnicodeDecodeError: - try: - _model = gensim.models.KeyedVectors.load(model_file) - except: - continue - self.individual_models.append(_model) - self.model = W2VModelEnsemble(self.individual_models) - - def getInitialSet(self, data, amount=5, min_similarity=0.5): - - trgs = [] - for i in range(len(data)): - d = data[i] - logger.debug(d) - target = d[1].strip().lower() - head = int(d[2].strip()) - trgs.append(target) - - logger.debug("tgts: {}".format(trgs)) - subs = [] - cands = set([]) - candidates = set() - for i in range(len(data)): - d = data[i] - t = trgs[i] - for model in self.models: - try: - candidates.update([(w, v) for w, v in - model.most_similar(t.decode('utf-8'), topn=10) - if v > min_similarity]) - except Exception: - try: - candidates.update([(w, v) for w, v in - model.most_similar(t, topn=10) - if v > min_similarity]) - except Exception: - pass - - candidate_mean_scores = [] - for candidate in candidates: - # compute mean score for every candidate across models - mean_score = np.mean([model.similarity(t, candidate) - for model in self.models - if candidate in model]) - candidate_mean_scores.append((candidate, mean_score)) - - # sort candidates by score (best first) - candidate_mean_scores = sorted(candidate_mean_scores, - key=lambda x: x[1], reversed=True) - # select top n - best_candidates = [cand for cand, sim in - candidate_mean_scores][:amount] - # subs.append([word[0] for word in most_sim]) - subs.append(best_candidates) - - logger.debug("tgts: {}".format(trgs)) - subsr = subs - subs = [] - for l in subsr: - lr = [] - for inst in l: - cand = inst.split('|||')[0].strip() - cands.add(cand) - lr.append(inst) - subs.append(lr) - - cands = list(cands) - - subs_filtered = self.filterSubs(data, subs, trgs) - - final_cands = {} - for i in range(0, len(data)): - target = data[i][1] - logger.debug(subs_filtered, amount, i) - cands = subs_filtered[i][0:min(amount, len(subs_filtered[i]))] - cands = [word.split('|||')[0].strip() for word in cands] - if target not in final_cands: - final_cands[target] = set([]) - final_cands[target].update(set(cands)) - - return final_cands - - def filterSubs(self, data, subs, trgs): - result = [] - for i in range(0, len(data)): - d = data[i] - - t = trgs[i] - - most_sim = subs[i] - most_simf = [] - - for cand in most_sim: - if cand!=t: - most_simf.append(cand) - - result.append(most_simf) - return result - - -class W2VModelEnsemble: - - def __init__(self, models): - self.models = models - - def most_similar(self, target, min_similarity=0.5, topn=10): - - all_similar_words = set() - for model in self.models: - if target in model: - all_similar_words.update([w for w, sim in - model.most_similar(target, topn=topn) - if sim > min_similarity]) - candidate_mean_scores = [] - for w in all_similar_words: - mean_score = np.mean([model.similarity(target, w) - for model in self.models - if w in model and target in model]) - candidate_mean_scores.append((w, mean_score)) - - # sort - most_similar = sorted(candidate_mean_scores, key=lambda x: x[1], - reverse=True) - # select top n - return most_similar[:topn] - - def similarity(self, w1, w2): - return np.mean([model.similarity(w1, w2) for model in self.models]) - - -class BoundaryRanker: - - def __init__(self, fe=None, userId=None): - self.fe = fe - self.classifier = None - self.feature_selector = None - self.userId = userId - - def trainRankerWithCrossValidation( - self, victor_corpus, positive_range, folds, test_size, - losses=['hinge', 'modified_huber'], penalties=['elasticnet'], - alphas=[0.0001, 0.001, 0.01], - l1_ratios=[0.0, 0.15, 0.25, 0.5, 0.75, 1.0], k='all'): - # Read victor corpus: - data = [] - f = open(victor_corpus) - for line in f: - data.append(line.strip().split('\t')) - f.close() - - # Create matrixes: - X = self.fe.calculateFeatures(victor_corpus) - Y = self.generateLabels(data, positive_range) - - # Select features: - self.feature_selector = SelectKBest(f_classif, k=k) - self.feature_selector.fit(X, Y) - X = self.feature_selector.transform(X) - - # Extract ranking problems: - firsts = [] - candidates = [] - Xsets = [] - Ysets = [] - index = -1 - for line in data: - fs = set([]) - cs = [] - Xs = [] - Ys = [] - for cand in line[3:len(line)]: - index += 1 - candd = cand.split(':') - rank = candd[0].strip() - word = candd[1].strip() - - cs.append(word) - Xs.append(X[index]) - Ys.append(Y[index]) - if rank=='1': - fs.add(word) - firsts.append(fs) - candidates.append(cs) - Xsets.append(Xs) - Ysets.append(Ys) - - # Create data splits: - datasets = [] - for i in range(0, folds): - Xtr, Xte, Ytr, Yte, Ftr, Fte, Ctr, Cte = train_test_split( - Xsets, Ysets, firsts, candidates, test_size=test_size, - random_state=i) - Xtra = [] - for matrix in Xtr: - Xtra += matrix - Xtea = [] - for matrix in Xte: - Xtea += matrix - Ytra = [] - for matrix in Ytr: - Ytra += matrix - datasets.append((Xtra, Ytra, Xte, Xtea, Fte, Cte)) - - # Get classifier with best parameters: - max_score = -1.0 - parameters = () - for l in losses: - for p in penalties: - for a in alphas: - for r in l1_ratios: - sum = 0.0 - sum_total = 0 - for dataset in datasets: - Xtra = dataset[0] - Ytra = dataset[1] - Xte = dataset[2] - Xtea = dataset[3] - Fte = dataset[4] - Cte = dataset[5] - - classifier = linear_model.SGDClassifier(loss=l, penalty=p, alpha=a, l1_ratio=r, epsilon=0.0001) - try: - classifier.fit(Xtra, Ytra) - t1 = self.getCrossValidationScore(classifier, Xtea, Xte, Fte, Cte) - sum += t1 - sum_total += 1 - except Exception: - pass - sum_total = max(1, sum_total) - if (sum/sum_total)>max_score: - max_score = sum - parameters = (l, p, a, r) - self.classifier = linear_model.SGDClassifier(loss=parameters[0], penalty=parameters[1], alpha=parameters[2], l1_ratio=parameters[3], epsilon=0.0001) - self.classifier.fit(X, Y) - - def getCrossValidationScore(self, classifier, Xtea, Xte, firsts, candidates): - distances = classifier.decision_function(Xtea) - index = -1 - corrects = 0 - total = 0 - for i in range(0, len(Xte)): - xset = Xte[i] - maxd = -999999 - for j in range(0, len(xset)): - index += 1 - distance = distances[index] - if distance>maxd: - maxd = distance - maxc = candidates[i][j] - if maxc in firsts[i]: - corrects += 1 - total += 1 - return float(corrects)/float(total) - - def getRankings(self, data): - #Transform data: - textdata = '' - for inst in data: - for token in inst: - textdata += token+'\t' - textdata += '\n' - textdata = textdata.strip() - - #Create matrixes: - X = self.fe.calculateFeatures(textdata, input='text') - - #Select features: - X = self.feature_selector.transform(X) - - #Get boundary distances: - distances = self.classifier.decision_function(X) - - #Get rankings: - result = [] - index = 0 - for i in range(0, len(data)): - line = data[i] - scores = {} - for subst in line[3:len(line)]: - word = subst.strip().split(':')[1].strip() - scores[word] = distances[index] - index += 1 - ranking_data = sorted(list(scores.keys()), key=scores.__getitem__, reverse=True) - result.append(ranking_data) - - #Return rankings: - return result - - def generateLabels(self, data, positive_range): - Y = [] - for line in data: - max_range = min(int(line[len(line)-1].split(':')[0].strip()), positive_range) - for i in range(3, len(line)): - rank_index = int(line[i].split(':')[0].strip()) - if rank_index<=max_range: - Y.append(1) - else: - Y.append(0) - return Y - - def save(self, userId): - with open(RANKER_MODEL_PATH_TEMPLATE.format(userId), 'wb') as pf: - pickle.dump((self.fe, self.classifier, self.feature_selector), pf, - pickle.HIGHEST_PROTOCOL) - - def load(self, userId=None): - if not userId: - userId = self.userId - with open(RANKER_MODEL_PATH_TEMPLATE.format(userId), 'rb') as pf: - (self.fe, self.classifier, self.feature_selector) = pickle.load(pf) - return self - - -class BoundarySelector: - - def __init__(self, boundary_ranker): - self.ranker = boundary_ranker - - def trainSelectorWithCrossValidation(self, victor_corpus, positive_range, - folds, test_size, - losses=['hinge', 'modified_huber'], - penalties=['elasticnet'], - alphas=[0.0001, 0.001, 0.01], - l1_ratios=[0.0, 0.15, 0.25, 0.5, 0.75, 1.0], - k='all'): - self.ranker.trainRankerWithCrossValidation(victor_corpus, positive_range, - folds, test_size, losses=losses, penalties=penalties, alphas=alphas, l1_ratios=l1_ratios, k=k) - - def selectCandidates(self, data, proportion, proportion_type='percentage'): - rankings = self.ranker.getRankings(data) - logger.debug((data, rankings)) - selected_substitutions = [] - - index = -1 - for line in data: - index += 1 - - if proportion_type == 'percentage': - if proportion > 1.0: - select_n = len(rankings[index]) - else: - select_n = int(float(proportion) * len(rankings[index])) - selected_candidates = rankings[index][:max(1, select_n)] - else: - if proportion < 1: - toselect = 1 - elif proportion > len(rankings[index]): - toselect = len(rankings[index]) - else: - toselect = proportion - selected_candidates = rankings[index][:toselect] - - selected_substitutions.append(selected_candidates) - - return selected_substitutions - - -class GlavasRanker: - - def __init__(self, fe): - """ - Creates an instance of the GlavasRanker class. - - @param fe: A configured FeatureEstimator object. - """ - - self.fe = fe - self.feature_values = None - - def getRankings(self, alldata): - - #Calculate features: - textdata = '' - for inst in alldata: - for token in inst: - textdata += token+'\t' - textdata += '\n' - textdata = textdata.strip() - self.feature_values = self.fe.calculateFeatures(textdata, input='text') - - #Create object for results: - result = [] - - #Read feature values for each candidate in victor corpus: - index = 0 - for data in alldata: - #Get all substitutions in ranking instance: - substitutions = data[3:len(data)] - - #Get instance's feature values: - instance_features = [] - for substitution in substitutions: - instance_features.append(self.feature_values[index]) - index += 1 - - rankings = {} - for i in range(0, len(self.fe.identifiers)): - #Create dictionary of substitution to feature value: - scores = {} - for j in range(0, len(substitutions)): - substitution = substitutions[j] - word = substitution.strip().split(':')[1].strip() - scores[word] = instance_features[j][i] - - #Check if feature is simplicity or complexity measure: - rev = False - if self.fe.identifiers[i][1]=='Simplicity': - rev = True - - #Sort substitutions: - words = list(scores.keys()) - sorted_substitutions = sorted(words, key=scores.__getitem__, reverse=rev) - - #Update rankings: - for j in range(0, len(sorted_substitutions)): - word = sorted_substitutions[j] - if word in rankings: - rankings[word] += j - else: - rankings[word] = j - - #Produce final rankings: - final_rankings = sorted(list(rankings.keys()), key=rankings.__getitem__) - - #Add them to result: - result.append(final_rankings) - - #Return result: - return result - - -class NNRegressionRanker: - - def __init__(self, fe, model): - self.fe = fe - self.model = model - - def getRankings(self, data): - #Transform data: - textdata = '' - for inst in data: - for token in inst: - textdata += token+'\t' - textdata += '\n' - textdata = textdata.strip() - - #Create matrix: - features = self.fe.calculateFeatures(textdata, input='text') - - ranks = [] - c = -1 - for line in data: - cands = [cand.strip().split(':')[1].strip() for cand in line[3:]] - featmap = {} - scoremap = {} - for cand in cands: - c += 1 - featmap[cand] = features[c] - scoremap[cand] = 0.0 - for i in range(0, len(cands)-1): - cand1 = cands[i] - for j in range(i+1, len(cands)): - cand2 = cands[j] - posneg = np.concatenate((featmap[cand1], featmap[cand2])) - probs = self.model.predict(np.array([posneg])) - score = probs[0] - scoremap[cand1] += score - negpos = np.concatenate((featmap[cand2], featmap[cand1])) - probs = self.model.predict(np.array([negpos])) - score = probs[0] - scoremap[cand1] -= score - rank = sorted(list(scoremap.keys()), key=scoremap.__getitem__, reverse=True) - if len(rank)>1: - if rank[0]==line[1].strip(): - rank = rank[1:] - ranks.append(rank) - return ranks - - -class OnlineRegressionRanker: - - def __init__(self, fe, model, training_dataset=None, userId=None): - self.fe = fe - self.userId = userId - if model: - self.model = model - elif training_dataset: - self.model = self.trainRegressionModel(training_dataset) - else: - self.model = None - - def trainRegressionModel(self, training_dataset): - # Create matrix: - features = self.fe.calculateFeatures(training_dataset, input='file') - Xtr = [] - Ytr = [] - f = open(training_dataset) - c = -1 - for line in f: - data = line.strip().split('\t') - cands = [cand.strip().split(':')[1] for cand in data[3:]] - indexes = [int(cand.strip().split(':')[0]) for cand in data[3:]] - featmap = {} - for cand in cands: - c += 1 - featmap[cand] = features[c] - for i in range(0, len(cands)-1): - for j in range(i+1, len(cands)): - indexi = indexes[i] - indexj = indexes[j] - indexdiffji = indexj-indexi - indexdiffij = indexi-indexj - positive = featmap[cands[i]] - negative = featmap[cands[j]] - v1 = np.concatenate((positive,negative)) - v2 = np.concatenate((negative,positive)) - Xtr.append(v1) - Xtr.append(v2) - Ytr.append(indexdiffji) - Ytr.append(indexdiffij) - f.close() - Xtr = np.array(Xtr) - Ytr = np.array(Ytr) - - model = linear_model.SGDRegressor() - model.fit(Xtr, Ytr) - return model - - def onlineTrainRegressionModel(self, training_data_text): - logger.info("Partially fitting the ranker") - # Create matrix: - features = self.fe.calculateFeatures(training_data_text, - format='victor', input='text') - Xtr = [] - Ytr = [] - c = -1 - for line in training_data_text.strip().split('\n'): - logger.debug(line) - data = line.strip().split('\t') - cands = [cand.strip().split(':')[1] for cand in data[3:]] - indexes = [int(cand.strip().split(':')[0]) for cand in data[3:]] - featmap = {} - for cand in cands: - c += 1 - featmap[cand] = features[c] - for i in range(0, len(cands) - 1): - for j in range(i + 1, len(cands)): - indexi = indexes[i] - indexj = indexes[j] - indexdiffji = indexj - indexi - indexdiffij = indexi - indexj - positive = featmap[cands[i]] - negative = featmap[cands[j]] - v1 = np.concatenate((positive, negative)) - v2 = np.concatenate((negative, positive)) - Xtr.append(v1) - Xtr.append(v2) - Ytr.append(indexdiffji) - Ytr.append(indexdiffij) - Xtr = np.array(Xtr) - Ytr = np.array(Ytr) - - self.model.partial_fit(Xtr, Ytr) - return self.model - - def getRankings(self, data): - #Transform data: - textdata = '' - for inst in data: - for token in inst: - textdata += token+'\t' - textdata += '\n' - textdata = textdata.strip() - - #Create matrix: - features = self.fe.calculateFeatures(textdata, input='text') - - ranks = [] - c = -1 - for line in data: - cands = [cand.strip().split(':')[1].strip() for cand in line[3:]] - featmap = {} - scoremap = {} - for cand in cands: - c += 1 - featmap[cand] = features[c] - scoremap[cand] = 0.0 - for i in range(0, len(cands)-1): - cand1 = cands[i] - for j in range(i+1, len(cands)): - cand2 = cands[j] - posneg = np.concatenate((featmap[cand1], featmap[cand2])) - probs = self.model.predict(np.array([posneg])) - score = probs[0] - scoremap[cand1] += score - negpos = np.concatenate((featmap[cand2], featmap[cand1])) - probs = self.model.predict(np.array([negpos])) - score = probs[0] - scoremap[cand1] -= score - rank = sorted(list(scoremap.keys()), key=scoremap.__getitem__, reverse=True) - if len(rank)>1: - if rank[0]==line[1].strip(): - rank = rank[1:] - ranks.append(rank) - return ranks - - def save(self, userId): - logger.info("Saving new model for user {}".format(userId)) - with open(RANKER_MODEL_PATH_TEMPLATE.format(userId), 'wb') as pf: - # pickle.dump((self.fe, self.model), pf, pickle.HIGHEST_PROTOCOL) - pickle.dump(self, pf, pickle.HIGHEST_PROTOCOL) - - # def load(self, userId=None): - # if not userId: - # userId = self.userId - # with open(RANKER_MODEL_TEMPLATE.format(userId), 'rb') as pf: - # (self.fe, self.model) = pickle.load(pf) - # return self - - @staticmethod - def staticload(userId): - with open(RANKER_MODEL_PATH_TEMPLATE.format(userId), 'rb') as pf: - return pickle.load(pf) diff --git a/lexi/server/run_lexi_server.py b/lexi/server/run_lexi_server.py index ee9f528..a79f76d 100644 --- a/lexi/server/run_lexi_server.py +++ b/lexi/server/run_lexi_server.py @@ -14,14 +14,14 @@ from werkzeug.exceptions import HTTPException from lexi.config import LEXI_BASE, LOG_DIR, RANKER_MODEL_PATH_TEMPLATE, \ - MODELS_DIR + CWI_MODEL_PATH_TEMPLATE, MODELS_DIR, RESOURCES from lexi.core.endpoints import update_ranker -from lexi.core.simplification.lexical import LexensteinSimplifier -from lexi.core.util.io import load_pickled_model +from lexi.core.simplification.lexical import LexicalSimplificationPipeline, \ + LexiCWI, LexiRanker, LexiGenerator from lexi.server.util import statuscodes from lexi.server.util.html import process_html from lexi.server.util.communication import make_response -from lexi.lib.lib import OnlineRegressionRanker +# from lexi.lib.lib import OnlineRegressionRanker SCRIPTDIR = os.path.dirname(os.path.realpath(__file__)) @@ -118,10 +118,18 @@ # LOADING DEFAULT MODEL -simplifier = LexensteinSimplifier("default").load() -default_ranker = load_pickled_model( - RANKER_MODEL_PATH_TEMPLATE.format("default")) +simplification_pipeline = LexicalSimplificationPipeline("default") +generator = LexiGenerator(synonyms_files=RESOURCES["da"]["synonyms"], + embedding_files=RESOURCES["da"]["embeddings"]) +simplification_pipeline.setGenerator(generator) +# default_ranker = load_pickled_model( +# RANKER_MODEL_PATH_TEMPLATE.format("default")) +# default_cwi = load_pickled_model( +# CWI_MODEL_PATH_TEMPLATE.format("default")) +default_ranker = LexiRanker("default") +default_cwi = LexiCWI("default") # TODO pretrain offline and load personalized_rankers = {"default": default_ranker} +personalized_cwi = {"default": default_cwi} logger.debug("Default ranker:") logger.debug(type(default_ranker)) logger.debug(default_ranker) @@ -162,32 +170,24 @@ def process(): frontend_version=frontend_version, language=language) - if user_id in personalized_rankers: - logger.info("Using personalized ranker, still in memory.") - ranker = personalized_rankers[user_id] - else: - logger.info("Gotta load ranker or use default...") - try: - # retrieve model - model_path = db_connection.get_model_path(user_id) - ranker = OnlineRegressionRanker.staticload(model_path) - except: - logger.warning("Could not load personalized model. " - "Loading default ranker.") - ranker = copy.copy(personalized_rankers["default"]) - logger.debug(ranker) - ranker.userId = user_id - personalized_rankers[user_id] = ranker + cwi = None + single_word_request = request.json.get("single_word_request", False) + if not single_word_request: + cwi = get_personalized_cwi(user_id) + + ranker = get_personalized_ranker(user_id) + + logger.info("Loaded CWI: "+str(cwi)) logger.info("Loaded ranker: "+str(ranker)) min_similarity = request.json.get("min_similarity", 0.65) if not type(min_similarity) == float: raise ValueError("'min_similarity' must be a float. You " "provided a {}".format(type(min_similarity))) - html_out, simplifications = process_html(simplifier, + html_out, simplifications = process_html(simplification_pipeline, request.json["html"], request.json.get("startOffset"), request.json.get("endOffset"), - ranker, mode="lexical", + cwi, ranker, mode="lexical", requestId=request_id, min_similarity=min_similarity, blacklist=GENERIC_BLACKLIST) @@ -295,6 +295,46 @@ def versioncheck(): download_url=download_url) +def get_personalized_ranker(user_id): + if user_id in personalized_rankers: + logger.info("Using personalized ranker, still in memory.") + ranker = personalized_rankers[user_id] + else: + logger.info("Gotta load ranker or use default...") + try: + # retrieve model + model_path = db_connection.get_model_path(user_id) + ranker = LexiRanker(user_id) + except: + logger.warning("Could not load personalized model. " + "Loading default ranker.") + ranker = copy.copy(personalized_rankers["default"]) + logger.debug(ranker) + ranker.userId = user_id + personalized_rankers[user_id] = ranker + return ranker + + +def get_personalized_cwi(user_id): + if user_id in personalized_cwi: + logger.info("Using personalized cwi, still in memory.") + cwi = personalized_cwi[user_id] + else: + logger.info("Gotta load cwi or use default...") + try: + # retrieve model + model_path = db_connection.get_model_path(user_id) + cwi = LexiCWI(user_id) + except: + logger.warning("Could not load personalized model. " + "Loading default cwi.") + cwi = copy.copy(personalized_cwi["default"]) + logger.debug(cwi) + cwi.userId = user_id + personalized_cwi[user_id] = cwi + return cwi + + if __name__ == "__main__": app.run(threaded=True) logger.debug("Rules: " + str([rule for rule in app.url_map.iter_rules()])) diff --git a/lexi/server/util/database.py b/lexi/server/util/database.py index 899a237..9c33ccb 100644 --- a/lexi/server/util/database.py +++ b/lexi/server/util/database.py @@ -1,6 +1,7 @@ import psycopg2 import logging import json +# import MySQLdb from collections import defaultdict logger = logging.getLogger('lexi') @@ -8,9 +9,12 @@ class DatabaseConnection: - def __init__(self, kwargs): + def __init__(self, kwargs, type="postgres"): try: - self.pg_connection = psycopg2.connect(**kwargs) + if type == "postgres": + self.pg_connection = psycopg2.connect(**kwargs) + # elif type == "mysql": + # self.pg_connection = MySQLdb.connect(**kwargs) self.cursor = self.pg_connection.cursor() logger.info("Connected to database '{}' at '{}'.".format( kwargs["dbname"], kwargs["host"] diff --git a/lexi/server/util/html.py b/lexi/server/util/html.py index 0205e03..6cfd17a 100644 --- a/lexi/server/util/html.py +++ b/lexi/server/util/html.py @@ -24,12 +24,13 @@ def map_text_to_html_offsets(html_src): return mapping -def process_html(classifier, html_src, startOffset, endOffset, ranker, +def process_html(pipeline, html_src, startOffset, endOffset, cwi, ranker, mode="lexical", requestId=0, min_similarity=0.7, blacklist=None): """ - :param classifier: + :param pipeline: :param html_src: The HTML source in question + :param ranker: CWI module to use with this classifier :param ranker: Ranker to use with this classifier :param mode: simplification mode (whether to perform lexical simplification, sentence simplification, ...). Only "lexical" accepted for now. @@ -43,7 +44,8 @@ def process_html(classifier, html_src, startOffset, endOffset, ranker, html_out = "" if mode == "lexical": _output, _simplifications = process_html_lexical( - classifier, html_src, startOffset, endOffset, requestId=requestId, + pipeline, html_src, startOffset, endOffset, requestId=requestId, + cwi=cwi, ranker=ranker, min_similarity=min_similarity, blacklist=blacklist) diff --git a/requirements.txt b/requirements.txt index b29ca48..2f696fc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -37,7 +37,7 @@ scipy==0.19.1 singledispatch==3.4.0.3 six==1.10.0 smart-open==1.5.3 -spacy==2.0.18 +stanfordnlp==0.1.1 thinc==6.12.1 toolz==0.8.2 tqdm==4.29.0 @@ -47,3 +47,4 @@ wcwidth==0.1.7 webencodings==0.5.1 Werkzeug==0.12.2 wrapt==1.10.11 +git+https://github.com/jbingel/rippletagger.git diff --git a/scripts/train_default_classifier.py b/scripts/train_default_classifier.py index d8e4b0a..e297882 100644 --- a/scripts/train_default_classifier.py +++ b/scripts/train_default_classifier.py @@ -1,45 +1,26 @@ -from lexi.config import RESOURCES, RESOURCES_TEST +from lexi.config import RESOURCES from lexi.core.simplification.lexical import * def fresh_train(userId="default", language="da", resources=None): - c = LexensteinSimplifier(userId=userId, language=language) + c = LexicalSimplificationPipeline(userId=userId, language=language) if not resources: try: - #resources = RESOURCES[language] - resources = RESOURCES_TEST[language] - print("WARNING: CHECK FOR CORRECT RESOURCES! (using test)") + resources = RESOURCES[language] except KeyError: print("Couldn't find resources for language {}".format(language)) - # General purpose - w2vpm = resources['embeddings'] + # Generator - # gg = LexensteinGenerator(w2vpm) - gg = SynonymDBGenerator(resources['synonyms']) - # gg = LexensteinGenerator(w2vpm) + g = LexiGenerator(synonyms_files=resources["synonyms"], + embedding_files=resources["embeddings"]) + c.setGenerator(g) - # Selector - fe = FeatureEstimator() - # fe.resources[w2vpm[0]] = gg.model - fe.addCollocationalFeature(resources['lm'], 2, 2, 'Complexity') - fe.addWordVectorSimilarityFeature(w2vpm[0], 'Simplicity') - br = BoundaryRanker(fe) - bs = BoundarySelector(br) - bs.trainSelectorWithCrossValidation(resources['ubr'], 1, 5, 0.25, k='all') # Ranker - fe = FeatureEstimator() - fe.addLengthFeature('Complexity') - fe.addCollocationalFeature(resources['lm'], 2, 2, 'Simplicity') - orr = OnlineRegressionRanker(fe, None, training_dataset=resources[ - 'ranking_training_dataset']) - # Return LexicalSimplifier object - c.generator = gg - c.selector = bs - c.ranker = orr + c.setRanker(LexiRanker("default")) return c + c = fresh_train() -c.save() -r = c.ranker -r.save("default") +c.ranker.save("default") +c.cwi.save("default")