initial public commit

yuluqinn · Jan 11, 2019 · a120b17 · a120b17
commit a120b17
Show file tree

Hide file tree

Showing 48 changed files with 15,630 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+logs/*
+build/*
+lexi-venv/*
+models/*
+*.pyc
+trash/
+.idea
diff --git a/README.md b/README.md
@@ -0,0 +1,30 @@
+# Backend for Lexi software
+
+## Changelog
+
+### Version 0.2.4
++ bugfix in Database connection: rollback connection at error
+
+### Version 0.2.3
++ using synonym list for Danish
++ return simplification objects with unique sessionIds
++ accommodate for on-demand simplifications
+
+### Version 0.2.2
++ simplify HTML only between given start and end character offset
+
+### Version 0.2.1
++ blacklist words per user
+
+### Version 0.2.0
++ massive restructuring of source
++ marking if original word is displayed as first alternative
+
+### Version 0.1.2
++ log frontend_version in database
+
+### Version 0.1.1
++ small bugfix in database calls
+
+### Version 0.1
++ initial tagged release
diff --git a/lexi.cfg b/lexi.cfg
@@ -0,0 +1,10 @@
+[postgres]
+dbname = lexi
+user = lexi
+#password = ENTER_PW
+host = localhost
+port = 5432
+
+[frontend]
+most_recent_version = 0.4.8
+download_url = https://chrome.google.com/webstore/detail/lexi/emlpaamieadedfhniaablanidblekmha
diff --git a/lexi/__init__.py b/lexi/__init__.py
@@ -0,0 +1 @@
+BACKEND_VERSION = "0.2.5"
diff --git a/lexi/config.py b/lexi/config.py
@@ -0,0 +1,45 @@
+import os
+
+SOURCE_BASE = os.path.dirname(os.path.realpath(__file__))
+LEXI_BASE = os.path.join(SOURCE_BASE, "..")
+LOG_DIR = os.path.join(LEXI_BASE, "logs")
+MODELS_DIR = os.path.join(LEXI_BASE, "models")
+RANKER_MODELS_DIR = os.path.join(MODELS_DIR, "rankers")
+RESOURCES_DIR = os.path.join(LEXI_BASE, "res")
+
+RANKER_MODEL_PATH_TEMPLATE = os.path.join(RANKER_MODELS_DIR, "{}.pickle")
+LEXICAL_MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}-lexical.pickle")
+MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}.pickle")
+
+RESOURCES = {
+    "da": {
+        "embeddings":
+            #[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_
+        # filtered.bin",
+            [RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow.bin",
+             RESOURCES_DIR+"/da/embeddings/da.bin"],
+        "lm":
+            RESOURCES_DIR+"/da/lm/danish_lm.bin",
+        "ubr":
+            RESOURCES_DIR+"/da/simplification/danish_dataset_ubr.txt",
+        "ranking_training_dataset":
+            RESOURCES_DIR+"/da/simplification/clean_danish_ls_dataset.txt",
+        "synonyms":
+            RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"}
+}
+
+RESOURCES_TEST = {
+    "da": {
+        "embeddings":
+            [RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_"
+                           "filtered.bin",
+             RESOURCES_DIR + "/da/embeddings/da.bin"],
+        "lm":
+            RESOURCES_DIR + "/da/lm/danish_lm.bin",
+        "ubr":
+            RESOURCES_DIR + "/da/simplification/danish_dataset_ubr.txt",
+        "ranking_training_dataset":
+            RESOURCES_DIR + "/da/simplification/clean_danish_ls_dataset.txt",
+        "synonyms":
+            RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"}
+}
diff --git a/lexi/core/__init__.py b/lexi/core/__init__.py
diff --git a/lexi/core/avgperceptron.py b/lexi/core/avgperceptron.py
@@ -0,0 +1,267 @@
+# -*- coding: utf-8 -*-
+from __future__ import absolute_import
+import os
+import logging
+from collections import defaultdict
+import pickle
+import random
+
+from textblob.base import BaseTagger
+from textblob.tokenizers import WordTokenizer, SentenceTokenizer
+from textblob.exceptions import MissingCorpusError
+
+PICKLE = "trontagger-0.1.0.pickle"
+
+"""
+Averaged perceptron classifier. Implementation geared for simplicity rather than
+efficiency.
+"""
+
+
+class AveragedPerceptron(object):
+
+    '''An averaged perceptron, as implemented by Matthew Honnibal.
+
+    See more implementation details here:
+        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
+    '''
+
+    def __init__(self):
+        # Each feature gets its own weight vector, so weights is a dict-of-dicts
+        self.weights = {}
+        self.classes = set()
+        # The accumulated values, for the averaging. These will be keyed by
+        # feature/clas tuples
+        self._totals = defaultdict(int)
+        # The last time the feature was changed, for the averaging. Also
+        # keyed by feature/clas tuples
+        # (tstamps is short for timestamps)
+        self._tstamps = defaultdict(int)
+        # Number of instances seen
+        self.i = 0
+
+    def predict(self, features):
+        '''Dot-product the features and current weights and return the best label.'''
+        scores = defaultdict(float)
+        for feat, value in features.items():
+            if feat not in self.weights or value == 0:
+                continue
+            weights = self.weights[feat]
+            for label, weight in weights.items():
+                scores[label] += value * weight
+        # Do a secondary alphabetic sort, for stability
+        return max(self.classes, key=lambda label: (scores[label], label))
+
+    def update(self, truth, guess, features):
+        '''Update the feature weights.'''
+        def upd_feat(c, f, w, v):
+            param = (f, c)
+            self._totals[param] += (self.i - self._tstamps[param]) * w
+            self._tstamps[param] = self.i
+            self.weights[f][c] = w + v
+
+        self.i += 1
+        if truth == guess:
+            return None
+        for f in features:
+            weights = self.weights.setdefault(f, {})
+            upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
+            upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
+        return None
+
+    def average_weights(self):
+        '''Average weights from all iterations.'''
+        for feat, weights in self.weights.items():
+            new_feat_weights = {}
+            for clas, weight in weights.items():
+                param = (feat, clas)
+                total = self._totals[param]
+                total += (self.i - self._tstamps[param]) * weight
+                averaged = round(total / float(self.i), 3)
+                if averaged:
+                    new_feat_weights[clas] = averaged
+            self.weights[feat] = new_feat_weights
+        return None
+
+    def save(self, path):
+        '''Save the pickled model weights.'''
+        return pickle.dump(dict(self.weights), open(path, 'w'))
+
+    def load(self, path):
+        '''Load the pickled model weights.'''
+        self.weights = pickle.load(open(path))
+        return None
+
+
+def train(nr_iter, examples):
+    '''Return an averaged perceptron model trained on ``examples`` for
+    ``nr_iter`` iterations.
+    '''
+    model = AveragedPerceptron()
+    for i in range(nr_iter):
+        random.shuffle(examples)
+        for features, class_ in examples:
+            scores = model.predict(features)
+            guess, score = max(scores.items(), key=lambda i: i[1])
+            if guess != class_:
+                model.update(class_, guess, features)
+    model.average_weights()
+    return model
+
+
+class PerceptronTagger(BaseTagger):
+
+    '''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
+
+    See more implementation details here:
+        http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
+
+    :param load: Load the pickled model upon instantiation.
+    '''
+
+    START = ['-START-', '-START2-']
+    END = ['-END-', '-END2-']
+    AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)
+
+    def __init__(self, load=True):
+        self.model = AveragedPerceptron()
+        self.tagdict = {}
+        self.classes = set()
+        if load:
+            self.load(self.AP_MODEL_LOC)
+
+    def tag(self, corpus, tokenize=True):
+        '''Tags a string `corpus`.'''
+        # Assume untokenized corpus has \n between sentences and ' ' between words
+        s_split = SentenceTokenizer().tokenize if tokenize else lambda t: t.split('\n')
+        w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split()
+        def split_sents(corpus):
+            for s in s_split(corpus):
+                yield w_split(s)
+
+        prev, prev2 = self.START
+        tokens = []
+        for words in split_sents(corpus):
+            context = self.START + [self._normalize(w) for w in words] + self.END
+            for i, word in enumerate(words):
+                tag = self.tagdict.get(word)
+                if not tag:
+                    features = self._get_features(i, word, context, prev, prev2)
+                    tag = self.model.predict(features)
+                tokens.append((word, tag))
+                prev2 = prev
+                prev = tag
+        return tokens
+
+    def train(self, sentences, save_loc=None, nr_iter=5):
+        '''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
+        controls the number of Perceptron training iterations.
+
+        :param sentences: A list of (words, tags) tuples.
+        :param save_loc: If not ``None``, saves a pickled model in this location.
+        :param nr_iter: Number of training iterations.
+        '''
+        self._make_tagdict(sentences)
+        self.model.classes = self.classes
+        prev, prev2 = self.START
+        for iter_ in range(nr_iter):
+            c = 0
+            n = 0
+            for words, tags in sentences:
+                context = self.START + [self._normalize(w) for w in words] \
+                                                                    + self.END
+                for i, word in enumerate(words):
+                    guess = self.tagdict.get(word)
+                    if not guess:
+                        feats = self._get_features(i, word, context, prev, prev2)
+                        guess = self.model.predict(feats)
+                        self.model.update(tags[i], guess, feats)
+                    prev2 = prev
+                    prev = guess
+                    c += guess == tags[i]
+                    n += 1
+            random.shuffle(sentences)
+            logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
+        self.model.average_weights()
+        # Pickle as a binary file
+        if save_loc is not None:
+            pickle.dump((self.model.weights, self.tagdict, self.classes),
+                         open(save_loc, 'wb'), -1)
+        return None
+
+    def load(self, loc):
+        '''Load a pickled model.'''
+        try:
+            w_td_c = pickle.load(open(loc, 'rb'))
+        except IOError:
+            msg = ("Missing trontagger.pickle file.")
+            raise MissingCorpusError(msg)
+        self.model.weights, self.tagdict, self.classes = w_td_c
+        self.model.classes = self.classes
+        return None
+
+    def _normalize(self, word):
+        '''Normalization used in pre-processing.
+
+        - All words are lower cased
+        - Digits in the range 1800-2100 are represented as !YEAR;
+        - Other digits are represented as !DIGITS
+
+        :rtype: str
+        '''
+        if '-' in word and word[0] != '-':
+            return '!HYPHEN'
+        elif word.isdigit() and len(word) == 4:
+            return '!YEAR'
+        elif word[0].isdigit():
+            return '!DIGITS'
+        else:
+            return word.lower()
+
+    def _get_features(self, i, word, context, prev, prev2):
+        '''Map tokens into a feature representation, implemented as a
+        {hashable: float} dict. If the features change, a new model must be
+        trained.
+        '''
+        def add(name, *args):
+            features[' '.join((name,) + tuple(args))] += 1
+
+        i += len(self.START)
+        features = defaultdict(int)
+        # It's useful to have a constant feature, which acts sort of like a prior
+        add('bias')
+        add('i suffix', word[-3:])
+        add('i pref1', word[0])
+        add('i-1 tag', prev)
+        add('i-2 tag', prev2)
+        add('i tag+i-2 tag', prev, prev2)
+        add('i word', context[i])
+        add('i-1 tag+i word', prev, context[i])
+        add('i-1 word', context[i-1])
+        add('i-1 suffix', context[i-1][-3:])
+        add('i-2 word', context[i-2])
+        add('i+1 word', context[i+1])
+        add('i+1 suffix', context[i+1][-3:])
+        add('i+2 word', context[i+2])
+        return features
+
+    def _make_tagdict(self, sentences):
+        '''Make a tag dictionary for single-tag words.'''
+        counts = defaultdict(lambda: defaultdict(int))
+        for words, tags in sentences:
+            for word, tag in zip(words, tags):
+                counts[word][tag] += 1
+                self.classes.add(tag)
+        freq_thresh = 20
+        ambiguity_thresh = 0.97
+        for word, tag_freqs in counts.items():
+            tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
+            n = sum(tag_freqs.values())
+            # Don't add rare words to the tag dictionary
+            # Only add quite unambiguous words
+            if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
+                self.tagdict[word] = tag
+
+
+def _pc(n, d):
+    return (float(n) / d) * 100