forked from jbingel/lexi-server
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit a120b17
Showing
48 changed files
with
15,630 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
logs/* | ||
build/* | ||
lexi-venv/* | ||
models/* | ||
*.pyc | ||
trash/ | ||
.idea |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Backend for Lexi software | ||
|
||
## Changelog | ||
|
||
### Version 0.2.4 | ||
+ bugfix in Database connection: rollback connection at error | ||
|
||
### Version 0.2.3 | ||
+ using synonym list for Danish | ||
+ return simplification objects with unique sessionIds | ||
+ accommodate for on-demand simplifications | ||
|
||
### Version 0.2.2 | ||
+ simplify HTML only between given start and end character offset | ||
|
||
### Version 0.2.1 | ||
+ blacklist words per user | ||
|
||
### Version 0.2.0 | ||
+ massive restructuring of source | ||
+ marking if original word is displayed as first alternative | ||
|
||
### Version 0.1.2 | ||
+ log frontend_version in database | ||
|
||
### Version 0.1.1 | ||
+ small bugfix in database calls | ||
|
||
### Version 0.1 | ||
+ initial tagged release |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
[postgres] | ||
dbname = lexi | ||
user = lexi | ||
#password = ENTER_PW | ||
host = localhost | ||
port = 5432 | ||
|
||
[frontend] | ||
most_recent_version = 0.4.8 | ||
download_url = https://chrome.google.com/webstore/detail/lexi/emlpaamieadedfhniaablanidblekmha |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
BACKEND_VERSION = "0.2.5" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
import os | ||
|
||
SOURCE_BASE = os.path.dirname(os.path.realpath(__file__)) | ||
LEXI_BASE = os.path.join(SOURCE_BASE, "..") | ||
LOG_DIR = os.path.join(LEXI_BASE, "logs") | ||
MODELS_DIR = os.path.join(LEXI_BASE, "models") | ||
RANKER_MODELS_DIR = os.path.join(MODELS_DIR, "rankers") | ||
RESOURCES_DIR = os.path.join(LEXI_BASE, "res") | ||
|
||
RANKER_MODEL_PATH_TEMPLATE = os.path.join(RANKER_MODELS_DIR, "{}.pickle") | ||
LEXICAL_MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}-lexical.pickle") | ||
MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}.pickle") | ||
|
||
RESOURCES = { | ||
"da": { | ||
"embeddings": | ||
#[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_ | ||
# filtered.bin", | ||
[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow.bin", | ||
RESOURCES_DIR+"/da/embeddings/da.bin"], | ||
"lm": | ||
RESOURCES_DIR+"/da/lm/danish_lm.bin", | ||
"ubr": | ||
RESOURCES_DIR+"/da/simplification/danish_dataset_ubr.txt", | ||
"ranking_training_dataset": | ||
RESOURCES_DIR+"/da/simplification/clean_danish_ls_dataset.txt", | ||
"synonyms": | ||
RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"} | ||
} | ||
|
||
RESOURCES_TEST = { | ||
"da": { | ||
"embeddings": | ||
[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_" | ||
"filtered.bin", | ||
RESOURCES_DIR + "/da/embeddings/da.bin"], | ||
"lm": | ||
RESOURCES_DIR + "/da/lm/danish_lm.bin", | ||
"ubr": | ||
RESOURCES_DIR + "/da/simplification/danish_dataset_ubr.txt", | ||
"ranking_training_dataset": | ||
RESOURCES_DIR + "/da/simplification/clean_danish_ls_dataset.txt", | ||
"synonyms": | ||
RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"} | ||
} |
Empty file.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,267 @@ | ||
# -*- coding: utf-8 -*- | ||
from __future__ import absolute_import | ||
import os | ||
import logging | ||
from collections import defaultdict | ||
import pickle | ||
import random | ||
|
||
from textblob.base import BaseTagger | ||
from textblob.tokenizers import WordTokenizer, SentenceTokenizer | ||
from textblob.exceptions import MissingCorpusError | ||
|
||
PICKLE = "trontagger-0.1.0.pickle" | ||
|
||
""" | ||
Averaged perceptron classifier. Implementation geared for simplicity rather than | ||
efficiency. | ||
""" | ||
|
||
|
||
class AveragedPerceptron(object): | ||
|
||
'''An averaged perceptron, as implemented by Matthew Honnibal. | ||
See more implementation details here: | ||
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ | ||
''' | ||
|
||
def __init__(self): | ||
# Each feature gets its own weight vector, so weights is a dict-of-dicts | ||
self.weights = {} | ||
self.classes = set() | ||
# The accumulated values, for the averaging. These will be keyed by | ||
# feature/clas tuples | ||
self._totals = defaultdict(int) | ||
# The last time the feature was changed, for the averaging. Also | ||
# keyed by feature/clas tuples | ||
# (tstamps is short for timestamps) | ||
self._tstamps = defaultdict(int) | ||
# Number of instances seen | ||
self.i = 0 | ||
|
||
def predict(self, features): | ||
'''Dot-product the features and current weights and return the best label.''' | ||
scores = defaultdict(float) | ||
for feat, value in features.items(): | ||
if feat not in self.weights or value == 0: | ||
continue | ||
weights = self.weights[feat] | ||
for label, weight in weights.items(): | ||
scores[label] += value * weight | ||
# Do a secondary alphabetic sort, for stability | ||
return max(self.classes, key=lambda label: (scores[label], label)) | ||
|
||
def update(self, truth, guess, features): | ||
'''Update the feature weights.''' | ||
def upd_feat(c, f, w, v): | ||
param = (f, c) | ||
self._totals[param] += (self.i - self._tstamps[param]) * w | ||
self._tstamps[param] = self.i | ||
self.weights[f][c] = w + v | ||
|
||
self.i += 1 | ||
if truth == guess: | ||
return None | ||
for f in features: | ||
weights = self.weights.setdefault(f, {}) | ||
upd_feat(truth, f, weights.get(truth, 0.0), 1.0) | ||
upd_feat(guess, f, weights.get(guess, 0.0), -1.0) | ||
return None | ||
|
||
def average_weights(self): | ||
'''Average weights from all iterations.''' | ||
for feat, weights in self.weights.items(): | ||
new_feat_weights = {} | ||
for clas, weight in weights.items(): | ||
param = (feat, clas) | ||
total = self._totals[param] | ||
total += (self.i - self._tstamps[param]) * weight | ||
averaged = round(total / float(self.i), 3) | ||
if averaged: | ||
new_feat_weights[clas] = averaged | ||
self.weights[feat] = new_feat_weights | ||
return None | ||
|
||
def save(self, path): | ||
'''Save the pickled model weights.''' | ||
return pickle.dump(dict(self.weights), open(path, 'w')) | ||
|
||
def load(self, path): | ||
'''Load the pickled model weights.''' | ||
self.weights = pickle.load(open(path)) | ||
return None | ||
|
||
|
||
def train(nr_iter, examples): | ||
'''Return an averaged perceptron model trained on ``examples`` for | ||
``nr_iter`` iterations. | ||
''' | ||
model = AveragedPerceptron() | ||
for i in range(nr_iter): | ||
random.shuffle(examples) | ||
for features, class_ in examples: | ||
scores = model.predict(features) | ||
guess, score = max(scores.items(), key=lambda i: i[1]) | ||
if guess != class_: | ||
model.update(class_, guess, features) | ||
model.average_weights() | ||
return model | ||
|
||
|
||
class PerceptronTagger(BaseTagger): | ||
|
||
'''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal. | ||
See more implementation details here: | ||
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/ | ||
:param load: Load the pickled model upon instantiation. | ||
''' | ||
|
||
START = ['-START-', '-START2-'] | ||
END = ['-END-', '-END2-'] | ||
AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE) | ||
|
||
def __init__(self, load=True): | ||
self.model = AveragedPerceptron() | ||
self.tagdict = {} | ||
self.classes = set() | ||
if load: | ||
self.load(self.AP_MODEL_LOC) | ||
|
||
def tag(self, corpus, tokenize=True): | ||
'''Tags a string `corpus`.''' | ||
# Assume untokenized corpus has \n between sentences and ' ' between words | ||
s_split = SentenceTokenizer().tokenize if tokenize else lambda t: t.split('\n') | ||
w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split() | ||
def split_sents(corpus): | ||
for s in s_split(corpus): | ||
yield w_split(s) | ||
|
||
prev, prev2 = self.START | ||
tokens = [] | ||
for words in split_sents(corpus): | ||
context = self.START + [self._normalize(w) for w in words] + self.END | ||
for i, word in enumerate(words): | ||
tag = self.tagdict.get(word) | ||
if not tag: | ||
features = self._get_features(i, word, context, prev, prev2) | ||
tag = self.model.predict(features) | ||
tokens.append((word, tag)) | ||
prev2 = prev | ||
prev = tag | ||
return tokens | ||
|
||
def train(self, sentences, save_loc=None, nr_iter=5): | ||
'''Train a model from sentences, and save it at ``save_loc``. ``nr_iter`` | ||
controls the number of Perceptron training iterations. | ||
:param sentences: A list of (words, tags) tuples. | ||
:param save_loc: If not ``None``, saves a pickled model in this location. | ||
:param nr_iter: Number of training iterations. | ||
''' | ||
self._make_tagdict(sentences) | ||
self.model.classes = self.classes | ||
prev, prev2 = self.START | ||
for iter_ in range(nr_iter): | ||
c = 0 | ||
n = 0 | ||
for words, tags in sentences: | ||
context = self.START + [self._normalize(w) for w in words] \ | ||
+ self.END | ||
for i, word in enumerate(words): | ||
guess = self.tagdict.get(word) | ||
if not guess: | ||
feats = self._get_features(i, word, context, prev, prev2) | ||
guess = self.model.predict(feats) | ||
self.model.update(tags[i], guess, feats) | ||
prev2 = prev | ||
prev = guess | ||
c += guess == tags[i] | ||
n += 1 | ||
random.shuffle(sentences) | ||
logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n))) | ||
self.model.average_weights() | ||
# Pickle as a binary file | ||
if save_loc is not None: | ||
pickle.dump((self.model.weights, self.tagdict, self.classes), | ||
open(save_loc, 'wb'), -1) | ||
return None | ||
|
||
def load(self, loc): | ||
'''Load a pickled model.''' | ||
try: | ||
w_td_c = pickle.load(open(loc, 'rb')) | ||
except IOError: | ||
msg = ("Missing trontagger.pickle file.") | ||
raise MissingCorpusError(msg) | ||
self.model.weights, self.tagdict, self.classes = w_td_c | ||
self.model.classes = self.classes | ||
return None | ||
|
||
def _normalize(self, word): | ||
'''Normalization used in pre-processing. | ||
- All words are lower cased | ||
- Digits in the range 1800-2100 are represented as !YEAR; | ||
- Other digits are represented as !DIGITS | ||
:rtype: str | ||
''' | ||
if '-' in word and word[0] != '-': | ||
return '!HYPHEN' | ||
elif word.isdigit() and len(word) == 4: | ||
return '!YEAR' | ||
elif word[0].isdigit(): | ||
return '!DIGITS' | ||
else: | ||
return word.lower() | ||
|
||
def _get_features(self, i, word, context, prev, prev2): | ||
'''Map tokens into a feature representation, implemented as a | ||
{hashable: float} dict. If the features change, a new model must be | ||
trained. | ||
''' | ||
def add(name, *args): | ||
features[' '.join((name,) + tuple(args))] += 1 | ||
|
||
i += len(self.START) | ||
features = defaultdict(int) | ||
# It's useful to have a constant feature, which acts sort of like a prior | ||
add('bias') | ||
add('i suffix', word[-3:]) | ||
add('i pref1', word[0]) | ||
add('i-1 tag', prev) | ||
add('i-2 tag', prev2) | ||
add('i tag+i-2 tag', prev, prev2) | ||
add('i word', context[i]) | ||
add('i-1 tag+i word', prev, context[i]) | ||
add('i-1 word', context[i-1]) | ||
add('i-1 suffix', context[i-1][-3:]) | ||
add('i-2 word', context[i-2]) | ||
add('i+1 word', context[i+1]) | ||
add('i+1 suffix', context[i+1][-3:]) | ||
add('i+2 word', context[i+2]) | ||
return features | ||
|
||
def _make_tagdict(self, sentences): | ||
'''Make a tag dictionary for single-tag words.''' | ||
counts = defaultdict(lambda: defaultdict(int)) | ||
for words, tags in sentences: | ||
for word, tag in zip(words, tags): | ||
counts[word][tag] += 1 | ||
self.classes.add(tag) | ||
freq_thresh = 20 | ||
ambiguity_thresh = 0.97 | ||
for word, tag_freqs in counts.items(): | ||
tag, mode = max(tag_freqs.items(), key=lambda item: item[1]) | ||
n = sum(tag_freqs.values()) | ||
# Don't add rare words to the tag dictionary | ||
# Only add quite unambiguous words | ||
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh: | ||
self.tagdict[word] = tag | ||
|
||
|
||
def _pc(n, d): | ||
return (float(n) / d) * 100 |
Oops, something went wrong.