Skip to content

Commit

Permalink
initial public commit
Browse files Browse the repository at this point in the history
  • Loading branch information
jbingel committed Jan 11, 2019
0 parents commit a120b17
Show file tree
Hide file tree
Showing 48 changed files with 15,630 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
logs/*
build/*
lexi-venv/*
models/*
*.pyc
trash/
.idea
30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Backend for Lexi software

## Changelog

### Version 0.2.4
+ bugfix in Database connection: rollback connection at error

### Version 0.2.3
+ using synonym list for Danish
+ return simplification objects with unique sessionIds
+ accommodate for on-demand simplifications

### Version 0.2.2
+ simplify HTML only between given start and end character offset

### Version 0.2.1
+ blacklist words per user

### Version 0.2.0
+ massive restructuring of source
+ marking if original word is displayed as first alternative

### Version 0.1.2
+ log frontend_version in database

### Version 0.1.1
+ small bugfix in database calls

### Version 0.1
+ initial tagged release
10 changes: 10 additions & 0 deletions lexi.cfg
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
[postgres]
dbname = lexi
user = lexi
#password = ENTER_PW
host = localhost
port = 5432

[frontend]
most_recent_version = 0.4.8
download_url = https://chrome.google.com/webstore/detail/lexi/emlpaamieadedfhniaablanidblekmha
1 change: 1 addition & 0 deletions lexi/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
BACKEND_VERSION = "0.2.5"
45 changes: 45 additions & 0 deletions lexi/config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
import os

SOURCE_BASE = os.path.dirname(os.path.realpath(__file__))
LEXI_BASE = os.path.join(SOURCE_BASE, "..")
LOG_DIR = os.path.join(LEXI_BASE, "logs")
MODELS_DIR = os.path.join(LEXI_BASE, "models")
RANKER_MODELS_DIR = os.path.join(MODELS_DIR, "rankers")
RESOURCES_DIR = os.path.join(LEXI_BASE, "res")

RANKER_MODEL_PATH_TEMPLATE = os.path.join(RANKER_MODELS_DIR, "{}.pickle")
LEXICAL_MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}-lexical.pickle")
MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}.pickle")

RESOURCES = {
"da": {
"embeddings":
#[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_
# filtered.bin",
[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow.bin",
RESOURCES_DIR+"/da/embeddings/da.bin"],
"lm":
RESOURCES_DIR+"/da/lm/danish_lm.bin",
"ubr":
RESOURCES_DIR+"/da/simplification/danish_dataset_ubr.txt",
"ranking_training_dataset":
RESOURCES_DIR+"/da/simplification/clean_danish_ls_dataset.txt",
"synonyms":
RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"}
}

RESOURCES_TEST = {
"da": {
"embeddings":
[RESOURCES_DIR+"/da/embeddings/danish_word_vectors_1300_cbow_"
"filtered.bin",
RESOURCES_DIR + "/da/embeddings/da.bin"],
"lm":
RESOURCES_DIR + "/da/lm/danish_lm.bin",
"ubr":
RESOURCES_DIR + "/da/simplification/danish_dataset_ubr.txt",
"ranking_training_dataset":
RESOURCES_DIR + "/da/simplification/clean_danish_ls_dataset.txt",
"synonyms":
RESOURCES_DIR + "/da/synonyms/da_synonyms_combined.csv"}
}
Empty file added lexi/core/__init__.py
Empty file.
267 changes: 267 additions & 0 deletions lexi/core/avgperceptron.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,267 @@
# -*- coding: utf-8 -*-
from __future__ import absolute_import
import os
import logging
from collections import defaultdict
import pickle
import random

from textblob.base import BaseTagger
from textblob.tokenizers import WordTokenizer, SentenceTokenizer
from textblob.exceptions import MissingCorpusError

PICKLE = "trontagger-0.1.0.pickle"

"""
Averaged perceptron classifier. Implementation geared for simplicity rather than
efficiency.
"""


class AveragedPerceptron(object):

'''An averaged perceptron, as implemented by Matthew Honnibal.
See more implementation details here:
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
'''

def __init__(self):
# Each feature gets its own weight vector, so weights is a dict-of-dicts
self.weights = {}
self.classes = set()
# The accumulated values, for the averaging. These will be keyed by
# feature/clas tuples
self._totals = defaultdict(int)
# The last time the feature was changed, for the averaging. Also
# keyed by feature/clas tuples
# (tstamps is short for timestamps)
self._tstamps = defaultdict(int)
# Number of instances seen
self.i = 0

def predict(self, features):
'''Dot-product the features and current weights and return the best label.'''
scores = defaultdict(float)
for feat, value in features.items():
if feat not in self.weights or value == 0:
continue
weights = self.weights[feat]
for label, weight in weights.items():
scores[label] += value * weight
# Do a secondary alphabetic sort, for stability
return max(self.classes, key=lambda label: (scores[label], label))

def update(self, truth, guess, features):
'''Update the feature weights.'''
def upd_feat(c, f, w, v):
param = (f, c)
self._totals[param] += (self.i - self._tstamps[param]) * w
self._tstamps[param] = self.i
self.weights[f][c] = w + v

self.i += 1
if truth == guess:
return None
for f in features:
weights = self.weights.setdefault(f, {})
upd_feat(truth, f, weights.get(truth, 0.0), 1.0)
upd_feat(guess, f, weights.get(guess, 0.0), -1.0)
return None

def average_weights(self):
'''Average weights from all iterations.'''
for feat, weights in self.weights.items():
new_feat_weights = {}
for clas, weight in weights.items():
param = (feat, clas)
total = self._totals[param]
total += (self.i - self._tstamps[param]) * weight
averaged = round(total / float(self.i), 3)
if averaged:
new_feat_weights[clas] = averaged
self.weights[feat] = new_feat_weights
return None

def save(self, path):
'''Save the pickled model weights.'''
return pickle.dump(dict(self.weights), open(path, 'w'))

def load(self, path):
'''Load the pickled model weights.'''
self.weights = pickle.load(open(path))
return None


def train(nr_iter, examples):
'''Return an averaged perceptron model trained on ``examples`` for
``nr_iter`` iterations.
'''
model = AveragedPerceptron()
for i in range(nr_iter):
random.shuffle(examples)
for features, class_ in examples:
scores = model.predict(features)
guess, score = max(scores.items(), key=lambda i: i[1])
if guess != class_:
model.update(class_, guess, features)
model.average_weights()
return model


class PerceptronTagger(BaseTagger):

'''Greedy Averaged Perceptron tagger, as implemented by Matthew Honnibal.
See more implementation details here:
http://honnibal.wordpress.com/2013/09/11/a-good-part-of-speechpos-tagger-in-about-200-lines-of-python/
:param load: Load the pickled model upon instantiation.
'''

START = ['-START-', '-START2-']
END = ['-END-', '-END2-']
AP_MODEL_LOC = os.path.join(os.path.dirname(__file__), PICKLE)

def __init__(self, load=True):
self.model = AveragedPerceptron()
self.tagdict = {}
self.classes = set()
if load:
self.load(self.AP_MODEL_LOC)

def tag(self, corpus, tokenize=True):
'''Tags a string `corpus`.'''
# Assume untokenized corpus has \n between sentences and ' ' between words
s_split = SentenceTokenizer().tokenize if tokenize else lambda t: t.split('\n')
w_split = WordTokenizer().tokenize if tokenize else lambda s: s.split()
def split_sents(corpus):
for s in s_split(corpus):
yield w_split(s)

prev, prev2 = self.START
tokens = []
for words in split_sents(corpus):
context = self.START + [self._normalize(w) for w in words] + self.END
for i, word in enumerate(words):
tag = self.tagdict.get(word)
if not tag:
features = self._get_features(i, word, context, prev, prev2)
tag = self.model.predict(features)
tokens.append((word, tag))
prev2 = prev
prev = tag
return tokens

def train(self, sentences, save_loc=None, nr_iter=5):
'''Train a model from sentences, and save it at ``save_loc``. ``nr_iter``
controls the number of Perceptron training iterations.
:param sentences: A list of (words, tags) tuples.
:param save_loc: If not ``None``, saves a pickled model in this location.
:param nr_iter: Number of training iterations.
'''
self._make_tagdict(sentences)
self.model.classes = self.classes
prev, prev2 = self.START
for iter_ in range(nr_iter):
c = 0
n = 0
for words, tags in sentences:
context = self.START + [self._normalize(w) for w in words] \
+ self.END
for i, word in enumerate(words):
guess = self.tagdict.get(word)
if not guess:
feats = self._get_features(i, word, context, prev, prev2)
guess = self.model.predict(feats)
self.model.update(tags[i], guess, feats)
prev2 = prev
prev = guess
c += guess == tags[i]
n += 1
random.shuffle(sentences)
logging.info("Iter {0}: {1}/{2}={3}".format(iter_, c, n, _pc(c, n)))
self.model.average_weights()
# Pickle as a binary file
if save_loc is not None:
pickle.dump((self.model.weights, self.tagdict, self.classes),
open(save_loc, 'wb'), -1)
return None

def load(self, loc):
'''Load a pickled model.'''
try:
w_td_c = pickle.load(open(loc, 'rb'))
except IOError:
msg = ("Missing trontagger.pickle file.")
raise MissingCorpusError(msg)
self.model.weights, self.tagdict, self.classes = w_td_c
self.model.classes = self.classes
return None

def _normalize(self, word):
'''Normalization used in pre-processing.
- All words are lower cased
- Digits in the range 1800-2100 are represented as !YEAR;
- Other digits are represented as !DIGITS
:rtype: str
'''
if '-' in word and word[0] != '-':
return '!HYPHEN'
elif word.isdigit() and len(word) == 4:
return '!YEAR'
elif word[0].isdigit():
return '!DIGITS'
else:
return word.lower()

def _get_features(self, i, word, context, prev, prev2):
'''Map tokens into a feature representation, implemented as a
{hashable: float} dict. If the features change, a new model must be
trained.
'''
def add(name, *args):
features[' '.join((name,) + tuple(args))] += 1

i += len(self.START)
features = defaultdict(int)
# It's useful to have a constant feature, which acts sort of like a prior
add('bias')
add('i suffix', word[-3:])
add('i pref1', word[0])
add('i-1 tag', prev)
add('i-2 tag', prev2)
add('i tag+i-2 tag', prev, prev2)
add('i word', context[i])
add('i-1 tag+i word', prev, context[i])
add('i-1 word', context[i-1])
add('i-1 suffix', context[i-1][-3:])
add('i-2 word', context[i-2])
add('i+1 word', context[i+1])
add('i+1 suffix', context[i+1][-3:])
add('i+2 word', context[i+2])
return features

def _make_tagdict(self, sentences):
'''Make a tag dictionary for single-tag words.'''
counts = defaultdict(lambda: defaultdict(int))
for words, tags in sentences:
for word, tag in zip(words, tags):
counts[word][tag] += 1
self.classes.add(tag)
freq_thresh = 20
ambiguity_thresh = 0.97
for word, tag_freqs in counts.items():
tag, mode = max(tag_freqs.items(), key=lambda item: item[1])
n = sum(tag_freqs.values())
# Don't add rare words to the tag dictionary
# Only add quite unambiguous words
if n >= freq_thresh and (float(mode) / n) >= ambiguity_thresh:
self.tagdict[word] = tag


def _pc(n, d):
return (float(n) / d) * 100
Loading

0 comments on commit a120b17

Please sign in to comment.