Skip to content

Commit

Permalink
- moving the ML part to a dedicated class, a scorer, which is used by…
Browse files Browse the repository at this point in the history
… ranker and CWI

- Ranking by individual scoring, expecting binary ratings by user on new items for update
  • Loading branch information
jbingel committed May 8, 2019
1 parent e126eba commit 54bcce7
Show file tree
Hide file tree
Showing 7 changed files with 335 additions and 457 deletions.
11 changes: 7 additions & 4 deletions lexi/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,13 +4,16 @@
LEXI_BASE = os.path.join(SOURCE_BASE, "..")
LOG_DIR = os.path.join(LEXI_BASE, "logs")
MODELS_DIR = os.path.join(LEXI_BASE, "models")
RANKER_MODELS_DIR = os.path.join(MODELS_DIR, "rankers")
CWI_MODELS_DIR = os.path.join(MODELS_DIR, "cwi")
RANKER_DIR = os.path.join(MODELS_DIR, "rankers")
CWI_DIR = os.path.join(MODELS_DIR, "cwi")
SCORERS_DIR = os.path.join(MODELS_DIR, "scorers")
RESOURCES_DIR = os.path.join(LEXI_BASE, "res")
STANFORDNLP = os.path.join(RESOURCES_DIR, "stanfordnlp_resources")

RANKER_MODEL_PATH_TEMPLATE = os.path.join(RANKER_MODELS_DIR, "{}.pickle")
CWI_MODEL_PATH_TEMPLATE = os.path.join(CWI_MODELS_DIR, "{}.pickle")
RANKER_PATH_TEMPLATE = os.path.join(RANKER_DIR, "{}.json")
CWI_PATH_TEMPLATE = os.path.join(CWI_DIR, "{}.json")
SCORER_PATH_TEMPLATE = os.path.join(SCORERS_DIR, "{}.json")
SCORER_MODEL_PATH_TEMPLATE = os.path.join(SCORERS_DIR, "{}.pt")

LEXICAL_MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}-lexical.pickle")
MODEL_PATH_TEMPLATE = os.path.join(MODELS_DIR, "{}.pickle")
Expand Down
41 changes: 16 additions & 25 deletions lexi/core/endpoints.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,10 +181,9 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0):
:return:
"""
update_batch = []
featurized_words = {}

logger.debug("Updating ranker: {}".format(ranker))
logger.debug("Ranker has featurizer: {}".format(ranker.featurizer))
logger.debug("Ranker has featurizer: {}".format(ranker.scorer.featurizer))

# iterate over feedback items (user choices for simplified words)
for _, simplification in feedback.items():
Expand All @@ -207,35 +206,27 @@ def update_ranker(ranker, user_id, feedback, overall_rating=0):
original_sentence = simplification.get("sentence")
original_start_offset = simplification.get("word_offset_start")
original_end_offset = simplification.get("word_offset_end")
for w in choices:
if w not in featurized_words:
# construct modified sentence
modified_sentence = "{}{}{}".format(
original_sentence[:original_start_offset],
w,
original_sentence[original_end_offset:])
# featurize word in modified context
logger.debug("Word in modified context: {} {} {} {}".format(
modified_sentence, w, original_start_offset,
original_start_offset+len(w)))
featurized_words[w] = ranker.featurizer.featurize(
modified_sentence, original_start_offset,
original_start_offset+len(w))

simple_index = selection % len(choices)
simple_word = choices[simple_index]
difficult_words = [w for w in choices if not w == simple_word]

# add feature vectors to update batch
update_batch.append((featurized_words[simple_word], 0))
for difficult in difficult_words:
update_batch.append((featurized_words[difficult], 1))
# update_batch.append((featurized_words[simple_word],
# featurized_words[difficult]))
for w in choices:
# construct modified sentence
modified_sentence = "{}{}{}".format(
original_sentence[:original_start_offset],
w,
original_sentence[original_end_offset:])

# put modified sentences into update batch (list of tuples
# (items, label), where items are 3-tuples
# (modified_sent, start_offset, end_offset))
update_batch.append((
(modified_sentence, # item
original_start_offset,
original_start_offset + len(w)),
int(w != simple_word))) # label: 0 if w simple, 1 if difficult

if update_batch:
update_batch = list(zip(*update_batch))
# print(help(ranker))
ranker.update(update_batch)
ranker.save(user_id)
else:
Expand Down
307 changes: 60 additions & 247 deletions lexi/core/featurize/featurizers.py
Original file line number Diff line number Diff line change
@@ -1,254 +1,67 @@
import numpy as np
import logging
import jsonpickle
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import MinMaxScaler

from lexi.core.featurize import extract_lexical_feats, feat_util
from lexi.core.featurize.extract_sentence_feats import TreeNode
from abc import ABCMeta, abstractmethod


class LabelMapper:
def __init__(self):
self.label2id = dict()
self.id2label = {}

def map_batch(self, labels):
out = []
for label in labels:
out.append(self.map(label))
return out

def map(self, label):
if label not in self.label2id:
newId = len(self.label2id)
self.label2id[label] = newId
self.id2label[newId] = label
return self.label2id[label]

def map_inv(self, ids):
out = []
for _id in ids:
out.append(self.id2label.get(_id, "?"))
return out


class Featurizer:

def __init__(self, features=None):
self.mapper = LabelMapper()
self.features = features

def fit_transform(self, data):
return self.transform(data, fit=True)

def transform(self, data, fit=False):
raise NotImplementedError

def transform_plain(self, data):
raise NotImplementedError

def map_labels(self, data):
return self.mapper.map_batch(data)

def map_inv(self, ids):
return self.mapper.map_inv(ids)


class LexicalFeaturizer(Featurizer):

def __init__(self, features=None):
super().__init__(features)
self.vectorizer = DictVectorizer()

def featurize_word(self, w):
word = extract_lexical_feats.Word(w)
return word.featurize_by_type(self.features)

def transform(self, data, fit=False):
feats = []
labels = []
for word in data:
feats.append(self.featurize_word(word))
# labels.append(label)
if fit:
feats = self.vectorizer.fit_transform(feats)
else:
feats = self.vectorizer.transform(feats)

# labels = np.array(labels)
return feats, labels

def transform_plain(self, data):
return self.transform(data, fit=False)
logger = logging.getLogger('lexi')


class PystructEdgeFeaturizer(Featurizer):
class LexicalFeaturizer(DictVectorizer):

def __init__(self, features=None):
super().__init__(features)
self.node_vectorizer = DictVectorizer()
self.edge_vectorizer = DictVectorizer()
# self.nlp = spacy.load('en')
print("Loaded natural language processor.")

def prettyprintweights(self, linearmodel):
for name, value in zip(self.node_vectorizer.feature_names_, linearmodel.coef_[0]):
print("\t".join([name, str(value)]))

def featurize_sentence(self, s):
nodefeats = []
edges = []
edgefeats = []
labels = []
for l, i in zip(s["label"], s["idx"]):
i -= 1
w = TreeNode(s, i, s["form"][i], s["lemma"][i], s["pos"][i],
s["ne"][i], s["head"], s["deprel"], l)
nodefeats.append(w.featurize_by_type(self.features))
head = int(s["head"][i])
tgt = head if head > 0 else i+1
edges.append((tgt-1, i))
edgefeats.append(w.featurize_by_type(["dependency"]))
labels.append(l)
return nodefeats, edges, edgefeats, labels

def fit_transform(self, data):
return self.transform(data, fit=True)

def transform(self, data, fit=False):
labels = []
X = []
y = []
sentence_lengths = []
Xnodefeats = []
Xedges = []
Xedgefeats = []
print("Collecting features...")
# for s in feat_util.read_sentences_plain(data):
for s in feat_util.read_sentences(data):
nodefeats, edges, edgefeats, nodelabels = self.featurize_sentence(s)
sentence_lengths.append(len(nodefeats))
Xnodefeats.extend(nodefeats)
Xedges.extend(edges)
Xedgefeats.extend(edgefeats)
labels.extend(nodelabels)

if fit:
Xnodefeats = self.node_vectorizer.fit_transform(Xnodefeats).toarray()
Xedgefeats = self.edge_vectorizer.fit_transform(Xedgefeats).toarray()
else:
Xnodefeats = self.node_vectorizer.transform(Xnodefeats).toarray()
Xedgefeats = self.edge_vectorizer.transform(Xedgefeats).toarray()
i = 0
for sl in sentence_lengths:
X.append((Xnodefeats[i:i+sl], np.array(Xedges[i:i+sl]), Xedgefeats[i:i+sl]))
y.append(np.array(self.mapper.map_batch(labels[i:i + sl])))
i = i+sl

for i in range(len(X)):
if not len(X[i][0]) == len(y[i]):
print("unequal {}: {} vs {}".format(i, len(X[i][0]), len(y[i])))
return X, y

def transform_plain(self, data):
X = []
parses = []
sentence_lengths = []
Xnodefeats = []
Xedges = []
Xedgefeats = []
print("Collecting features...")
for s in feat_util.read_sentences_plain(data):
nodefeats, edges, edgefeats, _ = self.featurize_sentence(s)
sentence_lengths.append(len(nodefeats))
Xnodefeats.extend(nodefeats)
Xedges.extend(edges)
Xedgefeats.extend(edgefeats)
parses.append(s)

Xnodefeats = self.node_vectorizer.transform(Xnodefeats).toarray()
Xedgefeats = self.edge_vectorizer.transform(Xedgefeats).toarray()
i = 0
for sl in sentence_lengths:
X.append((Xnodefeats[i:i+sl], np.array(Xedges[i:i+sl]), Xedgefeats[i:i+sl]))
i = i+sl
return X, parses


class PystructChainFeaturizer(Featurizer):

def __init__(self, features=None):
super().__init__(features)
self.node_vectorizer = DictVectorizer()
self.edge_vectorizer = DictVectorizer()
# self.nlp = spacy.load('en')
print("Loaded natural language processor.")

def prettyprintweights(self, linearmodel):
for name, value in zip(self.node_vectorizer.feature_names_, linearmodel.coef_[0]):
print("\t".join([name, str(value)]))

def featurize_sentence(self, s):
nodefeats = []
labels = []
for l, i in zip(s["label"], s["idx"]):
i -= 1
w = TreeNode(s, i, s["form"][i], s["lemma"][i], s["pos"][i],
s["ne"][i], s["head"], s["deprel"], l)
nodefeats.append(w.featurize_by_type(self.features))
labels.append(l)
return nodefeats, labels

def fit_transform(self, data):
return self.transform(data, fit=True)

def transform(self, data, fit=False):
labels = []
X = []
y = []
sentence_lengths = []
Xnodefeats = []
Xedges = []
Xedgefeats = []
print("Collecting features...")
# for s in feat_util.read_sentences_plain(data):
for s in feat_util.read_sentences(data):
nodefeats, nodelabels = self.featurize_sentence(s)
sentence_lengths.append(len(nodefeats))
Xnodefeats.extend(nodefeats)
labels.extend(nodelabels)

if fit:
Xnodefeats = self.node_vectorizer.fit_transform(Xnodefeats).toarray()
Xedgefeats = self.edge_vectorizer.fit_transform(Xedgefeats).toarray()
def __init__(self):
super().__init__(sparse=False)
self.scaler = MinMaxScaler()
self.cache = {}

def __getstate__(self):
state = self.__dict__.copy()
if 'cache' in state:
del state['cache']
return state

def __setstate__(self, state):
self.__dict__.update(state)
self.cache = {}

def dimensions(self):
if hasattr(self, "feature_names_"):
return len(self.get_feature_names())
else:
Xnodefeats = self.node_vectorizer.transform(Xnodefeats).toarray()
Xedgefeats = self.edge_vectorizer.transform(Xedgefeats).toarray()
i = 0
for sl in sentence_lengths:
X.append((Xnodefeats[i:i+sl], np.array(Xedges[i:i+sl]), Xedgefeats[i:i+sl]))
y.append(np.array(self.mapper.map_batch(labels[i:i + sl])))
i = i+sl

for i in range(len(X)):
if not len(X[i][0]) == len(y[i]):
print("unequal {}: {} vs {}".format(i, len(X[i][0]), len(y[i])))
return X, y

def transform_plain(self, data):
X = []
parses = []
sentence_lengths = []
Xnodefeats = []
print("Collecting features...")
for s in feat_util.read_sentences_plain(data):
nodefeats, _ = self.featurize_sentence(s)
sentence_lengths.append(len(nodefeats))
Xnodefeats.extend(nodefeats)
parses.append(s)

Xnodefeats = self.node_vectorizer.transform(Xnodefeats).toarray()
i = 0
for sl in sentence_lengths:
X.append(Xnodefeats[i:i+sl])
i = i+sl
return X, parses
logger.warning("Asking for vectorizer dimensionality, "
"but vectorizer has not been fit yet. Returning 0.")
return 0

def to_dict(self, sentence, start_offset, end_offset):
featuredict = dict()
featuredict["word_length"] = end_offset - start_offset
featuredict["sentence_length"] = len(sentence)
return featuredict

def fit(self, words_in_context):
wic_dicts = [self.to_dict(*wic) for wic in words_in_context]
vecs = super().fit_transform(wic_dicts)
self.scaler.fit(vecs)

def featurize(self, sentence, start_offset, end_offset, scale=True):
cached = self.cache.get((sentence, start_offset, end_offset))
if cached is not None:
return cached
x = self.transform(self.to_dict(sentence, start_offset, end_offset))
if scale:
x = self.scaler.transform(x)
self.cache[(sentence, start_offset, end_offset)] = x
return x

def save(self, path):
json = jsonpickle.encode(self)
with open(path, "w") as jsonfile:
jsonfile.write(json)

@staticmethod
def staticload(path):
with open(path) as jsonfile:
json = jsonfile.read()
featurizer = jsonpickle.decode(json)
featurizer.cache = {}
return featurizer
Loading

0 comments on commit 54bcce7

Please sign in to comment.