-
Notifications
You must be signed in to change notification settings - Fork 11
/
ML_Comparisons.py
71 lines (59 loc) · 2.71 KB
/
ML_Comparisons.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import sqlite3
import sys
from geopy.distance import great_circle
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from preprocessing import generate_arrays_from_file_map2vec, index_to_coord, get_coordinates, generate_strings_from_file
from preprocessing import REVERSE_MAP_2x2
from preprocessing import print_stats
import numpy as np
from sklearn.externals import joblib
# For command line use, type: python test.py <dataset name> such as lgl_gold or wiki (see file names)
if len(sys.argv) > 1:
data = sys.argv[1]
else:
data = u"lgl"
X, Y = [], []
clf = MultinomialNB()
classes = range(len(REVERSE_MAP_2x2))
# clf = RandomForestClassifier()
for (x, y) in generate_arrays_from_file_map2vec(u"../data/train_wiki_uniform.txt", looping=False):
X.extend(x[0])
Y.extend(np.argmax(y, axis=1))
# -------- Uncomment for Naive Bayes -------------
if len(X) > 25000:
print(u"Training with:", len(X), u"examples.")
clf.partial_fit(X, Y, classes)
X, Y = [], []
# ------------------------------------------------
print(u"Training with:", len(X), u"examples.")
clf.partial_fit(X, Y, classes) # Naive Bayes only!
# clf.fit(X, Y) # Random Forest
joblib.dump(clf, u'../data/bayes.pkl') # saves the model to file
# ------------------------------------- END OF TRAINING, BEGINNING OF TESTING -----------------------------------
X = []
final_errors = []
clf = joblib.load(u'../data/bayes.pkl')
test_file = u"data/eval_" + data + u".txt" # which data to test on?
for (x, y) in generate_arrays_from_file_map2vec(test_file, looping=False):
X.extend(x[0]) # Load test instances
print(u"Testing with:", len(X), u"examples.")
conn = sqlite3.connect(u'../data/geonames.db')
for x, (y, name, context) in zip(clf.predict(X), generate_strings_from_file(test_file)):
p = index_to_coord(REVERSE_MAP_2x2[x], 2)
candidates = get_coordinates(conn.cursor(), name)
if len(candidates) == 0:
print(u"Don't have an entry for", name, u"in GeoNames")
raise Exception(u"Check your database, buddy :-)")
# candidates = [candidates[0]] # Uncomment for population heuristic.
# THE ABOVE IS THE POPULATION ONLY BASELINE IMPLEMENTATION
best_candidate = []
max_pop = candidates[0][2]
bias = 0.9 # bias parameter, see
for candidate in candidates:
err = great_circle(p, (float(candidate[0]), float(candidate[1]))).km
best_candidate.append((err - (err * max(1, candidate[2]) / max(1, max_pop)) * bias, (float(candidate[0]), float(candidate[1]))))
best_candidate = sorted(best_candidate, key=lambda (a, b): a)[0]
final_errors.append(great_circle(best_candidate[1], y).km)
print_stats(final_errors)
print(u"Done testing:", test_file)