-
Notifications
You must be signed in to change notification settings - Fork 1
/
translate_frequency.py
202 lines (157 loc) · 7.24 KB
/
translate_frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
# -*- coding: utf-8 -*-
"""
word features:
This script processes the data files to create features about translations
and frequencies.
Specifically, word frequency, levenshtein distance (cognate), age of acquisition
Saves features as .txt file for specified language
There is code to do this for the root words (e.g. "is" becomes "be") and code
to do this for all words ("be" and "is" as two separate words).
March 2018
"""
# imports
# do pip installs to get libraries: wordfreq, googletrans, editdistance, unidecode
from wordfreq import word_frequency
from googletrans import Translator
import editdistance
from unidecode import unidecode
import time
import pandas as pd
import numpy as np
translator = Translator()
# choose language to translate
lang='en_es'
fn = 'data/data_{0}/{0}.slam.20171218.train.new'.format(lang)
(src, dest)=lang.split('_')
print("Translating %s to %s" %(src,dest))
aoa_xl = pd.ExcelFile("data/13428_2013_348_MOESM1_ESM.xlsx")
aoa_df = aoa_xl.parse('Sheet1')
# FOR EVERY ROOT WORD (e.g, "be" instead of "is")
# Make csv of translation and calculate a simple metric of how much of a "cognate" it is
#output filename
fnout = "{}_{}_rootwordfeats.txt".format(src,dest)
print("Will save translations and features to: ",fnout)
trans_cache = {}
newlines=[]
with open(fn, 'r') as f:
for line in f:
if len(line)>1 and line[0]!='#':
# process and translate
pieces = line.split()
idx = pieces[0]
word = pieces[1]
root_token = pieces[2]
part_of_speech = pieces[3]
morph_feats = pieces[4]
dependency_label = pieces[5]
dependency_edge_head = pieces[6]
clean_word = root_token.lower() # Only using root token here rather than word
if '\'' not in clean_word and clean_word not in trans_cache:
# Wait so as not to annoy the API (otherwise will give JSON error)
time.sleep(0.2)
# Translate so we know the word in the language the user already knows
translation_obj = translator.translate(clean_word,src=src,dest=dest)
translation = translation_obj.text
# Clean up translation:
# remove accents, make lowercase, and remove any non-informative first words
clean_trans = unidecode(translation)
clean_trans = clean_trans.lower()
g = clean_trans.split(" ")
if len(g) == 2:
if g[0] in ['to','i','we','you','they','he','she','the',"i'll"]:
clean_trans = g[1]
elif len(g) > 2:
if g[1] in ['will']:
clean_trans = ' '.join(g[2:len(g)])
# levenshtein distance features
levdist = editdistance.eval(clean_word, clean_trans)
levdistfrac = levdist/max(len(clean_word),len(clean_trans))
# print to console translation and LD
print("%s to %s (LD=%i, LDfrac=%.2f)" % (clean_word, clean_trans, levdist, levdistfrac))
trans_cache[clean_word]=translation
if dest == 'en':
aoa_word = clean_trans
else:
aoa_word = clean_word
word_df = aoa_df[aoa_df['Word'] == aoa_word]
if not word_df.empty:
aoa = word_df['Rating.Mean'].values[0]
else:
aoa = np.nan
# word frequency of translation (known language) and src words
trans_freq = word_frequency(translation, dest)
src_freq = word_frequency(root_token, src)
# write to csv file
newrow = [root_token, clean_trans, str(src_freq), str(levdist), str(levdistfrac), str(aoa)]
newlines.append(",".join(newrow)+'\n')
print(len(newlines))
with open(fnout,'w') as fp:
fp.writelines(newlines)
# FOR EVERY WORD (NOT root word)
# Make csv of translation and calculate a simple metric of how much of a "cognate" it is
#output filename
fnout = "{}_{}_wordwordfeats.txt".format(src,dest)
print("Will save translations and features to: ",fnout)
trans_cache = {}
newlines=[]
with open(fn, 'r') as f:
for line in f:
if len(line)>1 and line[0]!='#':
# process and translate
pieces = line.split()
idx = pieces[0]
word = pieces[1]
root_token = pieces[2]
part_of_speech = pieces[3]
morph_feats = pieces[4]
dependency_label = pieces[5]
dependency_edge_head = pieces[6]
clean_word = word.lower() # Using WORD not root
if '\'' not in clean_word and clean_word not in trans_cache:
# Wait so as not to annoy the API (otherwise will give JSON error)
time.sleep(0.2)
# Translate so we know the word in the language the user already knows
translation_obj = translator.translate(clean_word,src=src,dest=dest)
translation = translation_obj.text
# Clean up translation:
# remove accents, make lowercase, and remove any non-informative first words
clean_trans = unidecode(translation)
clean_trans = clean_trans.lower()
g = clean_trans.split(" ")
if len(g) == 2:
if g[0] in ['to','i','we','you','they','he','she','the']:
clean_trans = g[1]
elif len(g) > 2:
if g[1] in ['will']: # e.g. comere, i will eat
clean_trans = ' '.join(g[2:len(g)])
# levenshtein distance features
levdist = editdistance.eval(clean_word, clean_trans)
levdistfrac = levdist/max(len(clean_word),len(clean_trans))
# age of acquisition
if dest == 'en':
aoa_word = clean_trans
else:
aoa_word = clean_word
word_df = aoa_df[aoa_df['Word'] == aoa_word]
if not word_df.empty:
aoa = word_df['Rating.Mean'].values[0]
else:
aoa = np.nan
# print to console translation and LD
print("%s to %s (LD=%i, LDfrac=%.2f, AOA=%.2f)" % (clean_word, clean_trans, levdist, levdistfrac, aoa))
trans_cache[clean_word]=translation
# word frequency of translation (known language) and src words
trans_freq = word_frequency(translation, dest)
src_freq = word_frequency(word, src)
# write to csv file
newrow = [word, clean_trans, str(src_freq), str(levdist), str(levdistfrac), str(aoa)]
newlines.append(",".join(newrow)+'\n')
print(len(newlines))
with open(fnout,'w') as fp:
fp.writelines(newlines)
xl = pd.ExcelFile("data/13428_2013_348_MOESM1_ESM.xlsx")
# Print the sheet names
print(xl.sheet_names)
# Load a sheet into a DataFrame by name: df1
df1 = xl.parse('Sheet1')
df1['Word']