-
Notifications
You must be signed in to change notification settings - Fork 10
/
phone_frequency.py
177 lines (156 loc) · 4.43 KB
/
phone_frequency.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
# Calculate the frequency of a set of phones, provided a dictionary and a set of .lab
# files.
import codecs
from glob import glob
def dirclean(string):
"""Clean raw input strings so that they are readable as directories."""
if string[-1]==" ":
string = string.replace(" ","")
if string[-1]!="/":
string = string + "/"
return string
def parse(string, sep):
"""Parse a string into a list."""
# remove leading and trailing sep's
id_list = []
while string is not "":
if sep not in string:
id_list.append(string)
string = ""
else:
index = string.index(sep)
newstring = string[:index]
id_list.append(newstring)
string = string[index+1:]
return id_list
def find_replace(text, list):
"""Finds and replaces characters given in a string.
Characters to be replaced must be given as a list of ordered pairs."""
for pair in list:
if pair[0] in text:
text = text.replace(pair[0], pair[1])
return text
# Define user form
print "\nphone_frequency.py"
print """
What is the dictionary file that you want to use?
You can drag and drop it into the Terminal window below.
Press enter to use the dictionary stored in the
most current version of the aligner."""
dictionary = raw_input("> ")
if dictionary == "":
dictionary = "/Applications/Prosodylab-Aligner-v1/dictionary.txt"
if dictionary[-1] == " ":
dictionary = dictionary.replace(" ","")
print"""
Which set of .lab files are you using to calculate the
frequency? You can drag and drop it into the Terminal
window below."""
labdir = raw_input("> ")
labdir = dirclean(labdir)
mod = False
# Open dictionary
d = codecs.open(dictionary, 'r', 'utf-8')
# Parse dictionary
dlist = []
for line in d:
line = line.replace("\n","")
newline = parse(line," ")
dlist.append(newline)
d.close()
# Make phone-only dictionary
phonedict = []
for line in dlist:
phonedict.append(line[1:])
# Get list of unique phones
phonelist = []
for line in phonedict:
for phone in line:
if phone not in phonelist:
phonelist.append(phone)
phonelist.sort()
# Make a counter list
countlist = []
for phone in phonelist:
countlist.append([phone, 0])
countlist.append(['Total', 0])
# Make list of .lab files
lab_list = glob(labdir + '*.lab')
for file in lab_list:
# Read in text and parse into words
f = codecs.open(file, 'r', 'utf-8')
txt = f.read(); f.close()
txt = find_replace(txt, [[' ',' '],[' \n',''],['\n',''],[' \t',''],['\t','']])
if txt[0] == " ":
txt = txt[1:]
if txt[-1] == " ":
txt = txt[:-1]
wordlist = parse(txt," ")
# Read words in as phones
wordchoice = []
for word in wordlist:
for entry in dlist:
if word == entry[0]:
if entry[-1] == '':
entry = entry[:-1]
wordchoice.append(entry)
# pick the first entry if there are duplicates (not elegant)
uniquewords = []; prev_line = [None]
for word in wordchoice:
if prev_line[0] != word[0]:
uniquewords.append(word)
prev_line = word
# if a word is missing, add its pronunciation
if len(uniquewords) != len(wordlist):
# get words in uniquewords
uwords = []
for word in uniquewords:
uwords.append(word[0])
for word in wordlist:
if word not in uwords:
print "\nThe word %s in %s is missing from the dictionary." % (word, file.replace(labdir,''))
print "Please provide its pronunciation below."
print "Each phoneme must be separated by a space."
phonestring = raw_input("> ")
parsed_string = parse(phonestring, " ")
# Make a new entry & append
new_entry = []; new_entry.append(word)
for phone in parsed_string:
new_entry.append(phone)
uniquewords.append(new_entry)
dlist.append(new_entry)
mod = True
# read only the phones
phones = []
for word in uniquewords:
onlyphones = word[1:]
for phone in onlyphones:
phones.append(phone)
phonecount = len(phones)
countlist[-1][1] = countlist[-1][1] + phonecount
# Count phones
for phone in phones:
for ppair in countlist:
if phone == ppair[0]:
ppair[1] = ppair[1] + 1
# Store phonelist in a new file
countfile = open(labdir + "0_phone_count.txt", "w")
for pair in countlist:
textline = pair[0] + '\t' + str(pair[1]) + '\n'
countfile.write(textline)
countfile.close()
# Store dictionary
if mod == True:
dlist.sort()
newdict = open(labdir + "0_new_dictionary.txt", "w")
for line in dlist:
textline = None
for item in line:
if textline == None:
textline = item
else:
textline = textline + ' ' + item
textline = textline + '\n'
textline = textline.encode('utf-8')
newdict.write(textline)
newdict.close()