-
Notifications
You must be signed in to change notification settings - Fork 9
/
parse_text.py
31 lines (28 loc) · 1.22 KB
/
parse_text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
def parse(filename):
valid_chars = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z','.','?',',','\'',':','!',';']
# load ascii text and covert to lowercase
raw_text = open(filename).read().lower().replace("--",";")
wordlist = []
s = ""
# split the raw text into valid "words" (punctuation are words in this case)
for i in range(0,len(raw_text)):
x = i
# space or return -> store word
if raw_text[x] == ' ' or raw_text[x] == '\n':
if len(s) > 0:
wordlist.append(s)
s = ""
# include only valid characters
elif raw_text[x] in valid_chars:
if raw_text[x].isalpha():
s += raw_text[x]
else:
# for I'll, don't, alice's, etc...
if raw_text[x] == '\'' and ( raw_text[x+1] == 's' or raw_text[x+1] == 't' or raw_text[x+1] == 'm' or raw_text[x+1] == 'l' ):
s += raw_text[x]
else:
if len(s) > 0:
wordlist.append(s)
wordlist.append(raw_text[x])
s = ""
return wordlist