-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_loader.py
70 lines (55 loc) · 2.25 KB
/
data_loader.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import jsonlist
from vocab import Vocab
SENT_START = "<sentence_start>"
SENT_END = "<sentence_end>"
def create_speechtexts():
infile = jsonlist.load_file('debate_speech.jsonlist')
speechtexts = []
for i in range(0, len(infile)):
for j in range(0, len(infile[i]['speeches'])):
speechtexts.append(infile[i]['speeches'][j]['text'])
for i in range(0, len(speechtexts)):
speechtexts[i] = speechtexts[i].replace("\n","")
speechtexts[i] = speechtexts[i].replace(","," ,")
speechtexts[i] = speechtexts[i].replace("."," . ")
speechtexts[i] = speechtexts[i].replace("?"," ? ")
speechtexts[i] = speechtexts[i].replace("--"," -- ")
speechtexts[i] = speechtexts[i].replace(";"," ; ")
speechtexts[i] = speechtexts[i].replace("\'","'")
speechtexts[i] = speechtexts[i].replace("(","( ")
speechtexts[i] = speechtexts[i].replace(")"," )")
speechtexts[i] = speechtexts[i].replace("[","[ ")
speechtexts[i] = speechtexts[i].replace("]"," ]")
return speechtexts
def create_sets(part):
speechtexts = create_speechtexts()
if part == "train":
return speechtexts[0:int(0.85 * len(speechtexts))]
elif part == "valid":
return speechtexts[int(0.85 * len(speechtexts)):int(0.9 * len(speechtexts))]
else:
return speechtexts[int(0.9 * len(speechtexts)):]
def load(speech_data, index):
start = index.add(SENT_START)
sentences = []
for paragraph in speech_data:
for sentence in paragraph.split(" . "):
tokens = sentence.split()
if not tokens:
continue
sentence = [index.add(SENT_START)]
sentence.extend(index.add(t.lower()) for t in tokens)
sentence.append(index.add(SENT_END))
sentences.append(sentence)
return sentences
def main():
print("Debate Speech preprocessing and dataset statistics")
index = Vocab()
for part in ("train", "valid", "test"):
print("Processing", part)
sentences = load(create_sets(part), index)
print("Found", sum(len(s) for s in sentences),
"tokens in", len(sentences), "sentences")
print("Found in total", len(index), "tokens")
if __name__ == '__main__':
main()