-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocessing.py
94 lines (76 loc) · 3.14 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import numpy as np
import re
# Importing our translations
# for example: "spa.txt" or "spa-eng/spa.txt"
data_path = "deu.txt"
# Defining lines as a list of each line
with open(data_path, 'r', encoding='utf-8') as f:
lines = f.read().split('\n')
# Building empty lists to hold sentences
input_docs = []
target_docs = []
# Building empty vocabulary sets
input_tokens = set()
target_tokens = set()
# Adjust the number of lines so that
# preprocessing doesn't take too long for you
for line in lines[:1001]:
# Input and target sentences are separated by tabs
input_doc, target_doc = line.split('\t')[:2]
# Appending each input sentence to input_docs
input_docs.append(input_doc)
target_doc = " ".join(re.findall(r"[\w']+|[^\s\w]", target_doc))
# Redefine target_doc below
# and append it to target_docs:
target_doc = '<START> ' + target_doc + ' <END>'
target_docs.append(target_doc)
# Now we split up each sentence into words
# and add each unique word to our vocabulary set
for token in re.findall(r"[\w']+|[^\s\w]", input_doc):
# print(token)
# Add your code here:
if token not in input_tokens:
input_tokens.add(token)
for token in target_doc.split():
# print(token)
# And here:
if token not in target_tokens:
target_tokens.add(token)
input_tokens = sorted(list(input_tokens))
target_tokens = sorted(list(target_tokens))
# Create num_encoder_tokens and num_decoder_tokens:
num_encoder_tokens = len(input_tokens)
num_decoder_tokens = len(target_tokens)
max_encoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", input_doc)) for input_doc in input_docs])
max_decoder_seq_length = max([len(re.findall(r"[\w']+|[^\s\w]", target_doc)) for target_doc in target_docs])
input_features_dict = dict(
[(token, i) for i, token in enumerate(input_tokens)])
target_features_dict = dict(
[(token, i) for i, token in enumerate(target_tokens)])
reverse_input_features_dict = dict(
(i, token) for token, i in input_features_dict.items())
reverse_target_features_dict = dict(
(i, token) for token, i in target_features_dict.items())
encoder_input_data = np.zeros(
(len(input_docs), max_encoder_seq_length, num_encoder_tokens),
dtype='float32')
decoder_input_data = np.zeros(
(len(input_docs), max_decoder_seq_length, num_decoder_tokens),
dtype='float32')
decoder_target_data = np.zeros(
(len(input_docs), max_decoder_seq_length, num_decoder_tokens),
dtype='float32')
for line, (input_doc, target_doc) in enumerate(zip(input_docs, target_docs)):
for timestep, token in enumerate(re.findall(r"[\w']+|[^\s\w]", input_doc)):
# Assign 1. for the current line, timestep, & word
# in encoder_input_data:
encoder_input_data[line, timestep, input_features_dict[token]] = 1.
for timestep, token in enumerate(target_doc.split()):
decoder_input_data[line, timestep, target_features_dict[token]] = 1.
if timestep > 0:
decoder_target_data[line, timestep - 1, target_features_dict[token]] = 1.
# print out those value here:
# print(list(input_features_dict.keys())[:51])
# print(list(reverse_target_features_dict.keys())[:51])
# print(reverse_target_features_dict[50])
# print(len(input_tokens))