-
Notifications
You must be signed in to change notification settings - Fork 0
/
word_util.py
124 lines (105 loc) · 3.74 KB
/
word_util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import numpy as np
import os
import jieba
import pandas as pd
class WordUtil():
UNK_TAG = '<unk>'
START_TAG = '<s>'
END_TAG = '</s>'
UNK = 0
START = 1
END = 2
vocab_file_path = os.path.join('data', 'generated', 'vocab.txt')
csv_file_path = os.path.join('data', 'generated', 'data.csv')
def __init__(self, *word_files):
self.word_files = word_files
self.dict = {
self.UNK_TAG: self.UNK,
self.START_TAG: self.START,
self.END_TAG: self.END
}
if os.path.exists(self.vocab_file_path):
self.load_dict()
# jieba.load_userdict(self.vocab_file_path)
else:
self.build_dict()
def load_dict(self):
with open(self.vocab_file_path, 'r', encoding='utf-8') as f:
for line in f.readlines():
line = line.replace('\n', '')
if line == self.UNK_TAG \
or line == self.START_TAG \
or line == self.END_TAG:
continue
self.dict[line] = len(self.dict)
def build_dict(self):
''' Build words dictionary from files '''
count = {}
for word_file in self.word_files:
with open(word_file, encoding='utf-8') as f:
content = f.read()
word_list = jieba.cut(content)
#word_list = list(content)
for w in word_list:
if w == '\n':
continue
if w not in count:
count[w] = 0
count[w] += 1
words = sorted(count.items(), key=lambda x: x[1], reverse=True)
for k, _ in words:
self.dict[k] = len(self.dict)
def word_to_index(self, word):
if word in self.dict:
return self.dict[word]
return self.UNK
def index_to_word(self, index):
for k, v in self.dict.items():
if index == v:
return k
return self.UNK_TAG
def transform(self, sentence):
index = []
for w in jieba.lcut(sentence):
if w == '\n':
continue
index.append(self.word_to_index(w))
return index
def reverse_transform(self, index):
sentence = ''
for i in index:
if i == self.UNK or i == self.START or i == self.END:
continue
sentence += self.index_to_word(i)
return sentence
def get_index_array(self):
src_tar_index = []
for word_file in self.word_files:
with open(word_file, encoding='utf-8') as f:
index_list = []
for line in f.readlines():
line_index = self.transform(line)
index_list.append(line_index)
src_tar_index.append(index_list)
return src_tar_index
def generate_vocab(self):
assert self.dict is not None
with open(self.vocab_file_path, 'w', encoding='utf-8') as f:
for w in self.dict.keys():
f.write(w + '\n')
def generate_csv(self):
if os.path.exists(self.csv_file_path):
return
result = self.get_index_array()
question = [self._list_to_str(l) for l in result[0]]
answer = [self._list_to_str(l) for l in result[1]]
dataframe = pd.DataFrame({'question': question, 'answer': answer})
dataframe.to_csv(self.csv_file_path, index=False, sep=',')
def _list_to_str(self, li):
_str = ''
for i in li:
_str += str(i) + ' '
return _str.strip()
if __name__ == '__main__':
a = WordUtil('data/origin/question.txt', 'data/origin/answer.txt')
print(a.get_index_array()[0][0])