This repository has been archived by the owner on Jul 27, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 17
/
id2text.py
73 lines (63 loc) · 2.2 KB
/
id2text.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
# Copyright 2016 Google Inc. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Convert text id and word id to texts."""
# -*- coding: utf-8 -*-
import argparse
def main():
parser = argparse.ArgumentParser()
parser.add_argument('dict_file', help='the dict filename as input',
type=str)
parser.add_argument('id_kv_file', help='the id kv filename as input',
type=str)
parser.add_argument('text_kv_file', help='the text kv filename as output',
type=str)
args = parser.parse_args()
print 'Prepare the dict'
id2word = {}
dict_file = open(args.dict_file)
for l in dict_file:
ll = l.split('\t')
id2word['(%s+%s)' % (ll[1].strip(), ll[2].strip())] = ll[0].strip()
print 'Processing kv'
id_kv = open(args.id_kv_file)
text_kv = open(args.text_kv_file, 'w')
for sl in id_kv:
kvs = sl.strip().split('\t')
out_kvs = ''
for kv in kvs:
kv_sp = kv.split('=')
if len(kv_sp) > 2:
for s in kv_sp[2:]:
kv_sp[1] += '=' + s
if (kv_sp[0].startswith('answer') and
not kv_sp[0].endswith('score')) or kv_sp[0] == 'background':
sentence = ''
word_ids = kv_sp[1].split()
for w in word_ids:
if w in id2word:
sentence += id2word[w] + ' '
else:
if w == '@-@':
sentence += '- '
else:
sentence += w + ' '
if kv_sp[0] == 'background':
out_kvs += '%s=%s\t'%(kv_sp[0], sentence)
else:
out_kvs += '%s=%s\t'%(kv_sp[0], sentence.strip())
else:
out_kvs += '%s\t'%(kv)
text_kv.write('%s\n'%(out_kvs.strip()))
if __name__ == '__main__':
main()