-
Notifications
You must be signed in to change notification settings - Fork 0
/
get_hedge.py
60 lines (51 loc) · 1.99 KB
/
get_hedge.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
import argparse
import re, json
def section_to_clst(section):
clst = []
idx = 0
sec_len = 0
for sec in section:
c = []
for _ in sec:
c.append(idx)
idx += 1
clst.append(c)
sec_len += 1
return clst, idx, sec_len
def get_hedge(input_path, output_path, task):
file_path = '{}/{}.txt'.format(input_path, task)
save_path = '{}/{}.hedges.jsonl'.format(output_path, task)
fout = open(save_path, 'w')
count = 0
line_count = 0
sec_len_average = 0
with open(file_path, encoding="utf-8") as fin:
for line in fin:
data = json.loads(line)
article_id = data["article_id"]
article_text = data["article_text"]
if len(article_text) <= 1:
continue
else:
sec_clst, sen_num, sec_len = section_to_clst(data["sections"])
dataset = {}
dataset["id"] = article_id
dataset["hedges"] = sec_clst
dataset["length"] = sen_num
dataset["section_length"] = sec_len
fout.write(json.dumps(dataset) + '\n')
count += sen_num
sec_len_average += sec_len
line_count += 1
count = count/line_count
sec_len_average = sec_len_average/line_count
fin.close()
fout.close()
return count, sec_len_average
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Preprocessing dataset for hedges')
parser.add_argument('--input_path', type=str, default='dataset/arxiv-dataset', help='The dataset directory.')
parser.add_argument('--output_path', type=str, default='/home/zcl/dataset/preprocessed/pubmed', help='The dataset directory.')
parser.add_argument('--task', type=str, default='val', help='dataset [train|val|test]')
args = parser.parse_args()
average_len, sec_len_average = get_hedge(args.input_path, args.output_path, args.task)