-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_dataset_stats.py
39 lines (28 loc) · 1.16 KB
/
count_dataset_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
import json
from pathlib import Path
import statistics
path_orig, path_short = Path('original.json'), Path('short.json')
data_orig, data_short = json.loads(path_orig.read_text(encoding='utf-8')), json.loads(path_short.read_text(encoding='utf-8'))
orig_q = []
orig_a = []
short_q = []
short_a = []
def listing(data, list_q, list_a):
for item in data['data']:
for paragraph in item['paragraphs']:
for qa in paragraph['qas']:
list_q.append(qa['question'])
list_a.append(qa['answers'][0]['text'])
return list_q, list_a
orig_q, orig_a = listing(data_orig, orig_q, orig_a)
short_q, short_a = listing(data_short, short_q, short_a)
def avg_symbols(strings):
return round(sum(map(len, strings)) / len(strings))
def avg_tokens(strings):
return round(statistics.mean([len(token) for token in [element.split() for element in strings]]))
print('Average number of symbols in answers')
print('Original dataset version', avg_symbols(orig_a))
print('Shortened dataset version', avg_symbols(short_a))
print('\nAverage number of tokens in answers')
print('Original dataset version', avg_tokens(orig_a))
print('Shortened dataset version', avg_tokens(short_a))