-
Notifications
You must be signed in to change notification settings - Fork 477
/
stats.py
59 lines (53 loc) · 1.85 KB
/
stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
# -*- coding: utf-8 -*-
import json
def run_left_right_analysis(characters):
total = 0
right = 0
left = 0
for data in characters.itervalues():
if 'decomposition' not in data or 'etymology' not in data:
continue
(decomposition, etymology) = (data['decomposition'], data['etymology'])
if etymology['type'] != 'pictophonetic':
continue
if decomposition[0] != u'⿰' or len(decomposition) != 3:
continue
total += 1
phonetic = etymology.get('phonetic')
if phonetic == decomposition[1]:
left += 1
if phonetic == decomposition[2]:
right += 1
print '(total, left, right):', (total, left, right)
def run_stroke_count_analysis(characters):
total = 0
counts = [0, 0, 0]
phonetic_stroke_total = 0
semantic_stroke_total = 0
for data in characters.itervalues():
etymology = data.get('etymology', {})
if 'phonetic' not in etymology or 'semantic' not in etymology:
continue
(phonetic, semantic) = (etymology['phonetic'], etymology['semantic'])
if phonetic not in characters or semantic not in characters:
continue
total += 1
phonetic_strokes = len(characters[phonetic]['matches'])
semantic_strokes = len(characters[semantic]['matches'])
phonetic_stroke_total += phonetic_strokes
semantic_stroke_total += semantic_strokes
counts[cmp(phonetic_strokes, semantic_strokes) + 1] += 1
mean = lambda x: 1.0 * x / total
print '(total, counts, phonetic_mean, semantic_mean):', (
total, map(mean, counts),
mean(phonetic_stroke_total), mean(semantic_stroke_total))
if __name__ == '__main__':
characters = {}
with open('dictionary.txt') as f:
for line in f.xreadlines():
if not line:
continue
data = json.loads(line.strip())
characters[data['character']] = data
run_left_right_analysis(characters)
run_stroke_count_analysis(characters)