-
Notifications
You must be signed in to change notification settings - Fork 0
/
vframe_dict_generator.py
executable file
·129 lines (100 loc) · 5.48 KB
/
vframe_dict_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
#!/usr/bin/env python3
# -*- coding: utf-8, vim: expandtab:ts=4 -*-
"""
author: Noémi Vadász
last update: 2018.01.11.
VFrame dictionary compiler
input
verb+prev list https://github.com/kagnes/hungarian_verbal_complex
verb+inf list https://github.com/kagnes/infinitival_constructions
output
vframe_to_read human-readable form of dictionary
one verb per line
őriz ?, meg: X
? infinitival argument with no preverb
meg: ? infinitival argument with the preverb
meg: X no infinitival argument with the preverb
vframe_to_eval input for VFrame searcher https://github.com/ppke-nlpg/vframe
one verb per line
őriz X:?, meg:X
X:? infinitival argument with no preverb
meg:? infinitival argument with the preverb
meg:X no infinitival argument with the preverb
vframe_dict.py input for AnaGramma https://github.com/ppke-nlpg/TO_BE_RELEASED # TODO
python dictionary, one verb per line
'őriz#V': {'X#PreV': '?', 'meg#PreV': 'X'},
"""
from collections import defaultdict
from itertools import chain
import locale
locale.setlocale(locale.LC_ALL, 'hu_HU.utf8')
class VFrameDefaultDict(defaultdict):
def __str__(self):
return '{0}'.format(','.join(self._add_x(('{0}:{1}'.format(k, v)
for k, v in sorted(self.items(), key=lambda x: locale.strxfrm(str(x)))
if k != 'X'), repr_format='eval')))
def __repr__(self):
return '{0}}},'.format(', '.join(self._add_x(("{0}#PreV': {1}".format(repr(k)[:-1], repr(v))
for k, v in sorted(self.items(),
key=lambda x: locale.strxfrm(str(x)))
if k != 'X'), repr_format='dict')))
def toreadable(self):
return '{0}'.format(', '.join(self._add_x('{0}: {1}'.format(k, v)
for k, v in sorted(self.items(), key=lambda x: locale.strxfrm(str(x)))
if k != 'X')))
def _add_x(self, dict_iter, repr_format='read'):
x = self.get('X')
if x is not None:
if repr_format == 'dict':
return chain(["'X#PreV': '{0}'".format(x)], dict_iter)
elif repr_format == 'eval':
return chain(['X:{0}'.format(x)], dict_iter)
else:
return chain([x], dict_iter)
return dict_iter
def split_and_store_verb_w_prev(verb_w_prev, vframe, value):
prev, verb = verb_w_prev.split('+', maxsplit=1) # prev-verb separator: +
if '|' in verb:
first_verb, second_verb = verb.split('|', maxsplit=1)
inverted = second_verb + '|' + first_verb
vframe[first_verb][prev] = 'X'
vframe[second_verb][prev] = 'X'
vframe[inverted][prev] = 'X'
if '-' in prev: # to store 'be-be' type preverbs and 'be' too
vframe[verb][prev.split('-', maxsplit=1)[0]] = value
vframe[verb][prev] = value
def main():
# defaultdict collection for VFrame
vframe = VFrameDefaultDict(lambda: VFrameDefaultDict(str))
# read freqPrevFin.txt (https://github.com/kagnes/hungarian_verbal_complex)
with open('kagi_verbal_complex/freqPrevFin.txt', encoding='UTF-8') as prev_list:
for line in prev_list: # put instance to vframe structure with default None infinitival argument
split_and_store_verb_w_prev(line.strip().split(maxsplit=1)[1], vframe, 'X')
# read FinInf.txt (https://github.com/kagnes/infinitival_constructions)
with open('infinitival_constructions/FinInf.txt', encoding='UTF-8') as inf_list:
for line in inf_list:
infline = line.strip().split('\t', maxsplit=5)
if len(infline) == 6 and '%' in infline[5]: # manually corrected verb stem or preverb-verb segmentation
verb_w_prev = infline[5].lstrip('%')
else:
verb_w_prev = infline[1]
if '+' in verb_w_prev: # prev-verb separator: +
split_and_store_verb_w_prev(verb_w_prev, vframe, '?') # ? allows infinitival argument
else:
verb = verb_w_prev
vframe[verb]['X'] = '?'
# print results: input for evaluating VFrame algorithm and formatted input for AnaGramma VFrame module
with open('vframe_dict/vframe_dict.py', 'w', encoding='UTF-8') as vframe_dict, \
open('vframe_dict/vframe_to_eval', 'w', encoding='UTF-8') as vframe_to_eval, \
open('vframe_dict/vframe_to_read', 'w', encoding='UTF-8') as vframe_to_read:
print('#!/usr/bin/env python3',
'# -*- coding: utf-8, vim: expandtab:ts=4 -*-',
'',
'verb_prev_restrs_dict = {', sep='\n', file=vframe_dict)
for v, prevs in sorted(vframe.items(), key=lambda x: locale.strxfrm(str(x))):
print(" '{0}#V': {{".format(v), repr(prevs), sep='', file=vframe_dict)
print('{0}\t'.format(v), str(prevs), sep='', file=vframe_to_eval)
print('{0}\t'.format(v), prevs.toreadable(), sep='', file=vframe_to_read)
print('}', file=vframe_dict)
if __name__ == '__main__':
main()