-
Notifications
You must be signed in to change notification settings - Fork 0
/
lexccounter.py
executable file
·119 lines (105 loc) · 6.37 KB
/
lexccounter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
#!/usr/bin/env python3
import argparse, re, sys, urllib.request, logging, itertools
from collections import defaultdict
def cleanLine(line):
return re.sub(r'!.*$', '', re.sub(r'%(.)', r'\1', line.strip())).strip()
def countStems(dictionary, uniqueOn='lemma+continuationLexicon', vanilla=False):
logger = logging.getLogger("countStems")
currentLexicon = None
lexicons = defaultdict(lambda: ([], set()))
if uniqueOn not in {'lemma+continuationLexicon', 'lemma+gloss', 'lemma+comment'}:
raise ValueError('invalid unique criteria: ' + uniqueOn)
for lineNo, line in enumerate(dictionary.splitlines()):
origLine, line = line, cleanLine(line)
if line.startswith('LEXICON'):
logger.info('Switching lexicon from %s (%s unique entries, %s pointers) to %s' % (currentLexicon, len(lexicons[currentLexicon][1]), len(lexicons[currentLexicon][0]), line.split()[1]))
currentLexicon = line.split()[1]
elif not line.startswith('!') and line and currentLexicon and not (origLine.find("Use/MT")+1 and vanilla):
try:
if len(re.findall(r'\s+', line)) >= 2:
if ':' in line:
split = re.findall(r'^(.+?):([^;]+);(?:\s+!\s+(.+))?', line)
if len(split):
split = split[0]
lemma = split[0].strip()
continuationLexicon = split[1].strip().split()[-1].split('-')
gloss = split[2].strip()
if uniqueOn == 'lemma+continuationLexicon':
lexicons[currentLexicon][1].add((lemma, frozenset(continuationLexicon)))
elif uniqueOn == 'lemma+gloss' or uniqueOn == 'lemma+comment':
lexicons[currentLexicon][1].add((lemma, gloss))
logger.debug('Parsed L%s (%s) as lemma: %s, continuations: %s, and gloss: %s' % (lineNo + 1, line, repr(lemma), repr(continuationLexicon), repr(gloss)))
else:
logger.warning('Failed to parse L%s: %s' % (lineNo + 1, line))
else:
split = line.split(';')[0].strip().split()
lemma = split[0]
continuationLexicon = split[1].strip().split('-')
gloss = line.split('!')[1].strip() if '!' in line else None
if uniqueOn == 'lemma+continuationLexicon':
lexicons[currentLexicon][1].add((lemma, frozenset(continuationLexicon)))
elif uniqueOn == 'lemma+gloss' or uniqueOn == 'lemma+comment':
lexicons[currentLexicon][1].add((lemma, frozenset(continuationLexicon)))
logger.debug('Parsed L%s (%s) as lemma: %s, continuations: %s, and gloss: %s' % (lineNo + 1, line, repr(lemma), repr(continuationLexicon), repr(gloss)))
elif len(re.findall(r'\s+', line)) == 1:
lexiconPointer = line.split(';')[0].strip()
if ' ' in lexiconPointer:
logger.warning('Failed to parse L%s: %s' % (lineNo + 1, line))
else:
lexicons[currentLexicon][0].append(lexiconPointer)
else:
logger.warning('Failed to parse L%s: %s' % (lineNo + 1, line))
except Exception as e:
logger.warning('Failed to parse L%s: %s' % (lineNo + 1, line))
logger.info('Switching lexicon from %s (%s unique entries, %s pointers) to %s' % (currentLexicon, len(lexicons[currentLexicon][1]), len(lexicons[currentLexicon][0]), 'END'))
def getAllLexicons(rootLexicon):
return lexicons[rootLexicon][0] + sum(map(getAllLexicons, lexicons[rootLexicon][0]), [])
if 'Root' in lexicons:
validLexicons = set(getAllLexicons('Root'))
logger.info('Counting from lexicons %s' % validLexicons)
else:
logger.critical('No Root lexicon found')
sys.exit(-1)
entries = set()
for validLexicon in validLexicons:
logging.info('In lexicon %s referenced from ROOT, found %s entries.' % (validLexicon, len(lexicons[validLexicon][1])))
entries.update(lexicons[validLexicon][1])
print('Unique entries: %s' % len(entries))
return len(entries)
if __name__ == '__main__':
parser = argparse.ArgumentParser(description="Count unique stems (on lemma and continuation lexicon) in a HFST morphological dictionary (lexc)")
parser.add_argument('uri', help="uri to lexc file")
parser.add_argument('-l', '--uniqLemmaGloss', help="count stems unique on lemma and gloss (comment)", action='store_true', default=False)
parser.add_argument('-V', '--vanilla', help="count vanilla stems only", action='store_true', default=False)
parser.add_argument('-v', '--verbose', help="show errors dictionary (verbose)", action='store_true', default=False)
parser.add_argument('-vv', '--doubleVerbose', help="show progress through dictionary (verbose×2)", action='store_true', default=False)
parser.add_argument('-vvv', '--tripleVerbose', help="show detailed progress through dictionary (verbose×3)", action='store_true', default=False)
args = parser.parse_args()
if args.tripleVerbose:
logging.basicConfig(level=logging.DEBUG)
elif args.doubleVerbose:
logging.basicConfig(level=logging.INFO)
elif args.verbose:
logging.basicConfig(level=logging.WARNING)
else:
logging.basicConfig(level=logging.ERROR)
if 'http' in args.uri:
try:
dictionary = str((urllib.request.urlopen(args.uri)).read(), 'utf-8')
except urllib.error.HTTPError:
logging.critical('Dictionary %s not found' % args.uri)
sys.exit(-1)
if args.uniqLemmaGloss:
countStems(dictionary, uniqueOn='lemma+gloss', vanilla=args.vanilla)
else:
countStems(dictionary, vanilla=args.vanilla)
else:
try:
with open(args.uri) as dictionary:
if args.uniqLemmaGloss:
countStems(dictionary.read(), uniqueOn='lemma+gloss', vanilla=args.vanilla)
else:
countStems(dictionary.read(), vanilla=args.vanilla)
except FileNotFoundError:
logging.critical('Dictionary %s not found' % args.uri)
sys.exit(-1)