This repository has been archived by the owner on Nov 5, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 1
/
from_tei.py
151 lines (134 loc) · 5.43 KB
/
from_tei.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
import re
from typing import List
import lxml.etree as et
###########################################################
###########################################################
# namespaces
tei_ns = 'http://www.tei-c.org/ns/1.0'
xml_ns = 'http://www.w3.org/XML/1998/namespace'
def get_file(tei: str):
with open(tei, 'r', encoding='utf-8') as f:
return f.read()
###########################################################
###########################################################
# pre parse cleanup
def pre_parse_cleanup(text): #* PASSING
text = re.sub(r' +|\t', ' ', text)
# text = re.sub(r' *<supplied> *', '[', text)
text = re.sub(r' *<supplied[^<>]*>', '[', text)
text = re.sub(r' *</supplied> *', ']', text)
text = re.sub(r'<lb[^<>]*>', '', text)
text = text.replace('\n', '')
text = text.replace('<hi rend="overline">', '')
text = text.replace('</hi>', '')
return text
def parse(text: str):
parser = et.XMLParser(remove_blank_text=True, encoding='utf-8', recover=True)
return et.fromstring(text, parser)
###########################################################
###########################################################
# after parsed
def add_underdot_to_unclear_letters(root: et._Element): #* PASSING
unclear = root.xpath('//tei:unclear', namespaces={'tei': tei_ns})
for u in unclear:
underdotted = []
if u.text is None:
print(f'NONE WORD!!!{et.tostring(u, encoding="unicode")=}')
for letter in u.text:
underdotted.append(f'{letter}\u0323')
u.text = ''.join(underdotted)
def remove_unclear_tags(text: str): #* PASSING
text = re.sub(r'<unclear[^<>]*>', '', text)
text = re.sub(r'</unclear[^<>]*>', '', text)
return text
def write_xml(root):
tree = root.getroottree()
tree.write('output.xml', encoding='utf-8', pretty_print=True)
def handle_abbr(abbr: et._Element): #* PASSING
if not abbr.text and abbr.getchildren():
if abbr.getchildren()[0].text:
return abbr.getchildren()[0].text
elif abbr.text and not abbr.getchildren():
return abbr.text
def handle_app(app: et._Element, hand: str):
words = []
if hand == 'firsthand':
rdg_type = 'orig'
else:
rdg_type = 'corr'
for rdg in app.getchildren():
if rdg.get('type') == rdg_type and rdg.get('hand') == hand:
for word in rdg.getchildren():
if word.text:
words.append(word.text)
elif word.getchildren():
if word.getchildren()[0].text:
words.append(word.getchildren()[0].text)
elif word.getchildren()[0].tag == '{http://www.tei-c.org/ns/1.0}abbr':
t = handle_abbr(word.getchildren()[0])
if t:
words.append(t)
return words
def get_all_words_in_verse(verse: et._Element, hand: str) -> List[str]:
words = []
for elem in verse:
if elem.tag == f'{{{tei_ns}}}w' and not elem.getchildren() and elem.text:
words.append(elem.text)
elif elem.tag == f'{{{tei_ns}}}w' and elem.getchildren():
for child in elem:
if child.tag == f'{{{tei_ns}}}abbr':
words.append(handle_abbr(child))
elif child.text:
words.append(child.text)
elif elem.tag == f'{{{tei_ns}}}app':
words += handle_app(elem, hand)
return words
def handle_lacunae(words: List[str]) -> List[str]: ###* PASSING
indices_of_lac_words = []
for i, word in enumerate(words):
if not word:
print(f'\n{word=}\n')
lac_word_found = re.search(r'\[[^\[\]]*\]', word)
if lac_word_found and lac_word_found.group(0) == word: # entire word is supplied, i.e. lacunose b/c word == [word]
indices_of_lac_words.append(i)
if 0 in indices_of_lac_words and 1 not in indices_of_lac_words:
words[1] = words[1].replace(words[1], f'___{words[1]}')
furthest_lac_index = None
for index in indices_of_lac_words:
if furthest_lac_index:
if index <= furthest_lac_index:
continue
else:
furthest_lac_index = None
if index + 1 not in indices_of_lac_words and index != 0:
words[index-1] = words[index-1].replace(words[index-1], f'{words[index-1]}___')
elif index + 1 in indices_of_lac_words:
words[index-1] = words[index-1].replace(words[index-1], f'{words[index-1]}___')
for i, _ in enumerate(words, 1):
if index + i in indices_of_lac_words:
furthest_lac_index = index + i
else:
break
new_words = []
for i, w in enumerate(words):
if i in indices_of_lac_words:
continue
else:
new_words.append(w)
return new_words
def remove_duplicate_witnesses(witnesses: List[list]):
found_wits = []
new_wits = []
for w in witnesses:
if w[1] not in found_wits:
found_wits.append(w[1])
new_wits.append(w)
return new_wits
def get_verse_as_tuple(verse: et._Element, hands: list = ['firsthand']) -> List[tuple]:
witnesses = []
for hand in hands:
words = get_all_words_in_verse(verse, hand)
words = handle_lacunae(words)
witnesses.append((hand, words))
witnesses = remove_duplicate_witnesses(witnesses)
return witnesses