-
Notifications
You must be signed in to change notification settings - Fork 0
/
title_article.py
132 lines (121 loc) · 5.99 KB
/
title_article.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#!/usr/bin/env LANG=en_UK.UTF-8 /usr/local/bin/python3
'''
Multiple language title article splitting script
Takes arguments of full title, and title language in ISO Country Code alpha-2
Handles the potential for title entries to be upper case or lower case.
Loads TITLE_ARTICLES dictionary, which returns a list of title articles relevant to supplied language
Script matches title article (if present) to language key's value list,
if not returns original title and empty string.
Joanna White 2021
'''
# Dictionary of ISO country codes and title articles for each language
# These contents may have first originated from AACR2 documentation, with additions from BFI staff through the years
TITLE_ARTICLES = {'af': ["Die ", "'N "],
'sq': ["Nji ", "Një "],
'ar': ["El-", "Ad-", "Ag-", "Ak-", "An-", "Ar-", "As-", "At-", "Az-"],
'da': ["Den ", "Det ", "De ", "En ", "Et "],
'nl': ["De ", "Het ", "'S ", "Een ", "Eene ", "'N "],
'en': ["The ", "A ", "An "],
'fr': ["Le ", "La ", "L'", "Les ", "Un ", "Une "],
'de': ["Der ", "Die ", "Das ", "Ein ", "Eine "],
'el': ["Ho ", "He ", "To ", "Hoi ", "Hai ", "Ta ", "Henas ", "Heis ", "Mia ", "Hena "],
'he': ["Ha-" , "Ho-"],
'hu': ["A ", "Az ", "Egy "],
'is': ["Hinn ", "Hin ", "Hid ", "Hinir ", "Hinar "],
'it': ["Il ", "La ", "Lo ", "I ", "Gli ", "Gl'", "Le ", "L'", "Un ", "Uno ", "Una ", "Un'"],
'nb': ["Den ", "Det ", "De ", "En ", "Et "],
'nn': ["Dent ", "Det ", "Dei ", "Ein ", "Ei ", "Eit "],
'pt': ["O ", "A ", "Os ", "As ", "Um ", "Uma "],
'ro': ["Un ", "Una ", "O "],
'es': ["El ", "La ", "Lo ", "Los ", "Las ", "Un ", "Una "],
'sv': ["Den ", "Det ", "De ", "En ", "Ett "],
'tr': ["Bir "],
'cy': ["Y ", "Yr "],
'yi': ["Der ", "Di ", "Die ", "Dos ", "Das ", "A ", "An ", "Eyn ", "Eyne "]
}
def splitter(title_supplied, language):
# Refresh variables
title = ''
title_art = ''
# Manage title appearing all upper case
if title_supplied.isupper():
title_supplied = title_supplied.title()
# Counts words in the supplied title
title_strip = title_supplied.strip()
count = 1 + title_strip.count(" ")
# For single word titles, splits into title and title_art
# where articles are attached to first word of title
language = language.lower()
if count == 1:
if 'ar' in language or 'he' in language:
title_supplied = title_supplied.capitalize()
# Split here on the first word - hyphen
if title_supplied.startswith(("El-", "Ad-", "Ag-", "Ak-", "An-", "Ar-", "As-", "At-", "Az-", "Ha-", "Ho-")):
title_art_split = title_supplied.split("-")
title_art = title_art_split[0]
title = "{}".format(title_art_split[1])
elif 'it' in language or 'fr' in language:
title_supplied = title_supplied.capitalize()
# Split on the first word apostrophe where present
if title_supplied.startswith(("L'", "Un'", "Gl'")):
title_art_split = title_supplied.split("'")
title_art = "{}'".format(title_art_split[0])
title = "{}".format(title_art_split[1])
else:
title = title_supplied
title_art = ''
# For multiple word titles, splits into title and title_art
# where articles are attached to first word of title
# and where articles are separately spaced
elif count > 1:
ttl = []
title_split = title_supplied.split()
title_art_split = title_split[0]
title_art_split = title_art_split.capitalize()
if 'ar' in language or 'he' in language:
# Split here on the first word - hyphen
if title_art_split.startswith(("El-", "Ad-", "Ag-", "Ak-", "An-", "Ar-", "As-", "At-", "Az-", "Ha-", "Ho-")):
article_split = title_art_split.split("-")
title_art = str(article_split[0])
ttl.append(article_split[1])
ttl += title_split[1:]
title = ' '.join(ttl)
elif 'it' in language or 'fr' in language:
# Split on the first word apostrophe where present
if title_art_split.startswith(("L'", "Un'", "Gl'")):
article_split = title_art_split.split("'")
title_art = "{}'".format(article_split[0])
ttl.append(article_split[1])
ttl += title_split[1:]
title_join = ' '.join(ttl)
title = title_join.strip()
else:
ttl = title_split[1:]
title_art = title_split[0]
title = ' '.join(ttl)
else:
ttl = title_split[1:]
title_art = title_split[0]
title = ' '.join(ttl)
# Searches through keys for language match
for key in TITLE_ARTICLES.keys():
if language == str(key):
lst = []
lst = TITLE_ARTICLES[language]
# Looks to match title_art with values in language key match
# and return title, title_art where this is the case
for item in zip(lst):
if len(title_art) > 0:
title_art = title_art.capitalize()
if title_art in str(item):
title_art = title_art.title()
title = title[0].upper() + title[1:]
if title.isupper():
title = title.title()
return title, title_art
else:
return title, title_art
# Returns titles as they are where no article language matches
for key in TITLE_ARTICLES.keys():
if language != str(key):
return title_supplied, ''