-
Notifications
You must be signed in to change notification settings - Fork 0
/
wikidata_page_props.py
48 lines (36 loc) · 1.57 KB
/
wikidata_page_props.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
import threading
from .exceptions import AlreadyLoadedException, NotYetLoadedException
from .in_memory_csv import InMemoryCsv
class WikidataPageProps:
__instance = None
__instance_lock = threading.Lock()
@staticmethod
def initialize_instance_from_csv(csv):
with WikidataPageProps.__instance_lock:
if WikidataPageProps.__instance:
raise AlreadyLoadedException()
in_memory_csv = InMemoryCsv.load(csv, delimiter=";", row_filter=WikidataPageProps._row_filter)
WikidataPageProps.__instance = WikidataPageProps(in_memory_csv)
return WikidataPageProps.__instance
@staticmethod
def _row_filter(row):
# we only want to keep rows, which reference the wikidata id
return row["pp_propname"] == "wikibase_item"
@staticmethod
def instance():
with WikidataPageProps.__instance_lock:
if not WikidataPageProps.__instance:
raise NotYetLoadedException()
return WikidataPageProps.__instance
def __init__(self, in_memory_csv):
self._mapping = {}
self._build_mapping(in_memory_csv)
def _build_mapping(self, in_memory_csv):
for row in in_memory_csv.rows():
hex_string = row["pp_value"]
byte_array = bytearray.fromhex(hex_string)
self._mapping[int(row["pp_page"])] = byte_array.decode("utf-8")
def wikidata_id(self, wikipedia_page_id):
return self._mapping[wikipedia_page_id]
def __contains__(self, wikipedia_page_id):
return wikipedia_page_id in self._mapping