-
Notifications
You must be signed in to change notification settings - Fork 0
/
dbpedia.py
111 lines (88 loc) · 3.3 KB
/
dbpedia.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import glob, sys, os
import matplotlib.pyplot as plt
import numpy as np
def build_graph(name_of_file):
all_files = glob.glob(os.getcwd() + "/z_ttl/*.ttl")
dic_shortcuts = {":" : "http://dbpedia.org/resource/"}
dic_shortcuts_rec = {"http://dbpedia.org/resource" : ":"}
_id = 0
dic_nodes = {}
dic_nodes_rev = {}
dic_edges = []
dic_props = {}
debug = False
count = 0
for each in all_files:
print("Processing file : %s" % each.split("/")[-1])
_file = open(each, "r")
for line in _file:
count+=1
if line[0]=='#':
pass
if count%10000 == 0:
print(count, sys.getsizeof(dic_nodes) + sys.getsizeof(dic_nodes_rev) + sys.getsizeof(dic_edges) + sys.getsizeof(dic_props))
m = line.split(" ")
start = None
sub = None
obj = None
pred = None
try:
start = dic_shortcuts_rec["/".join(m[0][1:-1].split("/")[:-1])]
sub = start + "/" + m[0][1:-1].split("/")[-1]
except Exception as e:
sub = m[0]
try:
start = dic_shortcuts_rec["/".join(m[1][1:-1].split("/")[:-1])]
pred = start + "/" + m[1][1:-1].split("/")[-1]
except Exception as e:
pred = m[1]
try:
start = dic_shortcuts_rec["/".join(m[2][1:-1].split("/")[:-1])]
obj = start + "/" + m[2][1:-1].split("/")[-1]
except Exception as e:
obj = m[2]
debug and print(sub, "$" , pred, "$", obj)
if sub not in dic_nodes:
dic_nodes[sub] = _id
dic_nodes_rev[_id] = sub
_id +=1
if "http" in obj:
if obj not in dic_nodes:
dic_nodes[obj]=_id
dic_nodes_rev[_id] = obj
_id+=1
dic_edges.append((dic_nodes[sub], pred, dic_nodes[obj]))
else:
if sub not in dic_props:
dic_props[sub] = []
obj = obj.strip('"')
dic_props[sub].append((pred, obj))
#Writing the graph to file now.
f = open(name_of_file, "w")
f.write('<graph id="G" edgedefault="undirected">\n')
#Writing all the nodes which have properties
for each in dic_props:
f.write('\t<node id="%d">\n'%(dic_nodes[each]))
for each_pair in dic_props[each]:
f.write('\t\t<data key="%s">%s</data>\n'%(each_pair[0], each_pair[1]))
f.write('\t</node>\n')
#Writing all the nodes which don't have any properties
for each in dic_nodes:
if each not in dic_props:
f.write('\t<node id="%d"/>\n'%(dic_nodes[each]))
#Witing all the edges
for each in dic_edges:
f.write('\t<edge id="%d" source="%d" target="%d">\n'%(_id, each[0], each[2]))
_id+=1
f.write('\t\t<data key="label">%s</data>\n'%(each[1]))
f.write('\t</edge>\n')
f.write("</graph>\n")
if __name__ == "__main__":
build_graph("test.graphML")
"""
m = []
for each in dic_props:
m.append(len(dic_props))
plt.hist(np.asarray(m))
plt.show()
"""