-
Notifications
You must be signed in to change notification settings - Fork 1
/
utils.py
248 lines (187 loc) · 9.41 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
import time
import logging
import networkx as nx
import numpy as np
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
def generate_candidates(X, strategy, target_rel, max_candidates, consolidate_sides=False, seed=0):
""" Generate candidate statements from an existing knowledge graph using a defined strategy.
Parameters
----------
X: np.array, shape (n, 3)
Triples from which to discover new facts.
strategy: string
The candidates generation strategy.
- 'random_uniform' : generates N candidates (N <= max_candidates) based on a uniform random sampling of
head and tail entities.
- 'entity_frequency' : generates candidates by sampling entities with low frequency.
- 'graph_degree' : generates candidates by sampling entities with a low graph degree.
- 'cluster_coefficient' : generates candidates by sampling entities with a low clustering coefficient.
- 'cluster_triangles' : generates candidates by sampling entities with a low number of cluster triangles.
- 'cluster_squares' : generates candidates by sampling entities with a low number of cluster squares.
max_candidates: int or float
The maximum numbers of candidates generated by 'strategy'.
Can be an absolute number or a percentage [0,1].
This does not guarantee the number of candidates generated.
target_rel : str
Target relation to focus on. The function will generate candidate
statements only with this specific relation type.
consolidate_sides: bool
If True will generate candidate statements as a product of
unique head and tail entities, otherwise will
consider head and tail entities separately. Default: False.
seed : int
Seed to use for reproducible results.
Returns
-------
X_candidates : ndarray, shape [n, 3]
A list of candidate statements.
Examples
--------
>>> import numpy as np
>>> from ampligraph.discovery.discovery import generate_candidates
>>>
>>> X = np.array([['a', 'y', 'b'],
>>> ['b', 'y', 'a'],
>>> ['a', 'y', 'c'],
>>> ['c', 'y', 'a'],
>>> ['a', 'y', 'd'],
>>> ['c', 'y', 'd'],
>>> ['b', 'y', 'c'],
>>> ['f', 'y', 'e']])
>>> X_candidates = generate_candidates(X, strategy='graph_degree', target_rel='y', max_candidates=3)
>>> ([['a', 'y', 'e'],
>>> ['f', 'y', 'a'],
>>> ['c', 'y', 'e']])
"""
if strategy not in ['random_uniform', 'entity_frequency',
'graph_degree', 'cluster_coefficient',
'cluster_triangles', 'cluster_squares']:
msg = '%s is not a valid candidate generation strategy.' % strategy
raise ValueError(msg)
if target_rel not in np.unique(X[:, 1]):
# No error as may be case where target_rel is not in X
msg = 'Target relation is not found in triples.'
logger.warning(msg)
if not isinstance(max_candidates, (float, int)):
msg = 'Parameter max_candidates must be a float or int.'
raise ValueError(msg)
if max_candidates <= 0:
msg = 'Parameter max_candidates must be a positive integer ' \
'or float in range (0,1].'
raise ValueError(msg)
if isinstance(max_candidates, float):
max_candidates = int(max_candidates * len(X))
def _filter_candidates(X_candidates, X, remove_reflexive=True):
""" Inner function to filter candidate statements from X_candidates that are in X.
"""
X_candidates = _setdiff2d(X_candidates, X)
# Filter statements that are ['x', rel, 'x']
if remove_reflexive:
keep_idx = np.where(X_candidates[:, 0] != X_candidates[:, 2])
X_candidates = X_candidates[keep_idx]
return X_candidates
# Set random seed
np.random.seed(seed)
# Get entities linked with this relation
if consolidate_sides:
e_s = np.unique(np.concatenate((X[:, 0], X[:, 2])))
e_o = e_s
else:
e_s = np.unique(X[:, 0])
e_o = np.unique(X[:, 2])
logger.info('Generating candidates using {} strategy.'.format(strategy))
if strategy == 'random_uniform':
# Take close to sqrt of max_candidates so that: len(meshgrid result) == max_candidates
sample_size = int(np.sqrt(max_candidates) + 10) # +10 to allow for reduction in sampled array due to filtering
X_candidates = np.zeros([max_candidates, 3], dtype=object) # Pre-allocate X_candidates array
num_retries, max_retries = 0, 5 # Retry up to 5 times to reach max_candidates
start_idx, end_idx = 0, 0 #
while end_idx <= max_candidates - 1:
sample_e_s = np.random.choice(e_s, size=sample_size, replace=False)
sample_e_o = np.random.choice(e_o, size=sample_size, replace=False)
gen_candidates = np.array(np.meshgrid(sample_e_s, target_rel, sample_e_o)).T.reshape(-1, 3)
gen_candidates = _filter_candidates(gen_candidates, X)
# Select either all of gen_candidates or just enough to fill X_candidates
select_idx = min(len(gen_candidates), len(X_candidates) - start_idx)
end_idx = start_idx + select_idx
X_candidates[start_idx:end_idx, :] = gen_candidates[0:select_idx, :]
start_idx = end_idx
num_retries += 1
if num_retries == max_retries:
break
# end_idx will equal max_candidates in most cases, but could be less
return X_candidates[0:end_idx, :]
elif strategy == 'entity_frequency':
# Get entity counts and sort them in ascending order
if consolidate_sides:
e_s_counts = np.array(np.unique(X[:, [0, 2]], return_counts=True)).T
e_o_counts = e_s_counts
else:
e_s_counts = np.array(np.unique(X[:, 0], return_counts=True)).T
e_o_counts = np.array(np.unique(X[:, 2], return_counts=True)).T
e_s_weights = e_s_counts[:, 1].astype(np.float64) / np.sum(e_s_counts[:, 1].astype(np.float64))
e_o_weights = e_o_counts[:, 1].astype(np.float64) / np.sum(e_o_counts[:, 1].astype(np.float64))
## Customizable
## --------------------------------------------------------------------------------
## Add a new if clause for a new strategy.
## --------------------------------------------------------------------------------
elif strategy in ['graph_degree', 'cluster_coefficient', 'cluster_triangles', 'cluster_squares']:
# Create networkx graph
G = nx.Graph()
for row in X:
G.add_nodes_from([row[0], row[2]])
G.add_edge(row[0], row[2], name=row[1])
# Calculate node metrics
if strategy == 'graph_degree':
C = {i: j for i, j in G.degree()}
elif strategy == 'cluster_coefficient':
C = nx.algorithms.cluster.clustering(G)
elif strategy == 'cluster_triangles':
C = nx.algorithms.cluster.triangles(G)
elif strategy == 'cluster_squares':
C = nx.algorithms.cluster.square_clustering(G)
e_s_weights = np.array([C[x] for x in e_s], dtype=np.float64)
e_o_weights = np.array([C[x] for x in e_o], dtype=np.float64)
e_s_weights = e_s_weights / np.sum(e_s_weights)
e_o_weights = e_o_weights / np.sum(e_o_weights)
# Take close to sqrt of max_candidates so that: len(meshgrid result) == max_candidates
sample_size = int(np.sqrt(max_candidates) + 10) # +10 to allow for reduction in sampled array due to filtering
X_candidates = np.zeros([max_candidates, 3], dtype=object) # Pre-allocate X_candidates array
num_retries, max_retries = 0, 5 # Retry up to 5 times to reach max_candidates
start_idx, end_idx = 0, 0
while end_idx <= max_candidates - 1:
sample_e_s = np.random.choice(e_s, size=sample_size, replace=True, p=e_s_weights)
sample_e_o = np.random.choice(e_o, size=sample_size, replace=True, p=e_o_weights)
gen_candidates = np.array(np.meshgrid(sample_e_s, target_rel, sample_e_o)).T.reshape(-1, 3)
gen_candidates = _filter_candidates(gen_candidates, X)
# Select either all of gen_candidates or just enough to fill X_candidates
select_idx = min(len(gen_candidates), len(X_candidates) - start_idx)
end_idx = start_idx + select_idx
X_candidates[start_idx:end_idx, :] = gen_candidates[0:select_idx, :]
start_idx = end_idx
num_retries += 1
if num_retries == max_retries:
break
return X_candidates[0:end_idx, :]
def _setdiff2d(A, B):
""" Utility function equivalent to numpy.setdiff1d on 2d arrays.
Parameters
----------
A : ndarray, shape [n, m]
B : ndarray, shape [n, m]
Returns
-------
np.array, shape [k, m]
Rows of A that are not in B.
"""
if len(A.shape) != 2 or len(B.shape) != 2:
raise RuntimeError('Input arrays must be 2-dimensional.')
tmp = np.prod(np.swapaxes(A[:, :, None], 1, 2) == B, axis=2)
return A[~ np.sum(np.cumsum(tmp, axis=0) * tmp == 1, axis=1).astype(bool)]
def flatten_list(lst):
all_disc = []
for i in lst:
for j in i:
all_disc.append(j)
return all_disc