-
Notifications
You must be signed in to change notification settings - Fork 1
/
node.py
executable file
·250 lines (229 loc) · 12.5 KB
/
node.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
import numpy as np
from saxpy.znorm import znorm
from saxpy.sax import ts_to_string
from saxpy.alphabet import cuts_for_asize
from loguru import logger
from saxpy.paa import paa
class Node:
def __init__(self, level: int = 1, pattern_representation: str = "", label: str = "intermediate",
group: dict = None, parent=None, paa_value: int = 3):
self.level = level # number of different char for rappresentation
self.paa_value = paa_value
if pattern_representation == "":
pr = "a"*self.paa_value # using SAX
self.pattern_representation = pr
else:
self.pattern_representation = pattern_representation
self.members = list(group.keys()) # members time series contained in N
self.size = len(group) # numbers of time series contained
self.label = label # each node has tree possible labels: bad-leaf, good-leaf or intermediate
self.group = group # group obtained from k-anonymity top-down
self.child_node = list() # all childs node
# self.left = None # left child
# self.right = None # right child
self.parent = parent # parent
def start_splitting(self, p_value: int, max_level: int, good_leaf_nodes: list(), bad_leaf_nodes: list()):
"""
Splitting Node Naive algorithm (k, P) Anonymity
:param p_value:
:param max_level:
:param paa_value
:return:
"""
# logger.info("good_leaf_nodes: {}, bad_leaf_nodes: {}".format(len(good_leaf_nodes), len(bad_leaf_nodes)))
# Size of the node is < P
if self.size < p_value:
logger.info("size:{}, p_value:{} == bad-leaf".format(self.size, p_value))
self.label = "bad-leaf"
bad_leaf_nodes.append(self)
return
# Size of the node is == max_level
if self.level == max_level:
logger.info("size:{}, p_value:{} == good-leaf".format(self.size, p_value))
self.label = "good-leaf"
good_leaf_nodes.append(self)
return
# Size of the node is > P and < 2*P: maximize level without node splitting
if p_value <= self.size < 2*p_value:
logger.info("Maximize-level, size:{}, p_value:{} == good-leaf".format(self.size, p_value))
self.maximize_level_node(max_level)
self.label = "good-leaf"
good_leaf_nodes.append(self)
return
"""
Otherwise, we need to check if node N has to be split. The checking relies on a tentative split performed on N.
Suppose that, by increasing the level of N, N is tentatively split into a number of child nodes.
If all these child nodes contain fewer than P time series, no real split is performed and the original node N is
labeled as good-leaf and the recursion terminates on N. Otherwise, there must exist tentative child node(s)
whose size >= P, also called TG-node(s) (Tentative Good Nodes).
The rest children whose size < P are called TB-nodes (Tentative Bad Nodes), if any.
If the total number of records in all TB-nodes under N is no less than P, we merge them into a single tentative
node, denoted by childmerge, at the level of N.level. If the above tentative process produces nc tentative
child nodes (including TB and TG) and nc >= 2, N will really be split into nc children and then the node
splitting procedure will be recursively invoked on each of them
"""
tentative_child_node = dict()
temp_level = self.level + 1
for key, value in self.group.items():
# to reduce dimensionality
data = np.array(value)
data_znorm = znorm(data)
data_paa = paa(data_znorm, self.paa_value)
pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
if pr in tentative_child_node.keys():
tentative_child_node[pr].append(key)
else:
tentative_child_node[pr] = [key]
length_all_tentative_child = [len(x) for x in list(tentative_child_node.values())]
good_leaf = np.all(np.array(length_all_tentative_child) < p_value)
if good_leaf:
# If N cannot be split
logger.info("Good-leaf, all_tentative_child are < {}".format(p_value))
self.label = "good-leaf"
good_leaf_nodes.append(self)
return
else:
# N can be split
logger.info("N can be split")
logger.info("Compute tentative good nodes and tentative bad nodes")
# tentative good nodes
# index of nodes in tentative_child_node with more p_value
pr_keys = list(tentative_child_node.keys())
# get index tentative good node
pattern_representation_tg = list()
tg_nodes_index = list(np.where(np.array(length_all_tentative_child) >= p_value)[0])
# logger.info(pr_keys)
tg_nodes = list()
for index in tg_nodes_index:
keys_elements = tentative_child_node[pr_keys[index]]
dict_temp = dict()
for key in keys_elements:
dict_temp[key] = self.group[key]
tg_nodes.append(dict_temp)
pattern_representation_tg.append(pr_keys[index])
# tentative bad nodes
tb_nodes_index = list(np.where(np.array(length_all_tentative_child) < p_value)[0])
tb_nodes = list()
pattern_representation_tb = list()
for index in tb_nodes_index:
keys_elements = tentative_child_node[pr_keys[index]]
dict_temp = dict()
for key in keys_elements:
dict_temp[key] = self.group[key]
tb_nodes.append(dict_temp)
pattern_representation_tb.append(pr_keys[index])
total_size_tb_nodes = 0
for tb_node in tb_nodes:
total_size_tb_nodes += len(tb_node)
if total_size_tb_nodes >= p_value:
# Merge all bad nodes in a single node and label it as good-leaf
logger.info("Merge all bad nodes in a single node, and label it as good-leaf")
child_merge_node_group = dict()
for tb_node in tb_nodes:
for key, value in tb_node.items():
child_merge_node_group[key] = value
node_merge = Node(level=self.level, pattern_representation=self.pattern_representation,
label="good-leaf", group=child_merge_node_group, parent=self)
self.child_node.append(node_merge)
good_leaf_nodes.append(node_merge)
nc = len(tg_nodes) + len(tb_nodes)
logger.info("Split only tg_nodes {0}".format(len(tg_nodes)))
if nc >= 2:
# Split and invoke recursively the node splitting procedure
for index in range(0, len(tg_nodes)):
node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
label="intermediate", group=tg_nodes[index], parent=self)
self.child_node.append(node)
node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes)
else:
# Split without calling the node splitting procedure
for index in range(0, len(tg_nodes)):
node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
label="good-leaf", group=tg_nodes[index], parent=self)
self.child_node.append(node)
good_leaf_nodes.append(node)
else:
nc = len(tg_nodes) + len(tb_nodes)
logger.info("Label all tb_node {0} as bad-leaf and split only tg_nodes {1}".format(len(tb_nodes),len(tg_nodes)))
for index in range(0, len(tb_nodes)):
node = Node(level=self.level, pattern_representation=pattern_representation_tb[index], label="bad-leaf",
group=tb_nodes[index], parent=self)
self.child_node.append(node)
bad_leaf_nodes.append(node)
if nc >= 2:
for index in range(0, len(tg_nodes)):
node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
label="intermediate", group=tg_nodes[index], parent=self)
self.child_node.append(node)
node.start_splitting(p_value, max_level, good_leaf_nodes, bad_leaf_nodes)
else:
for index in range(0, len(tg_nodes)):
node = Node(level=self.level, pattern_representation=pattern_representation_tg[index],
label="good-leaf", group=tg_nodes[index], parent=self)
self.child_node.append(node)
good_leaf_nodes.append(node)
@staticmethod
def postprocessing(good_leaf_nodes, bad_leaf_nodes):
# count = sum(1 for a, b in zip(seq1, seq2) if a != b)
difference = float('inf')
for bad_leaf_node in bad_leaf_nodes:
pattern_representation_bad_node = bad_leaf_node.pattern_representation
choose_node = None
for index in range(0, len(good_leaf_nodes)):
pattern_representation_good_node = good_leaf_nodes[index].pattern_representation
difference_good_bad = sum(1 for a, b in zip(pattern_representation_good_node,
pattern_representation_bad_node) if a != b)
if difference_good_bad < difference:
choose_node = index
# choose_node contain good node with minimum difference between pattern representation
Node.add_row_to_node(good_leaf_nodes[choose_node], bad_leaf_node)
# delete all bad_leaf nodes
bad_leaf_nodes = list()
@staticmethod
def add_row_to_node(node_original, node_to_add):
"""
add node_to_add content to node_original
:param node_original:
:param node_to_add:
:return:
"""
for key, value in node_to_add.group.items():
node_original.group[key] = value
node_original.members = list(node_original.group.keys())
node_original.size = len(node_original.group)
def maximize_level_node(self, max_level):
"""
Try to maximaxe the level value
:param p_value:
:return:
"""
values_group = list(self.group.values())
original_level = self.level
equal = True
# Try to maximize the level computing the ts_to_string of the new level
# and compare it with the old one for each element in group
while equal and self.level < max_level:
temp_level = self.level + 1
data = np.array(values_group[0])
data_znorm = znorm(data)
data_paa = paa(data_znorm, self.paa_value)
pr = ts_to_string(data_paa, cuts_for_asize(temp_level))
# Compare pr for each element (first one already checked, because the initial pr is its)
for index in range(1, len(values_group)):
data = np.array(values_group[index])
data_znorm = znorm(data)
data_paa = paa(data_znorm, self.paa_value)
pr_2 = ts_to_string(data_paa, cuts_for_asize(temp_level))
if pr_2 != pr: # comparing new pr with the pr of original level
equal = False
if equal: # need to try another level or already at the max?
self.level = temp_level
# Check if something changes
if original_level != self.level:
logger.info("New level for node: {}".format(self.level))
data = np.array(values_group[0])
data_znorm = znorm(data)
data_paa = paa(data_znorm, self.paa_value)
self.pattern_representation = ts_to_string(data_paa, cuts_for_asize(self.level))
else:
logger.info("Can't split again, max level already reached")