forked from PaddlePaddle/PaddleRec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
model.py
executable file
·513 lines (444 loc) · 21.1 KB
/
model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
# -*- coding=utf-8 -*-
"""
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
import paddle
import paddle.fluid as fluid
from paddlerec.core.utils import envs
from paddlerec.core.model import ModelBase
class Model(ModelBase):
def __init__(self, config):
ModelBase.__init__(self, config)
def _init_hyper_parameters(self):
# tree meta hyper parameters
self.max_layers = envs.get_global_env("hyper_parameters.max_layers", 4)
self.node_nums = envs.get_global_env("hyper_parameters.node_nums", 26)
self.leaf_node_nums = envs.get_global_env(
"hyper_parameters.leaf_node_nums", 13)
self.output_positive = envs.get_global_env(
"hyper_parameters.output_positive", True)
self.layer_node_num_list = envs.get_global_env(
"hyper_parameters.layer_node_num_list", [2, 4, 7, 12])
self.child_nums = envs.get_global_env("hyper_parameters.child_nums", 2)
self.tree_layer_path = envs.get_global_env(
"hyper_parameters.tree.tree_layer_path", None)
# model training hyper parameter
self.node_emb_size = envs.get_global_env(
"hyper_parameters.node_emb_size", 64)
self.input_emb_size = envs.get_global_env(
"hyper_parameters.input_emb_size", 768)
self.act = envs.get_global_env("hyper_parameters.act", "tanh")
self.neg_sampling_list = envs.get_global_env(
"hyper_parameters.neg_sampling_list", [1, 2, 3, 4])
# model infer hyper parameter
self.topK = envs.get_global_env(
"hyper_parameters.topK",
1, )
self.batch_size = envs.get_global_env(
"dataset.dataset_infer.batch_size", 1)
def net(self, input, is_infer=False):
if not is_infer:
return self.train_net(input)
else:
return self.infer_net(input)
def train_net(self, input):
self.tdm_net(input)
self.create_info()
self.avg_loss()
self.metrics()
def infer_net(self, input):
self.create_first_layer()
self.tdm_infer_net(input)
def input_data(self, is_infer=False, **kwargs):
if not is_infer:
return self.train_input()
else:
return self.infer_input()
""" -------- Train network detail ------- """
def train_input(self):
input_emb = fluid.data(
name="input_emb",
shape=[None, self.input_emb_size],
dtype="float32", )
item_label = fluid.data(
name="item_label",
shape=[None, 1],
dtype="int64", )
return [input_emb, item_label]
def tdm_net(self, input):
"""
tdm训练网络的主要流程部分
"""
is_distributed = True if envs.get_trainer() == "CtrTrainer" else False
input_emb = input[0]
item_label = input[1]
# 根据输入的item的正样本在给定的树上进行负采样
# sample_nodes 是采样的node_id的结果,包含正负样本
# sample_label 是采样的node_id对应的正负标签
# sample_mask 是为了保持tensor维度一致,padding部分的标签,若为0,则是padding的虚拟node_id
if self.check_version():
with fluid.device_guard("cpu"):
sample_nodes, sample_label, sample_mask = fluid.contrib.layers.tdm_sampler(
x=item_label,
neg_samples_num_list=self.neg_sampling_list,
layer_node_num_list=self.layer_node_num_list,
leaf_node_num=self.leaf_node_nums,
tree_travel_attr=fluid.ParamAttr(name="TDM_Tree_Travel"),
tree_layer_attr=fluid.ParamAttr(name="TDM_Tree_Layer"),
output_positive=self.output_positive,
output_list=True,
seed=0,
tree_dtype='int64',
dtype='int64')
else:
sample_nodes, sample_label, sample_mask = fluid.contrib.layers.tdm_sampler(
x=item_label,
neg_samples_num_list=self.neg_sampling_list,
layer_node_num_list=self.layer_node_num_list,
leaf_node_num=self.leaf_node_nums,
tree_travel_attr=fluid.ParamAttr(name="TDM_Tree_Travel"),
tree_layer_attr=fluid.ParamAttr(name="TDM_Tree_Layer"),
output_positive=self.output_positive,
output_list=True,
seed=0,
tree_dtype='int64',
dtype='int64')
# 查表得到每个节点的Embedding
sample_nodes_emb = [
fluid.embedding(
input=sample_nodes[i],
is_sparse=True,
size=[self.node_nums, self.node_emb_size],
param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
for i in range(self.max_layers)
]
# 此处进行Reshape是为了之后层次化的分类器训练
sample_nodes_emb = [
fluid.layers.reshape(sample_nodes_emb[i], [
-1, self.neg_sampling_list[i] + self.output_positive,
self.node_emb_size
]) for i in range(self.max_layers)
]
# 对输入的input_emb进行转换,使其维度与node_emb维度一致
input_trans_emb = self.input_trans_layer(input_emb)
# 分类器的主体网络,分别训练不同层次的分类器
layer_classifier_res = self.classifier_layer(input_trans_emb,
sample_nodes_emb)
# 最后的概率判别FC,将所有层次的node分类结果放到一起以相同的标准进行判别
# 考虑到树极大可能不平衡,有些item不在最后一层,所以需要这样的机制保证每个item都有机会被召回
tdm_fc = fluid.layers.fc(
input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
# 将loss打平,放到一起计算整体网络的loss
tdm_fc_re = fluid.layers.reshape(tdm_fc, [-1, 2])
# 若想对各个层次的loss辅以不同的权重,则在此处无需concat
# 支持各个层次分别计算loss,再乘相应的权重
sample_label = fluid.layers.concat(sample_label, axis=1)
labels_reshape = fluid.layers.reshape(sample_label, [-1, 1])
labels_reshape.stop_gradient = True
# 计算整体的loss并得到softmax的输出
cost, softmax_prob = fluid.layers.softmax_with_cross_entropy(
logits=tdm_fc_re, label=labels_reshape, return_softmax=True)
# 通过mask过滤掉虚拟节点的loss
sample_mask = fluid.layers.concat(sample_mask, axis=1)
mask_reshape = fluid.layers.reshape(sample_mask, [-1, 1])
mask_index = fluid.layers.where(mask_reshape != 0)
mask_index.stop_gradient = True
self.mask_cost = fluid.layers.gather_nd(cost, mask_index)
softmax_prob = fluid.layers.unsqueeze(input=softmax_prob, axes=[1])
self.mask_prob = fluid.layers.gather_nd(softmax_prob, mask_index)
self.mask_label = fluid.layers.gather_nd(labels_reshape, mask_index)
self._predict = self.mask_prob
def create_info(self):
fluid.default_startup_program().global_block().create_var(
name="TDM_Tree_Info",
dtype=fluid.core.VarDesc.VarType.INT32,
shape=[self.node_nums, 3 + self.child_nums],
persistable=True,
initializer=fluid.initializer.ConstantInitializer(0))
fluid.default_main_program().global_block().create_var(
name="TDM_Tree_Info",
dtype=fluid.core.VarDesc.VarType.INT32,
shape=[self.node_nums, 3 + self.child_nums],
persistable=True)
def avg_loss(self):
avg_cost = fluid.layers.reduce_mean(self.mask_cost)
self._cost = avg_cost
def metrics(self):
auc, batch_auc, _ = fluid.layers.auc(input=self._predict,
label=self.mask_label,
num_thresholds=2**12,
slide_steps=20)
self._metrics["AUC"] = auc
self._metrics["BATCH_AUC"] = batch_auc
self._metrics["BATCH_LOSS"] = self._cost
def input_trans_layer(self, input_emb):
"""
输入侧训练组网
"""
# 将input映射到与node相同的维度
input_fc_out = fluid.layers.fc(
input=input_emb,
size=self.node_emb_size,
act=None,
param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
# 将input_emb映射到各个不同层次的向量表示空间
input_layer_fc_out = [
fluid.layers.fc(
input=input_fc_out,
size=self.node_emb_size,
act=self.act,
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias." + str(i)), )
for i in range(self.max_layers)
]
return input_layer_fc_out
def _expand_layer(self, input_layer, node, layer_idx):
# 扩展input的输入,使数量与node一致,
# 也可以以其他broadcast的操作进行代替
# 同时兼容了训练组网与预测组网
input_layer_unsequeeze = fluid.layers.unsqueeze(
input=input_layer, axes=[1])
if not isinstance(node, list):
input_layer_expand = fluid.layers.expand(
input_layer_unsequeeze, expand_times=[1, node.shape[1], 1])
else:
input_layer_expand = fluid.layers.expand(
input_layer_unsequeeze,
expand_times=[1, node[layer_idx].shape[1], 1])
return input_layer_expand
def classifier_layer(self, input, node):
# 扩展input,使维度与node匹配
input_expand = [
self._expand_layer(input[i], node, i)
for i in range(self.max_layers)
]
# 将input_emb与node_emb concat到一起过分类器FC
input_node_concat = [
fluid.layers.concat(
input=[input_expand[i], node[i]], axis=2)
for i in range(self.max_layers)
]
hidden_states_fc = [
fluid.layers.fc(
input=input_node_concat[i],
size=self.node_emb_size,
num_flatten_dims=2,
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight." + str(i)),
bias_attr=fluid.ParamAttr(name="cls.concat_fc.bias." + str(i)))
for i in range(self.max_layers)
]
# 如果将所有层次的node放到一起计算loss,则需要在此处concat
# 将分类器结果以batch为准绳concat到一起,而不是layer
# 维度形如[batch_size, total_node_num, node_emb_size]
hidden_states_concat = fluid.layers.concat(hidden_states_fc, axis=1)
return hidden_states_concat
""" -------- Infer network detail ------- """
def infer_input(self):
input_emb = fluid.layers.data(
name="input_emb",
shape=[self.input_emb_size],
dtype="float32", )
return [input_emb]
def get_layer_list(self):
"""get layer list from layer_list.txt"""
layer_list = []
with open(self.tree_layer_path, 'r') as fin:
for line in fin.readlines():
l = []
layer = (line.split('\n'))[0].split(',')
for node in layer:
if node:
l.append(node)
layer_list.append(l)
self.layer_list = layer_list
def create_first_layer(self):
"""decide which layer to start infer"""
self.get_layer_list()
first_layer_id = 0
for idx, layer_node in enumerate(self.layer_node_num_list):
if layer_node >= self.topK:
first_layer_id = idx
break
first_layer_node = self.layer_list[first_layer_id]
self.first_layer_idx = first_layer_id
node_list = []
mask_list = []
for id in first_layer_node:
node_list.append(
fluid.layers.fill_constant(
[self.batch_size, 1], value=int(id), dtype='int64'))
mask_list.append(
fluid.layers.fill_constant(
[self.batch_size, 1], value=0, dtype='int64'))
self.first_layer_node = fluid.layers.concat(node_list, axis=1)
self.first_layer_node_mask = fluid.layers.concat(mask_list, axis=1)
def tdm_infer_net(self, input):
"""
infer的主要流程
infer的基本逻辑是:从上层开始(具体层idx由树结构及TopK值决定)
1、依次通过每一层分类器,得到当前层输入的指定节点的prob
2、根据prob值大小,取topK的节点,取这些节点的孩子节点作为下一层的输入
3、循环1、2步骤,遍历完所有层,得到每一层筛选结果的集合
4、将筛选结果集合中的叶子节点,拿出来再做一次topK,得到最终的召回输出
"""
input_emb = input[0]
node_score = []
node_list = []
current_layer_node = self.first_layer_node
current_layer_node_mask = self.first_layer_node_mask
input_trans_emb = self.input_fc_infer(input_emb)
for layer_idx in range(self.first_layer_idx, self.max_layers):
# 确定当前层的需要计算的节点数
if layer_idx == self.first_layer_idx:
current_layer_node_num = self.first_layer_node.shape[1]
else:
current_layer_node_num = current_layer_node.shape[1] * \
current_layer_node.shape[2]
current_layer_node = fluid.layers.reshape(
current_layer_node, [-1, current_layer_node_num])
current_layer_node_mask = fluid.layers.reshape(
current_layer_node_mask, [-1, current_layer_node_num])
node_emb = fluid.embedding(
input=current_layer_node,
size=[self.node_nums, self.node_emb_size],
param_attr=fluid.ParamAttr(name="TDM_Tree_Emb"))
input_fc_out = self.layer_fc_infer(input_trans_emb, layer_idx)
# 过每一层的分类器
layer_classifier_res = self.classifier_layer_infer(
input_fc_out, node_emb, layer_idx)
# 过最终的判别分类器
tdm_fc = fluid.layers.fc(
input=layer_classifier_res,
size=2,
act=None,
num_flatten_dims=2,
param_attr=fluid.ParamAttr(name="tdm.cls_fc.weight"),
bias_attr=fluid.ParamAttr(name="tdm.cls_fc.bias"))
prob = fluid.layers.softmax(tdm_fc)
positive_prob = fluid.layers.slice(
prob, axes=[2], starts=[1], ends=[2])
prob_re = fluid.layers.reshape(positive_prob,
[-1, current_layer_node_num])
# 过滤掉padding产生的无效节点(node_id=0)
node_zero_mask = fluid.layers.cast(current_layer_node, 'bool')
node_zero_mask = fluid.layers.cast(node_zero_mask, 'float32')
prob_re = prob_re * node_zero_mask
# 在当前层的分类结果中取topK,并将对应的score及node_id保存下来
k = self.topK
if current_layer_node_num < self.topK:
k = current_layer_node_num
_, topk_i = fluid.layers.topk(prob_re, k)
# index_sample op根据下标索引tensor对应位置的值
# 若paddle版本>2.0,调用方式为paddle.index_sample
top_node = fluid.contrib.layers.index_sample(current_layer_node,
topk_i)
prob_re_mask = prob_re * current_layer_node_mask # 过滤掉非叶子节点
topk_value = fluid.contrib.layers.index_sample(prob_re_mask,
topk_i)
node_score.append(topk_value)
node_list.append(top_node)
# 取当前层topK结果的孩子节点,作为下一层的输入
if layer_idx < self.max_layers - 1:
# tdm_child op 根据输入返回其 child 及 child_mask
# 若child是叶子节点,则child_mask=1,否则为0
current_layer_node, current_layer_node_mask = \
fluid.contrib.layers.tdm_child(x=top_node,
node_nums=self.node_nums,
child_nums=self.child_nums,
param_attr=fluid.ParamAttr(
name="TDM_Tree_Info"),
dtype='int64')
total_node_score = fluid.layers.concat(node_score, axis=1)
total_node = fluid.layers.concat(node_list, axis=1)
# 考虑到树可能是不平衡的,计算所有层的叶子节点的topK
res_score, res_i = fluid.layers.topk(total_node_score, self.topK)
res_layer_node = fluid.contrib.layers.index_sample(total_node, res_i)
res_node = fluid.layers.reshape(res_layer_node, [-1, self.topK, 1])
# 利用Tree_info信息,将node_id转换为item_id
tree_info = fluid.default_main_program().global_block().var(
"TDM_Tree_Info")
res_node_emb = fluid.layers.gather_nd(tree_info, res_node)
res_item = fluid.layers.slice(
res_node_emb, axes=[2], starts=[0], ends=[1])
self.res_item_re = fluid.layers.reshape(res_item, [-1, self.topK])
self._infer_results["item"] = self.res_item_re
def input_fc_infer(self, input_emb):
"""
输入侧预测组网第一部分,将input转换为node同维度
"""
# 组网与训练时保持一致
input_fc_out = fluid.layers.fc(
input=input_emb,
size=self.node_emb_size,
act=None,
param_attr=fluid.ParamAttr(name="trans.input_fc.weight"),
bias_attr=fluid.ParamAttr(name="trans.input_fc.bias"), )
return input_fc_out
def layer_fc_infer(self, input_fc_out, layer_idx):
"""
输入侧预测组网第二部分,将input映射到不同层次的向量空间
"""
# 组网与训练保持一致,通过layer_idx指定不同层的FC
input_layer_fc_out = fluid.layers.fc(
input=input_fc_out,
size=self.node_emb_size,
act=self.act,
param_attr=fluid.ParamAttr(
name="trans.layer_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(
name="trans.layer_fc.bias." + str(layer_idx)), )
return input_layer_fc_out
def classifier_layer_infer(self, input, node, layer_idx):
# 为infer组网提供的简化版classifier,通过给定layer_idx调用不同层的分类器
# 同样需要保持input与node的维度匹配
input_expand = self._expand_layer(input, node, layer_idx)
# 与训练网络相同的concat逻辑
input_node_concat = fluid.layers.concat(
input=[input_expand, node], axis=2)
# 根据参数名param_attr调用不同的层的FC
hidden_states_fc = fluid.layers.fc(
input=input_node_concat,
size=self.node_emb_size,
num_flatten_dims=2,
act=self.act,
param_attr=fluid.ParamAttr(
name="cls.concat_fc.weight." + str(layer_idx)),
bias_attr=fluid.ParamAttr(
name="cls.concat_fc.bias." + str(layer_idx)))
return hidden_states_fc
def check_version(self):
"""
Log error and exit when the installed version of paddlepaddle is
not satisfied.
"""
err = "TDM-GPU need Paddle version 1.8 or higher is required, " \
"or a suitable develop version is satisfied as well. \n" \
"Please make sure the version is good with your code." \
try:
fluid.require_version('1.8.0')
return True
except Exception as e:
print(err)
return False