Skip to content

Commit

Permalink
"add book recommender_system testing" (#5143)
Browse files Browse the repository at this point in the history
* "add sequence conv layer"

* "add book recommender_system testing"

* "add training loop"

* "add sequence layer"

* "add recommender system training data"

* "fix conv2d layer bug"

* add sequence_conv_pool

* "fix input is Null"

* add networks

* "fix based comment"

* "add sum op layer"

* "merge layers"

* Update layers.py

* "fix input is NULL bug"

* "debug embedding table"

* "modify layers.py"

* "fix pool interface"

* "add export type to layers"

* "fix based on comment"

* "need lod info support in all operator"

* "remove accuracy layer"

* "tuning learning rate"

* "add sparse test"

* "add gpu test"

* Update test_recommender_system.py
  • Loading branch information
dzhwinter authored Nov 2, 2017
1 parent f48159a commit 69011c1
Show file tree
Hide file tree
Showing 3 changed files with 324 additions and 10 deletions.
20 changes: 10 additions & 10 deletions python/paddle/v2/framework/layers.py
Original file line number Diff line number Diff line change
Expand Up @@ -197,19 +197,19 @@ def sums(input, program=None, init_program=None):
return out


def cos_sim(X, Y, program=None, init_program=None):
helper = LayerHelper('cos_sim', **locals())
out = helper.create_tmp_variable(dtype=helper.input_dtype("X"))
xnorm = helper.create_tmp_variable(dtype=helper.input_dtype("X"))
ynorm = helper.create_tmp_variable(dtype=helper.input_dtype("X"))
def cos_sim(X, Y, **kwargs):
helper = LayerHelper('cos_sim', **kwargs)
out = helper.create_tmp_variable(dtype=X.data_type)
xnorm = helper.create_tmp_variable(dtype=X.data_type)
ynorm = helper.create_tmp_variable(dtype=X.data_type)
helper.append_op(
type='cos_sim',
inputs={'X': [X],
'Y': [Y]},
outputs={'Out': [out],
'XNorm': [xnorm],
'YNorm': [ynorm]})
return out, xnorm, ynorm
return out


def cross_entropy(input, label, **kwargs):
Expand Down Expand Up @@ -265,7 +265,7 @@ def accuracy(input, label, k=1, **kwargs):
def sequence_conv(input,
num_filters,
filter_size=3,
stride=1,
filter_stride=1,
padding=None,
bias_attr=None,
param_attr=None,
Expand All @@ -291,9 +291,9 @@ def sequence_conv(input,
},
outputs={"Out": pre_bias},
attrs={
'context_stride': stride,
'context_start': 0,
'context_length': filter_size
'contextStride': filter_stride,
'contextStart': 0,
'contextLength': filter_size
})
pre_act = helper.append_bias_op(pre_bias)
return helper.append_activation(pre_act)
Expand Down
1 change: 1 addition & 0 deletions python/paddle/v2/framework/nets.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,6 +101,7 @@ def __extend_list__(obj):
def sequence_conv_pool(input,
num_filters,
filter_size,
act="sigmoid",
pool_type="max",
program=None,
init_program=None):
Expand Down
313 changes: 313 additions & 0 deletions python/paddle/v2/framework/tests/test_recommender_system.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,313 @@
import paddle.v2 as paddle
import paddle.v2.framework.layers as layers
import paddle.v2.framework.nets as nets
import paddle.v2.framework.core as core
import paddle.v2.framework.optimizer as optimizer

from paddle.v2.framework.framework import Program, g_program
from paddle.v2.framework.executor import Executor

import numpy as np

init_program = Program()
program = Program()
is_sparse = True
use_gpu = False
BATCH_SIZE = 256


def get_usr_combined_features():
# FIXME(dzh) : old API integer_value(10) may has range check.
# currently we don't have user configurated check.

USR_DICT_SIZE = paddle.dataset.movielens.max_user_id() + 1

uid = layers.data(
name='user_id',
shape=[1],
data_type='int64',
program=program,
init_program=init_program)

usr_emb = layers.embedding(
input=uid,
data_type='float32',
size=[USR_DICT_SIZE, 32],
param_attr={'name': 'user_table'},
is_sparse=is_sparse,
program=program,
init_program=init_program)

usr_fc = layers.fc(input=usr_emb,
size=32,
program=program,
init_program=init_program)

USR_GENDER_DICT_SIZE = 2

usr_gender_id = layers.data(
name='gender_id',
shape=[1],
data_type='int64',
program=program,
init_program=init_program)

usr_gender_emb = layers.embedding(
input=usr_gender_id,
size=[USR_GENDER_DICT_SIZE, 16],
param_attr={'name': 'gender_table'},
is_sparse=is_sparse,
program=program,
init_program=init_program)

usr_gender_fc = layers.fc(input=usr_gender_emb,
size=16,
program=program,
init_program=init_program)

USR_AGE_DICT_SIZE = len(paddle.dataset.movielens.age_table)
usr_age_id = layers.data(
name='age_id',
shape=[1],
data_type="int64",
program=program,
init_program=init_program)

usr_age_emb = layers.embedding(
input=usr_age_id,
size=[USR_AGE_DICT_SIZE, 16],
is_sparse=is_sparse,
param_attr={'name': 'age_table'},
program=program,
init_program=init_program)

usr_age_fc = layers.fc(input=usr_age_emb,
size=16,
program=program,
init_program=init_program)

USR_JOB_DICT_SIZE = paddle.dataset.movielens.max_job_id() + 1
usr_job_id = layers.data(
name='job_id',
shape=[1],
data_type="int64",
program=program,
init_program=init_program)

usr_job_emb = layers.embedding(
input=usr_job_id,
size=[USR_JOB_DICT_SIZE, 16],
param_attr={'name': 'job_table'},
is_sparse=is_sparse,
program=program,
init_program=init_program)

usr_job_fc = layers.fc(input=usr_job_emb,
size=16,
program=program,
init_program=init_program)

concat_embed = layers.concat(
input=[usr_fc, usr_gender_fc, usr_age_fc, usr_job_fc],
axis=1,
program=program,
init_program=init_program)

usr_combined_features = layers.fc(input=concat_embed,
size=200,
act="tanh",
program=program,
init_program=init_program)

return usr_combined_features


def get_mov_combined_features():

MOV_DICT_SIZE = paddle.dataset.movielens.max_movie_id() + 1

mov_id = layers.data(
name='movie_id',
shape=[1],
data_type='int64',
program=program,
init_program=init_program)

mov_emb = layers.embedding(
input=mov_id,
data_type='float32',
size=[MOV_DICT_SIZE, 32],
param_attr={'name': 'movie_table'},
is_sparse=is_sparse,
program=program,
init_program=init_program)

mov_fc = layers.fc(input=mov_emb,
size=32,
program=program,
init_program=init_program)

CATEGORY_DICT_SIZE = len(paddle.dataset.movielens.movie_categories())

category_id = layers.data(
name='category_id',
shape=[1],
data_type='int64',
program=program,
init_program=init_program)

mov_categories_emb = layers.embedding(
input=category_id,
size=[CATEGORY_DICT_SIZE, 32],
is_sparse=is_sparse,
program=program,
init_program=init_program)

mov_categories_hidden = layers.sequence_pool(
input=mov_categories_emb,
pool_type="sum",
program=program,
init_program=init_program)

MOV_TITLE_DICT_SIZE = len(paddle.dataset.movielens.get_movie_title_dict())

mov_title_id = layers.data(
name='movie_title',
shape=[1],
data_type='int64',
program=program,
init_program=init_program)

mov_title_emb = layers.embedding(
input=mov_title_id,
size=[MOV_TITLE_DICT_SIZE, 32],
is_sparse=is_sparse,
program=program,
init_program=init_program)

mov_title_conv = nets.sequence_conv_pool(
input=mov_title_emb,
num_filters=32,
filter_size=3,
act="tanh",
pool_type="sum",
program=program,
init_program=init_program)

concat_embed = layers.concat(
input=[mov_fc, mov_categories_hidden, mov_title_conv],
axis=1,
program=program,
init_program=init_program)

# FIXME(dzh) : need tanh operator
mov_combined_features = layers.fc(input=concat_embed,
size=200,
act="tanh",
program=program,
init_program=init_program)

return mov_combined_features


def model():
usr_combined_features = get_usr_combined_features()
mov_combined_features = get_mov_combined_features()

# need cos sim
inference = layers.cos_sim(
X=usr_combined_features,
Y=mov_combined_features,
program=program,
init_program=init_program)

label = layers.data(
name='score',
shape=[1],
data_type='float32',
program=program,
init_program=init_program)

square_cost = layers.square_error_cost(
input=inference,
label=label,
program=program,
init_program=init_program)

avg_cost = layers.mean(
x=square_cost, program=program, init_program=init_program)

return avg_cost


def main():
cost = model()
sgd_optimizer = optimizer.SGDOptimizer(learning_rate=0.2)
opts = sgd_optimizer.minimize(cost)
block = program.block(0)

if use_gpu:
place = core.GPUPlace(0)
else:
place = core.CPUPlace()

exe = Executor(place)
exe.run(init_program, feed={}, fetch_list=[])

train_reader = paddle.batch(
paddle.reader.shuffle(
paddle.dataset.movielens.train(), buf_size=8192),
batch_size=BATCH_SIZE)

feeding = {
'user_id': 0,
'gender_id': 1,
'age_id': 2,
'job_id': 3,
'movie_id': 4,
'category_id': 5,
'movie_title': 6,
'score': 7
}

def func_feed(feeding, data):
feed_tensors = {}
for (key, idx) in feeding.iteritems():
tensor = core.LoDTensor()
if key != "category_id" and key != "movie_title":
if key == "score":
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"float32")
else:
numpy_data = np.array(map(lambda x: x[idx], data)).astype(
"int64")
else:
numpy_data = map(lambda x: np.array(x[idx]).astype("int64"),
data)
lod_info = [len(item) for item in numpy_data]
offset = 0
lod = [offset]
for item in lod_info:
offset += item
lod.append(offset)
numpy_data = np.concatenate(numpy_data, axis=0)
tensor.set_lod([lod])

numpy_data = numpy_data.reshape([numpy_data.shape[0], 1])
tensor.set(numpy_data, place)
feed_tensors[key] = tensor
return feed_tensors

PASS_NUM = 100
for pass_id in range(PASS_NUM):
for data in train_reader():
outs = exe.run(program,
feed=func_feed(feeding, data),
fetch_list=[cost])
out = np.array(outs[0])
if out[0] < 5.0:
# if avg cost less than 10.0, we think our code is good.
exit(0)


main()

0 comments on commit 69011c1

Please sign in to comment.