Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Restructure the code of ltr #515

Merged
merged 2 commits into from
Dec 4, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 4 additions & 8 deletions ltr/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,15 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}} = \frac{1}{2}(1-S_{i,j})-\fra

训练`RankNet`模型在命令行执行:
```bash
python ranknet.py
python train.py --model_type ranknet
```
初次执行会自动下载数据,训练RankNet模型,并将每个轮次的模型参数存储下来。

### RankNet模型预测

使用训练好的`RankNet`模型继续进行预测,在命令行执行:
```bash
python ranknet.py \
--run_type infer \
--test_model_path models/ranknet_params_0.tar.gz
python infer.py --model_type ranknet --test_model_path models/ranknet_params_0.tar.gz
```

本例提供了rankNet模型的训练和预测两个部分。完成训练后的模型分为拓扑结构(需要注意`rank_cost`不是模型拓扑结构的一部分)和模型参数文件两部分。在本例子中复用了`ranknet`训练时的模型拓扑结构`half_ranknet`,模型参数从外存中加载。模型预测的输入为单个文档的特征向量,模型会给出相关性得分。将预测得分排序即可得到最终的文档相关性排序结果。
Expand Down Expand Up @@ -193,7 +191,7 @@ $$\lambda _{i,j}=\frac{\partial C}{\partial s_{i}}=-\frac{\sigma }{1+e^{\sigma (

训练`LambdaRank`模型在命令行执行:
```bash
python lambda_rank.py
python train.py --model_type lambdarank
```
初次运行脚本会自动下载数据训练LambdaRank模型,并将每个轮次的模型存储下来。

Expand All @@ -203,9 +201,7 @@ LambdaRank模型预测过程和RankNet相同。预测时的模型拓扑结构复

使用训练好的`LambdaRank`模型继续进行预测,在命令行执行:
```bash
python lambda_rank.py \
--run_type infer \
--test_model_path models/lambda_rank_params_0.tar.gz
python infer.py --model_type lambdarank --test_model_path models/lambda_rank_params_0.tar.gz
```

## 自定义 LambdaRank数据
Expand Down
115 changes: 115 additions & 0 deletions ltr/infer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
import os
import gzip
import functools
import argparse

import paddle.v2 as paddle

from ranknet import half_ranknet
from lambda_rank import lambda_rank


def ranknet_infer(input_dim, model_path):
"""
RankNet model inference interface.
"""
# we just need half_ranknet to predict a rank score,
# which can be used in sort documents
output = half_ranknet("right", input_dim)
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))

# load data of same query and relevance documents,
# need ranknet to rank these candidates
infer_query_id = []
infer_data = []
infer_doc_index = []

# convert to mq2007 built-in data format
# <query_id> <relevance_score> <feature_vector>
plain_txt_test = functools.partial(
paddle.dataset.mq2007.test, format="plain_txt")

for query_id, relevance_score, feature_vector in plain_txt_test():
infer_query_id.append(query_id)
infer_data.append([feature_vector])

# predict score of infer_data document.
# Re-sort the document base on predict score
# in descending order. then we build the ranking documents
scores = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
for query_id, score in zip(infer_query_id, scores):
print "query_id : ", query_id, " score : ", score


def lambda_rank_infer(input_dim, model_path):
"""
LambdaRank model inference interface.
"""
output = lambda_rank(input_dim, is_infer=True)
parameters = paddle.parameters.Parameters.from_tar(gzip.open(model_path))

infer_query_id = None
infer_data = []
infer_data_num = 1

fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
for label, querylist in fill_default_test():
infer_data.append([querylist])
if len(infer_data) == infer_data_num:
break

# Predict score of infer_data document.
# Re-sort the document base on predict score.
# In descending order. then we build the ranking documents.
predicitons = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
for i, score in enumerate(predicitons):
print i, score


def parse_args():
parser = argparse.ArgumentParser(
description="PaddlePaddle learning to rank example.")
parser.add_argument(
"--model_type",
type=str,
help=("A flag indicating to run the RankNet or the LambdaRank model. "
"Available options are: ranknet or lambdarank."),
default="ranknet")
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--test_model_path",
type=str,
required=True,
help=("The path of a trained model."))
return parser.parse_args()


if __name__ == "__main__":
args = parse_args()
assert os.path.exists(args.test_model_path), (
"The trained model does not exit. Please set a correct path.")

paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)

# Training dataset: mq2007, input_dim = 46, dense format.
input_dim = 46

if args.model_type == "ranknet":
ranknet_infer(input_dim, args.test_model_path)
elif args.model_type == "lambdarank":
lambda_rank_infer(input_dim, args.test_model_path)
else:
logger.fatal(("A wrong value for parameter model type. "
"Available options are: ranknet or lambdarank."))
157 changes: 11 additions & 146 deletions ltr/lambda_rank.py
Original file line number Diff line number Diff line change
@@ -1,32 +1,20 @@
import os
import sys
import gzip
import functools
import argparse
import logging
import numpy as np

"""
LambdaRank is a listwise rank model.
https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
"""
import paddle.v2 as paddle

logger = logging.getLogger("paddle")
logger.setLevel(logging.INFO)


def lambda_rank(input_dim, is_infer):
def lambda_rank(input_dim, is_infer=False):
"""
LambdaRank is a listwise rank model, the input data and label
must be sequences.
The input data and label for LambdaRank must be sequences.

https://papers.nips.cc/paper/2971-learning-to-rank-with-nonsmooth-cost-functions.pdf
parameters :
input_dim, one document's dense feature vector dimension

The format of the dense_vector_sequence is as follows:
[[f, ...], [f, ...], ...], f is a float or an int number
"""
if not is_infer:
label = paddle.layer.data("label",
paddle.data_type.dense_vector_sequence(1))
data = paddle.layer.data("data",
paddle.data_type.dense_vector_sequence(input_dim))

Expand All @@ -49,134 +37,11 @@ def lambda_rank(input_dim, is_infer):
param_attr=paddle.attr.Param(initial_std=0.01))

if not is_infer:
# Define the cost layer.
label = paddle.layer.data("label",
paddle.data_type.dense_vector_sequence(1))

cost = paddle.layer.lambda_cost(
input=output, score=label, NDCG_num=6, max_sort_size=-1)
return cost, output
return output


def lambda_rank_train(num_passes, model_save_dir):
# The input for LambdaRank must be a sequence.
fill_default_train = functools.partial(
paddle.dataset.mq2007.train, format="listwise")
fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")

train_reader = paddle.batch(
paddle.reader.shuffle(fill_default_train, buf_size=100), batch_size=32)
test_reader = paddle.batch(fill_default_test, batch_size=32)

# Training dataset: mq2007, input_dim = 46, dense format.
input_dim = 46
cost, output = lambda_rank(input_dim, is_infer=False)
parameters = paddle.parameters.create(cost)

trainer = paddle.trainer.SGD(
cost=cost,
parameters=parameters,
update_equation=paddle.optimizer.Adam(learning_rate=1e-4))

# Define end batch and end pass event handler.
def event_handler(event):
if isinstance(event, paddle.event.EndIteration):
logger.info("Pass %d Batch %d Cost %.9f" %
(event.pass_id, event.batch_id, event.cost))
if isinstance(event, paddle.event.EndPass):
result = trainer.test(reader=test_reader, feeding=feeding)
logger.info("\nTest with Pass %d, %s" %
(event.pass_id, result.metrics))
with gzip.open(
os.path.join(model_save_dir, "lambda_rank_params_%d.tar.gz"
% (event.pass_id)), "w") as f:
trainer.save_parameter_to_tar(f)

feeding = {"label": 0, "data": 1}
trainer.train(
reader=train_reader,
event_handler=event_handler,
feeding=feeding,
num_passes=num_passes)


def lambda_rank_infer(test_model_path):
"""LambdaRank model inference interface.

Parameters:
test_model_path : The path of the trained model.
"""
logger.info("Begin to Infer...")
input_dim = 46
output = lambda_rank(input_dim, is_infer=True)
parameters = paddle.parameters.Parameters.from_tar(
gzip.open(test_model_path))

infer_query_id = None
infer_data = []
infer_data_num = 1

fill_default_test = functools.partial(
paddle.dataset.mq2007.test, format="listwise")
for label, querylist in fill_default_test():
infer_data.append([querylist])
if len(infer_data) == infer_data_num:
break

# Predict score of infer_data document.
# Re-sort the document base on predict score.
# In descending order. then we build the ranking documents.
predicitons = paddle.infer(
output_layer=output, parameters=parameters, input=infer_data)
for i, score in enumerate(predicitons):
print i, score


if __name__ == '__main__':
parser = argparse.ArgumentParser(
description="PaddlePaddle LambdaRank example.")
parser.add_argument(
"--run_type",
type=str,
help=("A flag indicating to run the training or the inferring task. "
"Available options are: train or infer."),
default="train")
parser.add_argument(
"--num_passes",
type=int,
help="The number of passes to train the model.",
default=10)
parser.add_argument(
"--use_gpu",
type=bool,
help="A flag indicating whether to use the GPU device in training.",
default=False)
parser.add_argument(
"--trainer_count",
type=int,
help="The thread number used in training.",
default=1)
parser.add_argument(
"--model_save_dir",
type=str,
required=False,
help=("The path to save the trained models."),
default="models")
parser.add_argument(
"--test_model_path",
type=str,
required=False,
help=("This parameter works only in inferring task to "
"specify path of a trained model."),
default="")

args = parser.parse_args()
paddle.init(use_gpu=args.use_gpu, trainer_count=args.trainer_count)
if args.run_type == "train":
lambda_rank_train(args.num_passes, args.model_save_dir)
elif args.run_type == "infer":
assert os.path.exists(args.test_model_path), (
"The trained model does not exit. Please set a correct path.")
lambda_rank_infer(args.test_model_path)
return cost
else:
logger.fatal(("A wrong value for parameter run type. "
"Available options are: train or infer."))
return output
38 changes: 0 additions & 38 deletions ltr/metrics.py

This file was deleted.

Loading