Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add OCR CTC model #596

Merged
merged 13 commits into from
Mar 7, 2018
4 changes: 4 additions & 0 deletions fluid/ocr_recognition/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
# OCR Model

This model built with paddle fluid is still under active development and is not
the final version. We welcome feedbacks.
178 changes: 178 additions & 0 deletions fluid/ocr_recognition/crnn_ctc_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
import paddle.fluid as fluid


def conv_bn_pool(input,
group,
out_ch,
act="relu",
param=None,
bias=None,
param_0=None,
is_test=False):
tmp = input
for i in xrange(group):
tmp = fluid.layers.conv2d(
input=tmp,
num_filters=out_ch[i],
filter_size=3,
padding=1,
param_attr=param if param_0 is None else param_0,
act=None, # LinearActivation
use_cudnn=True)
tmp = fluid.layers.batch_norm(
input=tmp,
act=act,
param_attr=param,
bias_attr=bias,
is_test=is_test)
tmp = fluid.layers.pool2d(
input=tmp, pool_size=2, pool_type='max', pool_stride=2, use_cudnn=True)

return tmp


def ocr_convs(input,
num,
with_bn,
regularizer=None,
gradient_clip=None,
is_test=False):
assert (num % 4 == 0)

b = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.0))
w0 = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.0005))
w1 = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.01))
tmp = input
tmp = conv_bn_pool(
tmp, 2, [16, 16], param=w1, bias=b, param_0=w0, is_test=is_test)

tmp = conv_bn_pool(tmp, 2, [32, 32], param=w1, bias=b, is_test=is_test)
tmp = conv_bn_pool(tmp, 2, [64, 64], param=w1, bias=b, is_test=is_test)
tmp = conv_bn_pool(tmp, 2, [128, 128], param=w1, bias=b, is_test=is_test)
return tmp


def encoder_net(images,
num_classes,
rnn_hidden_size=200,
regularizer=None,
gradient_clip=None,
is_test=False):
conv_features = ocr_convs(
images,
8,
True,
regularizer=regularizer,
gradient_clip=gradient_clip,
is_test=is_test)
sliced_feature = fluid.layers.im2sequence(
input=conv_features,
stride=[1, 1],
filter_size=[conv_features.shape[2], 1])

para_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.02))
bias_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.02),
learning_rate=2.0)
bias_attr_nobias = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.02))

fc_1 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=bias_attr_nobias)
fc_2 = fluid.layers.fc(input=sliced_feature,
size=rnn_hidden_size * 3,
param_attr=para_attr,
bias_attr=bias_attr_nobias)

gru_forward = fluid.layers.dynamic_gru(
input=fc_1,
size=rnn_hidden_size,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')
gru_backward = fluid.layers.dynamic_gru(
input=fc_2,
size=rnn_hidden_size,
is_reverse=True,
param_attr=para_attr,
bias_attr=bias_attr,
candidate_activation='relu')

w_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.02))
b_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=fluid.initializer.Normal(0.0, 0.0))

fc_out = fluid.layers.fc(input=[gru_forward, gru_backward],
size=num_classes + 1,
param_attr=w_attr,
bias_attr=b_attr)

return fc_out


def ctc_train_net(images, label, args, num_classes):
regularizer = fluid.regularizer.L2Decay(args.l2)
gradient_clip = None
fc_out = encoder_net(
images,
num_classes,
regularizer=regularizer,
gradient_clip=gradient_clip)

cost = fluid.layers.warpctc(
input=fc_out, label=label, blank=num_classes, norm_by_times=True)
sum_cost = fluid.layers.reduce_sum(cost)

optimizer = fluid.optimizer.Momentum(
learning_rate=args.learning_rate, momentum=args.momentum)
optimizer.minimize(sum_cost)

decoded_out = fluid.layers.ctc_greedy_decoder(
input=fc_out, blank=num_classes)
casted_label = fluid.layers.cast(x=label, dtype='int64')
error_evaluator = fluid.evaluator.EditDistance(
input=decoded_out, label=casted_label)
return sum_cost, error_evaluator


def ctc_infer(images, num_classes):
fc_out = encoder_net(images, num_classes, is_test=True)
return fluid.layers.ctc_greedy_decoder(input=fc_out, blank=num_classes)


def ctc_eval(images, label, num_classes):
fc_out = encoder_net(images, num_classes, is_test=True)
decoded_out = fluid.layers.ctc_greedy_decoder(
input=fc_out, blank=num_classes)

casted_label = fluid.layers.cast(x=label, dtype='int64')
error_evaluator = fluid.evaluator.EditDistance(
input=decoded_out, label=casted_label)

cost = fluid.layers.warpctc(
input=fc_out, label=label, blank=num_classes, norm_by_times=True)

return error_evaluator, cost
35 changes: 32 additions & 3 deletions fluid/ocr_recognition/ctc_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@
from PIL import Image

from paddle.v2.image import load_image
import paddle.v2 as paddle

NUM_CLASSES = 10784
DATA_SHAPE = [1, 48, 512]


class DataGenerator(object):
Expand All @@ -15,10 +19,10 @@ def train_reader(self, img_root_dir, img_label_list, batchsize):
Reader interface for training.

:param img_root_dir: The root path of the image for training.
:type file_list: str
:type file_list: str

:param img_label_list: The path of the <image_name, label> file for training.
:type file_list: str
:type file_list: str

'''

Expand Down Expand Up @@ -76,7 +80,7 @@ def test_reader(self, img_root_dir, img_label_list):
Reader interface for inference.

:param img_root_dir: The root path of the images for training.
:type file_list: str
:type file_list: str

:param img_label_list: The path of the <image_name, label> file for testing.
:type file_list: list
Expand All @@ -95,3 +99,28 @@ def reader():
yield img, label

return reader


def num_classes():
return NUM_CLASSES


def data_shape():
return DATA_SHAPE


def train(batch_size):
generator = DataGenerator()
return generator.train_reader(
"/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/train_images/",
"/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/train.list",
batch_size)


def test(batch_size=1):
generator = DataGenerator()
return paddle.batch(
generator.test_reader(
"/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/test_images/",
"/home/disk1/wanghaoshuang/models/fluid/ocr_recognition/data/test.list"
), batch_size)
95 changes: 95 additions & 0 deletions fluid/ocr_recognition/ctc_train.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
"""Trainer for OCR CTC model."""
import paddle.v2 as paddle
import paddle.fluid as fluid
import dummy_reader
import ctc_reader
import argparse
from load_model import load_param
import functools
import sys
from utility import add_arguments, print_arguments, to_lodtensor, get_feeder_data
from crnn_ctc_model import ctc_train_net

parser = argparse.ArgumentParser(description=__doc__)
add_arg = functools.partial(add_arguments, argparser=parser)
# yapf: disable
add_arg('batch_size', int, 32, "Minibatch size.")
add_arg('pass_num', int, 100, "# of training epochs.")
add_arg('log_period', int, 1000, "Log period.")
add_arg('learning_rate', float, 1.0e-3, "Learning rate.")
add_arg('l2', float, 0.0004, "L2 regularizer.")
add_arg('max_clip', float, 10.0, "Max clip threshold.")
add_arg('min_clip', float, -10.0, "Min clip threshold.")
add_arg('momentum', float, 0.9, "Momentum.")
add_arg('rnn_hidden_size',int, 200, "Hidden size of rnn layers.")
add_arg('device', int, 0, "Device id.'-1' means running on CPU"
"while '0' means GPU-0.")
# yapf: disable

def load_parameter(place):
params = load_param('./name.map', './data/model/results_without_avg_window/pass-00000/')
for name in params:
# print "param: %s" % name
t = fluid.global_scope().find_var(name).get_tensor()
t.set(params[name], place)


def train(args, data_reader=dummy_reader):
"""OCR CTC training"""
num_classes = data_reader.num_classes()
data_shape = data_reader.data_shape()
# define network
images = fluid.layers.data(name='pixel', shape=data_shape, dtype='float32')
label = fluid.layers.data(name='label', shape=[1], dtype='int32', lod_level=1)
sum_cost, error_evaluator = ctc_train_net(images, label, args, num_classes)
# data reader
train_reader = data_reader.train(args.batch_size)
test_reader = data_reader.test()
# prepare environment
place = fluid.CPUPlace()
if args.device >= 0:
place = fluid.CUDAPlace(args.device)
exe = fluid.Executor(place)
exe.run(fluid.default_startup_program())

#load_parameter(place)

inference_program = fluid.io.get_inference_program(error_evaluator)

for pass_id in range(args.pass_num):
error_evaluator.reset(exe)
batch_id = 1
total_loss = 0.0
total_seq_error = 0.0
# train a pass
for data in train_reader():
batch_loss, _, batch_seq_error = exe.run(
fluid.default_main_program(),
feed=get_feeder_data(data, place),
fetch_list=[sum_cost] + error_evaluator.metrics)
total_loss += batch_loss[0]
total_seq_error += batch_seq_error[0]
if batch_id % 10 == 1:
print '.',
sys.stdout.flush()
if batch_id % args.log_period == 1:
print "\nPass[%d]-batch[%d]; Avg Warp-CTC loss: %s; Avg seq error: %s." % (
pass_id, batch_id, total_loss / (batch_id * args.batch_size), total_seq_error / (batch_id * args.batch_size))
sys.stdout.flush()
batch_id += 1

# evaluate model on test data
error_evaluator.reset(exe)
for data in test_reader():
exe.run(inference_program, feed=get_feeder_data(data, place))
_, test_seq_error = error_evaluator.eval(exe)
print "\nEnd pass[%d]; Test seq error: %s.\n" % (
pass_id, str(test_seq_error[0]))

def main():
args = parser.parse_args()
print_arguments(args)
train(args, data_reader=ctc_reader)

if __name__ == "__main__":
main()
52 changes: 52 additions & 0 deletions fluid/ocr_recognition/dummy_reader.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
"""A dummy reader for test."""
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.

import numpy as np
import paddle.v2 as paddle

DATA_SHAPE = [1, 512, 512]
NUM_CLASSES = 20


def _read_creater(num_sample=1024, min_seq_len=1, max_seq_len=10):
def reader():
for i in range(num_sample):
sequence_len = np.random.randint(min_seq_len, max_seq_len)
x = np.random.uniform(0.1, 1, DATA_SHAPE).astype("float32")
y = np.random.randint(0, NUM_CLASSES + 1,
[sequence_len]).astype("int32")
yield x, y

return reader


def train(batch_size, num_sample=128):
"""Get train dataset reader."""
return paddle.batch(_read_creater(num_sample=num_sample), batch_size)


def test(batch_size=1, num_sample=16):
"""Get test dataset reader."""
return paddle.batch(_read_creater(num_sample=num_sample), batch_size)


def data_shape():
"""Get image shape in CHW order."""
return DATA_SHAPE


def num_classes():
"""Get number of total classes."""
return NUM_CLASSES
Loading