From b64aae7a61fcd2f0644e82fd875898ce2a6bb7e3 Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Tue, 29 Jan 2019 12:25:48 +0000 Subject: [PATCH 1/2] Add fsp distillatoin strategy. --- .../fluid/contrib/slim/core/compress_pass.py | 18 ++- .../slim/demo/distillation/compress.py | 91 ++++++++++++ .../slim/demo/distillation/config.yaml | 14 ++ .../contrib/slim/demo/distillation/resnet.py | 137 ++++++++++++++++++ .../distillation/distillation_strategy.py | 40 +++++ .../contrib/slim/distillation/distiller.py | 112 ++++++++++++++ .../paddle/fluid/contrib/slim/graph/graph.py | 22 +++ python/paddle/fluid/framework.py | 5 +- 8 files changed, 431 insertions(+), 8 deletions(-) create mode 100644 python/paddle/fluid/contrib/slim/demo/distillation/compress.py create mode 100644 python/paddle/fluid/contrib/slim/demo/distillation/config.yaml create mode 100644 python/paddle/fluid/contrib/slim/demo/distillation/resnet.py create mode 100644 python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py create mode 100644 python/paddle/fluid/contrib/slim/distillation/distiller.py diff --git a/python/paddle/fluid/contrib/slim/core/compress_pass.py b/python/paddle/fluid/contrib/slim/core/compress_pass.py index ed9fce12b6f7b..de14d7fcec430 100644 --- a/python/paddle/fluid/contrib/slim/core/compress_pass.py +++ b/python/paddle/fluid/contrib/slim/core/compress_pass.py @@ -13,6 +13,7 @@ # limitations under the License. from ....core import CPUPlace +from .... import io from ....data_feeder import DataFeeder from ..graph import get_executor, ImitationGraph from config import ConfigFactory @@ -41,6 +42,7 @@ def __init__(self, train_reader=None, eval_graph=None, eval_reader=None, + teacher_graphs=None, optimizer=None): # The total number of epoches to be trained. self.epoch = 0 @@ -57,6 +59,7 @@ def __init__(self, self.eval_graph = eval_graph self.eval_reader = eval_reader self.executor = None + self.teacher_graphs = teacher_graphs self.optimizer = optimizer def run_eval_graph(self): @@ -159,8 +162,9 @@ def config(self, config_file): def _load_checkpoint(self, context): if self.checkpoint: - exe = get_executor(context.train_graph, parallel=False) - fluid.io.load_persistables( + exe = get_executor( + context.train_graph, context.place, parallel=False) + io.load_persistables( exe.exe, self.checkpoint, main_program=context.train_graph.program) @@ -168,12 +172,13 @@ def _load_checkpoint(self, context): def _save_checkpoint(self, context): if context.epoch_id % 5 == 0 and self.model_save_dir: - model_path = os.path.join(self.model_save_dir, - str(context.epoch_id)) + model_path = os.path.join( + self.model_save_dir, + str(context.epoch_id) + "_" + str(context.batch_id)) if not os.path.isdir(model_path): os.makedirs(model_path) - exe = get_executor(context.train_graph, parallel=False) - fluid.io.save_persistables( + exe = get_executor(context.train_graph, context.place, False) + io.save_persistables( exe.exe, model_path, main_program=context.train_graph.program) print('Saved checkpoint to: {}'.format(model_path)) @@ -209,6 +214,7 @@ def run(self): train_reader=self.train_reader, eval_graph=self.eval_graph, eval_reader=self.eval_reader, + teacher_graphs=self.teacher_graphs, optimizer=self.optimizer) self._load_checkpoint(context) diff --git a/python/paddle/fluid/contrib/slim/demo/distillation/compress.py b/python/paddle/fluid/contrib/slim/demo/distillation/compress.py new file mode 100644 index 0000000000000..31c6ff98474c1 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/demo/distillation/compress.py @@ -0,0 +1,91 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import paddle.fluid as fluid +import paddle +import os +import sys +from resnet import * +from paddle.fluid.contrib.slim import CompressPass +from paddle.fluid.contrib.slim import build_compressor +from paddle.fluid.contrib.slim import ImitationGraph + + +class Model(object): + def __init__(slef): + pass + + def compress(self): + + img = fluid.layers.data(name='img', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + resnet50 = ResNet50() + predict = resnet50.net(img, class_dim=10) + eval_program = fluid.default_main_program().clone(for_test=False) + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + + with fluid.program_guard(main_program=eval_program): + acc = fluid.layers.accuracy(input=predict, label=label) + + optimizer = fluid.optimizer.SGD(0.001) + optimizer.minimize(avg_cost) + + place = fluid.CUDAPlace(0) + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=500), + batch_size=32) + eval_reader = paddle.batch(paddle.dataset.mnist.test(), batch_size=1) + + train_feed_list = {'img': img.name, 'label': label.name} + train_fetch_list = {'cost': avg_cost.name} + eval_feed_list = {'img': img.name, 'label': label.name} + eval_fetch_list = {'acc': acc.name} + + # define teacher program + teacher_program = fluid.Program() + startup_program = fluid.Program() + with fluid.program_guard(teacher_program, startup_program): + img = fluid.layers.data( + name='img', shape=[1, 28, 28], dtype='float32') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + resnet101 = ResNet101() + predict = resnet101.net(img, class_dim=10) + exe.run(startup_program) + + com_pass = CompressPass( + place, + fluid.global_scope(), + fluid.default_main_program(), + train_reader=train_reader, + train_feed_list=train_feed_list, + train_fetch_list=train_fetch_list, + eval_program=eval_program, + eval_reader=eval_reader, + eval_feed_list=eval_feed_list, + eval_fetch_list=eval_fetch_list, + teacher_programs=[teacher_program], + optimizer=optimizer) + com_pass.model_save_dir = './checkpoints' + com_pass.config('./config.yaml') + com_pass.run() + + +if __name__ == "__main__": + model = Model() + model.compress() diff --git a/python/paddle/fluid/contrib/slim/demo/distillation/config.yaml b/python/paddle/fluid/contrib/slim/demo/distillation/config.yaml new file mode 100644 index 0000000000000..78b0bc6cb0eb0 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/demo/distillation/config.yaml @@ -0,0 +1,14 @@ +version: 1.0 +distillers: + fsp_distiller: + class: 'FSPDistiller' +strategies: + fsp_distillation_strategy: + class: 'FSPDistillationStrategy' + distiller: 'fsp_distiller' + start_epoch: 0 + end_epoch: 10 +compress_pass: + epoch: 10 + strategies: + - fsp_distillation_strategy diff --git a/python/paddle/fluid/contrib/slim/demo/distillation/resnet.py b/python/paddle/fluid/contrib/slim/demo/distillation/resnet.py new file mode 100644 index 0000000000000..26c6ca143cdd6 --- /dev/null +++ b/python/paddle/fluid/contrib/slim/demo/distillation/resnet.py @@ -0,0 +1,137 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function +import paddle +import paddle.fluid as fluid +import math + +__all__ = ["ResNet", "ResNet50", "ResNet101", "ResNet152"] + +train_parameters = { + "input_size": [3, 224, 224], + "input_mean": [0.485, 0.456, 0.406], + "input_std": [0.229, 0.224, 0.225], + "learning_strategy": { + "name": "piecewise_decay", + "batch_size": 256, + "epochs": [30, 60, 90], + "steps": [0.1, 0.01, 0.001, 0.0001] + } +} + + +class ResNet(): + def __init__(self, layers=50): + self.params = train_parameters + self.layers = layers + + def net(self, input, class_dim=1000): + layers = self.layers + supported_layers = [50, 101, 152] + assert layers in supported_layers, \ + "supported layers are {} but input layer is {}".format(supported_layers, layers) + + if layers == 50: + depth = [3, 4, 6, 3] + elif layers == 101: + depth = [3, 4, 23, 3] + elif layers == 152: + depth = [3, 8, 36, 3] + num_filters = [64, 128, 256, 512] + + conv = self.conv_bn_layer( + input=input, num_filters=64, filter_size=7, stride=2, act='relu') + conv = fluid.layers.pool2d( + input=conv, + pool_size=3, + pool_stride=2, + pool_padding=1, + pool_type='max') + + for block in range(len(depth)): + for i in range(depth[block]): + conv = self.bottleneck_block( + input=conv, + num_filters=num_filters[block], + stride=2 if i == 0 and block != 0 else 1) + + pool = fluid.layers.pool2d( + input=conv, pool_size=7, pool_type='avg', global_pooling=True) + stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0) + out = fluid.layers.fc(input=pool, + size=class_dim, + act='softmax', + param_attr=fluid.param_attr.ParamAttr( + initializer=fluid.initializer.Uniform(-stdv, + stdv))) + return out + + def conv_bn_layer(self, + input, + num_filters, + filter_size, + stride=1, + groups=1, + act=None): + conv = fluid.layers.conv2d( + input=input, + num_filters=num_filters, + filter_size=filter_size, + stride=stride, + padding=(filter_size - 1) // 2, + groups=groups, + act=None, + bias_attr=False) + return fluid.layers.batch_norm(input=conv, act=act) + + def shortcut(self, input, ch_out, stride): + ch_in = input.shape[1] + if ch_in != ch_out or stride != 1: + return self.conv_bn_layer(input, ch_out, 1, stride) + else: + return input + + def bottleneck_block(self, input, num_filters, stride): + conv0 = self.conv_bn_layer( + input=input, num_filters=num_filters, filter_size=1, act='relu') + conv1 = self.conv_bn_layer( + input=conv0, + num_filters=num_filters, + filter_size=3, + stride=stride, + act='relu') + conv2 = self.conv_bn_layer( + input=conv1, num_filters=num_filters * 4, filter_size=1, act=None) + + short = self.shortcut(input, num_filters * 4, stride) + + return fluid.layers.elementwise_add(x=short, y=conv2, act='relu') + + +def ResNet50(): + model = ResNet(layers=50) + return model + + +def ResNet101(): + model = ResNet(layers=101) + return model + + +def ResNet152(): + model = ResNet(layers=152) + return model diff --git a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py new file mode 100644 index 0000000000000..a50c5edf073ac --- /dev/null +++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py @@ -0,0 +1,40 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from ..core.strategy import Strategy +from ....framework import Program, program_guard, Parameter +from .... import layers +import numpy as np +import copy +import re + +__all__ = ['FSPDistillationStrategy'] + + +class FSPDistillationStrategy(Strategy): + def __init__(self, distiller=None, start_epoch=0, end_epoch=10): + super(FSPDistillationStrategy, self).__init__(start_epoch, end_epoch) + self.distiller = distiller + self.train_graph_backup = None + + def on_compression_begin(self, context): + self.train_graph_backup = context.train_graph + + graph = self.distiller.distiller_graph(context.eval_graph, + context.teacher_graphs, + context.optimizer, context.place) + context.train_graph = graph + + def on_compression_end(self, context): + context.train_graph = self.train_graph_backup diff --git a/python/paddle/fluid/contrib/slim/distillation/distiller.py b/python/paddle/fluid/contrib/slim/distillation/distiller.py new file mode 100644 index 0000000000000..1a418b7b611cd --- /dev/null +++ b/python/paddle/fluid/contrib/slim/distillation/distiller.py @@ -0,0 +1,112 @@ +# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .... import layers +from .... import Executor +from .... import Program +from .... import program_guard + +__all__ = ['FSPDistiller'] + + +class FSPDistiller(object): + def __init__(self): + self.student_pairs = None + self.teacher_pairs = None + + def _feature_map_pairs(self, graph): + pairs = [] + sizes = [] + pair = [] + pre_size = None + for op in graph.all_ops(): + if op.type == 'conv2d': + out_var_name = op.output('Output')[0] + feature_map_size = graph.get_var(out_var_name).shape[2:] + if feature_map_size != pre_size: + if len(pair) == 2 and pair[1] != None: + pairs.append(pair) + sizes.append(pre_size) + pair = [out_var_name, None] + else: + pair[1] = out_var_name + pre_size = feature_map_size + if len(pair) == 2 and pair[1] != None: + pairs.append(pair) + sizes.append(pre_size) + return pairs, sizes + + def distiller_graph(self, student, teachers, optimizer, place): + """ + Merge student and teacher into training graph. + """ + teacher = teachers[0] + for var in teacher.program.list_vars(): + var.stop_gradient = True + graph = student.clone() + graph.merge(teacher) + s_pairs, s_sizes = self._feature_map_pairs(student) + t_pairs, t_sizes = self._feature_map_pairs(teacher) + distiller_pass = FSPDistillerPass(s_pairs, s_sizes, t_pairs, t_sizes, + optimizer, place) + dis_graph = distiller_pass.apply(graph) + return dis_graph + + +class FSPDistillerPass(object): + def __init__(self, s_pairs, s_sizes, t_pairs, t_sizes, optimizer, place): + self.s_pairs = s_pairs + self.s_sizes = s_sizes + self.t_pairs = t_pairs + self.t_sizes = t_sizes + self.optimizer = optimizer + self.place = place + + def apply(self, graph): + ret_graph = graph.clone() + startup_program = Program() + with program_guard(ret_graph.program, startup_program): + losses = [] + for s_pair, t_pair in zip(self.s_pairs, self.t_pairs): + s_pair_start = ret_graph.get_var(s_pair[0]) + s_pair_end = ret_graph.get_var(s_pair[1]) + s_fsp_matrix = self.fsp_matrix(s_pair_start, s_pair_end) + t_pair_start = ret_graph.get_var(t_pair[0]) + t_pair_end = ret_graph.get_var(t_pair[1]) + t_fsp_matrix = self.fsp_matrix(t_pair_start, t_pair_end) + l2_loss = layers.mean( + layers.square(s_fsp_matrix - t_fsp_matrix)) + losses.append(l2_loss) + loss = layers.sum(losses) + self.optimizer.minimize(loss) + + exe = Executor(self.place) + exe.run(startup_program, scope=ret_graph.scope) + ret_graph.out_nodes['loss'] = loss.name + return ret_graph + + def fsp_matrix(self, fea_map_0, fea_map_1): + a_channel = fea_map_0.shape[1] + b_channel = fea_map_1.shape[1] + h = fea_map_0.shape[2] + w = fea_map_0.shape[3] + tmp_0 = layers.transpose(fea_map_0, perm=[0, 2, 3, 1]) + tmp_0 = layers.reshape(tmp_0, [-1, h * w, 1, a_channel]) + tmp_0 = layers.expand(tmp_0, expand_times=[1, 1, b_channel, 1]) + tmp_0 = layers.transpose(tmp_0, perm=[0, 1, 3, 2]) + + tmp_1 = layers.transpose(fea_map_1, perm=[0, 2, 3, 1]) + tmp_1 = layers.reshape(tmp_1, [-1, h * w, 1, b_channel]) + tmp_1 = layers.expand(tmp_1, expand_times=[1, 1, a_channel, 1]) + return layers.reduce_mean(tmp_0 * tmp_1, dim=1) diff --git a/python/paddle/fluid/contrib/slim/graph/graph.py b/python/paddle/fluid/contrib/slim/graph/graph.py index c726a7a590cc9..63fc5ac192410 100644 --- a/python/paddle/fluid/contrib/slim/graph/graph.py +++ b/python/paddle/fluid/contrib/slim/graph/graph.py @@ -251,6 +251,28 @@ def clone(self, for_test=False): self.program.clone(for_test), self.scope, copy.deepcopy(self.in_nodes), copy.deepcopy(self.out_nodes)) + def merge(self, graph): + for var in graph.program.list_vars(): + self.program.global_block()._clone_variable(var) + for op in graph.all_ops(): + inputs = {} + outputs = {} + attrs = {} + for input_name in op.input_names: + inputs[input_name] = [ + self.get_var(in_var_name) + for in_var_name in op.input(input_name) + ] + for output_name in op.output_names: + outputs[output_name] = [ + self.get_var(out_var_name) + for out_var_name in op.output(output_name) + ] + for attr_name in op.attr_names: + attrs[attr_name] = op.attr(attr_name) + self.program.global_block().append_op( + type=op.type, inputs=inputs, outputs=outputs, attrs=attrs) + def ops(self): return self.program.global_block().ops diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index 8d061f41f09a8..66281f7007a1a 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -1487,7 +1487,7 @@ def _clone_variable(self, var): shape=var.shape, dtype=var.dtype, type=var.type, - persistable=True, + persistable=var.persistable, is_data=var.is_data) else: ret_var = self.create_var( @@ -1496,7 +1496,8 @@ def _clone_variable(self, var): dtype=var.dtype, type=var.type, lod_level=var.lod_level, - persistable=True, + persistable=var.persistable, + stop_gradient=var.stop_gradient, is_data=var.is_data) return ret_var From 14c91292267bd9f1d741edde6bbe1b0b0dc5fd1c Mon Sep 17 00:00:00 2001 From: wanghaoshuang Date: Wed, 30 Jan 2019 05:24:34 +0000 Subject: [PATCH 2/2] refine code. --- .../distillation/distillation_strategy.py | 19 +++++----- .../contrib/slim/distillation/distiller.py | 35 ++++++++++++------- 2 files changed, 32 insertions(+), 22 deletions(-) diff --git a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py index a50c5edf073ac..3359a15f00361 100644 --- a/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py +++ b/python/paddle/fluid/contrib/slim/distillation/distillation_strategy.py @@ -28,13 +28,14 @@ def __init__(self, distiller=None, start_epoch=0, end_epoch=10): self.distiller = distiller self.train_graph_backup = None - def on_compression_begin(self, context): - self.train_graph_backup = context.train_graph + def on_epoch_begin(self, context): + if self.start_epoch == context.epoch_id: + self.train_graph_backup = context.train_graph + graph = self.distiller.distiller_graph( + context.eval_graph, context.teacher_graphs, context.optimizer, + context.place) + context.train_graph = graph - graph = self.distiller.distiller_graph(context.eval_graph, - context.teacher_graphs, - context.optimizer, context.place) - context.train_graph = graph - - def on_compression_end(self, context): - context.train_graph = self.train_graph_backup + def on_epoch_end(self, context): + if context.epoch_id == (self.end_epoch - 1): + context.train_graph = self.train_graph_backup diff --git a/python/paddle/fluid/contrib/slim/distillation/distiller.py b/python/paddle/fluid/contrib/slim/distillation/distiller.py index 1a418b7b611cd..e9a394de571d6 100644 --- a/python/paddle/fluid/contrib/slim/distillation/distiller.py +++ b/python/paddle/fluid/contrib/slim/distillation/distiller.py @@ -21,9 +21,9 @@ class FSPDistiller(object): - def __init__(self): - self.student_pairs = None - self.teacher_pairs = None + def __init__(self, student_pairs=None, teacher_pairs=None): + self.student_pairs = student_pairs + self.teacher_pairs = teacher_pairs def _feature_map_pairs(self, graph): pairs = [] @@ -49,27 +49,34 @@ def _feature_map_pairs(self, graph): def distiller_graph(self, student, teachers, optimizer, place): """ - Merge student and teacher into training graph. + Generate distillation training graph. """ teacher = teachers[0] for var in teacher.program.list_vars(): var.stop_gradient = True + # step 1: merge student and teacher into graph graph = student.clone() graph.merge(teacher) - s_pairs, s_sizes = self._feature_map_pairs(student) - t_pairs, t_sizes = self._feature_map_pairs(teacher) - distiller_pass = FSPDistillerPass(s_pairs, s_sizes, t_pairs, t_sizes, - optimizer, place) + if not self.student_pairs: + self.student_pairs = self._feature_map_pairs(student) + if not self.teacher_pairs: + self.teacher_pairs = self._feature_map_pairs(teacher) + # step 2: add fsp loss and backward ops + distiller_pass = FSPDistillerPass(self.student_pairs, + self.teacher_pairs, optimizer, place) dis_graph = distiller_pass.apply(graph) return dis_graph class FSPDistillerPass(object): - def __init__(self, s_pairs, s_sizes, t_pairs, t_sizes, optimizer, place): + ''' + Convert graph to fsp distillation training graph + by adding fsp loss and backward operators. + ''' + + def __init__(self, s_pairs, t_pairs, optimizer, place): self.s_pairs = s_pairs - self.s_sizes = s_sizes self.t_pairs = t_pairs - self.t_sizes = t_sizes self.optimizer = optimizer self.place = place @@ -81,10 +88,10 @@ def apply(self, graph): for s_pair, t_pair in zip(self.s_pairs, self.t_pairs): s_pair_start = ret_graph.get_var(s_pair[0]) s_pair_end = ret_graph.get_var(s_pair[1]) - s_fsp_matrix = self.fsp_matrix(s_pair_start, s_pair_end) + s_fsp_matrix = self._fsp_matrix(s_pair_start, s_pair_end) t_pair_start = ret_graph.get_var(t_pair[0]) t_pair_end = ret_graph.get_var(t_pair[1]) - t_fsp_matrix = self.fsp_matrix(t_pair_start, t_pair_end) + t_fsp_matrix = self._fsp_matrix(t_pair_start, t_pair_end) l2_loss = layers.mean( layers.square(s_fsp_matrix - t_fsp_matrix)) losses.append(l2_loss) @@ -92,6 +99,8 @@ def apply(self, graph): self.optimizer.minimize(loss) exe = Executor(self.place) + # init variable created when append backward ops. Such as leaning rate + # and accumulators in some optimizer. exe.run(startup_program, scope=ret_graph.scope) ret_graph.out_nodes['loss'] = loss.name return ret_graph