diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index c917ca0ff4e08..9cf256fb6d533 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -227,8 +227,8 @@ function(cc_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     add_executable(${TARGET_NAME} ${cc_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
 endfunction(cc_test)
@@ -288,8 +288,8 @@ function(nv_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
     cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS})
-    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
-    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main)
+    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
+    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags)
     add_test(${TARGET_NAME} ${TARGET_NAME})
   endif()
 endfunction(nv_test)
diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md
index b3330b0b59d65..e1d91c668e9c6 100644
--- a/doc/howto/optimization/cpu_profiling.md
+++ b/doc/howto/optimization/cpu_profiling.md
@@ -1,42 +1,52 @@
-此教程会介绍如何使用Python的cProfile包，与Python库yep，google perftools来运行性能分析(Profiling)与调优。
+This tutorial introduces techniques we used to profile and tune the
+CPU performance of PaddlePaddle.  We will use Python packages
+`cProfile` and `yep`, and Google `perftools`.
 
-运行性能分析可以让开发人员科学的，有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中，真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。
+Profiling is the process that reveals the performance bottlenecks,
+which could be very different from what's in the developers' mind.
+Performance tuning is to fix the bottlenecks. Performance optimization
+repeats the steps of profiling and tuning alternatively.
 
-性能优化的步骤，通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。
+PaddlePaddle users program AI by calling the Python API, which calls
+into `libpaddle.so.` written in C++.  In this tutorial, we focus on
+the profiling and tuning of
 
-Paddle提供了Python语言绑定。用户使用Python进行神经网络编程，训练，测试。Python解释器通过`pybind`和`swig`调用Paddle的动态链接库，进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分:
+1. the Python code and
+1. the mixture of Python and C++ code.
 
-* Python代码的性能分析
-* Python与C++混合代码的性能分析
+## Profiling the Python Code
 
+### Generate the Performance Profiling File
 
-## Python代码的性能分析
-
-### 生成性能分析文件
-
-Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+We can use Python standard
+package, [`cProfile`](https://docs.python.org/2/library/profile.html),
+to generate Python profiling file.  For example:
 
 ```bash
 python -m cProfile -o profile.out main.py
 ```
 
-其中`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。
-
-### 查看性能分析文件
+where `main.py` is the program we are going to profile, `-o` specifies
+the output file.  Without `-o`, `cProfile` would outputs to standard
+output.
 
-当main.py运行完毕后，性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来。
+### Look into the Profiling File
 
-使用`pip install cprofilev`安装`cprofilev`工具。安装完成后，使用如下命令开启HTTP服务
+`cProfile` generates `profile.out` after `main.py` completes. We can
+use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into
+the details:
 
 ```bash
 cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
 ```
 
-其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+where `-a` specifies the HTTP IP, `-p` specifies the port, `-f`
+specifies the profiling file, and `main.py` is the source file.
 
-访问对应网址，即可显示性能分析的结果。性能分析结果格式如下:
+Open the Web browser and points to the local IP and the specifies
+port, we will see the output like the following:
 
-```text
+```
    ncalls  tottime  percall  cumtime  percall filename:lineno(function)
         1    0.284    0.284   29.514   29.514 main.py:1(<module>)
      4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
@@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
         1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
 ```
 
-每一列的含义是:
+where each line corresponds to Python function, and the meaning of
+each column is as follows:
 
-| 列名 | 含义 |
+| column | meaning |
 | --- | --- |
-| ncalls | 函数的调用次数 |
-| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
-| percall | tottime的每次调用平均时间 |
-| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
-| percall | cumtime的每次调用平均时间 |
-| filename:lineno(function) | 文件名, 行号，函数名 |
+| ncalls | the number of calls into a function |
+| tottime | the total execution time of the function, not including the
+ execution time of other functions called by the function |
+| percall | tottime divided by ncalls |
+| cumtime | the total execution time of the function, including the execution time of other functions being called |
+| percall | cumtime divided by ncalls |
+| filename:lineno(function) | where the function is defined |
 
+### Identify Performance Bottlenecks
 
-### 寻找性能瓶颈
-
-通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
-
-将性能分析结果按照tottime排序，效果如下:
+Usually, `tottime` and the related `percall` time is what we want to
+focus on. We can sort above profiling file by tottime:
 
 ```text
      4696   12.040    0.003   12.040    0.003 {built-in method run}
@@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
    107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
      4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
         1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
-
 ```
 
-可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+We can see that the most time-consuming function is the `built-in
+method run`, which is a C++ function in `libpaddle.so`.  We will
+explain how to profile C++ code in the next section.  At the right
+moment, let's look into the third function `sync_with_cpp`, which is a
+Python function.  We can click it to understand more about it:
 
-```text
+```
 Called By:
 
    Ordered by: internal time
@@ -92,72 +105,93 @@ Called:
    List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
 ```
 
-通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+The lists of the callers of `sync_with_cpp` might help us understand
+how to improve the function definition.
 
+## Profiling Python and C++ Code
 
+### Generate the Profiling File
 
-## Python与C++混合代码的性能分析
+To profile a mixture of Python and C++ code, we can use a Python
+package, `yep`, that can work with Google's `perftools`, which is a
+commonly-used profiler for C/C++ code.
 
-### 生成性能分析文件
-
-C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
-
-使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+In Ubuntu systems, we can install `yep` and `perftools` by running the
+following commands:
 
 ```bash
+apt update
 apt install libgoogle-perftools-dev
 pip install yep
 ```
 
-安装完毕后，我们可以通过
+Then we can run the following command
 
 ```bash
 python -m yep -v main.py
 ```
 
-生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+to generate the profiling file.  The default filename is
+`main.py.prof`.
+
+Please be aware of the `-v` command line option, which prints the
+analysis results after generating the profiling file.  By taking a
+glance at the print result, we'd know that if we stripped debug
+information from `libpaddle.so` at build time.  The following hints
+help make sure that the analysis results are readable:
 
-命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+1. Use GCC command line option `-g` when building `libpaddle.so` so to
+   include the debug information.  The standard building system of
+   PaddlePaddle is CMake, so you might want to set
+   `CMAKE_BUILD_TYPE=RelWithDebInfo`.
 
-1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
-2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
-3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+1. Use GCC command line option `-O2` or `-O3` to generate optimized
+   binary code. It doesn't make sense to profile `libpaddle.so`
+   without optimization, because it would anyway run slowly.
 
-### 查看性能分析文件
+1. Profiling the single-threaded binary file before the
+   multi-threading version, because the latter often generates tangled
+   profiling analysis result.  You might want to set environment
+   variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically
+   starting multiple threads.
 
-在运行完性能分析后，会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+### Look into the Profiling File
 
-安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+The tool we used to look into the profiling file generated by
+`perftools` is [`pprof`](https://github.com/google/pprof), which
+provides a Web-based GUI like `cprofilev`.
+
+We can rely on the standard Go toolchain to retrieve the source code
+of `pprof` and build it:
 
 ```bash
 go get github.com/google/pprof
 ```
 
-进而我们可以使用如下命令开启一个HTTP服务:
+Then we can use it to profile `main.py.prof` generated in the previous
+section:
 
 ```bash
 pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
 ```
 
-这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
-
-访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+Where `-http` specifies the IP and port of the HTTP service.
+Directing our Web browser to the service, we would see something like
+the following:
 
 ![result](./pprof_1.png)
 
+### Identifying the Performance Bottlenecks
 
-### 寻找性能瓶颈
-
-与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
-
-例如下图中，
+Similar to how we work with `cprofilev`, we'd focus on `tottime` and
+`cumtime`.
 
 ![kernel_perf](./pprof_2.png)
 
-在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
-
-在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
-
-## 总结
+We can see that the execution time of multiplication and the computing
+of the gradient of multiplication takes 2% to 4% of the total running
+time, and `MomentumOp` takes about 17%. Obviously, we'd want to
+optimize `MomentumOp`.
 
-至此，两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式，Paddle的开发人员和使用人员可以有次序的，科学的发现和解决性能问题。
+`pprof` would mark performance critical parts of the program in
+red. It's a good idea to follow the hint.
diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md
new file mode 100644
index 0000000000000..14eba0e2f34b1
--- /dev/null
+++ b/doc/howto/optimization/cpu_profiling_cn.md
@@ -0,0 +1,155 @@
+此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优（performance tuning）。
+
+Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。
+
+PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分:
+
+* Python 代码的性能分析
+* Python 与 C++ 混合代码的性能分析
+
+
+## Python代码的性能分析
+
+### 生成性能分析文件
+
+Python标准库中提供了性能分析的工具包，[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下:
+
+```bash
+python -m cProfile -o profile.out main.py
+```
+
+其中 `main.py` 是我们要分析的程序，`-o`标识了一个输出的文件名，用来存储本次性能分析的结果。如果不指定这个文件，`cProfile`会打印到标准输出。
+
+### 查看性能分析文件
+
+`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务，将性能分析结果以网页的形式展示出来：
+
+```bash
+cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py
+```
+
+其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。
+
+用Web浏览器访问对应网址，即可显示性能分析的结果：
+
+```
+   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
+        1    0.284    0.284   29.514   29.514 main.py:1(<module>)
+     4696    0.128    0.000   15.748    0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run)
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+        1    0.144    0.144    6.534    6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14(<module>)
+```
+
+每一列的含义是:
+
+| 列名 | 含义 |
+| --- | --- |
+| ncalls | 函数的调用次数 |
+| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 |
+| percall | tottime的每次调用平均时间 |
+| cumtime | 函数总时间。包含这个函数调用其他函数的时间 |
+| percall | cumtime的每次调用平均时间 |
+| filename:lineno(function) | 文件名, 行号，函数名 |
+
+
+### 寻找性能瓶颈
+
+通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。
+
+将性能分析结果按照tottime排序，效果如下:
+
+```text
+     4696   12.040    0.003   12.040    0.003 {built-in method run}
+   300005    0.874    0.000    1.681    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader)
+   107991    0.676    0.000    1.519    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__)
+     4697    0.626    0.000    2.291    0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)
+        1    0.618    0.618    0.618    0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1(<module>)
+```
+
+可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长，每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息，了解其调用关系。
+
+```text
+Called By:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+
+Function                                                                                                 was called by...
+                                                                                                             ncalls  tottime  cumtime
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp)  <-    4697    0.626    2.291  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)
+/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp)  <-    4696    0.019    2.316  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone)
+                                                                                                                  1    0.000    0.001  /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward)
+
+
+Called:
+
+   Ordered by: internal time
+   List reduced from 4497 to 2 due to restriction <'sync_with_cpp'>
+```
+
+通常观察热点函数间的调用关系，和对应行的代码，就可以了解到问题代码在哪里。当我们做出性能修正后，再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。
+
+
+
+## Python与C++混合代码的性能分析
+
+### 生成性能分析文件
+
+C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析
+
+使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为
+
+```bash
+apt update
+apt install libgoogle-perftools-dev
+pip install yep
+```
+
+安装完毕后，我们可以通过
+
+```bash
+python -m yep -v main.py
+```
+
+生成性能分析文件。生成的性能分析文件为`main.py.prof`。
+
+命令行中的`-v`指定在生成性能分析文件之后，在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同，编译时可能会去掉调试信息，运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果，可以采取下面几点措施:
+
+1. 编译时指定`-g`生成调试信息。使用cmake的话，可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。
+2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。
+3. 运行性能分析的时候，先从单线程开始，再开启多线程，进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。
+
+### 查看性能分析文件
+
+在运行完性能分析后，会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意，这里使用了用`Go`语言重构后的`pprof`，因为这个工具具有web服务界面，且展示效果更好。
+
+安装`pprof`的命令和一般的`Go`程序是一样的，其命令如下:
+
+```bash
+go get github.com/google/pprof
+```
+
+进而我们可以使用如下命令开启一个HTTP服务:
+
+```bash
+pprof -http=0.0.0.0:3213 `which python`  ./main.py.prof
+```
+
+这行命令中，`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径，进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。
+
+访问对应的网址，我们可以查看性能分析的结果。结果如下图所示:
+
+![result](./pprof_1.png)
+
+
+### 寻找性能瓶颈
+
+与寻找Python代码的性能瓶颈类似，寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。
+
+例如下图中，
+
+![kernel_perf](./pprof_2.png)
+
+在一次训练中，乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然，`MomentumOp`的性能有问题。
+
+在`pprof`中，对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题，再检查其他部分的性能问题，可以更有次序的完成性能的优化。
diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc
index 48cd131550dea..02a825324328f 100644
--- a/paddle/framework/op_desc.cc
+++ b/paddle/framework/op_desc.cc
@@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext {
     PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR,
                       "The %d-th output of Output(%s) must be LoDTensor.", j,
                       out);
-    in_var->SetLoDLevel(out_var->GetLodLevel());
+    out_var->SetLoDLevel(in_var->GetLodLevel());
   }
   bool IsRuntime() const override;
 
diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc
index 5eb1c44eb6fc4..95cfe2525e3e7 100644
--- a/paddle/memory/memory.cc
+++ b/paddle/memory/memory.cc
@@ -81,18 +81,33 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) {
 }
 
 template <>
-void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
-  return GetGPUBuddyAllocator(place.device)->Alloc(size);
+size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
+  return GetGPUBuddyAllocator(place.device)->Used();
 }
 
 template <>
-void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
-  GetGPUBuddyAllocator(place.device)->Free(p);
+void* Alloc<platform::GPUPlace>(platform::GPUPlace place, size_t size) {
+  auto* buddy_allocator = GetGPUBuddyAllocator(place.device);
+  auto* ptr = buddy_allocator->Alloc(size);
+  if (ptr == nullptr) {
+    int cur_dev = platform::GetCurrentDeviceId();
+    platform::SetDeviceId(place.device);
+    size_t avail, total;
+    platform::GpuMemoryUsage(avail, total);
+    LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU "
+                 << place.device << ", available " << avail << " bytes";
+    LOG(WARNING) << "total " << total;
+    LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize();
+    LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize();
+    LOG(WARNING) << "GPU memory used: " << Used<platform::GPUPlace>(place);
+    platform::SetDeviceId(cur_dev);
+  }
+  return ptr;
 }
 
 template <>
-size_t Used<platform::GPUPlace>(platform::GPUPlace place) {
-  return GetGPUBuddyAllocator(place.device)->Used();
+void Free<platform::GPUPlace>(platform::GPUPlace place, void* p) {
+  GetGPUBuddyAllocator(place.device)->Free(p);
 }
 
 #endif
diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc
index f29e531712064..83757a3917844 100644
--- a/paddle/optimizer/parameter_optimizer_test.cc
+++ b/paddle/optimizer/parameter_optimizer_test.cc
@@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); }
 TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); }
 
 TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/optimizer/serialization_test.cc b/paddle/optimizer/serialization_test.cc
index 4c416f55ee0bd..940e941e9042d 100644
--- a/paddle/optimizer/serialization_test.cc
+++ b/paddle/optimizer/serialization_test.cc
@@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) {
     EXPECT_EQ(t1[i], t[i]);
   }
 }
-
-int main(int argc, char** argv) {
-  testing::InitGoogleTest(&argc, argv);
-  return RUN_ALL_TESTS();
-}
diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt
index 4245df5ab72bf..2275c950ba13d 100644
--- a/paddle/testing/CMakeLists.txt
+++ b/paddle/testing/CMakeLists.txt
@@ -5,4 +5,6 @@ if(WITH_TESTING)
   add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies})
   add_library(paddle_test_util STATIC TestUtil.cpp)
   add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies})
+  add_library(paddle_gtest_main STATIC paddle_gtest_main.cc)
+  add_dependencies(paddle_gtest_main paddle_memory gtest gflags)
 endif()
diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc
new file mode 100644
index 0000000000000..a491322b7e533
--- /dev/null
+++ b/paddle/testing/paddle_gtest_main.cc
@@ -0,0 +1,39 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <cstring>
+#include "gflags/gflags.h"
+#include "gtest/gtest.h"
+#include "paddle/memory/memory.h"
+
+int main(int argc, char** argv) {
+  std::vector<char*> new_argv;
+  std::string gflags_env;
+  new_argv.push_back(argv[0]);
+#ifdef PADDLE_WITH_CUDA
+  new_argv.push_back(
+      strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory"));
+#else
+  new_argv.push_back(strdup("--tryfromenv=use_pinned_memory"));
+#endif
+  int new_argc = static_cast<int>(new_argv.size());
+  char** new_argv_address = new_argv.data();
+  google::ParseCommandLineFlags(&new_argc, &new_argv_address, false);
+  testing::InitGoogleTest(&argc, argv);
+  paddle::memory::Used(paddle::platform::CPUPlace());
+#ifdef PADDLE_WITH_CUDA
+  paddle::memory::Used(paddle::platform::GPUPlace(0));
+#endif
+  return RUN_ALL_TESTS();
+}
diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py
index 1c42e4d44f504..49c6d8983457f 100644
--- a/python/paddle/v2/fluid/framework.py
+++ b/python/paddle/v2/fluid/framework.py
@@ -3,10 +3,12 @@
 import numpy as np
 from . import core
 import proto.framework_pb2 as framework_pb2
+import contextlib
 
 __all__ = [
     'Block', 'Variable', 'Program', 'Operator', 'default_startup_program',
-    'default_main_program'
+    'default_main_program', 'program_guard', 'switch_startup_program',
+    'switch_main_program'
 ]
 
 
@@ -659,8 +661,83 @@ def __init__(self, block, shape, dtype, **kwargs):
 
 
 def default_startup_program():
+    """
+    Get default startup program. In startup program, Paddle will initialize
+    parameters, initialize nccl handle, etc.
+    
+    Returns:
+        Program: startup program
+    """
     return _startup_program_
 
 
 def default_main_program():
+    """
+    Get default main program. The main program is used for training or testing.
+    
+    Returns:
+        Program: main program
+    """
     return _main_program_
+
+
+def switch_main_program(program):
+    """
+    Switch the main program to a new program.
+    
+    Args:
+        program(Program): The new main program
+
+    Returns:
+        Program: The previous main program
+    """
+    global _main_program_
+    prev_program = _main_program_
+    _main_program_ = program
+    return prev_program
+
+
+def switch_startup_program(program):
+    """
+    Switch the startup program to a new program 
+    Args:
+        program(Program): The new startup program
+
+    Returns:
+        Program: The previous startup program
+    """
+    global _startup_program_
+    prev_program = _startup_program_
+    _startup_program_ = program
+    return prev_program
+
+
+@contextlib.contextmanager
+def program_guard(main_program, startup_program=None):
+    """
+    Switch program with `with` statement
+    
+    Examples:
+        >>> with program_guard(Program()):
+        >>>   data = fluid.layers.data(...)
+        >>>   hidden = fluid.layers.fc(...)
+        
+    Args:
+        main_program(Program): New main program inside `with` statement
+        startup_program(Program): New startup program inside `with` statement. 
+            None means do not change startup program.
+
+    Returns:
+        None
+    """
+    if not isinstance(main_program, Program):
+        raise TypeError("main_program should be Program")
+    main_program = switch_main_program(main_program)
+    if startup_program is not None:
+        if not isinstance(startup_program, Program):
+            raise TypeError("startup_program should be Program")
+        startup_program = switch_startup_program(startup_program)
+    yield
+    switch_main_program(main_program)
+    if startup_program is not None:
+        switch_startup_program(startup_program)
diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py
index b6906be60b8ff..33b0e54f42afc 100644
--- a/python/paddle/v2/fluid/tests/test_layers.py
+++ b/python/paddle/v2/fluid/tests/test_layers.py
@@ -1,192 +1,141 @@
+from __future__ import print_function
 import unittest
 
 import paddle.v2.fluid.layers as layers
 import paddle.v2.fluid.nets as nets
-from paddle.v2.fluid.framework import Program
+from paddle.v2.fluid.framework import Program, program_guard
 
 
 class TestBook(unittest.TestCase):
     def test_fit_a_line(self):
         program = Program()
-        x = layers.data(
-            name='x', shape=[13], dtype='float32', main_program=program)
-        y_predict = layers.fc(input=x, size=1, act=None, main_program=program)
+        with program_guard(program, startup_program=Program()):
+            x = layers.data(name='x', shape=[13], dtype='float32')
+            y_predict = layers.fc(input=x, size=1, act=None)
+            y = layers.data(name='y', shape=[1], dtype='float32')
+            cost = layers.square_error_cost(input=y_predict, label=y)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+            program.append_backward(avg_cost)
 
-        y = layers.data(
-            name='y', shape=[1], dtype='float32', main_program=program)
-        cost = layers.square_error_cost(
-            input=y_predict, label=y, main_program=program)
-
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-        program.append_backward(avg_cost)
-
-        print str(program)
+        print(str(program))
 
     def test_recognize_digits_mlp(self):
         program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        hidden1 = layers.fc(input=images,
-                            size=128,
-                            act='relu',
-                            main_program=program)
-        hidden2 = layers.fc(input=hidden1,
-                            size=64,
-                            act='relu',
-                            main_program=program)
-        predict = layers.fc(input=hidden2,
-                            size=10,
-                            act='softmax',
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            # Change g_program, so the rest layers use `g_program`
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden1 = layers.fc(input=images, size=128, act='relu')
+            hidden2 = layers.fc(input=hidden1, size=64, act='relu')
+            predict = layers.fc(input=hidden2, size=10, act='softmax')
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_simple_conv2d(self):
         program = Program()
-        images = layers.data(
-            name='pixel',
-            shape=[3, 48, 48],
-            dtype='int32',
-            main_program=program)
-        layers.conv2d(
-            input=images,
-            num_filters=3,
-            filter_size=[4, 4],
-            main_program=program)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32')
+            layers.conv2d(input=images, num_filters=3, filter_size=[4, 4])
+
+        print(str(program))
 
     def test_conv2d_transpose(self):
         program = Program()
-        kwargs = {'main_program': program}
-        img = layers.data(
-            name='pixel', shape=[3, 2, 2], dtype='float32', **kwargs)
-        layers.conv2d_transpose(
-            input=img, num_filters=10, output_size=28, **kwargs)
-        print str(program)
+        with program_guard(program):
+            img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32')
+            layers.conv2d_transpose(input=img, num_filters=10, output_size=28)
+        print(str(program))
 
     def test_recognize_digits_conv(self):
         program = Program()
-
-        images = layers.data(
-            name='pixel',
-            shape=[1, 28, 28],
-            dtype='float32',
-            main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        conv_pool_1 = nets.simple_img_conv_pool(
-            input=images,
-            filter_size=5,
-            num_filters=2,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-        conv_pool_2 = nets.simple_img_conv_pool(
-            input=conv_pool_1,
-            filter_size=5,
-            num_filters=4,
-            pool_size=2,
-            pool_stride=2,
-            act="relu",
-            main_program=program)
-
-        predict = layers.fc(input=conv_pool_2,
-                            size=10,
-                            act="softmax",
-                            main_program=program)
-        cost = layers.cross_entropy(
-            input=predict, label=label, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-
-        program.append_backward(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            conv_pool_1 = nets.simple_img_conv_pool(
+                input=images,
+                filter_size=5,
+                num_filters=2,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+            conv_pool_2 = nets.simple_img_conv_pool(
+                input=conv_pool_1,
+                filter_size=5,
+                num_filters=4,
+                pool_size=2,
+                pool_stride=2,
+                act="relu")
+
+            predict = layers.fc(input=conv_pool_2, size=10, act="softmax")
+            cost = layers.cross_entropy(input=predict, label=label)
+            avg_cost = layers.mean(x=cost)
+
+            program.append_backward(avg_cost)
+
+        print(str(program))
 
     def test_word_embedding(self):
         program = Program()
-        dict_size = 10000
-        embed_size = 32
-        first_word = layers.data(
-            name='firstw', shape=[1], dtype='int64', main_program=program)
-        second_word = layers.data(
-            name='secondw', shape=[1], dtype='int64', main_program=program)
-        third_word = layers.data(
-            name='thirdw', shape=[1], dtype='int64', main_program=program)
-        forth_word = layers.data(
-            name='forthw', shape=[1], dtype='int64', main_program=program)
-        next_word = layers.data(
-            name='nextw', shape=[1], dtype='int64', main_program=program)
-
-        embed_first = layers.embedding(
-            input=first_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr='shared_w',
-            main_program=program)
-        embed_second = layers.embedding(
-            input=second_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr='shared_w',
-            main_program=program)
-
-        embed_third = layers.embedding(
-            input=third_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr='shared_w',
-            main_program=program)
-        embed_forth = layers.embedding(
-            input=forth_word,
-            size=[dict_size, embed_size],
-            dtype='float32',
-            param_attr='shared_w',
-            main_program=program)
-
-        concat_embed = layers.concat(
-            input=[embed_first, embed_second, embed_third, embed_forth],
-            axis=1,
-            main_program=program)
-
-        hidden1 = layers.fc(input=concat_embed,
-                            size=256,
-                            act='sigmoid',
-                            main_program=program)
-        predict_word = layers.fc(input=hidden1,
-                                 size=dict_size,
-                                 act='softmax',
-                                 main_program=program)
-        cost = layers.cross_entropy(
-            input=predict_word, label=next_word, main_program=program)
-        avg_cost = layers.mean(x=cost, main_program=program)
-        self.assertIsNotNone(avg_cost)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            dict_size = 10000
+            embed_size = 32
+            first_word = layers.data(name='firstw', shape=[1], dtype='int64')
+            second_word = layers.data(name='secondw', shape=[1], dtype='int64')
+            third_word = layers.data(name='thirdw', shape=[1], dtype='int64')
+            forth_word = layers.data(name='forthw', shape=[1], dtype='int64')
+            next_word = layers.data(name='nextw', shape=[1], dtype='int64')
+
+            embed_first = layers.embedding(
+                input=first_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_second = layers.embedding(
+                input=second_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            embed_third = layers.embedding(
+                input=third_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+            embed_forth = layers.embedding(
+                input=forth_word,
+                size=[dict_size, embed_size],
+                dtype='float32',
+                param_attr='shared_w')
+
+            concat_embed = layers.concat(
+                input=[embed_first, embed_second, embed_third, embed_forth],
+                axis=1)
+
+            hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid')
+            predict_word = layers.fc(input=hidden1,
+                                     size=dict_size,
+                                     act='softmax')
+            cost = layers.cross_entropy(input=predict_word, label=next_word)
+            avg_cost = layers.mean(x=cost)
+            self.assertIsNotNone(avg_cost)
+
+        print(str(program))
 
     def test_linear_chain_crf(self):
         program = Program()
-
-        # Change g_program, so the rest layers use `g_program`
-        images = layers.data(
-            name='pixel', shape=[784], dtype='float32', main_program=program)
-        label = layers.data(
-            name='label', shape=[1], dtype='int32', main_program=program)
-        hidden = layers.fc(input=images, size=128, main_program=program)
-        crf = layers.linear_chain_crf(
-            input=hidden, label=label, main_program=program)
-
-        print str(program)
+        with program_guard(program, startup_program=Program()):
+            images = layers.data(name='pixel', shape=[784], dtype='float32')
+            label = layers.data(name='label', shape=[1], dtype='int32')
+            hidden = layers.fc(input=images, size=128)
+            crf = layers.linear_chain_crf(input=hidden, label=label)
+            self.assertNotEqual(crf, None)
+
+        print(str(program))
 
 
 if __name__ == '__main__':