diff --git a/cmake/generic.cmake b/cmake/generic.cmake index c917ca0ff4e08..9cf256fb6d533 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -227,8 +227,8 @@ function(cc_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) - add_dependencies(${TARGET_NAME} ${cc_test_DEPS} gtest gtest_main) + target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) endif() endfunction(cc_test) @@ -288,8 +288,8 @@ function(nv_test TARGET_NAME) set(multiValueArgs SRCS DEPS) cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) cuda_add_executable(${TARGET_NAME} ${nv_test_SRCS}) - target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) - add_dependencies(${TARGET_NAME} ${nv_test_DEPS} gtest gtest_main) + target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) + add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main paddle_memory gtest gflags) add_test(${TARGET_NAME} ${TARGET_NAME}) endif() endfunction(nv_test) diff --git a/doc/howto/optimization/cpu_profiling.md b/doc/howto/optimization/cpu_profiling.md index b3330b0b59d65..e1d91c668e9c6 100644 --- a/doc/howto/optimization/cpu_profiling.md +++ b/doc/howto/optimization/cpu_profiling.md @@ -1,42 +1,52 @@ -此教程会介绍如何使用Python的cProfile包,与Python库yep,google perftools来运行性能分析(Profiling)与调优。 +This tutorial introduces techniques we used to profile and tune the +CPU performance of PaddlePaddle. We will use Python packages +`cProfile` and `yep`, and Google `perftools`. -运行性能分析可以让开发人员科学的,有条不紊的对程序进行性能优化。性能分析是性能调优的基础。因为在程序实际运行中,真正的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。 +Profiling is the process that reveals the performance bottlenecks, +which could be very different from what's in the developers' mind. +Performance tuning is to fix the bottlenecks. Performance optimization +repeats the steps of profiling and tuning alternatively. -性能优化的步骤,通常是循环重复若干次『性能分析 --> 寻找瓶颈 ---> 调优瓶颈 --> 性能分析确认调优效果』。其中性能分析是性能调优的至关重要的量化指标。 +PaddlePaddle users program AI by calling the Python API, which calls +into `libpaddle.so.` written in C++. In this tutorial, we focus on +the profiling and tuning of -Paddle提供了Python语言绑定。用户使用Python进行神经网络编程,训练,测试。Python解释器通过`pybind`和`swig`调用Paddle的动态链接库,进而调用Paddle C++部分的代码。所以Paddle的性能分析与调优分为两个部分: +1. the Python code and +1. the mixture of Python and C++ code. -* Python代码的性能分析 -* Python与C++混合代码的性能分析 +## Profiling the Python Code +### Generate the Performance Profiling File -## Python代码的性能分析 - -### 生成性能分析文件 - -Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下: +We can use Python standard +package, [`cProfile`](https://docs.python.org/2/library/profile.html), +to generate Python profiling file. For example: ```bash python -m cProfile -o profile.out main.py ``` -其中`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印一些统计信息到`stdout`。这不方便我们进行后期处理(进行`sort`, `split`, `cut`等等)。 - -### 查看性能分析文件 +where `main.py` is the program we are going to profile, `-o` specifies +the output file. Without `-o`, `cProfile` would outputs to standard +output. -当main.py运行完毕后,性能分析结果文件`profile.out`就生成出来了。我们可以使用[cprofilev](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来。 +### Look into the Profiling File -使用`pip install cprofilev`安装`cprofilev`工具。安装完成后,使用如下命令开启HTTP服务 +`cProfile` generates `profile.out` after `main.py` completes. We can +use [`cprofilev`](https://github.com/ymichael/cprofilev) to look into +the details: ```bash cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py ``` -其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。 +where `-a` specifies the HTTP IP, `-p` specifies the port, `-f` +specifies the profiling file, and `main.py` is the source file. -访问对应网址,即可显示性能分析的结果。性能分析结果格式如下: +Open the Web browser and points to the local IP and the specifies +port, we will see the output like the following: -```text +``` ncalls tottime percall cumtime percall filename:lineno(function) 1 0.284 0.284 29.514 29.514 main.py:1() 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) @@ -44,23 +54,23 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() ``` -每一列的含义是: +where each line corresponds to Python function, and the meaning of +each column is as follows: -| 列名 | 含义 | +| column | meaning | | --- | --- | -| ncalls | 函数的调用次数 | -| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 | -| percall | tottime的每次调用平均时间 | -| cumtime | 函数总时间。包含这个函数调用其他函数的时间 | -| percall | cumtime的每次调用平均时间 | -| filename:lineno(function) | 文件名, 行号,函数名 | +| ncalls | the number of calls into a function | +| tottime | the total execution time of the function, not including the + execution time of other functions called by the function | +| percall | tottime divided by ncalls | +| cumtime | the total execution time of the function, including the execution time of other functions being called | +| percall | cumtime divided by ncalls | +| filename:lineno(function) | where the function is defined | +### Identify Performance Bottlenecks -### 寻找性能瓶颈 - -通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。 - -将性能分析结果按照tottime排序,效果如下: +Usually, `tottime` and the related `percall` time is what we want to +focus on. We can sort above profiling file by tottime: ```text 4696 12.040 0.003 12.040 0.003 {built-in method run} @@ -68,12 +78,15 @@ cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() - ``` -可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 +We can see that the most time-consuming function is the `built-in +method run`, which is a C++ function in `libpaddle.so`. We will +explain how to profile C++ code in the next section. At the right +moment, let's look into the third function `sync_with_cpp`, which is a +Python function. We can click it to understand more about it: -```text +``` Called By: Ordered by: internal time @@ -92,72 +105,93 @@ Called: List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> ``` -通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。 +The lists of the callers of `sync_with_cpp` might help us understand +how to improve the function definition. +## Profiling Python and C++ Code +### Generate the Profiling File -## Python与C++混合代码的性能分析 +To profile a mixture of Python and C++ code, we can use a Python +package, `yep`, that can work with Google's `perftools`, which is a +commonly-used profiler for C/C++ code. -### 生成性能分析文件 - -C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析 - -使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为 +In Ubuntu systems, we can install `yep` and `perftools` by running the +following commands: ```bash +apt update apt install libgoogle-perftools-dev pip install yep ``` -安装完毕后,我们可以通过 +Then we can run the following command ```bash python -m yep -v main.py ``` -生成性能分析文件。生成的性能分析文件为`main.py.prof`。 +to generate the profiling file. The default filename is +`main.py.prof`. + +Please be aware of the `-v` command line option, which prints the +analysis results after generating the profiling file. By taking a +glance at the print result, we'd know that if we stripped debug +information from `libpaddle.so` at build time. The following hints +help make sure that the analysis results are readable: -命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施: +1. Use GCC command line option `-g` when building `libpaddle.so` so to + include the debug information. The standard building system of + PaddlePaddle is CMake, so you might want to set + `CMAKE_BUILD_TYPE=RelWithDebInfo`. -1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。 -2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 -3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 +1. Use GCC command line option `-O2` or `-O3` to generate optimized + binary code. It doesn't make sense to profile `libpaddle.so` + without optimization, because it would anyway run slowly. -### 查看性能分析文件 +1. Profiling the single-threaded binary file before the + multi-threading version, because the latter often generates tangled + profiling analysis result. You might want to set environment + variable `OMP_NUM_THREADS=1` to prevents OpenMP from automatically + starting multiple threads. -在运行完性能分析后,会生成性能分析结果文件。我们可以使用[pprof](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。 +### Look into the Profiling File -安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下: +The tool we used to look into the profiling file generated by +`perftools` is [`pprof`](https://github.com/google/pprof), which +provides a Web-based GUI like `cprofilev`. + +We can rely on the standard Go toolchain to retrieve the source code +of `pprof` and build it: ```bash go get github.com/google/pprof ``` -进而我们可以使用如下命令开启一个HTTP服务: +Then we can use it to profile `main.py.prof` generated in the previous +section: ```bash pprof -http=0.0.0.0:3213 `which python` ./main.py.prof ``` -这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。 - -访问对应的网址,我们可以查看性能分析的结果。结果如下图所示: +Where `-http` specifies the IP and port of the HTTP service. +Directing our Web browser to the service, we would see something like +the following: ![result](./pprof_1.png) +### Identifying the Performance Bottlenecks -### 寻找性能瓶颈 - -与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。 - -例如下图中, +Similar to how we work with `cprofilev`, we'd focus on `tottime` and +`cumtime`. ![kernel_perf](./pprof_2.png) -在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。 - -在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。 - -## 总结 +We can see that the execution time of multiplication and the computing +of the gradient of multiplication takes 2% to 4% of the total running +time, and `MomentumOp` takes about 17%. Obviously, we'd want to +optimize `MomentumOp`. -至此,两种性能分析的方式都介绍完毕了。希望通过这两种性能分析的方式,Paddle的开发人员和使用人员可以有次序的,科学的发现和解决性能问题。 +`pprof` would mark performance critical parts of the program in +red. It's a good idea to follow the hint. diff --git a/doc/howto/optimization/cpu_profiling_cn.md b/doc/howto/optimization/cpu_profiling_cn.md new file mode 100644 index 0000000000000..14eba0e2f34b1 --- /dev/null +++ b/doc/howto/optimization/cpu_profiling_cn.md @@ -0,0 +1,155 @@ +此教程会介绍如何使用Python的cProfile包、Python库yep、Google perftools来进行性能分析 (profiling) 与调优(performance tuning)。 + +Profling 指发现性能瓶颈。系统中的瓶颈可能和程序员开发过程中想象的瓶颈相去甚远。Tuning 指消除瓶颈。性能优化的过程通常是不断重复地 profiling 和 tuning。 + +PaddlePaddle 用户一般通过调用 Python API 编写深度学习程序。大部分 Python API 调用用 C++ 写的 libpaddle.so。所以 PaddlePaddle 的性能分析与调优分为两个部分: + +* Python 代码的性能分析 +* Python 与 C++ 混合代码的性能分析 + + +## Python代码的性能分析 + +### 生成性能分析文件 + +Python标准库中提供了性能分析的工具包,[cProfile](https://docs.python.org/2/library/profile.html)。生成Python性能分析的命令如下: + +```bash +python -m cProfile -o profile.out main.py +``` + +其中 `main.py` 是我们要分析的程序,`-o`标识了一个输出的文件名,用来存储本次性能分析的结果。如果不指定这个文件,`cProfile`会打印到标准输出。 + +### 查看性能分析文件 + +`cProfile` 在main.py 运行完毕后输出`profile.out`。我们可以使用[`cprofilev`](https://github.com/ymichael/cprofilev)来查看性能分析结果。`cprofilev`是一个Python的第三方库。使用它会开启一个HTTP服务,将性能分析结果以网页的形式展示出来: + +```bash +cprofilev -a 0.0.0.0 -p 3214 -f profile.out main.py +``` + +其中`-a`标识HTTP服务绑定的IP。使用`0.0.0.0`允许外网访问这个HTTP服务。`-p`标识HTTP服务的端口。`-f`标识性能分析的结果文件。`main.py`标识被性能分析的源文件。 + +用Web浏览器访问对应网址,即可显示性能分析的结果: + +``` + ncalls tottime percall cumtime percall filename:lineno(function) + 1 0.284 0.284 29.514 29.514 main.py:1() + 4696 0.128 0.000 15.748 0.003 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/executor.py:20(run) + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 1 0.144 0.144 6.534 6.534 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/__init__.py:14() +``` + +每一列的含义是: + +| 列名 | 含义 | +| --- | --- | +| ncalls | 函数的调用次数 | +| tottime | 函数实际使用的总时间。该时间去除掉本函数调用其他函数的时间 | +| percall | tottime的每次调用平均时间 | +| cumtime | 函数总时间。包含这个函数调用其他函数的时间 | +| percall | cumtime的每次调用平均时间 | +| filename:lineno(function) | 文件名, 行号,函数名 | + + +### 寻找性能瓶颈 + +通常`tottime`和`cumtime`是寻找瓶颈的关键指标。这两个指标代表了某一个函数真实的运行时间。 + +将性能分析结果按照tottime排序,效果如下: + +```text + 4696 12.040 0.003 12.040 0.003 {built-in method run} + 300005 0.874 0.000 1.681 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/dataset/mnist.py:38(reader) + 107991 0.676 0.000 1.519 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:219(__init__) + 4697 0.626 0.000 2.291 0.000 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) + 1 0.618 0.618 0.618 0.618 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/__init__.py:1() +``` + +可以看到最耗时的函数是C++端的`run`函数。这需要联合我们第二节`Python`与`C++`混合代码的性能分析来进行调优。而`sync_with_cpp`函数的总共耗时很长,每次调用的耗时也很长。于是我们可以点击`sync_with_cpp`的详细信息,了解其调用关系。 + +```text +Called By: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> + +Function was called by... + ncalls tottime cumtime +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:428(sync_with_cpp) <- 4697 0.626 2.291 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) +/home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:562(sync_with_cpp) <- 4696 0.019 2.316 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:487(clone) + 1 0.000 0.001 /home/yuyang/perf_test/.env/lib/python2.7/site-packages/paddle/v2/fluid/framework.py:534(append_backward) + + +Called: + + Ordered by: internal time + List reduced from 4497 to 2 due to restriction <'sync_with_cpp'> +``` + +通常观察热点函数间的调用关系,和对应行的代码,就可以了解到问题代码在哪里。当我们做出性能修正后,再次进行性能分析(profiling)即可检查我们调优后的修正是否能够改善程序的性能。 + + + +## Python与C++混合代码的性能分析 + +### 生成性能分析文件 + +C++的性能分析工具非常多。常见的包括`gprof`, `valgrind`, `google-perftools`。但是调试Python中使用的动态链接库与直接调试原始二进制相比增加了很多复杂度。幸而Python的一个第三方库`yep`提供了方便的和`google-perftools`交互的方法。于是这里使用`yep`进行Python与C++混合代码的性能分析 + +使用`yep`前需要安装`google-perftools`与`yep`包。ubuntu下安装命令为 + +```bash +apt update +apt install libgoogle-perftools-dev +pip install yep +``` + +安装完毕后,我们可以通过 + +```bash +python -m yep -v main.py +``` + +生成性能分析文件。生成的性能分析文件为`main.py.prof`。 + +命令行中的`-v`指定在生成性能分析文件之后,在命令行显示分析结果。我们可以在命令行中简单的看一下生成效果。因为C++与Python不同,编译时可能会去掉调试信息,运行时也可能因为多线程产生混乱不可读的性能分析结果。为了生成更可读的性能分析结果,可以采取下面几点措施: + +1. 编译时指定`-g`生成调试信息。使用cmake的话,可以将CMAKE_BUILD_TYPE指定为`RelWithDebInfo`。 +2. 编译时一定要开启优化。单纯的`Debug`编译性能会和`-O2`或者`-O3`有非常大的差别。`Debug`模式下的性能测试是没有意义的。 +3. 运行性能分析的时候,先从单线程开始,再开启多线程,进而多机。毕竟单线程调试更容易。可以设置`OMP_NUM_THREADS=1`这个环境变量关闭openmp优化。 + +### 查看性能分析文件 + +在运行完性能分析后,会生成性能分析结果文件。我们可以使用[`pprof`](https://github.com/google/pprof)来显示性能分析结果。注意,这里使用了用`Go`语言重构后的`pprof`,因为这个工具具有web服务界面,且展示效果更好。 + +安装`pprof`的命令和一般的`Go`程序是一样的,其命令如下: + +```bash +go get github.com/google/pprof +``` + +进而我们可以使用如下命令开启一个HTTP服务: + +```bash +pprof -http=0.0.0.0:3213 `which python` ./main.py.prof +``` + +这行命令中,`-http`指开启HTTP服务。`which python`会产生当前Python二进制的完整路径,进而指定了Python可执行文件的路径。`./main.py.prof`输入了性能分析结果。 + +访问对应的网址,我们可以查看性能分析的结果。结果如下图所示: + +![result](./pprof_1.png) + + +### 寻找性能瓶颈 + +与寻找Python代码的性能瓶颈类似,寻找Python与C++混合代码的性能瓶颈也是要看`tottime`和`cumtime`。而`pprof`展示的调用图也可以帮助我们发现性能中的问题。 + +例如下图中, + +![kernel_perf](./pprof_2.png) + +在一次训练中,乘法和乘法梯度的计算占用2%-4%左右的计算时间。而`MomentumOp`占用了17%左右的计算时间。显然,`MomentumOp`的性能有问题。 + +在`pprof`中,对于性能的关键路径都做出了红色标记。先检查关键路径的性能问题,再检查其他部分的性能问题,可以更有次序的完成性能的优化。 diff --git a/paddle/framework/op_desc.cc b/paddle/framework/op_desc.cc index 48cd131550dea..02a825324328f 100644 --- a/paddle/framework/op_desc.cc +++ b/paddle/framework/op_desc.cc @@ -65,7 +65,7 @@ class CompileTimeInferShapeContext : public InferShapeContext { PADDLE_ENFORCE_EQ(in_var->GetType(), VarDesc::LOD_TENSOR, "The %d-th output of Output(%s) must be LoDTensor.", j, out); - in_var->SetLoDLevel(out_var->GetLodLevel()); + out_var->SetLoDLevel(in_var->GetLodLevel()); } bool IsRuntime() const override; diff --git a/paddle/memory/memory.cc b/paddle/memory/memory.cc index 5eb1c44eb6fc4..95cfe2525e3e7 100644 --- a/paddle/memory/memory.cc +++ b/paddle/memory/memory.cc @@ -81,18 +81,33 @@ BuddyAllocator* GetGPUBuddyAllocator(int gpu_id) { } template <> -void* Alloc(platform::GPUPlace place, size_t size) { - return GetGPUBuddyAllocator(place.device)->Alloc(size); +size_t Used(platform::GPUPlace place) { + return GetGPUBuddyAllocator(place.device)->Used(); } template <> -void Free(platform::GPUPlace place, void* p) { - GetGPUBuddyAllocator(place.device)->Free(p); +void* Alloc(platform::GPUPlace place, size_t size) { + auto* buddy_allocator = GetGPUBuddyAllocator(place.device); + auto* ptr = buddy_allocator->Alloc(size); + if (ptr == nullptr) { + int cur_dev = platform::GetCurrentDeviceId(); + platform::SetDeviceId(place.device); + size_t avail, total; + platform::GpuMemoryUsage(avail, total); + LOG(WARNING) << "Cannot allocate " << size << " bytes in GPU " + << place.device << ", available " << avail << " bytes"; + LOG(WARNING) << "total " << total; + LOG(WARNING) << "GpuMinChunkSize " << platform::GpuMinChunkSize(); + LOG(WARNING) << "GpuMaxChunkSize " << platform::GpuMaxChunkSize(); + LOG(WARNING) << "GPU memory used: " << Used(place); + platform::SetDeviceId(cur_dev); + } + return ptr; } template <> -size_t Used(platform::GPUPlace place) { - return GetGPUBuddyAllocator(place.device)->Used(); +void Free(platform::GPUPlace place, void* p) { + GetGPUBuddyAllocator(place.device)->Free(p); } #endif diff --git a/paddle/optimizer/parameter_optimizer_test.cc b/paddle/optimizer/parameter_optimizer_test.cc index f29e531712064..83757a3917844 100644 --- a/paddle/optimizer/parameter_optimizer_test.cc +++ b/paddle/optimizer/parameter_optimizer_test.cc @@ -127,8 +127,3 @@ TEST_F(OptimizerTest, TestGetWeight) { TestGetWeight(); } TEST_F(OptimizerTest, TestUpdate) { TestUpdate(); } TEST_F(OptimizerTest, TestCheckPoint) { TestCheckPoint(); } - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/paddle/optimizer/serialization_test.cc b/paddle/optimizer/serialization_test.cc index 4c416f55ee0bd..940e941e9042d 100644 --- a/paddle/optimizer/serialization_test.cc +++ b/paddle/optimizer/serialization_test.cc @@ -46,8 +46,3 @@ TEST(TensorToProto, Case2) { EXPECT_EQ(t1[i], t[i]); } } - -int main(int argc, char** argv) { - testing::InitGoogleTest(&argc, argv); - return RUN_ALL_TESTS(); -} diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 4245df5ab72bf..2275c950ba13d 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -5,4 +5,6 @@ if(WITH_TESTING) add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies}) add_library(paddle_test_util STATIC TestUtil.cpp) add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) + add_library(paddle_gtest_main STATIC paddle_gtest_main.cc) + add_dependencies(paddle_gtest_main paddle_memory gtest gflags) endif() diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc new file mode 100644 index 0000000000000..a491322b7e533 --- /dev/null +++ b/paddle/testing/paddle_gtest_main.cc @@ -0,0 +1,39 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "gflags/gflags.h" +#include "gtest/gtest.h" +#include "paddle/memory/memory.h" + +int main(int argc, char** argv) { + std::vector new_argv; + std::string gflags_env; + new_argv.push_back(argv[0]); +#ifdef PADDLE_WITH_CUDA + new_argv.push_back( + strdup("--tryfromenv=fraction_of_gpu_memory_to_use,use_pinned_memory")); +#else + new_argv.push_back(strdup("--tryfromenv=use_pinned_memory")); +#endif + int new_argc = static_cast(new_argv.size()); + char** new_argv_address = new_argv.data(); + google::ParseCommandLineFlags(&new_argc, &new_argv_address, false); + testing::InitGoogleTest(&argc, argv); + paddle::memory::Used(paddle::platform::CPUPlace()); +#ifdef PADDLE_WITH_CUDA + paddle::memory::Used(paddle::platform::GPUPlace(0)); +#endif + return RUN_ALL_TESTS(); +} diff --git a/python/paddle/v2/fluid/framework.py b/python/paddle/v2/fluid/framework.py index 1c42e4d44f504..49c6d8983457f 100644 --- a/python/paddle/v2/fluid/framework.py +++ b/python/paddle/v2/fluid/framework.py @@ -3,10 +3,12 @@ import numpy as np from . import core import proto.framework_pb2 as framework_pb2 +import contextlib __all__ = [ 'Block', 'Variable', 'Program', 'Operator', 'default_startup_program', - 'default_main_program' + 'default_main_program', 'program_guard', 'switch_startup_program', + 'switch_main_program' ] @@ -659,8 +661,83 @@ def __init__(self, block, shape, dtype, **kwargs): def default_startup_program(): + """ + Get default startup program. In startup program, Paddle will initialize + parameters, initialize nccl handle, etc. + + Returns: + Program: startup program + """ return _startup_program_ def default_main_program(): + """ + Get default main program. The main program is used for training or testing. + + Returns: + Program: main program + """ return _main_program_ + + +def switch_main_program(program): + """ + Switch the main program to a new program. + + Args: + program(Program): The new main program + + Returns: + Program: The previous main program + """ + global _main_program_ + prev_program = _main_program_ + _main_program_ = program + return prev_program + + +def switch_startup_program(program): + """ + Switch the startup program to a new program + Args: + program(Program): The new startup program + + Returns: + Program: The previous startup program + """ + global _startup_program_ + prev_program = _startup_program_ + _startup_program_ = program + return prev_program + + +@contextlib.contextmanager +def program_guard(main_program, startup_program=None): + """ + Switch program with `with` statement + + Examples: + >>> with program_guard(Program()): + >>> data = fluid.layers.data(...) + >>> hidden = fluid.layers.fc(...) + + Args: + main_program(Program): New main program inside `with` statement + startup_program(Program): New startup program inside `with` statement. + None means do not change startup program. + + Returns: + None + """ + if not isinstance(main_program, Program): + raise TypeError("main_program should be Program") + main_program = switch_main_program(main_program) + if startup_program is not None: + if not isinstance(startup_program, Program): + raise TypeError("startup_program should be Program") + startup_program = switch_startup_program(startup_program) + yield + switch_main_program(main_program) + if startup_program is not None: + switch_startup_program(startup_program) diff --git a/python/paddle/v2/fluid/tests/test_layers.py b/python/paddle/v2/fluid/tests/test_layers.py index b6906be60b8ff..33b0e54f42afc 100644 --- a/python/paddle/v2/fluid/tests/test_layers.py +++ b/python/paddle/v2/fluid/tests/test_layers.py @@ -1,192 +1,141 @@ +from __future__ import print_function import unittest import paddle.v2.fluid.layers as layers import paddle.v2.fluid.nets as nets -from paddle.v2.fluid.framework import Program +from paddle.v2.fluid.framework import Program, program_guard class TestBook(unittest.TestCase): def test_fit_a_line(self): program = Program() - x = layers.data( - name='x', shape=[13], dtype='float32', main_program=program) - y_predict = layers.fc(input=x, size=1, act=None, main_program=program) + with program_guard(program, startup_program=Program()): + x = layers.data(name='x', shape=[13], dtype='float32') + y_predict = layers.fc(input=x, size=1, act=None) + y = layers.data(name='y', shape=[1], dtype='float32') + cost = layers.square_error_cost(input=y_predict, label=y) + avg_cost = layers.mean(x=cost) + self.assertIsNotNone(avg_cost) + program.append_backward(avg_cost) - y = layers.data( - name='y', shape=[1], dtype='float32', main_program=program) - cost = layers.square_error_cost( - input=y_predict, label=y, main_program=program) - - avg_cost = layers.mean(x=cost, main_program=program) - self.assertIsNotNone(avg_cost) - program.append_backward(avg_cost) - - print str(program) + print(str(program)) def test_recognize_digits_mlp(self): program = Program() - - # Change g_program, so the rest layers use `g_program` - images = layers.data( - name='pixel', shape=[784], dtype='float32', main_program=program) - label = layers.data( - name='label', shape=[1], dtype='int32', main_program=program) - hidden1 = layers.fc(input=images, - size=128, - act='relu', - main_program=program) - hidden2 = layers.fc(input=hidden1, - size=64, - act='relu', - main_program=program) - predict = layers.fc(input=hidden2, - size=10, - act='softmax', - main_program=program) - cost = layers.cross_entropy( - input=predict, label=label, main_program=program) - avg_cost = layers.mean(x=cost, main_program=program) - self.assertIsNotNone(avg_cost) - - print str(program) + with program_guard(program, startup_program=Program()): + # Change g_program, so the rest layers use `g_program` + images = layers.data(name='pixel', shape=[784], dtype='float32') + label = layers.data(name='label', shape=[1], dtype='int32') + hidden1 = layers.fc(input=images, size=128, act='relu') + hidden2 = layers.fc(input=hidden1, size=64, act='relu') + predict = layers.fc(input=hidden2, size=10, act='softmax') + cost = layers.cross_entropy(input=predict, label=label) + avg_cost = layers.mean(x=cost) + self.assertIsNotNone(avg_cost) + + print(str(program)) def test_simple_conv2d(self): program = Program() - images = layers.data( - name='pixel', - shape=[3, 48, 48], - dtype='int32', - main_program=program) - layers.conv2d( - input=images, - num_filters=3, - filter_size=[4, 4], - main_program=program) - - print str(program) + with program_guard(program, startup_program=Program()): + images = layers.data(name='pixel', shape=[3, 48, 48], dtype='int32') + layers.conv2d(input=images, num_filters=3, filter_size=[4, 4]) + + print(str(program)) def test_conv2d_transpose(self): program = Program() - kwargs = {'main_program': program} - img = layers.data( - name='pixel', shape=[3, 2, 2], dtype='float32', **kwargs) - layers.conv2d_transpose( - input=img, num_filters=10, output_size=28, **kwargs) - print str(program) + with program_guard(program): + img = layers.data(name='pixel', shape=[3, 2, 2], dtype='float32') + layers.conv2d_transpose(input=img, num_filters=10, output_size=28) + print(str(program)) def test_recognize_digits_conv(self): program = Program() - - images = layers.data( - name='pixel', - shape=[1, 28, 28], - dtype='float32', - main_program=program) - label = layers.data( - name='label', shape=[1], dtype='int32', main_program=program) - conv_pool_1 = nets.simple_img_conv_pool( - input=images, - filter_size=5, - num_filters=2, - pool_size=2, - pool_stride=2, - act="relu", - main_program=program) - conv_pool_2 = nets.simple_img_conv_pool( - input=conv_pool_1, - filter_size=5, - num_filters=4, - pool_size=2, - pool_stride=2, - act="relu", - main_program=program) - - predict = layers.fc(input=conv_pool_2, - size=10, - act="softmax", - main_program=program) - cost = layers.cross_entropy( - input=predict, label=label, main_program=program) - avg_cost = layers.mean(x=cost, main_program=program) - - program.append_backward(avg_cost) - - print str(program) + with program_guard(program, startup_program=Program()): + images = layers.data( + name='pixel', shape=[1, 28, 28], dtype='float32') + label = layers.data(name='label', shape=[1], dtype='int32') + conv_pool_1 = nets.simple_img_conv_pool( + input=images, + filter_size=5, + num_filters=2, + pool_size=2, + pool_stride=2, + act="relu") + conv_pool_2 = nets.simple_img_conv_pool( + input=conv_pool_1, + filter_size=5, + num_filters=4, + pool_size=2, + pool_stride=2, + act="relu") + + predict = layers.fc(input=conv_pool_2, size=10, act="softmax") + cost = layers.cross_entropy(input=predict, label=label) + avg_cost = layers.mean(x=cost) + + program.append_backward(avg_cost) + + print(str(program)) def test_word_embedding(self): program = Program() - dict_size = 10000 - embed_size = 32 - first_word = layers.data( - name='firstw', shape=[1], dtype='int64', main_program=program) - second_word = layers.data( - name='secondw', shape=[1], dtype='int64', main_program=program) - third_word = layers.data( - name='thirdw', shape=[1], dtype='int64', main_program=program) - forth_word = layers.data( - name='forthw', shape=[1], dtype='int64', main_program=program) - next_word = layers.data( - name='nextw', shape=[1], dtype='int64', main_program=program) - - embed_first = layers.embedding( - input=first_word, - size=[dict_size, embed_size], - dtype='float32', - param_attr='shared_w', - main_program=program) - embed_second = layers.embedding( - input=second_word, - size=[dict_size, embed_size], - dtype='float32', - param_attr='shared_w', - main_program=program) - - embed_third = layers.embedding( - input=third_word, - size=[dict_size, embed_size], - dtype='float32', - param_attr='shared_w', - main_program=program) - embed_forth = layers.embedding( - input=forth_word, - size=[dict_size, embed_size], - dtype='float32', - param_attr='shared_w', - main_program=program) - - concat_embed = layers.concat( - input=[embed_first, embed_second, embed_third, embed_forth], - axis=1, - main_program=program) - - hidden1 = layers.fc(input=concat_embed, - size=256, - act='sigmoid', - main_program=program) - predict_word = layers.fc(input=hidden1, - size=dict_size, - act='softmax', - main_program=program) - cost = layers.cross_entropy( - input=predict_word, label=next_word, main_program=program) - avg_cost = layers.mean(x=cost, main_program=program) - self.assertIsNotNone(avg_cost) - - print str(program) + with program_guard(program, startup_program=Program()): + dict_size = 10000 + embed_size = 32 + first_word = layers.data(name='firstw', shape=[1], dtype='int64') + second_word = layers.data(name='secondw', shape=[1], dtype='int64') + third_word = layers.data(name='thirdw', shape=[1], dtype='int64') + forth_word = layers.data(name='forthw', shape=[1], dtype='int64') + next_word = layers.data(name='nextw', shape=[1], dtype='int64') + + embed_first = layers.embedding( + input=first_word, + size=[dict_size, embed_size], + dtype='float32', + param_attr='shared_w') + embed_second = layers.embedding( + input=second_word, + size=[dict_size, embed_size], + dtype='float32', + param_attr='shared_w') + + embed_third = layers.embedding( + input=third_word, + size=[dict_size, embed_size], + dtype='float32', + param_attr='shared_w') + embed_forth = layers.embedding( + input=forth_word, + size=[dict_size, embed_size], + dtype='float32', + param_attr='shared_w') + + concat_embed = layers.concat( + input=[embed_first, embed_second, embed_third, embed_forth], + axis=1) + + hidden1 = layers.fc(input=concat_embed, size=256, act='sigmoid') + predict_word = layers.fc(input=hidden1, + size=dict_size, + act='softmax') + cost = layers.cross_entropy(input=predict_word, label=next_word) + avg_cost = layers.mean(x=cost) + self.assertIsNotNone(avg_cost) + + print(str(program)) def test_linear_chain_crf(self): program = Program() - - # Change g_program, so the rest layers use `g_program` - images = layers.data( - name='pixel', shape=[784], dtype='float32', main_program=program) - label = layers.data( - name='label', shape=[1], dtype='int32', main_program=program) - hidden = layers.fc(input=images, size=128, main_program=program) - crf = layers.linear_chain_crf( - input=hidden, label=label, main_program=program) - - print str(program) + with program_guard(program, startup_program=Program()): + images = layers.data(name='pixel', shape=[784], dtype='float32') + label = layers.data(name='label', shape=[1], dtype='int32') + hidden = layers.fc(input=images, size=128) + crf = layers.linear_chain_crf(input=hidden, label=label) + self.assertNotEqual(crf, None) + + print(str(program)) if __name__ == '__main__':