From d26e4507dac94e0de3a24816541f06082770bc35 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Wed, 17 Oct 2018 18:38:58 +0800 Subject: [PATCH 001/684] init ctr data --- .../fluid/operators/math/jit_kernel_test.cc | 1 + paddle/fluid/operators/reader/CMakeLists.txt | 2 + .../operators/reader/create_ctr_reader_op.cc | 67 +++++++++++++++++++ paddle/fluid/operators/reader/ctr_reader.cc | 21 ++++++ paddle/fluid/operators/reader/ctr_reader.h | 51 ++++++++++++++ 5 files changed, 142 insertions(+) create mode 100644 paddle/fluid/operators/reader/create_ctr_reader_op.cc create mode 100644 paddle/fluid/operators/reader/ctr_reader.cc create mode 100644 paddle/fluid/operators/reader/ctr_reader.h diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 26590171bbeaa..7fdd1c6b76aeb 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -16,6 +16,7 @@ limitations under the License. */ #include #include // for exp #include // for memcpy +#include #include #include #include "gflags/gflags.h" diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 728197377df04..d4f1da69f0a6a 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -16,7 +16,9 @@ function(reader_library TARGET_NAME) endfunction() cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) +cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool) reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) +reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) reader_library(create_shuffle_reader_op SRCS create_shuffle_reader_op.cc) reader_library(create_batch_reader_op SRCS create_batch_reader_op.cc) diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc new file mode 100644 index 0000000000000..e182521f9ab86 --- /dev/null +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -0,0 +1,67 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/ctr_reader.h" + +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" +#include "paddle/fluid/operators/reader/reader_op_registry.h" + +namespace paddle { +namespace operators { +namespace reader { + +class CreateCTRReaderOp : public framework::OperatorBase { + public: + using framework::OperatorBase::OperatorBase; + + private: + void RunImpl(const framework::Scope& scope, + const platform::Place& dev_place) const override { + auto* out = scope.FindVar(Output("Out")) + ->template GetMutable(); + if (out->Get() != nullptr) return; + + const std::string& queue_name = Input("blocking_queue"); + auto* queue_holder_var = scope.FindVar(queue_name); + PADDLE_ENFORCE_NOT_NULL( + queue_holder_var, + "No LoDTensorBlockingQueueHolder variable with name %s found", + queue_name); + auto* queue_holder = + queue_holder_var->template GetMutable(); + + out->Reset(std::make_shared(queue_holder->GetQueue())); + } +}; + +class CreateCTRReaderOpMaker : public FileReaderMakerBase { + protected: + void Apply() override { + AddInput("blocking_queue", + "Name of the `LoDTensorBlockingQueueHolder` variable"); + + AddComment(R"DOC( + Create CTRReader to support read ctr data with cpp. + )DOC"); + } +}; + +} // namespace reader +} // namespace operators +} // namespace paddle + +namespace reader = ::paddle::operators::reader; + +REGISTER_FILE_READER_OPERATOR(create_ctr_reader, reader::CreateCTRReaderOp, + reader::CreateCTRReaderOpMaker); diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc new file mode 100644 index 0000000000000..bcf49fc967cab --- /dev/null +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -0,0 +1,21 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reader/ctr_reader.h" + +namespace paddle { +namespace operators { +namespace reader {} // namespace reader +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h new file mode 100644 index 0000000000000..c3cf78e5f43aa --- /dev/null +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -0,0 +1,51 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" + +namespace paddle { +namespace operators { +namespace reader { + +class CTRReader : public framework::FileReader { + public: + explicit CTRReader(const std::shared_ptr& queue) + : framework::FileReader() { + PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); + queue_ = queue; + } + + void ReadNext(std::vector* out) override { + bool success; + *out = queue_->Pop(&success); + if (!success) out->clear(); + } + + ~CTRReader() { queue_->Close(); } + + void Shutdown() override { queue_->Close(); } + + void Start() override { queue_->ReOpen(); } + + private: + std::shared_ptr queue_; +}; + +} // namespace reader +} // namespace operators +} // namespace paddle From 20f181cdc115cfa49e8b7614fe293535449a26f6 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 15:13:53 +0800 Subject: [PATCH 002/684] init ctr_reader --- paddle/fluid/operators/reader/CMakeLists.txt | 2 +- .../operators/reader/create_ctr_reader_op.cc | 14 +- paddle/fluid/operators/reader/ctr_reader.cc | 131 +++++++++++++++++- paddle/fluid/operators/reader/ctr_reader.h | 44 +++++- paddle/fluid/pybind/CMakeLists.txt | 2 +- 5 files changed, 185 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index d4f1da69f0a6a..341aeda4e41a5 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -16,7 +16,7 @@ function(reader_library TARGET_NAME) endfunction() cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) -cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool) +cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost) reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index e182521f9ab86..58a465d87a8c0 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -41,7 +41,13 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto* queue_holder = queue_holder_var->template GetMutable(); - out->Reset(std::make_shared(queue_holder->GetQueue())); + int thread_num = Attr("thread_num"); + std::vector slots = Attr>("slots"); + int batch_size = Attr("batch_size"); + std::vector file_list = + Attr>("file_list"); + out->Reset(std::make_shared(queue_holder->GetQueue(), batch_size, + thread_num, slots, file_list)); } }; @@ -50,6 +56,12 @@ class CreateCTRReaderOpMaker : public FileReaderMakerBase { void Apply() override { AddInput("blocking_queue", "Name of the `LoDTensorBlockingQueueHolder` variable"); + AddAttr("thread_num", "the thread num to read data"); + AddAttr("batch_size", "the batch size of read data"); + AddAttr>("file_list", + "The list of files that need to read"); + AddAttr>( + "slots", "the slots that should be extract from file"); AddComment(R"DOC( Create CTRReader to support read ctr data with cpp. diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index bcf49fc967cab..a4197a54349ee 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -14,8 +14,137 @@ #include "paddle/fluid/operators/reader/ctr_reader.h" +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include + namespace paddle { namespace operators { -namespace reader {} // namespace reader +namespace reader { + +static inline void string_split(const std::string& s, const char delimiter, + std::vector* output) { + size_t start = 0; + size_t end = s.find_first_of(delimiter); + + while (end <= std::string::npos) { + output->emplace_back(s.substr(start, end - start)); + if (end == std::string::npos) { + break; + } + start = end + 1; + end = s.find_first_of(delimiter, start); + } +} + +static inline void parse_line( + const std::string& line, const std::vector& slots, + int64_t* label, + std::unordered_map>* slots_to_data) { + std::vector ret; + string_split(line, ' ', &ret); + *label = std::stoi(ret[2]) > 0; + for (size_t i = 3; i < ret.size(); ++i) { + const std::string& item = ret[i]; + std::vector slot_and_feasign; + string_split(item, ':', &slot_and_feasign); + if (slot_and_feasign.size() == 2) { + const std::string& slot = slot_and_feasign[1]; + int64_t feasign = std::strtoll(slot_and_feasign[0].c_str(), NULL, 10); + (*slots_to_data)[slot_and_feasign[1]].push_back(feasign); + } + } +} + +// class Reader { +// public: +// virtual ~Reader() {} +// virtual bool HasNext() = 0; +// virtual void NextLine(std::string& line) = 0; +//}; + +class GzipReader { + public: + explicit GzipReader(const std::string& file_name) : instream_(&inbuf_) { + file_ = std::ifstream(file_name, std::ios_base::in | std::ios_base::binary); + inbuf_.push(boost::iostreams::gzip_decompressor()); + inbuf_.push(file_); + // Convert streambuf to istream + } + + ~GzipReader() { file_.close(); } + + bool HasNext() { return instream_.peek() != EOF; } + + void NextLine(std::string& line) { std::getline(instream_, line); } // NOLINT + + private: + boost::iostreams::filtering_streambuf inbuf_; + std::ifstream file_; + std::istream instream_; +}; + +class MultiGzipReader { + public: + explicit MultiGzipReader(const std::vector& file_list) { + for (auto& file : file_list) { + readers_.emplace_back(std::make_shared(file)); + } + } + + bool HasNext() { + if (current_reader_index_ >= readers_.size()) { + return false; + } + if (!readers_[current_reader_index_]->HasNext()) { + current_reader_index_++; + return HasNext(); + } + return true; + } + + void NextLine(std::string& line) { // NOLINT + readers_[current_reader_index_]->NextLine(line); + } + + private: + std::vector> readers_; + size_t current_reader_index_ = 0; +}; + +// void CTRReader::ReadThread( +// const std::vector &file_list, +// const std::vector& slots, +// int batch_size, +// std::shared_ptr& queue) {} + +void CTRReader::ReadThread(const std::vector& file_list, + const std::vector& slots, + int batch_size, + std::shared_ptr* queue) { + std::string line; + + // read all files + std::vector all_lines; + MultiGzipReader reader(file_list); + + for (int j = 0; j < all_lines.size(); ++j) { + std::unordered_map> slots_to_data; + int64_t label; + parse_line(all_lines[j], slots, &label, &slots_to_data); + } +} + +} // namespace reader } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index c3cf78e5f43aa..8a25993699213 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -14,8 +14,20 @@ #pragma once +#include +#include +#include +#include +#include +#include #include + +#include +#include +#include + #include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" namespace paddle { @@ -24,26 +36,50 @@ namespace reader { class CTRReader : public framework::FileReader { public: - explicit CTRReader(const std::shared_ptr& queue) + explicit CTRReader(const std::shared_ptr& queue, + int batch_size, int thread_num, + const std::vector& slots, + const std::vector& file_list) : framework::FileReader() { + thread_num_ = thread_num; + batch_size_ = batch_size; PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); queue_ = queue; + slots_ = slots; + file_list_ = file_list; } + ~CTRReader() { queue_->Close(); } + void ReadNext(std::vector* out) override { bool success; *out = queue_->Pop(&success); if (!success) out->clear(); } - ~CTRReader() { queue_->Close(); } - void Shutdown() override { queue_->Close(); } - void Start() override { queue_->ReOpen(); } + void Start() override { + queue_->ReOpen(); + for (int i = 0; i < thread_num_; i++) { + read_threads_.emplace_back( + new std::thread(std::bind(&CTRReader::ReadThread, this, file_list_, + slots_, batch_size_, queue_))); + } + } + + private: + void ReadThread(const std::vector& file_list, + const std::vector& slots, int batch_size, + std::shared_ptr* queue); private: std::shared_ptr queue_; + std::vector> read_threads_; + int thread_num_; + int batch_size_; + std::vector slots_; + std::vector file_list_; }; } // namespace reader diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index e7f634c4a622b..5ef519367427d 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,5 +1,5 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder boost) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) if(NOT WIN32) list(APPEND PYBIND_DEPS parallel_executor profiler) From a1e0f5abb71d1a2f24256db6ea29e7c9022706ba Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 16:53:48 +0800 Subject: [PATCH 003/684] add gzstream.cmake --- CMakeLists.txt | 1 + cmake/external/gzstream.cmake | 47 ++++++++++++++++++++++ paddle/fluid/operators/reader/ctr_reader.h | 11 ++--- 3 files changed, 54 insertions(+), 5 deletions(-) create mode 100644 cmake/external/gzstream.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index df00e977ebb54..bb2ba1ea0c416 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -176,6 +176,7 @@ include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) +include(external/gzstream) if (NOT WIN32) # there is no official support of snappystream, warpctc, nccl, cupti in windows diff --git a/cmake/external/gzstream.cmake b/cmake/external/gzstream.cmake new file mode 100644 index 0000000000000..f0e3dd8c6aa11 --- /dev/null +++ b/cmake/external/gzstream.cmake @@ -0,0 +1,47 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IF(MOBILE_INFERENCE) + return() +ENDIF() + +include (ExternalProject) + +# NOTE: gzstream is needed when linking with ctr reader. + +SET(GZSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/gzstream) +SET(GZSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/gzstream) +SET(GZSTREAM_INCLUDE_DIR "${GZSTREAM_INSTALL_DIR}/include/" CACHE PATH "gzstream include directory." FORCE) + +ExternalProject_Add( + extern_gzstream + GIT_REPOSITORY "https://github.com/kanedo/gzstream.git" + GIT_TAG "" + PREFIX ${GZSTREAM_SOURCES_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_IN_SOURCE 1 + BUILD_COMMAND make -j8 + INSTALL_COMMAND mkdir -p ${GZSTREAM_INSTALL_DIR}/lib/ && mkdir -p ${GZSTREAM_INSTALL_DIR}/include/ + && cp ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/libgzstream.a ${GZSTREAM_INSTALL_DIR}/lib + && cp -r ${GZSTREAM_SOURCES_DIR}/src/extern_gzstream/gzstream.h ${GZSTREAM_INSTALL_DIR}/include +) + +ADD_LIBRARY(gzstream STATIC IMPORTED GLOBAL) +SET_PROPERTY(TARGET gzstream PROPERTY IMPORTED_LOCATION + "${GZSTREAM_INSTALL_DIR}/lib/libgzstream.a") + +include_directories(${GZSTREAM_INCLUDE_DIR}) +ADD_DEPENDENCIES(gzstream extern_gzstream) diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 8a25993699213..1ef6e6d551fb1 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -61,11 +61,12 @@ class CTRReader : public framework::FileReader { void Start() override { queue_->ReOpen(); - for (int i = 0; i < thread_num_; i++) { - read_threads_.emplace_back( - new std::thread(std::bind(&CTRReader::ReadThread, this, file_list_, - slots_, batch_size_, queue_))); - } + // for (int i = 0; i < thread_num_; i++) { + // read_threads_.emplace_back( + // new std::thread(std::bind(&CTRReader::ReadThread, this, + // file_list_, + // slots_, batch_size_, queue_))); + // } } private: From 0f3ece775d455fadb301c8d8609a424b4a4f508c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 18:33:05 +0800 Subject: [PATCH 004/684] use gzstream --- paddle/fluid/operators/reader/CMakeLists.txt | 2 +- paddle/fluid/operators/reader/ctr_reader.cc | 46 +++++++------------- paddle/fluid/operators/reader/ctr_reader.h | 4 -- 3 files changed, 17 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 341aeda4e41a5..4ad376c6170b4 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -16,7 +16,7 @@ function(reader_library TARGET_NAME) endfunction() cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) -cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost) +cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost gzstream) reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index a4197a54349ee..8be9f68c9410a 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -14,6 +14,8 @@ #include "paddle/fluid/operators/reader/ctr_reader.h" +#include + #include #include #include @@ -24,10 +26,6 @@ #include #include -#include -#include -#include - namespace paddle { namespace operators { namespace reader { @@ -75,23 +73,19 @@ static inline void parse_line( class GzipReader { public: - explicit GzipReader(const std::string& file_name) : instream_(&inbuf_) { - file_ = std::ifstream(file_name, std::ios_base::in | std::ios_base::binary); - inbuf_.push(boost::iostreams::gzip_decompressor()); - inbuf_.push(file_); - // Convert streambuf to istream - } + explicit GzipReader(const std::string& file_name) + : gzstream_(file_name.c_str()) {} - ~GzipReader() { file_.close(); } + ~GzipReader() {} - bool HasNext() { return instream_.peek() != EOF; } + bool HasNext() { return gzstream_.peek() != EOF; } - void NextLine(std::string& line) { std::getline(instream_, line); } // NOLINT + void NextLine(std::string* line) { // NOLINT + std::getline(gzstream_, line); + } private: - boost::iostreams::filtering_streambuf inbuf_; - std::ifstream file_; - std::istream instream_; + igzstream gzstream_; }; class MultiGzipReader { @@ -113,8 +107,8 @@ class MultiGzipReader { return true; } - void NextLine(std::string& line) { // NOLINT - readers_[current_reader_index_]->NextLine(line); + void NextLine(std::string* line) { + readers_[current_reader_index_]->NextLine(*line); } private: @@ -122,12 +116,6 @@ class MultiGzipReader { size_t current_reader_index_ = 0; }; -// void CTRReader::ReadThread( -// const std::vector &file_list, -// const std::vector& slots, -// int batch_size, -// std::shared_ptr& queue) {} - void CTRReader::ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, @@ -135,14 +123,12 @@ void CTRReader::ReadThread(const std::vector& file_list, std::string line; // read all files - std::vector all_lines; MultiGzipReader reader(file_list); + reader.NextLine(&line); - for (int j = 0; j < all_lines.size(); ++j) { - std::unordered_map> slots_to_data; - int64_t label; - parse_line(all_lines[j], slots, &label, &slots_to_data); - } + std::unordered_map> slots_to_data; + int64_t label; + parse_line(line, slots, &label, &slots_to_data); } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 1ef6e6d551fb1..11eb4f97864a8 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -22,10 +22,6 @@ #include #include -#include -#include -#include - #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/threadpool.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" From 71c2ad412fe230cf8a7c6c231c889a7cd8232c0f Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 19:41:03 +0800 Subject: [PATCH 005/684] complete read thread --- paddle/fluid/operators/reader/ctr_reader.cc | 59 +++++++++++++++++---- paddle/fluid/operators/reader/ctr_reader.h | 2 +- 2 files changed, 50 insertions(+), 11 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 8be9f68c9410a..7c83a7d62c51c 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -52,6 +52,7 @@ static inline void parse_line( std::vector ret; string_split(line, ' ', &ret); *label = std::stoi(ret[2]) > 0; + for (size_t i = 3; i < ret.size(); ++i) { const std::string& item = ret[i]; std::vector slot_and_feasign; @@ -62,6 +63,13 @@ static inline void parse_line( (*slots_to_data)[slot_and_feasign[1]].push_back(feasign); } } + + // NOTE:: if the slot has no value, then fill [0] as it's data. + for (auto& slot : slots) { + if (slots_to_data->find(slot) == slots_to_data->end()) { + (*slots_to_data)[slot].push_back(0); + } + } } // class Reader { @@ -80,9 +88,7 @@ class GzipReader { bool HasNext() { return gzstream_.peek() != EOF; } - void NextLine(std::string* line) { // NOLINT - std::getline(gzstream_, line); - } + void NextLine(std::string* line) { std::getline(gzstream_, *line); } private: igzstream gzstream_; @@ -108,7 +114,7 @@ class MultiGzipReader { } void NextLine(std::string* line) { - readers_[current_reader_index_]->NextLine(*line); + readers_[current_reader_index_]->NextLine(line); } private: @@ -119,16 +125,49 @@ class MultiGzipReader { void CTRReader::ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, - std::shared_ptr* queue) { + std::shared_ptr queue) { std::string line; + std::vector read_data; + + std::vector>> batch_data; + std::vector batch_label; - // read all files MultiGzipReader reader(file_list); - reader.NextLine(&line); + // read all files + for (int i = 0; i < batch_size; ++i) { + if (reader.HasNext()) { + reader.NextLine(&line); + std::unordered_map> slots_to_data; + int64_t label; + parse_line(line, slots, &label, &slots_to_data); + batch_data.push_back(slots_to_data); + batch_label.push_back(label); + } else { + break; + } + } - std::unordered_map> slots_to_data; - int64_t label; - parse_line(line, slots, &label, &slots_to_data); + std::vector lod_datas; + for (auto& slot : slots) { + for (auto& slots_to_data : batch_data) { + std::vector lod_data{0}; + std::vector batch_feasign; + + auto& feasign = slots_to_data[slot]; + + lod_data.push_back(lod_data.back() + feasign.size()); + batch_feasign.insert(feasign.end(), feasign.begin(), feasign.end()); + framework::LoDTensor lod_tensor; + framework::LoD lod{lod_data}; + lod_tensor.set_lod(lod); + int64_t* tensor_data = lod_tensor.mutable_data( + framework::make_ddim({1, static_cast(batch_feasign.size())}), + platform::CPUPlace()); + memcpy(tensor_data, batch_feasign.data(), batch_feasign.size()); + lod_datas.push_back(lod_tensor); + } + } + queue->Push(lod_datas); } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 11eb4f97864a8..41c520621e46b 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -68,7 +68,7 @@ class CTRReader : public framework::FileReader { private: void ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, - std::shared_ptr* queue); + std::shared_ptr queue); private: std::shared_ptr queue_; From a06173eedc86c1f6dba9660674f45665693d8606 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 21:28:06 +0800 Subject: [PATCH 006/684] clean code --- paddle/fluid/operators/reader/ctr_reader.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 7c83a7d62c51c..6c24a1ce77a39 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -127,7 +127,6 @@ void CTRReader::ReadThread(const std::vector& file_list, int batch_size, std::shared_ptr queue) { std::string line; - std::vector read_data; std::vector>> batch_data; std::vector batch_label; From d981333e9443b721c172b0f7af077fa965c6ed14 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 21:36:35 +0800 Subject: [PATCH 007/684] add a base class for reader --- paddle/fluid/operators/reader/ctr_reader.cc | 27 +++++++++++---------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 6c24a1ce77a39..da109733da812 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -72,29 +72,29 @@ static inline void parse_line( } } -// class Reader { -// public: -// virtual ~Reader() {} -// virtual bool HasNext() = 0; -// virtual void NextLine(std::string& line) = 0; -//}; - -class GzipReader { +class Reader { + public: + virtual ~Reader() {} + virtual bool HasNext() = 0; + virtual void NextLine(std::string* line) = 0; +}; + +class GzipReader : public Reader { public: explicit GzipReader(const std::string& file_name) : gzstream_(file_name.c_str()) {} ~GzipReader() {} - bool HasNext() { return gzstream_.peek() != EOF; } + bool HasNext() override { return gzstream_.peek() != EOF; } - void NextLine(std::string* line) { std::getline(gzstream_, *line); } + void NextLine(std::string* line) override { std::getline(gzstream_, *line); } private: igzstream gzstream_; }; -class MultiGzipReader { +class MultiGzipReader : public Reader { public: explicit MultiGzipReader(const std::vector& file_list) { for (auto& file : file_list) { @@ -102,7 +102,7 @@ class MultiGzipReader { } } - bool HasNext() { + bool HasNext() override { if (current_reader_index_ >= readers_.size()) { return false; } @@ -113,7 +113,7 @@ class MultiGzipReader { return true; } - void NextLine(std::string* line) { + void NextLine(std::string* line) override { readers_[current_reader_index_]->NextLine(line); } @@ -151,6 +151,7 @@ void CTRReader::ReadThread(const std::vector& file_list, for (auto& slots_to_data : batch_data) { std::vector lod_data{0}; std::vector batch_feasign; + std::vector batch_label; auto& feasign = slots_to_data[slot]; From 694e8945a298773eaab847aa704548c3d755c560 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 21:54:14 +0800 Subject: [PATCH 008/684] add a base class for reader --- paddle/fluid/operators/reader/ctr_reader.cc | 54 +++++++++++++-------- 1 file changed, 34 insertions(+), 20 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index da109733da812..9742641297776 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -132,31 +132,36 @@ void CTRReader::ReadThread(const std::vector& file_list, std::vector batch_label; MultiGzipReader reader(file_list); - // read all files - for (int i = 0; i < batch_size; ++i) { - if (reader.HasNext()) { - reader.NextLine(&line); - std::unordered_map> slots_to_data; - int64_t label; - parse_line(line, slots, &label, &slots_to_data); - batch_data.push_back(slots_to_data); - batch_label.push_back(label); - } else { - break; + + while (reader.HasNext()) { + // read all files + for (int i = 0; i < batch_size; ++i) { + if (reader.HasNext()) { + reader.NextLine(&line); + std::unordered_map> slots_to_data; + int64_t label; + parse_line(line, slots, &label, &slots_to_data); + batch_data.push_back(slots_to_data); + batch_label.push_back(label); + } else { + break; + } } - } - std::vector lod_datas; - for (auto& slot : slots) { - for (auto& slots_to_data : batch_data) { + std::vector lod_datas; + + // first insert tensor for each slots + for (auto& slot : slots) { std::vector lod_data{0}; std::vector batch_feasign; - std::vector batch_label; - auto& feasign = slots_to_data[slot]; + for (size_t i = 0; i < batch_data.size(); ++i) { + auto& feasign = batch_data[i][slot]; + + lod_data.push_back(lod_data.back() + feasign.size()); + batch_feasign.insert(feasign.end(), feasign.begin(), feasign.end()); + } - lod_data.push_back(lod_data.back() + feasign.size()); - batch_feasign.insert(feasign.end(), feasign.begin(), feasign.end()); framework::LoDTensor lod_tensor; framework::LoD lod{lod_data}; lod_tensor.set_lod(lod); @@ -166,8 +171,17 @@ void CTRReader::ReadThread(const std::vector& file_list, memcpy(tensor_data, batch_feasign.data(), batch_feasign.size()); lod_datas.push_back(lod_tensor); } + + // insert label tensor + framework::LoDTensor label_tensor; + int64_t* label_tensor_data = label_tensor.mutable_data( + framework::make_ddim({1, static_cast(batch_label.size())}), + platform::CPUPlace()); + memcpy(label_tensor_data, batch_label.data(), batch_label.size()); + lod_datas.push_back(label_tensor); + + queue->Push(lod_datas); } - queue->Push(lod_datas); } } // namespace reader From 71cbc8bd24ffd853478323ac87eb2841d3521321 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 22:58:45 +0800 Subject: [PATCH 009/684] optimize code --- paddle/fluid/operators/reader/ctr_reader.cc | 7 ++- paddle/fluid/operators/reader/ctr_reader.h | 53 +++++++++++++-------- 2 files changed, 37 insertions(+), 23 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 9742641297776..9849eb6aef5f0 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -122,10 +122,9 @@ class MultiGzipReader : public Reader { size_t current_reader_index_ = 0; }; -void CTRReader::ReadThread(const std::vector& file_list, - const std::vector& slots, - int batch_size, - std::shared_ptr queue) { +void ReadThread(const std::vector& file_list, + const std::vector& slots, int batch_size, + std::shared_ptr queue) { std::string line; std::vector>> batch_data; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 41c520621e46b..ef319c86326d7 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -30,19 +30,23 @@ namespace paddle { namespace operators { namespace reader { +void ReadThread(const std::vector& file_list, + const std::vector& slots, int batch_size, + std::shared_ptr queue); + class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, int batch_size, int thread_num, const std::vector& slots, const std::vector& file_list) - : framework::FileReader() { - thread_num_ = thread_num; - batch_size_ = batch_size; + : thread_num_(thread_num), + batch_size_(batch_size), + slots_(slots), + file_list_(file_list) { PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); queue_ = queue; - slots_ = slots; - file_list_ = file_list; + SplitFiles(); } ~CTRReader() { queue_->Close(); } @@ -53,30 +57,41 @@ class CTRReader : public framework::FileReader { if (!success) out->clear(); } - void Shutdown() override { queue_->Close(); } + void Shutdown() override { + VLOG(3) << "Shutdown reader"; + for (auto& read_thread : read_threads_) { + read_thread->join(); + } + read_threads_.clear(); + queue_->Close(); + } void Start() override { + VLOG(3) << "Start reader"; queue_->ReOpen(); - // for (int i = 0; i < thread_num_; i++) { - // read_threads_.emplace_back( - // new std::thread(std::bind(&CTRReader::ReadThread, this, - // file_list_, - // slots_, batch_size_, queue_))); - // } + for (int i = 0; i < file_groups_.size(); i++) { + read_threads_.emplace_back(new std::thread(std::bind( + &ReadThread, file_groups_[i], slots_, batch_size_, queue_))); + } } private: - void ReadThread(const std::vector& file_list, - const std::vector& slots, int batch_size, - std::shared_ptr queue); + void SplitFiles() { + file_groups_.resize(file_list_.size() > thread_num_ ? thread_num_ + : file_list_.size()); + for (int i = 0; i < file_list_.size(); ++i) { + file_groups_[i % thread_num_].push_back(file_list_[i]); + } + } private: + const int thread_num_; + const int batch_size_; + const std::vector slots_; + const std::vector file_list_; std::shared_ptr queue_; std::vector> read_threads_; - int thread_num_; - int batch_size_; - std::vector slots_; - std::vector file_list_; + std::vector> file_groups_; }; } // namespace reader From c8bd521045c2faf03e7bb9c1c454a4acb7306d0e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 18 Oct 2018 23:31:04 +0800 Subject: [PATCH 010/684] add reader thread status --- paddle/fluid/operators/reader/ctr_reader.cc | 5 ++++ paddle/fluid/operators/reader/ctr_reader.h | 27 +++++++++++++-------- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 9849eb6aef5f0..60e8d1250df66 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -124,7 +124,10 @@ class MultiGzipReader : public Reader { void ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, + int thread_id, std::vector* thread_status, std::shared_ptr queue) { + (*thread_status)[thread_id] = Running; + std::string line; std::vector>> batch_data; @@ -181,6 +184,8 @@ void ReadThread(const std::vector& file_list, queue->Push(lod_datas); } + + (*thread_status)[thread_id] = Stopped; } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index ef319c86326d7..1006ea96c9e3e 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -30,8 +30,11 @@ namespace paddle { namespace operators { namespace reader { +enum ReaderThreadStatus { Running, Stopped }; + void ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, + int thread_id, std::vector* thread_status, std::shared_ptr queue); class CTRReader : public framework::FileReader { @@ -40,13 +43,16 @@ class CTRReader : public framework::FileReader { int batch_size, int thread_num, const std::vector& slots, const std::vector& file_list) - : thread_num_(thread_num), - batch_size_(batch_size), - slots_(slots), - file_list_(file_list) { + : batch_size_(batch_size), slots_(slots), file_list_(file_list) { PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); + PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); + thread_num_ = + file_list_.size() > thread_num_ ? thread_num_ : file_list_.size(); queue_ = queue; SplitFiles(); + for (int i = 0; i < thread_num; ++i) { + read_thread_status_.push_back(Stopped); + } } ~CTRReader() { queue_->Close(); } @@ -69,28 +75,29 @@ class CTRReader : public framework::FileReader { void Start() override { VLOG(3) << "Start reader"; queue_->ReOpen(); - for (int i = 0; i < file_groups_.size(); i++) { - read_threads_.emplace_back(new std::thread(std::bind( - &ReadThread, file_groups_[i], slots_, batch_size_, queue_))); + for (int thread_id = 0; thread_id < file_groups_.size(); thread_id++) { + read_threads_.emplace_back(new std::thread( + std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, + thread_id, &read_thread_status_, queue_))); } } private: void SplitFiles() { - file_groups_.resize(file_list_.size() > thread_num_ ? thread_num_ - : file_list_.size()); + file_groups_.resize(thread_num_); for (int i = 0; i < file_list_.size(); ++i) { file_groups_[i % thread_num_].push_back(file_list_[i]); } } private: - const int thread_num_; + int thread_num_; const int batch_size_; const std::vector slots_; const std::vector file_list_; std::shared_ptr queue_; std::vector> read_threads_; + std::vector read_thread_status_; std::vector> file_groups_; }; From 803e2ed9f47302b84024af89fe0b50f5b24818ba Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 19 Oct 2018 11:34:33 +0800 Subject: [PATCH 011/684] add ctr_reader_test and fix bug --- paddle/fluid/operators/reader/CMakeLists.txt | 1 + paddle/fluid/operators/reader/ctr_reader.cc | 68 ++++++++++++++----- paddle/fluid/operators/reader/ctr_reader.h | 16 +++-- .../fluid/operators/reader/ctr_reader_test.cc | 45 ++++++++++++ 4 files changed, 108 insertions(+), 22 deletions(-) create mode 100644 paddle/fluid/operators/reader/ctr_reader_test.cc diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 4ad376c6170b4..2e019f3c1d8f0 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -17,6 +17,7 @@ endfunction() cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost gzstream) +cc_test(ctr_reader_test SRCS ctr_reader_test.cc DEPS ctr_reader) reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader) reader_library(create_random_data_generator_op SRCS create_random_data_generator_op.cc) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 60e8d1250df66..55e4975b3979d 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -46,32 +46,47 @@ static inline void string_split(const std::string& s, const char delimiter, } static inline void parse_line( - const std::string& line, const std::vector& slots, + const std::string& line, + const std::unordered_map& slot_to_index, int64_t* label, - std::unordered_map>* slots_to_data) { + std::unordered_map>* slot_to_data) { std::vector ret; string_split(line, ' ', &ret); *label = std::stoi(ret[2]) > 0; for (size_t i = 3; i < ret.size(); ++i) { const std::string& item = ret[i]; - std::vector slot_and_feasign; - string_split(item, ':', &slot_and_feasign); - if (slot_and_feasign.size() == 2) { - const std::string& slot = slot_and_feasign[1]; - int64_t feasign = std::strtoll(slot_and_feasign[0].c_str(), NULL, 10); - (*slots_to_data)[slot_and_feasign[1]].push_back(feasign); + std::vector feasign_and_slot; + string_split(item, ':', &feasign_and_slot); + auto& slot = feasign_and_slot[1]; + if (feasign_and_slot.size() == 2 && + slot_to_index.find(slot) != slot_to_index.end()) { + const std::string& slot = feasign_and_slot[1]; + int64_t feasign = std::strtoll(feasign_and_slot[0].c_str(), NULL, 10); + (*slot_to_data)[feasign_and_slot[1]].push_back(feasign); } } // NOTE:: if the slot has no value, then fill [0] as it's data. - for (auto& slot : slots) { - if (slots_to_data->find(slot) == slots_to_data->end()) { - (*slots_to_data)[slot].push_back(0); + for (auto& item : slot_to_index) { + if (slot_to_data->find(item.first) == slot_to_data->end()) { + (*slot_to_data)[item.first].push_back(0); } } } +static void print_map( + std::unordered_map>* map) { + for (auto it = map->begin(); it != map->end(); ++it) { + std::cout << it->first << " -> "; + std::cout << "["; + for (auto& i : it->second) { + std::cout << i << " "; + } + std::cout << "]\n"; + } +} + class Reader { public: virtual ~Reader() {} @@ -126,7 +141,14 @@ void ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, int thread_id, std::vector* thread_status, std::shared_ptr queue) { + VLOG(3) << "reader thread start! thread_id = " << thread_id; (*thread_status)[thread_id] = Running; + VLOG(3) << "set status to running"; + + std::unordered_map slot_to_index; + for (size_t i = 0; i < slots.size(); ++i) { + slot_to_index[slots[i]] = i; + } std::string line; @@ -135,21 +157,29 @@ void ReadThread(const std::vector& file_list, MultiGzipReader reader(file_list); + VLOG(3) << "reader inited"; + while (reader.HasNext()) { - // read all files + batch_data.clear(); + batch_label.clear(); + + // read batch_size data for (int i = 0; i < batch_size; ++i) { if (reader.HasNext()) { reader.NextLine(&line); - std::unordered_map> slots_to_data; + std::unordered_map> slot_to_data; int64_t label; - parse_line(line, slots, &label, &slots_to_data); - batch_data.push_back(slots_to_data); + parse_line(line, slot_to_index, &label, &slot_to_data); + batch_data.push_back(slot_to_data); batch_label.push_back(label); } else { break; } } + VLOG(3) << "read one batch, batch_size = " << batch_data.size(); + print_map(&batch_data[0]); + std::vector lod_datas; // first insert tensor for each slots @@ -159,9 +189,9 @@ void ReadThread(const std::vector& file_list, for (size_t i = 0; i < batch_data.size(); ++i) { auto& feasign = batch_data[i][slot]; - lod_data.push_back(lod_data.back() + feasign.size()); - batch_feasign.insert(feasign.end(), feasign.begin(), feasign.end()); + batch_feasign.insert(batch_feasign.end(), feasign.begin(), + feasign.end()); } framework::LoDTensor lod_tensor; @@ -174,6 +204,8 @@ void ReadThread(const std::vector& file_list, lod_datas.push_back(lod_tensor); } + VLOG(3) << "convert data to tensor"; + // insert label tensor framework::LoDTensor label_tensor; int64_t* label_tensor_data = label_tensor.mutable_data( @@ -182,10 +214,12 @@ void ReadThread(const std::vector& file_list, memcpy(label_tensor_data, batch_label.data(), batch_label.size()); lod_datas.push_back(label_tensor); + VLOG(3) << "push one data"; queue->Push(lod_datas); } (*thread_status)[thread_id] = Stopped; + VLOG(3) << "thread " << thread_id << " exited"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 1006ea96c9e3e..9469d86c6ab11 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -47,15 +47,15 @@ class CTRReader : public framework::FileReader { PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); thread_num_ = - file_list_.size() > thread_num_ ? thread_num_ : file_list_.size(); + file_list_.size() > thread_num ? thread_num : file_list_.size(); queue_ = queue; SplitFiles(); - for (int i = 0; i < thread_num; ++i) { + for (int i = 0; i < thread_num_; ++i) { read_thread_status_.push_back(Stopped); } } - ~CTRReader() { queue_->Close(); } + ~CTRReader() { Shutdown(); } void ReadNext(std::vector* out) override { bool success; @@ -74,8 +74,11 @@ class CTRReader : public framework::FileReader { void Start() override { VLOG(3) << "Start reader"; + PADDLE_ENFORCE_EQ(read_threads_.size(), 0, "read thread should be empty!"); queue_->ReOpen(); - for (int thread_id = 0; thread_id < file_groups_.size(); thread_id++) { + VLOG(3) << "reopen success"; + VLOG(3) << "thread_num " << thread_num_; + for (int thread_id = 0; thread_id < thread_num_; thread_id++) { read_threads_.emplace_back(new std::thread( std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, thread_id, &read_thread_status_, queue_))); @@ -86,7 +89,10 @@ class CTRReader : public framework::FileReader { void SplitFiles() { file_groups_.resize(thread_num_); for (int i = 0; i < file_list_.size(); ++i) { - file_groups_[i % thread_num_].push_back(file_list_[i]); + auto& file_name = file_list_[i]; + std::ifstream f(file_name.c_str()); + PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name); + file_groups_[i % thread_num_].push_back(file_name); } } diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc new file mode 100644 index 0000000000000..404da3c6cfa2a --- /dev/null +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "gtest/gtest.h" + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/reader/blocking_queue.h" +#include "paddle/fluid/operators/reader/ctr_reader.h" + +using paddle::operators::reader::LoDTensorBlockingQueue; +using paddle::operators::reader::LoDTensorBlockingQueueHolder; +using paddle::operators::reader::CTRReader; + +TEST(CTR_READER, read_data) { + LoDTensorBlockingQueueHolder queue_holder; + int capacity = 64; + queue_holder.InitOnce(capacity, {}, false); + + std::shared_ptr queue = queue_holder.GetQueue(); + + int batch_size = 10; + int thread_num = 1; + std::vector slots = {"6003", "6004"}; + std::vector file_list = { + "/Users/qiaolongfei/project/gzip_test/part-00000-A.gz", + "/Users/qiaolongfei/project/gzip_test/part-00000-A.gz"}; + + CTRReader reader(queue, batch_size, thread_num, slots, file_list); + + reader.Start(); + // + // std::vector out; + // reader.ReadNext(&out); +} From dd2dfeb6247bc3c4a222012ce5a8030d4cdd3fa1 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 19 Oct 2018 13:37:16 +0800 Subject: [PATCH 012/684] add debug information --- paddle/fluid/operators/reader/ctr_reader.cc | 31 ++++-- .../fluid/operators/reader/ctr_reader_test.cc | 101 ++++++++++++++++-- 2 files changed, 116 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 55e4975b3979d..ca2f567e3715a 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -141,7 +141,12 @@ void ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, int thread_id, std::vector* thread_status, std::shared_ptr queue) { - VLOG(3) << "reader thread start! thread_id = " << thread_id; + VLOG(3) << "[" << thread_id << "]" + << " reader thread start! thread_id = " << thread_id; + for (auto& file : file_list) { + VLOG(3) << "[" << thread_id << "]" + << " file " << file; + } (*thread_status)[thread_id] = Running; VLOG(3) << "set status to running"; @@ -159,6 +164,10 @@ void ReadThread(const std::vector& file_list, VLOG(3) << "reader inited"; + clock_t t0 = clock(); + + int i = 0; + while (reader.HasNext()) { batch_data.clear(); batch_label.clear(); @@ -176,9 +185,7 @@ void ReadThread(const std::vector& file_list, break; } } - - VLOG(3) << "read one batch, batch_size = " << batch_data.size(); - print_map(&batch_data[0]); + // print_map(&batch_data[0]); std::vector lod_datas; @@ -204,8 +211,6 @@ void ReadThread(const std::vector& file_list, lod_datas.push_back(lod_tensor); } - VLOG(3) << "convert data to tensor"; - // insert label tensor framework::LoDTensor label_tensor; int64_t* label_tensor_data = label_tensor.mutable_data( @@ -214,8 +219,18 @@ void ReadThread(const std::vector& file_list, memcpy(label_tensor_data, batch_label.data(), batch_label.size()); lod_datas.push_back(label_tensor); - VLOG(3) << "push one data"; - queue->Push(lod_datas); + // queue->Push(lod_datas); + VLOG(4) << "push one data, queue_size=" << queue->Size(); + + if (i != 0 && i % 100 == 0) { + clock_t t1 = clock(); + float line_per_s = 100 * batch_size * static_cast(CLOCKS_PER_SEC) / + static_cast(t1 - t0); + VLOG(3) << "[" << thread_id << "]" + << " line_per_second = " << line_per_s; + t0 = t1; + } + i++; } (*thread_status)[thread_id] = Stopped; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 404da3c6cfa2a..142d04e315748 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -12,34 +12,119 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include "paddle/fluid/operators/reader/ctr_reader.h" + +#include + #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/operators/reader/blocking_queue.h" -#include "paddle/fluid/operators/reader/ctr_reader.h" using paddle::operators::reader::LoDTensorBlockingQueue; using paddle::operators::reader::LoDTensorBlockingQueueHolder; using paddle::operators::reader::CTRReader; +using paddle::framework::LoDTensor; TEST(CTR_READER, read_data) { LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; - queue_holder.InitOnce(capacity, {}, false); + queue_holder.InitOnce(capacity, {}, true); std::shared_ptr queue = queue_holder.GetQueue(); int batch_size = 10; - int thread_num = 1; - std::vector slots = {"6003", "6004"}; + int thread_num = 2; + std::vector slots = { + "6002", "6003", "6004", "6005", "6006", "6007", "6008", "6009", "6010", + "6011", "6012", "6013", "6014", "6015", "6016", "6017", "6018", "6019", + "6020", "6021", "6023", "6024", "6025", "6026", "6027", "6028", "6029", + "6030", "6031", "6032", "6033", "6034", "6035", "6036", "6037", "6038", + "6039", "6040", "6041", "6042", "6043", "6044", "6045", "6046", "6047", + "6048", "6050", "6051", "6052", "6054", "6055", "6056", "6057", "6058", + "6059", "6060", "6061", "6062", "6063", "6064", "6065", "6066", "6067", + "6068", "6069", "6070", "6071", "6072", "6073", "6074", "6075", "6076", + "6077", "6078", "6079", "6080", "6081", "6082", "6083", "6084", "6085", + "6086", "6087", "6088", "6089", "6090", "6091", "6092", "6093", "6094", + "6095", "6096", "6097", "6098", "6099", "6100", "6101", "6102", "6103", + "6104", "6105", "6106", "6107", "6108", "6109", "6110", "6111", "6112", + "6113", "6114", "6115", "6116", "6117", "6118", "6119", "6120", "6121", + "6122", "6123", "6124", "6125", "6126", "6127", "6128", "6129", "6130", + "6131", "6132", "6133", "6134", "6135", "6136", "6137", "6138", "6139", + "6140", "6141", "6142", "6143", "6144", "6145", "6146", "6147", "6148", + "6149", "6150", "6151", "6152", "6153", "6155", "6156", "6157", "6158", + "6160", "6161", "6162", "6163", "6164", "6165", "6166", "6167", "6168", + "6169", "6170", "6171", "6172", "6173", "6174", "6175", "6176", "6177", + "6178", "6181", "6182", "6183", "6184", "6185", "6186", "6188", "6189", + "6190", "6191", "6192", "6194", "6195", "6196", "6197", "6198", "6199", + "6200", "6201", "6202", "6203", "6204", "6205", "6206", "6207", "6208", + "6209", "6210", "6211", "6212", "6213", "6214", "6215", "6216", "6217", + "6218", "6220", "6222", "6223", "6224", "6225", "6226", "6227", "6228", + "6229", "6230", "6231", "6232", "6233", "6234", "6235", "6236", "6237", + "6238", "6239", "6240", "6241", "6242", "6243", "6244", "6245", "6247", + "6248", "6250", "6251", "6253", "6254", "6255", "6256", "6257", "6258", + "6259", "6260", "6261", "6262", "6263", "6264", "6265", "6350", "6351", + "6352", "6353", "6354", "6355", "6356", "6738", "6739", "6740", "6741", + "6751", "6753", "6754", "6755", "6756", "6757", "6759", "6760", "6763", + "6764", "6765", "6766", "6767", "6768", "6769", "6770", "6806", "6807", + "6808", "6809", "6810", "6811", "6812", "6813", "6814", "6815", "6816", + "6817", "6818", "6819", "6820", "6821", "6822", "6823", "6824", "6825", + "6826", "6827", "6828", "6829", "6830", "6831", "6832", "6833", "6834", + "6835", "6836", "6837", "6838", "6839", "6840", "6841", "6842", "6843", + "6844", "6845", "6846", "6847", "6848", "6849", "6850", "6851", "6852", + "6853", "6854", "6855", "6856", "6857", "6858", "6859", "6860", "6861", + "6862", "6863", "6864", "6865", "6866", "6867", "6868", "6869", "6870", + "6871", "6872", "6873", "6874", "6875", "6876", "6877", "6878", "6879", + "6880", "6881", "6882", "6883", "6884", "6885", "6886", "6887", "6888", + "6889", "6890", "6891", "6892", "6893", "6894", "6895", "6896", "6897", + "6898", "6899", "6900", "6901", "6902", "6903", "6904", "6905", "6906", + "6907", "6908", "6909", "6910", "6911", "6912", "6913", "6914", "6915", + "6916", "6917", "6918", "6919", "6920", "6921", "6922", "6923", "6924", + "6925", "6926", "6927", "6928", "6929", "6930", "6931", "6932", "6933", + "6934", "6935", "6936", "6937", "6938", "6939", "6940", "6941", "6942", + "6943", "6944", "6945", "6946", "6947", "6948", "6949", "6950", "6951", + "6952", "6953", "6954", "6955", "6956", "6957", "6958", "6959", "6960", + "6961", "6962", "6963", "7001", "7002", "7003", "7004", "7005", "7006", + "7007", "7008", "7009", "7010", "7011", "7012", "7013", "7014", "7015", + "7016", "7017", "7018", "7019", "7020", "7021", "7022", "7023", "7024", + "7025", "7026", "7027", "7028", "7029", "7030", "7031", "7032", "7033", + "7034", "7035", "7036", "7037", "7038", "7039", "7040", "7041", "7042", + "7043", "7044", "7045", "7046", "7047", "7048", "7049", "7050", "7051", + "7052", "7053", "7054", "7055", "7056", "7057", "7058", "7060", "7062", + "7063", "7064", "7065", "7066", "7067", "7068", "7069", "7070", "7071", + "7072", "7073", "7074", "7075", "7076", "7077", "7078", "7079", "7080", + "7081", "7082", "7083", "7084", "7085", "7086", "7087", "7088", "7089", + "7090", "7091", "7092", "7093", "7094", "7095", "7096", "7097", "7098", + "7099", "7100", "7101", "7102", "7103", "7104", "7105", "7106", "7107", + "7108", "7109", "7110", "7120", "7122", "7123", "7124", "7125", "7126", + "7127", "7128", "7129", "7131", "7133", "7134", "7135", "7136", "7137", + "7138", "7139", "7140", "7141", "7142", "7143", "7144", "7145", "7146", + "7147", "7148", "7149", "7150", "7151", "7152", "7153", "7154", "7155", + "7156", "7157", "7158", "7159", "7160", "7161", "7162", "7163", "7164", + "7165", "7166", "7167", "7168", "7169", "7170", "7171", "7172", "7173", + "7174", "7175", "7176", "7177", "7178", "7179", "7180", "7181", "7182", + "7183", "7184", "7185", "7186", "7187", "7500", "7501", "7502", "7503", + "7504", "7505", "7506", "7507", "7508", "7509", "7510", "7511", "7512", + "7513", "7514", "7515", "7516", "7517", "7750"}; std::vector file_list = { "/Users/qiaolongfei/project/gzip_test/part-00000-A.gz", - "/Users/qiaolongfei/project/gzip_test/part-00000-A.gz"}; + "/Users/qiaolongfei/project/gzip_test/part-00001-A.gz", + "/Users/qiaolongfei/project/gzip_test/part-00002-A.gz"}; CTRReader reader(queue, batch_size, thread_num, slots, file_list); reader.Start(); - // - // std::vector out; - // reader.ReadNext(&out); + + std::cout << "start to reader data" << std::endl; + std::vector out; + int read_batch = 1000; + clock_t t0 = clock(); + for (int i = 0; i < read_batch; ++i) { + reader.ReadNext(&out); + } + clock_t t1 = clock(); + float line_per_s = read_batch * batch_size * + static_cast(CLOCKS_PER_SEC) / + static_cast(t1 - t0); + VLOG(3) << "line_per_second = " << line_per_s; } From 92cbaa41eb0e97579befa15951a777f5f67cbaec Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 19 Oct 2018 22:29:48 +0800 Subject: [PATCH 013/684] add GetTimeInSec --- cmake/external/gzstream.cmake | 2 +- paddle/fluid/operators/reader/CMakeLists.txt | 2 +- paddle/fluid/operators/reader/ctr_reader.cc | 13 +++++-------- paddle/fluid/operators/reader/ctr_reader.h | 13 ++++++++++++- paddle/fluid/operators/reader/ctr_reader_test.cc | 16 ++++++++-------- 5 files changed, 27 insertions(+), 19 deletions(-) diff --git a/cmake/external/gzstream.cmake b/cmake/external/gzstream.cmake index f0e3dd8c6aa11..e8a7de27f1ab1 100644 --- a/cmake/external/gzstream.cmake +++ b/cmake/external/gzstream.cmake @@ -44,4 +44,4 @@ SET_PROPERTY(TARGET gzstream PROPERTY IMPORTED_LOCATION "${GZSTREAM_INSTALL_DIR}/lib/libgzstream.a") include_directories(${GZSTREAM_INCLUDE_DIR}) -ADD_DEPENDENCIES(gzstream extern_gzstream) +ADD_DEPENDENCIES(gzstream extern_gzstream zlib) diff --git a/paddle/fluid/operators/reader/CMakeLists.txt b/paddle/fluid/operators/reader/CMakeLists.txt index 2e019f3c1d8f0..1514f6566a8cd 100644 --- a/paddle/fluid/operators/reader/CMakeLists.txt +++ b/paddle/fluid/operators/reader/CMakeLists.txt @@ -16,7 +16,7 @@ function(reader_library TARGET_NAME) endfunction() cc_library(buffered_reader SRCS buffered_reader.cc DEPS reader simple_threadpool) -cc_library(ctr_reader SRCS ctr_reader.cc DEPS reader simple_threadpool boost gzstream) +cc_library(ctr_reader SRCS ctr_reader.cc DEPS gzstream reader zlib) cc_test(ctr_reader_test SRCS ctr_reader_test.cc DEPS ctr_reader) reader_library(open_files_op SRCS open_files_op.cc DEPS buffered_reader) reader_library(create_ctr_reader_op SRCS create_ctr_reader_op.cc DEPS ctr_reader) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index ca2f567e3715a..26092c17e4724 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -58,10 +58,8 @@ static inline void parse_line( const std::string& item = ret[i]; std::vector feasign_and_slot; string_split(item, ':', &feasign_and_slot); - auto& slot = feasign_and_slot[1]; if (feasign_and_slot.size() == 2 && - slot_to_index.find(slot) != slot_to_index.end()) { - const std::string& slot = feasign_and_slot[1]; + slot_to_index.find(feasign_and_slot[1]) != slot_to_index.end()) { int64_t feasign = std::strtoll(feasign_and_slot[0].c_str(), NULL, 10); (*slot_to_data)[feasign_and_slot[1]].push_back(feasign); } @@ -164,7 +162,7 @@ void ReadThread(const std::vector& file_list, VLOG(3) << "reader inited"; - clock_t t0 = clock(); + uint64_t t0 = GetTimeInSec(); int i = 0; @@ -219,13 +217,12 @@ void ReadThread(const std::vector& file_list, memcpy(label_tensor_data, batch_label.data(), batch_label.size()); lod_datas.push_back(label_tensor); - // queue->Push(lod_datas); + queue->Push(lod_datas); VLOG(4) << "push one data, queue_size=" << queue->Size(); if (i != 0 && i % 100 == 0) { - clock_t t1 = clock(); - float line_per_s = 100 * batch_size * static_cast(CLOCKS_PER_SEC) / - static_cast(t1 - t0); + uint64_t t1 = GetTimeInSec(); + float line_per_s = 100 * batch_size / static_cast(t1 - t0); VLOG(3) << "[" << thread_id << "]" << " line_per_second = " << line_per_s; t0 = t1; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 9469d86c6ab11..32dfed8264861 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -14,6 +14,8 @@ #pragma once +#include + #include #include #include @@ -37,6 +39,15 @@ void ReadThread(const std::vector& file_list, int thread_id, std::vector* thread_status, std::shared_ptr queue); +inline uint64_t GetTimeInSec() { + using clock = std::conditional::type; + return std::chrono::duration_cast( + clock::now().time_since_epoch()) + .count(); +} + class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, @@ -88,7 +99,7 @@ class CTRReader : public framework::FileReader { private: void SplitFiles() { file_groups_.resize(thread_num_); - for (int i = 0; i < file_list_.size(); ++i) { + for (size_t i = 0; i < file_list_.size(); ++i) { auto& file_name = file_list_[i]; std::ifstream f(file_name.c_str()); PADDLE_ENFORCE(f.good(), "file %s not exist!", file_name); diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 142d04e315748..6ca0b26a0d71a 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -25,16 +25,17 @@ using paddle::operators::reader::LoDTensorBlockingQueue; using paddle::operators::reader::LoDTensorBlockingQueueHolder; using paddle::operators::reader::CTRReader; using paddle::framework::LoDTensor; +using paddle::operators::reader::GetTimeInSec; TEST(CTR_READER, read_data) { LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; - queue_holder.InitOnce(capacity, {}, true); + queue_holder.InitOnce(capacity, {}, false); std::shared_ptr queue = queue_holder.GetQueue(); int batch_size = 10; - int thread_num = 2; + int thread_num = 4; std::vector slots = { "6002", "6003", "6004", "6005", "6006", "6007", "6008", "6009", "6010", "6011", "6012", "6013", "6014", "6015", "6016", "6017", "6018", "6019", @@ -109,7 +110,8 @@ TEST(CTR_READER, read_data) { std::vector file_list = { "/Users/qiaolongfei/project/gzip_test/part-00000-A.gz", "/Users/qiaolongfei/project/gzip_test/part-00001-A.gz", - "/Users/qiaolongfei/project/gzip_test/part-00002-A.gz"}; + "/Users/qiaolongfei/project/gzip_test/part-00002-A.gz", + "/Users/qiaolongfei/project/gzip_test/part-00003-A.gz"}; CTRReader reader(queue, batch_size, thread_num, slots, file_list); @@ -118,13 +120,11 @@ TEST(CTR_READER, read_data) { std::cout << "start to reader data" << std::endl; std::vector out; int read_batch = 1000; - clock_t t0 = clock(); + uint64_t t0 = GetTimeInSec(); for (int i = 0; i < read_batch; ++i) { reader.ReadNext(&out); } - clock_t t1 = clock(); - float line_per_s = read_batch * batch_size * - static_cast(CLOCKS_PER_SEC) / - static_cast(t1 - t0); + uint64_t t1 = GetTimeInSec(); + float line_per_s = read_batch * batch_size / static_cast(t1 - t0); VLOG(3) << "line_per_second = " << line_per_s; } From 044d2e20bfc14d5e7699337f2ae145c0e7047cdd Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sat, 20 Oct 2018 21:32:45 +0800 Subject: [PATCH 014/684] update test method --- paddle/fluid/operators/reader/ctr_reader.cc | 2 +- paddle/fluid/operators/reader/ctr_reader.h | 2 +- paddle/fluid/operators/reader/ctr_reader_test.cc | 13 ++++++++----- 3 files changed, 10 insertions(+), 7 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 26092c17e4724..cb86f4c613cf2 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -222,7 +222,7 @@ void ReadThread(const std::vector& file_list, if (i != 0 && i % 100 == 0) { uint64_t t1 = GetTimeInSec(); - float line_per_s = 100 * batch_size / static_cast(t1 - t0); + float line_per_s = 100 * batch_size * 1000000 / (t1 - t0); VLOG(3) << "[" << thread_id << "]" << " line_per_second = " << line_per_s; t0 = t1; diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 32dfed8264861..89f63364c8d3d 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -43,7 +43,7 @@ inline uint64_t GetTimeInSec() { using clock = std::conditional::type; - return std::chrono::duration_cast( + return std::chrono::duration_cast( clock::now().time_since_epoch()) .count(); } diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 6ca0b26a0d71a..51fbdf2d0794d 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -35,7 +35,7 @@ TEST(CTR_READER, read_data) { std::shared_ptr queue = queue_holder.GetQueue(); int batch_size = 10; - int thread_num = 4; + int thread_num = 3; std::vector slots = { "6002", "6003", "6004", "6005", "6006", "6007", "6008", "6009", "6010", "6011", "6012", "6013", "6014", "6015", "6016", "6017", "6018", "6019", @@ -119,12 +119,15 @@ TEST(CTR_READER, read_data) { std::cout << "start to reader data" << std::endl; std::vector out; - int read_batch = 1000; + int read_batch = 10000; uint64_t t0 = GetTimeInSec(); for (int i = 0; i < read_batch; ++i) { reader.ReadNext(&out); + if (i != 0 && i % 100 == 0) { + uint64_t t1 = GetTimeInSec(); + float line_per_s = 100 * batch_size * 1000000 / (t1 - t0); + VLOG(3) << "line_per_second = " << line_per_s; + t0 = GetTimeInSec(); + } } - uint64_t t1 = GetTimeInSec(); - float line_per_s = read_batch * batch_size / static_cast(t1 - t0); - VLOG(3) << "line_per_second = " << line_per_s; } From 5c65eff6ef3faed880d356a94c4c914a21dd9a35 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 21 Oct 2018 20:46:03 +0800 Subject: [PATCH 015/684] update test for ctr data --- paddle/fluid/operators/reader/ctr_reader.cc | 9 +- paddle/fluid/operators/reader/ctr_reader.h | 6 +- .../fluid/operators/reader/ctr_reader_test.cc | 174 +++++++++--------- 3 files changed, 96 insertions(+), 93 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index cb86f4c613cf2..47f2c56c64afe 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -168,7 +168,10 @@ void ReadThread(const std::vector& file_list, while (reader.HasNext()) { batch_data.clear(); + batch_data.reserve(batch_size); + batch_label.clear(); + batch_label.reserve(batch_size); // read batch_size data for (int i = 0; i < batch_size; ++i) { @@ -205,7 +208,8 @@ void ReadThread(const std::vector& file_list, int64_t* tensor_data = lod_tensor.mutable_data( framework::make_ddim({1, static_cast(batch_feasign.size())}), platform::CPUPlace()); - memcpy(tensor_data, batch_feasign.data(), batch_feasign.size()); + memcpy(tensor_data, batch_feasign.data(), + batch_feasign.size() * sizeof(int64_t)); lod_datas.push_back(lod_tensor); } @@ -214,7 +218,8 @@ void ReadThread(const std::vector& file_list, int64_t* label_tensor_data = label_tensor.mutable_data( framework::make_ddim({1, static_cast(batch_label.size())}), platform::CPUPlace()); - memcpy(label_tensor_data, batch_label.data(), batch_label.size()); + memcpy(label_tensor_data, batch_label.data(), + batch_label.size() * sizeof(int64_t)); lod_datas.push_back(label_tensor); queue->Push(lod_datas); diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 89f63364c8d3d..d87f81402fcae 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -55,13 +55,14 @@ class CTRReader : public framework::FileReader { const std::vector& slots, const std::vector& file_list) : batch_size_(batch_size), slots_(slots), file_list_(file_list) { + PADDLE_ENFORCE_GT(thread_num, 0, "thread num should be larger then 0!"); PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); PADDLE_ENFORCE_GT(file_list.size(), 0, "file list should not be empty"); thread_num_ = file_list_.size() > thread_num ? thread_num : file_list_.size(); queue_ = queue; SplitFiles(); - for (int i = 0; i < thread_num_; ++i) { + for (size_t i = 0; i < thread_num_; ++i) { read_thread_status_.push_back(Stopped); } } @@ -76,6 +77,7 @@ class CTRReader : public framework::FileReader { void Shutdown() override { VLOG(3) << "Shutdown reader"; + // shutdown should stop all the reader thread for (auto& read_thread : read_threads_) { read_thread->join(); } @@ -108,7 +110,7 @@ class CTRReader : public framework::FileReader { } private: - int thread_num_; + size_t thread_num_; const int batch_size_; const std::vector slots_; const std::vector file_list_; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 51fbdf2d0794d..a73d54385e665 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -14,8 +14,15 @@ #include "paddle/fluid/operators/reader/ctr_reader.h" +#include #include +#include +#include +#include +#include +#include + #include "gtest/gtest.h" #include "paddle/fluid/framework/lod_tensor.h" @@ -25,109 +32,98 @@ using paddle::operators::reader::LoDTensorBlockingQueue; using paddle::operators::reader::LoDTensorBlockingQueueHolder; using paddle::operators::reader::CTRReader; using paddle::framework::LoDTensor; -using paddle::operators::reader::GetTimeInSec; +using paddle::framework::LoD; +using paddle::platform::CPUPlace; + +static void generatedata(const std::vector& data, + const std::string& file_name) { + std::ifstream in(file_name.c_str()); + if (in.good()) { + VLOG(3) << "file " << file_name << " exist, delete it first!"; + remove(file_name.c_str()); + } else { + in.close(); + } + + ogzstream out(file_name.c_str()); + PADDLE_ENFORCE(out.good(), "open file %s failed!", file_name); + for (auto& c : data) { + out << c; + } + out.close(); + PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name); +} TEST(CTR_READER, read_data) { + const std::vector ctr_data = { + "aaaa 1 0 0:6002 1:6003 2:6004 3:6005 4:6006 -1\n", + "bbbb 1 0 5:6003 6:6003 7:6003 8:6004 9:6004 -1\n", + "cccc 1 1 10:6002 11:6002 12:6002 13:6002 14:6002 -2\n", + "dddd 1 0 15:6003 16:6003 17:6003 18:6003 19:6004 -3\n", + "1111 1 1 20:6001 21:6001 22:6001 23:6001 24:6001 12\n", + "2222 1 1 25:6004 26:6004 27:6004 28:6005 29:6005 aa\n", + "3333 1 0 30:6002 31:6003 32:6004 33:6004 34:6005 er\n", + "eeee 1 1 35:6003 36:6003 37:6005 38:6005 39:6005 dd\n", + "ffff 1 1 40:6002 41:6003 42:6004 43:6004 44:6005 66\n", + "gggg 1 1 46:6006 45:6006 47:6003 48:6003 49:6003 ba\n", + }; + std::string gz_file_name = "test_ctr_reader_data.gz"; + generatedata(ctr_data, gz_file_name); + + std::vector label_value = {0, 0, 1, 0, 1, 1, 0, 1, 1, 1}; + + std::vector>> data_slot_6002{ + {{{0, 1, 2}}, {0, 0}}, + {{{0, 5, 6}}, {10, 11, 12, 13, 14, 0}}, + {{{0, 1, 2}}, {0, 0}}, + {{{0, 1, 2}}, {30, 0}}, + {{{0, 1, 2}}, {40, 0}}}; + std::vector>> data_slot_6003{ + {{{0, 1, 4}}, {1, 5, 6, 7}}, + {{{0, 1, 5}}, {0, 15, 16, 17, 18}}, + {{{0, 1, 2}}, {0, 0}}, + {{{0, 1, 3}}, {31, 35, 36}}, + {{{0, 1, 4}}, {41, 47, 48, 49}}}; + LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; queue_holder.InitOnce(capacity, {}, false); std::shared_ptr queue = queue_holder.GetQueue(); - int batch_size = 10; - int thread_num = 3; - std::vector slots = { - "6002", "6003", "6004", "6005", "6006", "6007", "6008", "6009", "6010", - "6011", "6012", "6013", "6014", "6015", "6016", "6017", "6018", "6019", - "6020", "6021", "6023", "6024", "6025", "6026", "6027", "6028", "6029", - "6030", "6031", "6032", "6033", "6034", "6035", "6036", "6037", "6038", - "6039", "6040", "6041", "6042", "6043", "6044", "6045", "6046", "6047", - "6048", "6050", "6051", "6052", "6054", "6055", "6056", "6057", "6058", - "6059", "6060", "6061", "6062", "6063", "6064", "6065", "6066", "6067", - "6068", "6069", "6070", "6071", "6072", "6073", "6074", "6075", "6076", - "6077", "6078", "6079", "6080", "6081", "6082", "6083", "6084", "6085", - "6086", "6087", "6088", "6089", "6090", "6091", "6092", "6093", "6094", - "6095", "6096", "6097", "6098", "6099", "6100", "6101", "6102", "6103", - "6104", "6105", "6106", "6107", "6108", "6109", "6110", "6111", "6112", - "6113", "6114", "6115", "6116", "6117", "6118", "6119", "6120", "6121", - "6122", "6123", "6124", "6125", "6126", "6127", "6128", "6129", "6130", - "6131", "6132", "6133", "6134", "6135", "6136", "6137", "6138", "6139", - "6140", "6141", "6142", "6143", "6144", "6145", "6146", "6147", "6148", - "6149", "6150", "6151", "6152", "6153", "6155", "6156", "6157", "6158", - "6160", "6161", "6162", "6163", "6164", "6165", "6166", "6167", "6168", - "6169", "6170", "6171", "6172", "6173", "6174", "6175", "6176", "6177", - "6178", "6181", "6182", "6183", "6184", "6185", "6186", "6188", "6189", - "6190", "6191", "6192", "6194", "6195", "6196", "6197", "6198", "6199", - "6200", "6201", "6202", "6203", "6204", "6205", "6206", "6207", "6208", - "6209", "6210", "6211", "6212", "6213", "6214", "6215", "6216", "6217", - "6218", "6220", "6222", "6223", "6224", "6225", "6226", "6227", "6228", - "6229", "6230", "6231", "6232", "6233", "6234", "6235", "6236", "6237", - "6238", "6239", "6240", "6241", "6242", "6243", "6244", "6245", "6247", - "6248", "6250", "6251", "6253", "6254", "6255", "6256", "6257", "6258", - "6259", "6260", "6261", "6262", "6263", "6264", "6265", "6350", "6351", - "6352", "6353", "6354", "6355", "6356", "6738", "6739", "6740", "6741", - "6751", "6753", "6754", "6755", "6756", "6757", "6759", "6760", "6763", - "6764", "6765", "6766", "6767", "6768", "6769", "6770", "6806", "6807", - "6808", "6809", "6810", "6811", "6812", "6813", "6814", "6815", "6816", - "6817", "6818", "6819", "6820", "6821", "6822", "6823", "6824", "6825", - "6826", "6827", "6828", "6829", "6830", "6831", "6832", "6833", "6834", - "6835", "6836", "6837", "6838", "6839", "6840", "6841", "6842", "6843", - "6844", "6845", "6846", "6847", "6848", "6849", "6850", "6851", "6852", - "6853", "6854", "6855", "6856", "6857", "6858", "6859", "6860", "6861", - "6862", "6863", "6864", "6865", "6866", "6867", "6868", "6869", "6870", - "6871", "6872", "6873", "6874", "6875", "6876", "6877", "6878", "6879", - "6880", "6881", "6882", "6883", "6884", "6885", "6886", "6887", "6888", - "6889", "6890", "6891", "6892", "6893", "6894", "6895", "6896", "6897", - "6898", "6899", "6900", "6901", "6902", "6903", "6904", "6905", "6906", - "6907", "6908", "6909", "6910", "6911", "6912", "6913", "6914", "6915", - "6916", "6917", "6918", "6919", "6920", "6921", "6922", "6923", "6924", - "6925", "6926", "6927", "6928", "6929", "6930", "6931", "6932", "6933", - "6934", "6935", "6936", "6937", "6938", "6939", "6940", "6941", "6942", - "6943", "6944", "6945", "6946", "6947", "6948", "6949", "6950", "6951", - "6952", "6953", "6954", "6955", "6956", "6957", "6958", "6959", "6960", - "6961", "6962", "6963", "7001", "7002", "7003", "7004", "7005", "7006", - "7007", "7008", "7009", "7010", "7011", "7012", "7013", "7014", "7015", - "7016", "7017", "7018", "7019", "7020", "7021", "7022", "7023", "7024", - "7025", "7026", "7027", "7028", "7029", "7030", "7031", "7032", "7033", - "7034", "7035", "7036", "7037", "7038", "7039", "7040", "7041", "7042", - "7043", "7044", "7045", "7046", "7047", "7048", "7049", "7050", "7051", - "7052", "7053", "7054", "7055", "7056", "7057", "7058", "7060", "7062", - "7063", "7064", "7065", "7066", "7067", "7068", "7069", "7070", "7071", - "7072", "7073", "7074", "7075", "7076", "7077", "7078", "7079", "7080", - "7081", "7082", "7083", "7084", "7085", "7086", "7087", "7088", "7089", - "7090", "7091", "7092", "7093", "7094", "7095", "7096", "7097", "7098", - "7099", "7100", "7101", "7102", "7103", "7104", "7105", "7106", "7107", - "7108", "7109", "7110", "7120", "7122", "7123", "7124", "7125", "7126", - "7127", "7128", "7129", "7131", "7133", "7134", "7135", "7136", "7137", - "7138", "7139", "7140", "7141", "7142", "7143", "7144", "7145", "7146", - "7147", "7148", "7149", "7150", "7151", "7152", "7153", "7154", "7155", - "7156", "7157", "7158", "7159", "7160", "7161", "7162", "7163", "7164", - "7165", "7166", "7167", "7168", "7169", "7170", "7171", "7172", "7173", - "7174", "7175", "7176", "7177", "7178", "7179", "7180", "7181", "7182", - "7183", "7184", "7185", "7186", "7187", "7500", "7501", "7502", "7503", - "7504", "7505", "7506", "7507", "7508", "7509", "7510", "7511", "7512", - "7513", "7514", "7515", "7516", "7517", "7750"}; - std::vector file_list = { - "/Users/qiaolongfei/project/gzip_test/part-00000-A.gz", - "/Users/qiaolongfei/project/gzip_test/part-00001-A.gz", - "/Users/qiaolongfei/project/gzip_test/part-00002-A.gz", - "/Users/qiaolongfei/project/gzip_test/part-00003-A.gz"}; + int batch_size = 2; + int thread_num = 1; + std::vector slots = {"6002", "6003"}; + std::vector file_list; + for (int i = 0; i < thread_num; ++i) { + file_list.push_back(gz_file_name); + } CTRReader reader(queue, batch_size, thread_num, slots, file_list); reader.Start(); - std::cout << "start to reader data" << std::endl; - std::vector out; - int read_batch = 10000; - uint64_t t0 = GetTimeInSec(); - for (int i = 0; i < read_batch; ++i) { + size_t batch_num = std::ceil(ctr_data.size() / batch_size) * thread_num; + + for (size_t i = 0; i < batch_num; ++i) { + std::vector out; reader.ReadNext(&out); - if (i != 0 && i % 100 == 0) { - uint64_t t1 = GetTimeInSec(); - float line_per_s = 100 * batch_size * 1000000 / (t1 - t0); - VLOG(3) << "line_per_second = " << line_per_s; - t0 = GetTimeInSec(); + ASSERT_EQ(out.size(), slots.size() + 1); + auto& label_tensor = out.back(); + ASSERT_EQ(label_tensor.dims(), + paddle::framework::make_ddim({1, batch_size})); + for (size_t j = 0; j < batch_size && i * batch_num + j < ctr_data.size(); + ++j) { + auto& label = label_tensor.data()[j]; + ASSERT_TRUE(label == 0 || label == 1); + ASSERT_EQ(label, label_value[i * batch_size + j]); } + auto& tensor_6002 = out[0]; + ASSERT_EQ(std::get<0>(data_slot_6002[i]), tensor_6002.lod()); + ASSERT_EQ(std::memcmp(std::get<1>(data_slot_6002[i]).data(), + tensor_6002.data(), + tensor_6002.dims()[1] * sizeof(int64_t)), + 0); } + ASSERT_EQ(queue->Size(), 0); } From e67783375d31f7bcd1f5ce2af12dc56cafdb5783 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 21 Oct 2018 21:06:18 +0800 Subject: [PATCH 016/684] code clean --- paddle/fluid/operators/reader/ctr_reader.cc | 28 +----------------- paddle/fluid/operators/reader/ctr_reader.h | 9 ------ .../fluid/operators/reader/ctr_reader_test.cc | 29 ++++++++++--------- 3 files changed, 16 insertions(+), 50 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 47f2c56c64afe..0002e80a30640 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -73,18 +73,6 @@ static inline void parse_line( } } -static void print_map( - std::unordered_map>* map) { - for (auto it = map->begin(); it != map->end(); ++it) { - std::cout << it->first << " -> "; - std::cout << "["; - for (auto& i : it->second) { - std::cout << i << " "; - } - std::cout << "]\n"; - } -} - class Reader { public: virtual ~Reader() {} @@ -162,10 +150,6 @@ void ReadThread(const std::vector& file_list, VLOG(3) << "reader inited"; - uint64_t t0 = GetTimeInSec(); - - int i = 0; - while (reader.HasNext()) { batch_data.clear(); batch_data.reserve(batch_size); @@ -186,7 +170,6 @@ void ReadThread(const std::vector& file_list, break; } } - // print_map(&batch_data[0]); std::vector lod_datas; @@ -224,19 +207,10 @@ void ReadThread(const std::vector& file_list, queue->Push(lod_datas); VLOG(4) << "push one data, queue_size=" << queue->Size(); - - if (i != 0 && i % 100 == 0) { - uint64_t t1 = GetTimeInSec(); - float line_per_s = 100 * batch_size * 1000000 / (t1 - t0); - VLOG(3) << "[" << thread_id << "]" - << " line_per_second = " << line_per_s; - t0 = t1; - } - i++; } (*thread_status)[thread_id] = Stopped; - VLOG(3) << "thread " << thread_id << " exited"; + VLOG(3) << "set status to stopped, thread " << thread_id << " exited"; } } // namespace reader diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index d87f81402fcae..244a5e2e7755d 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -39,15 +39,6 @@ void ReadThread(const std::vector& file_list, int thread_id, std::vector* thread_status, std::shared_ptr queue); -inline uint64_t GetTimeInSec() { - using clock = std::conditional::type; - return std::chrono::duration_cast( - clock::now().time_since_epoch()) - .count(); -} - class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index a73d54385e665..0b8a053a86df0 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -33,6 +33,7 @@ using paddle::operators::reader::LoDTensorBlockingQueueHolder; using paddle::operators::reader::CTRReader; using paddle::framework::LoDTensor; using paddle::framework::LoD; +using paddle::framework::DDim; using paddle::platform::CPUPlace; static void generatedata(const std::vector& data, @@ -73,17 +74,17 @@ TEST(CTR_READER, read_data) { std::vector label_value = {0, 0, 1, 0, 1, 1, 0, 1, 1, 1}; std::vector>> data_slot_6002{ - {{{0, 1, 2}}, {0, 0}}, - {{{0, 5, 6}}, {10, 11, 12, 13, 14, 0}}, - {{{0, 1, 2}}, {0, 0}}, - {{{0, 1, 2}}, {30, 0}}, - {{{0, 1, 2}}, {40, 0}}}; + {{{0, 1, 2, 7}}, {0, 0, 10, 11, 12, 13, 14}}, + {{{0, 1, 2, 3}}, {0, 0, 0}}, + {{{0, 1, 2, 3}}, {30, 0, 40}}, + {{{0, 1}}, {0}}}; std::vector>> data_slot_6003{ - {{{0, 1, 4}}, {1, 5, 6, 7}}, - {{{0, 1, 5}}, {0, 15, 16, 17, 18}}, - {{{0, 1, 2}}, {0, 0}}, - {{{0, 1, 3}}, {31, 35, 36}}, - {{{0, 1, 4}}, {41, 47, 48, 49}}}; + {{{0, 1, 4, 5}}, {1, 5, 6, 7, 0}}, + {{{0, 4, 5, 6}}, {15, 16, 17, 18, 0, 0}}, + {{{0, 1, 3, 4}}, {31, 35, 36, 41}}, + {{{0, 3}}, {47, 48, 49}}}; + + std::vector label_dims = {{1, 3}, {1, 3}, {1, 3}, {1, 1}}; LoDTensorBlockingQueueHolder queue_holder; int capacity = 64; @@ -91,7 +92,7 @@ TEST(CTR_READER, read_data) { std::shared_ptr queue = queue_holder.GetQueue(); - int batch_size = 2; + int batch_size = 3; int thread_num = 1; std::vector slots = {"6002", "6003"}; std::vector file_list; @@ -103,15 +104,15 @@ TEST(CTR_READER, read_data) { reader.Start(); - size_t batch_num = std::ceil(ctr_data.size() / batch_size) * thread_num; + size_t batch_num = + std::ceil(static_cast(ctr_data.size()) / batch_size) * thread_num; for (size_t i = 0; i < batch_num; ++i) { std::vector out; reader.ReadNext(&out); ASSERT_EQ(out.size(), slots.size() + 1); auto& label_tensor = out.back(); - ASSERT_EQ(label_tensor.dims(), - paddle::framework::make_ddim({1, batch_size})); + ASSERT_EQ(label_tensor.dims(), label_dims[i]); for (size_t j = 0; j < batch_size && i * batch_num + j < ctr_data.size(); ++j) { auto& label = label_tensor.data()[j]; From 4051fb36b55357fb4c5587aa9436651e4db34db8 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 21 Oct 2018 21:54:47 +0800 Subject: [PATCH 017/684] add monitor thread --- paddle/fluid/operators/reader/ctr_reader.cc | 20 +++++++++++++++++++ paddle/fluid/operators/reader/ctr_reader.h | 19 +++++++++++++++++- .../fluid/operators/reader/ctr_reader_test.cc | 9 ++++++++- 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 0002e80a30640..3156070e2c4b7 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -123,6 +123,26 @@ class MultiGzipReader : public Reader { size_t current_reader_index_ = 0; }; +void MonitorThread(std::vector* thread_status, + std::shared_ptr queue) { + VLOG(3) << "monitor thread in"; + bool reader_thread_is_running = true; + while (reader_thread_is_running) { + VLOG(3) << "reader_thread_is_running"; + reader_thread_is_running = false; + for (size_t i = 0; i < (*thread_status).size(); ++i) { + if ((*thread_status)[i] == Running) { + VLOG(3) << "reader is running!"; + reader_thread_is_running = true; + } + } + std::this_thread::sleep_for(std::chrono::milliseconds(1000)); + } + VLOG(3) << "all reader thread is stopped, push empty data into queue"; + queue->Push({}); + VLOG(3) << "monitor thread exited"; +} + void ReadThread(const std::vector& file_list, const std::vector& slots, int batch_size, int thread_id, std::vector* thread_status, diff --git a/paddle/fluid/operators/reader/ctr_reader.h b/paddle/fluid/operators/reader/ctr_reader.h index 244a5e2e7755d..9b2a11bae12d2 100644 --- a/paddle/fluid/operators/reader/ctr_reader.h +++ b/paddle/fluid/operators/reader/ctr_reader.h @@ -16,6 +16,7 @@ #include +#include // NOLINT #include #include #include @@ -39,6 +40,11 @@ void ReadThread(const std::vector& file_list, int thread_id, std::vector* thread_status, std::shared_ptr queue); +// monitor all running thread, if they are all stopped, +// then push an empty data into LoDTensorBlockingQueue +void MonitorThread(std::vector* thread_status, + std::shared_ptr queue); + class CTRReader : public framework::FileReader { public: explicit CTRReader(const std::shared_ptr& queue, @@ -58,7 +64,7 @@ class CTRReader : public framework::FileReader { } } - ~CTRReader() { Shutdown(); } + ~CTRReader() {} void ReadNext(std::vector* out) override { bool success; @@ -68,12 +74,19 @@ class CTRReader : public framework::FileReader { void Shutdown() override { VLOG(3) << "Shutdown reader"; + if (status_ == ReaderStatus::kStopped) { + return; + } // shutdown should stop all the reader thread for (auto& read_thread : read_threads_) { read_thread->join(); } + monitor_thread_->join(); + read_threads_.clear(); + monitor_thread_.reset(nullptr); queue_->Close(); + status_ = ReaderStatus::kStopped; } void Start() override { @@ -87,6 +100,9 @@ class CTRReader : public framework::FileReader { std::bind(&ReadThread, file_groups_[thread_id], slots_, batch_size_, thread_id, &read_thread_status_, queue_))); } + monitor_thread_.reset(new std::thread( + std::bind(&MonitorThread, &read_thread_status_, queue_))); + status_ = ReaderStatus::kRunning; } private: @@ -107,6 +123,7 @@ class CTRReader : public framework::FileReader { const std::vector file_list_; std::shared_ptr queue_; std::vector> read_threads_; + std::unique_ptr monitor_thread_; std::vector read_thread_status_; std::vector> file_groups_; }; diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 0b8a053a86df0..190182f45c593 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -107,8 +107,8 @@ TEST(CTR_READER, read_data) { size_t batch_num = std::ceil(static_cast(ctr_data.size()) / batch_size) * thread_num; + std::vector out; for (size_t i = 0; i < batch_num; ++i) { - std::vector out; reader.ReadNext(&out); ASSERT_EQ(out.size(), slots.size() + 1); auto& label_tensor = out.back(); @@ -126,5 +126,12 @@ TEST(CTR_READER, read_data) { tensor_6002.dims()[1] * sizeof(int64_t)), 0); } + reader.ReadNext(&out); + ASSERT_EQ(out.size(), 0); ASSERT_EQ(queue->Size(), 0); + reader.Shutdown(); + + reader.Start(); + reader.Shutdown(); + ASSERT_EQ(queue->Size(), 5); } From d37b9797ece7d3c4dfa9e2af4138294d51da361e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 21 Oct 2018 22:04:45 +0800 Subject: [PATCH 018/684] update test --- .../fluid/operators/reader/ctr_reader_test.cc | 60 +++++++++++-------- 1 file changed, 36 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/reader/ctr_reader_test.cc b/paddle/fluid/operators/reader/ctr_reader_test.cc index 190182f45c593..731122e3c16ce 100644 --- a/paddle/fluid/operators/reader/ctr_reader_test.cc +++ b/paddle/fluid/operators/reader/ctr_reader_test.cc @@ -55,6 +55,38 @@ static void generatedata(const std::vector& data, PADDLE_ENFORCE(out.good(), "save file %s failed!", file_name); } +static inline void check_all_data( + const std::vector& ctr_data, + const std::vector& slots, const std::vector& label_dims, + const std::vector& label_value, + const std::vector>>& data_slot_6002, + const std::vector>>& data_slot_6003, + size_t batch_num, size_t batch_size, + std::shared_ptr queue, CTRReader* reader) { + std::vector out; + for (size_t i = 0; i < batch_num; ++i) { + reader->ReadNext(&out); + ASSERT_EQ(out.size(), slots.size() + 1); + auto& label_tensor = out.back(); + ASSERT_EQ(label_tensor.dims(), label_dims[i]); + for (size_t j = 0; j < batch_size && i * batch_num + j < ctr_data.size(); + ++j) { + auto& label = label_tensor.data()[j]; + ASSERT_TRUE(label == 0 || label == 1); + ASSERT_EQ(label, label_value[i * batch_size + j]); + } + auto& tensor_6002 = out[0]; + ASSERT_EQ(std::get<0>(data_slot_6002[i]), tensor_6002.lod()); + ASSERT_EQ(std::memcmp(std::get<1>(data_slot_6002[i]).data(), + tensor_6002.data(), + tensor_6002.dims()[1] * sizeof(int64_t)), + 0); + } + reader->ReadNext(&out); + ASSERT_EQ(out.size(), 0); + ASSERT_EQ(queue->Size(), 0); +} + TEST(CTR_READER, read_data) { const std::vector ctr_data = { "aaaa 1 0 0:6002 1:6003 2:6004 3:6005 4:6006 -1\n", @@ -103,35 +135,15 @@ TEST(CTR_READER, read_data) { CTRReader reader(queue, batch_size, thread_num, slots, file_list); reader.Start(); - size_t batch_num = std::ceil(static_cast(ctr_data.size()) / batch_size) * thread_num; + check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, + data_slot_6003, batch_num, batch_size, queue, &reader); - std::vector out; - for (size_t i = 0; i < batch_num; ++i) { - reader.ReadNext(&out); - ASSERT_EQ(out.size(), slots.size() + 1); - auto& label_tensor = out.back(); - ASSERT_EQ(label_tensor.dims(), label_dims[i]); - for (size_t j = 0; j < batch_size && i * batch_num + j < ctr_data.size(); - ++j) { - auto& label = label_tensor.data()[j]; - ASSERT_TRUE(label == 0 || label == 1); - ASSERT_EQ(label, label_value[i * batch_size + j]); - } - auto& tensor_6002 = out[0]; - ASSERT_EQ(std::get<0>(data_slot_6002[i]), tensor_6002.lod()); - ASSERT_EQ(std::memcmp(std::get<1>(data_slot_6002[i]).data(), - tensor_6002.data(), - tensor_6002.dims()[1] * sizeof(int64_t)), - 0); - } - reader.ReadNext(&out); - ASSERT_EQ(out.size(), 0); - ASSERT_EQ(queue->Size(), 0); reader.Shutdown(); reader.Start(); + check_all_data(ctr_data, slots, label_dims, label_value, data_slot_6002, + data_slot_6003, batch_num, batch_size, queue, &reader); reader.Shutdown(); - ASSERT_EQ(queue->Size(), 5); } From 40d65a136968dc7d100e926509c491569b73fe0e Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 21 Oct 2018 22:07:11 +0800 Subject: [PATCH 019/684] optimize code --- paddle/fluid/operators/reader/ctr_reader.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/reader/ctr_reader.cc b/paddle/fluid/operators/reader/ctr_reader.cc index 3156070e2c4b7..60d7742bce4c3 100644 --- a/paddle/fluid/operators/reader/ctr_reader.cc +++ b/paddle/fluid/operators/reader/ctr_reader.cc @@ -218,7 +218,7 @@ void ReadThread(const std::vector& file_list, // insert label tensor framework::LoDTensor label_tensor; - int64_t* label_tensor_data = label_tensor.mutable_data( + auto* label_tensor_data = label_tensor.mutable_data( framework::make_ddim({1, static_cast(batch_label.size())}), platform::CPUPlace()); memcpy(label_tensor_data, batch_label.data(), From aff54ef735852eeedeafda3d9a5b3b75a5c3e99c Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Sun, 21 Oct 2018 23:26:42 +0800 Subject: [PATCH 020/684] add ctr data --- .../paddle/fluid/contrib/reader/ctr_reader.py | 123 ++++++++++++++++++ 1 file changed, 123 insertions(+) create mode 100644 python/paddle/fluid/contrib/reader/ctr_reader.py diff --git a/python/paddle/fluid/contrib/reader/ctr_reader.py b/python/paddle/fluid/contrib/reader/ctr_reader.py new file mode 100644 index 0000000000000..b8449e8d84867 --- /dev/null +++ b/python/paddle/fluid/contrib/reader/ctr_reader.py @@ -0,0 +1,123 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from paddle.fluid import core +from paddle.fluid.executor import global_scope +from paddle.fluid.framework import default_main_program, \ + default_startup_program, Variable +from paddle.fluid.unique_name import generate as unique_name + + +def monkey_patch_reader_methods(reader): + def __get_reader__(): + scope = global_scope() + var = scope.find_var(reader.name) + return var.get_reader() + + def reset(): + return __get_reader__().reset() + + reader.reset = reset + reader.stop_gradient = True + reader.persistable = True + return reader + + +def _copy_reader_var_(block, var): + new_var = block.create_var(name=var.name, type=core.VarDesc.VarType.READER) + new_var.desc.set_shapes(var.desc.shapes()) + new_var.desc.set_dtypes(var.desc.dtypes()) + new_var.persistable = True + return new_var + + +def ctr_reader(feed_data, + capacity, + thread_num, + batch_size, + file_list, + slots, + name=None): + """ + Create a CTR reader for data feeding in Python + + This layer returns a Reader Variable. + The Reader provides :code:`decorate_paddle_reader()` and + :code:`decorate_tensor_provider()` to set a Python generator as the data + source in Python side. When :code:`Executor::Run()` is invoked in C++ + side, the data from the generator would be read automatically. Unlike + :code:`DataFeeder.feed()`, the data reading process and + :code:`Executor::Run()` process can run in parallel using + :code:`py_reader`. The :code:`start()` method of the Reader should be + called when each pass begins, while the :code:`reset()` method should be + called when the pass ends and :code:`fluid.core.EOFException` raises. + Note that :code:`Program.clone()` method cannot clone :code:`py_reader`. + + Args: + capacity(int): The buffer capacity maintained by :code:`py_reader`. + thread_num(list|tuple): List of tuples which declaring data shapes. + batch_size(list|tuple): List of strs which declaring data type. + file_list(list|tuple): List of ints which declaring data lod_level. + slots(bool): Whether use double buffer or not. + name(basestring): The prefix Python queue name and Reader name. None will + be generated automatically. + + Returns: + Variable: A Reader from which we can get feeding data. + + Examples: + + 1. The basic usage of :code:`py_reader` is as follows: + """ + if name is None: + queue_name = unique_name('lod_tensor_blocking_queue') + reader_name = unique_name('create_ctr_reader') + else: + queue_name = "_".join([name, "queue"]) + reader_name = "_".join([name, "reader"]) + + var = global_scope().var(queue_name) + feed_queue = core.init_lod_tensor_blocking_queue(var, capacity, shapes) + + startup_blk = default_startup_program().current_block() + reader_var = startup_blk.create_var(name=reader_name) + startup_blk.append_op( + type='create_ctr_reader', + inputs={'blocking_queue': [queue_name]}, + outputs={'Out': [reader_var]}, + attrs={ + 'thread_num': thread_num, + 'batch_size': batch_size, + 'file_list': file_list, + 'slots': slots, + }) + + reader_var.persistable = True + + main_prog_reader_var = _copy_reader_var_( + default_main_program().current_block(), reader_var) + + reader = monkey_patch_reader_methods(main_prog_reader_var) + + # monkey patch py_reader special methods + reader.queue = feed_queue + reader.exited = False + + main_blk = default_main_program().current_block() + main_blk.append_op( + type='read', inputs={'Reader': [reader]}, outputs={'Out': feed_data}) + + return reader From c8801e100f04fb6ad4d35a5635cbc316fead80d1 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Sat, 10 Nov 2018 10:55:07 +0000 Subject: [PATCH 021/684] grad diff problem to be fixed and need api spec change to be done --- paddle/fluid/framework/selected_rows.h | 3 +- .../operators/hierarchical_sigmoid_op.cc | 11 +- .../fluid/operators/hierarchical_sigmoid_op.h | 55 ++++++-- .../fluid/operators/math/matrix_bit_code.cc | 49 ++++---- paddle/fluid/operators/math/matrix_bit_code.h | 119 ++++++++++++++++-- python/paddle/fluid/layers/nn.py | 23 +++- .../paddle/fluid/tests/unittests/op_test.py | 7 +- .../fluid/tests/unittests/test_hsigmoid_op.py | 117 +++++++++++++++-- 8 files changed, 324 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index daf5e95304fb8..4d728ae54ae6b 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -133,7 +133,8 @@ class SelectedRows { // SelectedRows are simply concated when adding together. Until a // SelectedRows add a Tensor, will the duplicate rows be handled. Vector rows_; - std::unordered_map id_to_index_; + std::unordered_map + id_to_index_; // should not be used when ids has duplicate member std::unique_ptr value_{nullptr}; int64_t height_; std::unique_ptr rwlock_{nullptr}; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index dadd054b9a6f8..49a17416c84ac 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -91,10 +91,19 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("W", "(Tensor, required), The parameters of hierarchical " "sigmoid operator, each of them is a 2-D tensor, the shape is" - "[num_classes - 1, D]."); + "[K, D]. Which K is the num of non-leaf node in Path Tree"); AddInput("Label", "(Tensor, required), The labels of training data. It's a" "tensor with shape [N, 1]."); + AddInput("PTable", + "(Tensor, optional), The Path Table from root to current word" + "it should have shape like [N, L], L is the length of the Path") + .AsDispensable(); + AddInput("PCode", + "(Tensor, optional), The Code on each Node of the Path from root " + "to current word" + "it should have shape like [N, L], L is the length of the Path") + .AsDispensable(); AddInput("Bias", "(Tensor, optional), The bias is a tensor with shape" "[1, num_classes - 1]."); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 64096a717b12e..2d500a03df87f 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/clip_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/matrix_bit_code.h" @@ -34,12 +35,21 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* in = ctx.Input("X"); auto* w = ctx.Input("W"); + auto* path = ctx.Input("PTable"); + auto* code = ctx.Input("PCode"); auto* label = ctx.Input("Label"); auto* bias = ctx.Input("Bias"); auto* out = ctx.Output("Out"); auto* pre_out = ctx.Output("PreOut"); size_t num_classes = static_cast(ctx.Attr("num_classes")); - int64_t code_length = math::FindLastSet(num_classes - 1); + bool is_custom = false; + if (path) { + is_custom = true; + } else { + is_custom = false; + } + int64_t code_length = + path ? path->dims()[1] : math::FindLastSet(num_classes - 1); int64_t batch_size = in->dims()[0]; framework::Tensor sum; auto& dev_ctx = ctx.template device_context(); @@ -52,7 +62,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { zero(dev_ctx, pre_out, static_cast(0.0)); auto& place = *ctx.template device_context().eigen_device(); math::RowwiseSum row_sum; - math::MatrixBitCodeFunctor bit_code(num_classes, label->data()); + + std::unique_ptr> bit_code; + if (!is_custom) { + bit_code.reset(new math::MatrixBitCodeFunctor(num_classes, + label->data())); + } else { + bit_code.reset(new math::MatrixBitCodeFunctor(path, code, + label->data())); + } std::vector sum_dims({batch_size, 1UL}); sum.mutable_data(framework::make_ddim(sum_dims), ctx.GetPlace()); @@ -60,15 +78,15 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { out->mutable_data(ctx.GetPlace()); auto out_mat = framework::EigenVector::Flatten(*out); if (bias) { - bit_code.Add(pre_out, *bias); + bit_code->Add(pre_out, *bias); } - bit_code.Mul(pre_out, *w, *in); + bit_code->Mul(pre_out, *w, *in); // clip to [-40, 40] Transform trans; trans(ctx.template device_context(), pre_out_data, pre_out_data + pre_out->numel(), pre_out_data, ClipFunctor(static_cast(-40.0), static_cast(40.0))); - bit_code.Sum(*pre_out, out, static_cast(-1)); + bit_code->Sum(*pre_out, out, static_cast(-1)); // use softrelu to calculate cross entropy pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); row_sum(dev_ctx, *pre_out, &sum); @@ -86,6 +104,8 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* in = ctx.Input("X"); auto* w = ctx.Input("W"); + auto* path = ctx.Input("PTable"); + auto* code = ctx.Input("PCode"); auto* in_grad = ctx.Output(framework::GradVarName("X")); auto* w_grad = ctx.Output(framework::GradVarName("W")); auto* bias_grad = @@ -105,7 +125,22 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { zero(dev_ctx, w_grad, static_cast(0.0)); size_t num_classes = static_cast(ctx.Attr("num_classes")); - math::MatrixBitCodeFunctor bit_code(num_classes, label->data()); + + bool is_custom = false; + if (path) { + is_custom = true; + } else { + is_custom = false; + } + + std::unique_ptr> bit_code; + if (!is_custom) { + bit_code.reset(new math::MatrixBitCodeFunctor(num_classes, + label->data())); + } else { + bit_code.reset(new math::MatrixBitCodeFunctor(path, code, + label->data())); + } auto& place = *ctx.template device_context().eigen_device(); auto pre_out_mat = EigenMatrix::From(*pre_out); @@ -116,7 +151,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { // softrelu derivative pre_out_grad_mat.device(place) = static_cast(1.0) - static_cast(1.0) / pre_out_mat.exp(); - bit_code.Sub(&pre_out_grad); // the gradient of clip(w * x + b) + bit_code->Sub(&pre_out_grad); // the gradient of clip(w * x + b) pre_out_grad_mat.device(place) = pre_out_grad_mat * out_grad_mat.broadcast(bcast); // TODO(guosheng): multiply pre_out_grad with subgradient of clipping to @@ -124,10 +159,10 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { if (bias_grad) { bias_grad->mutable_data(ctx.GetPlace()); zero(dev_ctx, bias_grad, static_cast(0.0)); - bit_code.AddGrad(pre_out_grad, bias_grad); + bit_code->AddGrad(pre_out_grad, bias_grad); } - bit_code.MulGradWeight(pre_out_grad, w_grad, *in); - bit_code.MulGradError(pre_out_grad, *w, in_grad); + bit_code->MulGradWeight(pre_out_grad, w_grad, *in); + bit_code->MulGradError(pre_out_grad, *w, in_grad); } }; diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 1e56e297396c6..88279f8d8a781 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -21,14 +21,13 @@ namespace math { template void MatrixBitCodeFunctor::Add(framework::Tensor* tmat, const framework::Tensor& vec) { - SimpleCodeTable code_table(num_classes_); size_t batch_size = tmat->dims()[0]; size_t width = tmat->dims()[1]; for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table(static_cast(ids_[i])); - int code_length = code.get_length(); + auto code = code_table->get_code(i); + int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); + size_t index = code->calc_index(j); tmat->data()[i * width + j] += vec.data()[index]; } } @@ -37,14 +36,13 @@ void MatrixBitCodeFunctor::Add(framework::Tensor* tmat, template void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, framework::Tensor* vec) { - SimpleCodeTable code_table(num_classes_); size_t batch_size = tmat.dims()[0]; size_t width = tmat.dims()[1]; for (size_t i = 0; i < batch_size; ++i) { - auto code = code_table(static_cast(ids_[i])); - int code_length = code.get_length(); + auto code = code_table->get_code(i); + int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); + size_t index = code->calc_index(j); vec->data()[index] += tmat.data()[i * width + j]; } } @@ -53,15 +51,14 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, template void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum) { - SimpleCodeTable code_table(num_classes_); size_t num_samples = tmat.dims()[0]; size_t o_width = tmat.dims()[1]; for (size_t i = 0; i < num_samples; ++i) { T sm = static_cast(0.0); - auto code = code_table(static_cast(ids_[i])); - int code_length = code.get_length(); + auto code = code_table->get_code(i); + int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { - if (code.calc_bit(j)) { + if (code->calc_bit(j)) { // calc_bit starts from right most bit, while data in tmat[i] is in the // reverse order. sm += tmat.data()[i * o_width + j]; @@ -75,7 +72,6 @@ template void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, const framework::Tensor& weight, const framework::Tensor& input) { - SimpleCodeTable code_table(num_classes_); size_t num_samples = tmat->dims()[0]; size_t tmat_width = tmat->dims()[1]; size_t input_width = input.dims()[1]; @@ -84,10 +80,10 @@ void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, auto weight_value = weight.data(); auto input_value = input.data(); for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table(static_cast(ids_[i])); - int code_length = code.get_length(); + auto code = code_table->get_code(i); + int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); + size_t index = code->calc_index(j); T sum = static_cast(0.0); for (size_t k = 0; k < input_width; ++k) { sum += weight_value[weight_width * index + k] * @@ -102,7 +98,6 @@ template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight, const framework::Tensor& input) { - SimpleCodeTable code_table(num_classes_); size_t num_samples = tmat.dims()[0]; size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; @@ -111,10 +106,10 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto weight_value = weight->data(); auto input_value = input.data(); for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table(static_cast(ids_[i])); - int code_length = code.get_length(); + auto code = code_table->get_code(i); + int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); + size_t index = code->calc_index(j); for (size_t k = 0; k < input_width; ++k) { weight_value[weight_width * index + k] += @@ -128,7 +123,6 @@ template void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, const framework::Tensor& weight, framework::Tensor* input) { - SimpleCodeTable code_table(num_classes_); size_t num_samples = tmat.dims()[0]; size_t tmat_width = tmat.dims()[1]; size_t input_width = input->dims()[1]; @@ -138,10 +132,10 @@ void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, auto input_value = input->data(); for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table(static_cast(ids_[i])); - int code_length = code.get_length(); + auto code = code_table->get_code(i); + int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { - size_t index = code.calc_index(j); + size_t index = code->calc_index(j); for (size_t k = 0; k < input_width; ++k) { input_value[input_width * i + k] += @@ -154,14 +148,13 @@ void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, template void MatrixBitCodeFunctor::Sub(framework::Tensor* tmat) { - SimpleCodeTable code_table(num_classes_); size_t num_samples = tmat->dims()[0]; size_t o_width = tmat->dims()[1]; for (size_t i = 0; i < num_samples; ++i) { - auto code = code_table(static_cast(ids_[i])); - int code_length = code.get_length(); + auto code = code_table->get_code(i); + int code_length = code->get_length(); for (int j = 0; j < code_length; ++j) { - if (code.calc_bit(j)) { + if (code->calc_bit(j)) { tmat->data()[i * o_width + j] -= 1; } } diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 07854c83584f9..f03c8d3689c8e 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -93,9 +93,27 @@ inline int clz(const T& value) { inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); } #endif // !_WIN32 } +// set a code interface to create multiple code +class Code { + public: + virtual ~Code() {} + virtual size_t calc_index(int bit) const = 0; + virtual bool calc_bit(int bit) const = 0; + virtual int get_length() const = 0; +}; +// set a CodeTable interface to create multiple code table +class CodeTable { + public: + virtual std::unique_ptr get_code(int64_t code) const = 0; + virtual size_t size() const = 0; + virtual int get_max_code_length() const = 0; + virtual ~CodeTable() {} +}; -struct SimpleCode { - SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {} +class SimpleCode : public Code { + public: + SimpleCode(size_t code, size_t num_classes, const int64_t* ids) + : c_(static_cast(ids[code]) + num_classes) {} /** * Here the id of root shoud be 1 rather than 0, thus the encoding of class c * is `c + num_classes` and all siblings can get the same weight indice using @@ -105,31 +123,111 @@ struct SimpleCode { * Binary classification path is the suffixes of encoding, thus leave out the * left most bit in calc_bit. */ - inline size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; } - inline bool calc_bit(int bit) const { return c_ & (1 << bit); } - inline int get_length() const { return FindLastSet(c_) - 1; } + size_t calc_index(int bit) const { return (c_ >> (bit + 1)) - 1; } + bool calc_bit(int bit) const { return c_ & (1 << bit); } + int get_length() const { return FindLastSet(c_) - 1; } private: size_t c_; }; -struct SimpleCodeTable { - explicit SimpleCodeTable(size_t num_classes) : num_classes_(num_classes) {} - SimpleCode operator()(size_t code) const { - return SimpleCode(code, num_classes_); +template +class CustomCode : public Code { + public: + CustomCode(const framework::Tensor* ptable, const framework::Tensor* pcode, + const int64_t* ids, const int index) + : ptable_(ptable), pcode_(pcode), ids_(ids), index_(index) {} + /** + * Here the id of root shoud be 1 rather than 0, thus the encoding of class c + * is `c + num_classes` and all siblings can get the same weight indice using + * prefixes. + * Weight index is the prefixes of encoding, thus leave out the right most + * bit in calc_index. + * Binary classification path is the suffixes of encoding, thus leave out the + * left most bit in calc_bit. + */ + size_t calc_index(int bit) const { + return ptable_ + ->data()[index_ * static_cast(ptable_->dims()[1]) + bit]; + } + bool calc_bit(int bit) const { + return pcode_ + ->data()[index_ * static_cast(ptable_->dims()[1]) + bit]; + } + int get_length() const { + int length = 0; + + for (int i = 0; i < ptable_->dims()[1]; i++) { + if (ptable_->data()[index_ * static_cast(ptable_->dims()[1]) + + i] != -1) { + length++; + } else { + return length; + } + } + return length; + } + + private: + const framework::Tensor* ptable_; + const framework::Tensor* pcode_; + const int64_t* ids_; + const int index_; +}; + +class SimpleCodeTable : public CodeTable { + public: + explicit SimpleCodeTable(size_t num_classes, const int64_t* ids) + : num_classes_(num_classes), ids_(ids) {} + std::unique_ptr get_code(int64_t code) const { + std::unique_ptr coder(new SimpleCode(code, num_classes_, ids_)); + return coder; } size_t size() const { return num_classes_; } int get_max_code_length() const { return FindLastSet(num_classes_ - 1); } private: size_t num_classes_; + const int64_t* ids_; +}; + +template +class CustomCodeTable : public CodeTable { + public: + explicit CustomCodeTable(const framework::Tensor* ptable, + const framework::Tensor* pcode, const int64_t* ids) + : ptable_(ptable), pcode_(pcode), ids_(ids) {} + + std::unique_ptr get_code(int64_t code) const { + std::unique_ptr coder(new CustomCode(ptable_, pcode_, ids_, code)); + return coder; + } + + size_t size() const { return static_cast(ptable_->dims()[1]); } + int get_max_code_length() const { + return static_cast(ptable_->dims()[1]); + } + + private: + const framework::Tensor* ptable_; + const framework::Tensor* pcode_; + const int64_t* ids_; }; template class MatrixBitCodeFunctor { public: explicit MatrixBitCodeFunctor(size_t num_classes, const int64_t* ids) - : num_classes_(num_classes), ids_(ids) {} + : num_classes_(num_classes), + ids_(ids), + code_table(new SimpleCodeTable(num_classes, ids)) {} + + explicit MatrixBitCodeFunctor(const framework::Tensor* ptable, + const framework::Tensor* pcode, + const int64_t* ids) + : num_classes_(static_cast(ptable->dims()[1])), + ids_(ids), + code_table(new CustomCodeTable(ptable, pcode, ids)) {} /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ @@ -168,6 +266,7 @@ class MatrixBitCodeFunctor { size_t num_classes_; const int64_t* ids_; + std::unique_ptr code_table; }; } // namespace math } // namespace operators diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 110e6d5ab236a..d3ee80ad529b3 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4349,6 +4349,8 @@ def nce(input, def hsigmoid(input, label, num_classes, + ptabl=None, + pcode=None, param_attr=None, bias_attr=None, name=None): @@ -4372,6 +4374,12 @@ def hsigmoid(input, label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. num_classes: (int), The number of classes, must not be less than 2. + ptable: (Variable|None) this variable can store each batch of samples' path to root, + it should be in leaf -> root order + ptable should have the same shape with pcode, and for each sample i ptable[i] indicates a np.array like + structure and each element in this array is indexes in parent nodes' Weight Matrix. + pcode: (Variable|None) this variable can store each batch of samples' code, + each code consist with every code of parent nodes. it should be in leaf -> root order param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of hsigmoid. If it is set to None or one attribute of ParamAttr, hsigmoid will create ParamAttr as param_attr. If the Initializer of the param_attr @@ -4403,12 +4411,25 @@ def hsigmoid(input, dim = input.shape[1] if num_classes < 2: raise ValueError("num_classes must not be less than 2.") + if (ptable is not None) and (pcode is None): + raise ValueError("pcode should not be None when ptable has been set") + elif (ptable is None) and (pcode is not None): + raise ValueError("ptable should not be None when pcode has been set") + else: + pass + weights = helper.create_parameter( attr=helper.param_attr, shape=[num_classes - 1, dim], is_bias=False, dtype=input.dtype) - inputs = {"X": input, "W": weights, "Label": label} + inputs = { + "X": input, + "W": weights, + "PTable": ptable, + "PCode": pcode, + "Label": label + } if helper.bias_attr: bias = helper.create_parameter( attr=helper.bias_attr, diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index e97643cddef22..fb521e86a3189 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -138,8 +138,11 @@ def setUpClass(cls): cls.dtype = "float32" cls.outputs = {} - np.random.seed(123) - random.seed(124) + # np.random.seed(123) + # random.seed(124) + + np.random.seed(190) + random.seed(200) @classmethod def tearDownClass(cls): diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 6948ae30023a7..4beeed01311bc 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -40,6 +40,29 @@ def cal_bit(self, bit): return self.c & (1 << bit) +class CodeTableWithCustomTree(object): + def __init__(self, ptable, pcode, index): + self.ptable_ = ptable + self.pcode_ = pcode + self.index_ = index + + def cal_index(self, bit): + return self.ptable_[self.index_][bit] + + def get_length(self): + length = 0 + for ele in self.ptable_[self.index_]: + + if ele >= 0: + length = length + 1 + else: + return length + return length + + def cal_bit(self, bit): + return self.pcode_[self.index_][bit] + + def hsigmoid(x, w, label, bias, num_classes): batch_size = x.shape[0] code_length = find_latest_set(num_classes - 1) @@ -48,10 +71,12 @@ def hsigmoid(x, w, label, bias, num_classes): pre_sum = np.zeros((batch_size, 1)) out = np.zeros((batch_size, 1)).astype("float32") for i in range(batch_size): + #print("\n leaf {leaf}: \n".format(leaf = label[i])) code_table = CodeTable(num_classes, label[i]) length = code_table.get_length() for j in range(length): idx = code_table.cal_index(j) + #print("index {index} ".format(index = j)) pre_output[i][j] += bias[0][idx] for i in range(batch_size): code_table = CodeTable(num_classes, label[i]) @@ -63,10 +88,12 @@ def hsigmoid(x, w, label, bias, num_classes): pre_output = np.clip(pre_output, -40.0, 40.0) # out(i, 0) = \sum_j bit(i, j) * preout(i, j) for i in range(batch_size): + #print("\n leaf {leaf}: \n".format(leaf = label[i])) code_table = CodeTable(num_classes, label[i]) length = code_table.get_length() sum = 0.0 for j in range(length): + #print("bit {bit} ".format(bit = code_table.cal_bit(j))) if code_table.cal_bit(j): sum += pre_output[i][j] out[i] = -1.0 * sum @@ -77,25 +104,101 @@ def hsigmoid(x, w, label, bias, num_classes): return pre_output, out -class TestHSigmoidOp(OpTest): +def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): + batch_size = x.shape[0] + code_length = len(ptable[0]) + code_table = [0 for _ in range(code_length)] + pre_output = np.zeros((batch_size, code_length)) + pre_sum = np.zeros((batch_size, 1)) + out = np.zeros((batch_size, 1)).astype("float32") + for i in range(batch_size): + code_table = CodeTableWithCustomTree(ptable, pcode, i) + length = code_table.get_length() + for j in range(length): + idx = code_table.cal_index(j) + pre_output[i][j] += bias[0][idx] + for i in range(batch_size): + code_table = CodeTableWithCustomTree(ptable, pcode, i) + length = code_table.get_length() + for j in range(length): + idx = code_table.cal_index(j) + pre_output[i][j] += np.dot(w[idx], x[i]) + # clip[-40.0, 40.0] + pre_output = np.clip(pre_output, -40.0, 40.0) + # out(i, 0) = \sum_j bit(i, j) * preout(i, j) + for i in range(batch_size): + code_table = CodeTableWithCustomTree(ptable, pcode, i) + length = code_table.get_length() + sum = 0.0 + for j in range(length): + if code_table.cal_bit(j): + sum += pre_output[i][j] + out[i] = -1.0 * sum + # soft relu + pre_output = np.log(1 + np.exp(pre_output)) + pre_sum = pre_output.sum(1).reshape((batch_size, 1)) + out += pre_sum + return pre_output, out + + +# class TestHSigmoidOp(OpTest): +# def setUp(self): +# self.op_type = "hierarchical_sigmoid" +# num_classes = 6 +# feature_size = 8 +# batch_size = 7 +# x = np.random.random((batch_size, feature_size)).astype("float32") +# w = np.random.random((num_classes - 1, feature_size)).astype("float32") +# label = np.random.randint(0, num_classes, (batch_size, 1)) +# bias = np.random.random((1, num_classes - 1)).astype("float32") +# self.attrs = {'num_classes': num_classes} +# self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} +# pre_output, out = hsigmoid(x, w, label, bias, num_classes) +# self.outputs = {'PreOut': pre_output, 'Out': out} + +# def test_check_output(self): +# self.check_output() + +# def test_check_grad(self): +# self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) + + +class TestHSigmoidOpWithCostumTree(OpTest): def setUp(self): self.op_type = "hierarchical_sigmoid" - num_classes = 6 + num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample feature_size = 8 batch_size = 4 - x = np.random.random((batch_size, feature_size)).astype("float32") - w = np.random.random((num_classes - 1, feature_size)).astype("float32") - label = np.random.randint(0, num_classes, (batch_size, 1)) + x = np.random.random((batch_size, feature_size)).astype("float32") * 10 + w = np.random.random( + (num_classes - 1, feature_size)).astype("float32") * 10 + label = np.array([0, 1, 4, 5]) + ptable = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, + -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( + 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store bias = np.random.random((1, num_classes - 1)).astype("float32") self.attrs = {'num_classes': num_classes} - self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} - pre_output, out = hsigmoid(x, w, label, bias, num_classes) + self.inputs = { + 'X': x, + 'W': w, + 'PTable': ptable, + 'PCode': pcode, + 'Label': label, + 'Bias': bias + } + pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, + bias, num_classes) self.outputs = {'PreOut': pre_output, 'Out': out} def test_check_output(self): + print("checking output in CostumTree") self.check_output() def test_check_grad(self): + print("checking outputGrad in CostumTree") self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) From 32e05b01f294b8ea5d742294fc8b4f4e69985f0a Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Mon, 12 Nov 2018 11:36:48 +0000 Subject: [PATCH 022/684] test=develop --- .../fluid/operators/hierarchical_sigmoid_op.h | 9 ++++ paddle/fluid/operators/math/matrix_bit_code.h | 2 +- .../paddle/fluid/tests/unittests/op_test.py | 7 +-- .../fluid/tests/unittests/test_hsigmoid_op.py | 53 ++++++++++--------- 4 files changed, 40 insertions(+), 31 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 2d500a03df87f..90bdb47311fce 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -86,6 +86,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { trans(ctx.template device_context(), pre_out_data, pre_out_data + pre_out->numel(), pre_out_data, ClipFunctor(static_cast(-40.0), static_cast(40.0))); + pre_out_mat = -1 * pre_out_mat; bit_code->Sum(*pre_out, out, static_cast(-1)); // use softrelu to calculate cross entropy pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); @@ -146,6 +147,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { auto pre_out_mat = EigenMatrix::From(*pre_out); auto pre_out_grad_mat = EigenMatrix::From(pre_out_grad); auto out_grad_mat = EigenMatrix::From(*out_grad); + Eigen::array bcast({{1, static_cast(pre_out_grad.dims()[1])}}); // softrelu derivative @@ -160,9 +162,16 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { bias_grad->mutable_data(ctx.GetPlace()); zero(dev_ctx, bias_grad, static_cast(0.0)); bit_code->AddGrad(pre_out_grad, bias_grad); + auto bias_grad_mat = EigenMatrix::From(*bias_grad); + bias_grad_mat = -1 * bias_grad_mat; } bit_code->MulGradWeight(pre_out_grad, w_grad, *in); bit_code->MulGradError(pre_out_grad, *w, in_grad); + auto w_grad_mat = EigenMatrix::From(*w_grad); + auto in_grad_mat = EigenMatrix::From(*in_grad); + + w_grad_mat = -1 * w_grad_mat; + in_grad_mat = -1 * in_grad_mat; } }; diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index f03c8d3689c8e..1e2abd1e697a2 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -157,7 +157,7 @@ class CustomCode : public Code { int get_length() const { int length = 0; - for (int i = 0; i < ptable_->dims()[1]; i++) { + for (int i = 0; i < static_cast(ptable_->dims()[1]); i++) { if (ptable_->data()[index_ * static_cast(ptable_->dims()[1]) + i] != -1) { length++; diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py index fb521e86a3189..e97643cddef22 100644 --- a/python/paddle/fluid/tests/unittests/op_test.py +++ b/python/paddle/fluid/tests/unittests/op_test.py @@ -138,11 +138,8 @@ def setUpClass(cls): cls.dtype = "float32" cls.outputs = {} - # np.random.seed(123) - # random.seed(124) - - np.random.seed(190) - random.seed(200) + np.random.seed(123) + random.seed(124) @classmethod def tearDownClass(cls): diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 4beeed01311bc..0a16f5a39c596 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -17,6 +17,9 @@ import unittest import numpy as np import math +# import paddle.fluid as fluid +# import paddle.fluid.core as core +# from op_builder import OpBuilder from op_test import OpTest np.random.seed(100) @@ -51,7 +54,7 @@ def cal_index(self, bit): def get_length(self): length = 0 - for ele in self.ptable_[self.index_]: + for ele in self.ptable_[self.index_]: # find the first -1 to stop trace if ele >= 0: length = length + 1 @@ -71,12 +74,10 @@ def hsigmoid(x, w, label, bias, num_classes): pre_sum = np.zeros((batch_size, 1)) out = np.zeros((batch_size, 1)).astype("float32") for i in range(batch_size): - #print("\n leaf {leaf}: \n".format(leaf = label[i])) code_table = CodeTable(num_classes, label[i]) length = code_table.get_length() for j in range(length): idx = code_table.cal_index(j) - #print("index {index} ".format(index = j)) pre_output[i][j] += bias[0][idx] for i in range(batch_size): code_table = CodeTable(num_classes, label[i]) @@ -87,13 +88,12 @@ def hsigmoid(x, w, label, bias, num_classes): # clip[-40.0, 40.0] pre_output = np.clip(pre_output, -40.0, 40.0) # out(i, 0) = \sum_j bit(i, j) * preout(i, j) + pre_output = -1 * pre_output for i in range(batch_size): - #print("\n leaf {leaf}: \n".format(leaf = label[i])) code_table = CodeTable(num_classes, label[i]) length = code_table.get_length() sum = 0.0 for j in range(length): - #print("bit {bit} ".format(bit = code_table.cal_bit(j))) if code_table.cal_bit(j): sum += pre_output[i][j] out[i] = -1.0 * sum @@ -108,6 +108,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): batch_size = x.shape[0] code_length = len(ptable[0]) code_table = [0 for _ in range(code_length)] + # init pre_out with shape [N, code_length] pre_output = np.zeros((batch_size, code_length)) pre_sum = np.zeros((batch_size, 1)) out = np.zeros((batch_size, 1)).astype("float32") @@ -125,6 +126,7 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): pre_output[i][j] += np.dot(w[idx], x[i]) # clip[-40.0, 40.0] pre_output = np.clip(pre_output, -40.0, 40.0) + pre_output = -1 * pre_output # out(i, 0) = \sum_j bit(i, j) * preout(i, j) for i in range(batch_size): code_table = CodeTableWithCustomTree(ptable, pcode, i) @@ -141,26 +143,27 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): return pre_output, out -# class TestHSigmoidOp(OpTest): -# def setUp(self): -# self.op_type = "hierarchical_sigmoid" -# num_classes = 6 -# feature_size = 8 -# batch_size = 7 -# x = np.random.random((batch_size, feature_size)).astype("float32") -# w = np.random.random((num_classes - 1, feature_size)).astype("float32") -# label = np.random.randint(0, num_classes, (batch_size, 1)) -# bias = np.random.random((1, num_classes - 1)).astype("float32") -# self.attrs = {'num_classes': num_classes} -# self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} -# pre_output, out = hsigmoid(x, w, label, bias, num_classes) -# self.outputs = {'PreOut': pre_output, 'Out': out} +class TestHSigmoidOp(OpTest): + def setUp(self): + self.op_type = "hierarchical_sigmoid" + num_classes = 6 + feature_size = 8 + batch_size = 4 + x = np.random.random((batch_size, feature_size)).astype("float32") * 2 + w = np.random.random( + (num_classes - 1, feature_size)).astype("float32") * 2 + label = np.random.randint(0, num_classes, (batch_size, 1)) + bias = np.random.random((1, num_classes - 1)).astype("float32") + self.attrs = {'num_classes': num_classes} + self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} + pre_output, out = hsigmoid(x, w, label, bias, num_classes) + self.outputs = {'PreOut': pre_output, 'Out': out} -# def test_check_output(self): -# self.check_output() + def test_check_output(self): + self.check_output() -# def test_check_grad(self): -# self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) + def test_check_grad(self): + self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) class TestHSigmoidOpWithCostumTree(OpTest): @@ -169,9 +172,9 @@ def setUp(self): num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample feature_size = 8 batch_size = 4 - x = np.random.random((batch_size, feature_size)).astype("float32") * 10 + x = np.random.random((batch_size, feature_size)).astype("float32") * 2 w = np.random.random( - (num_classes - 1, feature_size)).astype("float32") * 10 + (num_classes - 1, feature_size)).astype("float32") * 2 label = np.array([0, 1, 4, 5]) ptable = np.array( [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), From b8ff0972b63238dbc0fb853615967f8e339a30b7 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Mon, 12 Nov 2018 12:05:31 +0000 Subject: [PATCH 023/684] test=develop --- paddle/fluid/operators/hierarchical_sigmoid_op.h | 8 -------- python/paddle/fluid/tests/unittests/test_hsigmoid_op.py | 2 -- 2 files changed, 10 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 90bdb47311fce..df4f5f561a270 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -86,7 +86,6 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { trans(ctx.template device_context(), pre_out_data, pre_out_data + pre_out->numel(), pre_out_data, ClipFunctor(static_cast(-40.0), static_cast(40.0))); - pre_out_mat = -1 * pre_out_mat; bit_code->Sum(*pre_out, out, static_cast(-1)); // use softrelu to calculate cross entropy pre_out_mat.device(place) = (static_cast(1.0) + pre_out_mat.exp()).log(); @@ -162,16 +161,9 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { bias_grad->mutable_data(ctx.GetPlace()); zero(dev_ctx, bias_grad, static_cast(0.0)); bit_code->AddGrad(pre_out_grad, bias_grad); - auto bias_grad_mat = EigenMatrix::From(*bias_grad); - bias_grad_mat = -1 * bias_grad_mat; } bit_code->MulGradWeight(pre_out_grad, w_grad, *in); bit_code->MulGradError(pre_out_grad, *w, in_grad); - auto w_grad_mat = EigenMatrix::From(*w_grad); - auto in_grad_mat = EigenMatrix::From(*in_grad); - - w_grad_mat = -1 * w_grad_mat; - in_grad_mat = -1 * in_grad_mat; } }; diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 0a16f5a39c596..6152b96912da0 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -88,7 +88,6 @@ def hsigmoid(x, w, label, bias, num_classes): # clip[-40.0, 40.0] pre_output = np.clip(pre_output, -40.0, 40.0) # out(i, 0) = \sum_j bit(i, j) * preout(i, j) - pre_output = -1 * pre_output for i in range(batch_size): code_table = CodeTable(num_classes, label[i]) length = code_table.get_length() @@ -126,7 +125,6 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): pre_output[i][j] += np.dot(w[idx], x[i]) # clip[-40.0, 40.0] pre_output = np.clip(pre_output, -40.0, 40.0) - pre_output = -1 * pre_output # out(i, 0) = \sum_j bit(i, j) * preout(i, j) for i in range(batch_size): code_table = CodeTableWithCustomTree(ptable, pcode, i) From 5d0b568ecb58d479619c5a2295d65b7f677d4648 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 6 Nov 2018 18:42:19 +0800 Subject: [PATCH 024/684] Add YOLOv3 loss operator. test=develop --- paddle/fluid/operators/yolov3_loss_op.cc | 130 +++++++++ paddle/fluid/operators/yolov3_loss_op.cu | 23 ++ paddle/fluid/operators/yolov3_loss_op.h | 340 +++++++++++++++++++++++ 3 files changed, 493 insertions(+) create mode 100644 paddle/fluid/operators/yolov3_loss_op.cc create mode 100644 paddle/fluid/operators/yolov3_loss_op.cu create mode 100644 paddle/fluid/operators/yolov3_loss_op.h diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc new file mode 100644 index 0000000000000..b4c6a185e2b41 --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -0,0 +1,130 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/yolov3_loss_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class Yolov3LossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTBox"), + "Input(GTBox) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of Yolov3LossOp should not be null."); + + // PADDLE_ENFORCE(ctx->HasAttr("img_height"), + // "Attr(img_height) of Yolov3LossOp should not be null. "); + // PADDLE_ENFORCE(ctx->HasAttr("anchors"), + // "Attr(anchor) of Yolov3LossOp should not be null.") + // PADDLE_ENFORCE(ctx->HasAttr("class_num"), + // "Attr(class_num) of Yolov3LossOp should not be null."); + // PADDLE_ENFORCE(ctx->HasAttr( + // "ignore_thresh", + // "Attr(ignore_thresh) of Yolov3LossOp should not be null.")); + + auto dim_x = ctx->GetInputDim("X"); + auto dim_gt = ctx->GetInputDim("GTBox"); + auto img_height = ctx->Attrs().Get("img_height"); + auto anchors = ctx->Attrs().Get>("anchors"); + auto box_num = ctx->Attrs().Get("box_num"); + auto class_num = ctx->Attrs().Get("class_num"); + PADDLE_ENFORCE_GT(img_height, 0, + "Attr(img_height) value should be greater then 0"); + PADDLE_ENFORCE_GT(anchors.size(), 0, + "Attr(anchors) length should be greater then 0."); + PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, + "Attr(anchors) length should be even integer."); + PADDLE_ENFORCE_GT(box_num, 0, + "Attr(box_num) should be an integer greater then 0."); + PADDLE_ENFORCE_GT(class_num, 0, + "Attr(class_num) should be an integer greater then 0."); + PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); + + std::vector dim_out({dim_x[0], 1}); + ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + } +}; + +class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of bilinear interpolation, " + "This is a 4-D tensor with shape of [N, C, H, W]"); + AddOutput("Out", + "The output yolo loss tensor, " + "This is a 2-D tensor with shape of [N, 1]"); + + AddAttr("box_num", "The number of boxes generated in each grid."); + AddAttr("class_num", "The number of classes to predict."); + AddComment(R"DOC( + This operator generate yolov3 loss by given predict result and ground + truth boxes. + )DOC"); + } +}; + +class Yolov3LossOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + protected: + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@GRAD) should not be null"); + auto dim_x = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), dim_x); + } + } + + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); +REGISTER_OP_CPU_KERNEL( + yolov3_loss, + ops::Yolov3LossKernel); +REGISTER_OP_CPU_KERNEL( + yolov3_loss_grad, + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.cu b/paddle/fluid/operators/yolov3_loss_op.cu new file mode 100644 index 0000000000000..48f997456ac48 --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.cu @@ -0,0 +1,23 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#define EIGEN_USE_GPU + +#include "paddle/fluid/operators/yolov3_loss_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + yolov3_loss, + ops::Yolov3LossOpKernel); +REGISTER_OP_CUDA_KERNEL( + yolov3_loss_grad, + ops::Yolov3LossGradOpKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h new file mode 100644 index 0000000000000..7950390567bd9 --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -0,0 +1,340 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; +template +using EigenVector = framework::EigenVector; + +using Array2 = Eigen::DSizes; +using Array4 = Eigen::DSizes; + +template +static inline bool isZero(T x) { + return abs(x) < 1e-6; +} + +template +static inline T sigmod(T x) { + return 1.0 / (exp(-1.0 * x) + 1.0); +} + +template +static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + auto result = ((x_t - y_t) * mask_t).pow(2).sum().eval(); + return result(0); +} + +template +static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + auto result = + ((y_t * (x_t.log()) + (1.0 - y_t) * ((1.0 - x_t).log())) * mask_t) + .sum() + .eval(); + return result; +} + +template +static inline T CalcCEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); +} + +template +static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, + Tensor* pred_confs, Tensor* pred_classes, + Tensor* pred_x, Tensor* pred_y, Tensor* pred_w, + Tensor* pred_h, std::vector anchors, + const int class_num, const int stride) { + const int n = input.dims()[0]; + const int c = input.dims()[1]; + const int h = input.dims()[2]; + const int w = input.dims()[3]; + const int anchor_num = anchors.size() / 2; + const int box_attr_num = 5 + class_num; + + auto input_t = EigenTensor::From(input); + auto pred_boxes_t = EigenTensor::From(*pred_boxes); + auto pred_confs_t = EigenTensor::From(*pred_confs); + auto pred_classes_t = EigenTensor::From(*pred_classes); + auto pred_x_t = EigenTensor::From(*pred_x); + auto pred_y_t = EigenTensor::From(*pred_y); + auto pred_w_t = EigenTensor::From(*pred_w); + auto pred_h_t = EigenTensor::From(*pred_h); + + for (int i = 0; i < n; i++) { + for (int an_idx = 0; an_idx < anchor_num; an_idx++) { + float an_w = anchors[an_idx * 2] / stride; + float an_h = anchors[an_idx * 2 + 1] / stride; + + for (int j = 0; j < h; j++) { + for (int k = 0; k < w; k++) { + pred_x_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx, j, k)); + pred_y_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx + 1, j, k)); + pred_w_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx + 2, j, k)); + pred_h_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx + 3, j, k)); + + pred_boxes_t(i, an_idx, j, k, 0) = pred_x_t(i, an_idx, j, k) + k; + pred_boxes_t(i, an_idx, j, k, 1) = pred_y_t(i, an_idx, j, k) + j; + pred_boxes_t(i, an_idx, j, k, 2) = + exp(pred_w_t(i, an_idx, j, k)) * an_w; + pred_boxes_t(i, an_idx, j, k, 3) = + exp(pred_h_t(i, an_idx, j, k)) * an_h; + + pred_confs_t(i, an_idx, j, k) = + sigmod(input_t(i, box_attr_num * an_idx + 4, j, k)); + + for (int c = 0; c < class_num; c++) { + pred_classes_t(i, an_idx, j, k, c) = + sigmod(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); + } + } + } + } + } +} + +template +static T CalcBoxIoU(std::vector box1, std::vector box2, + bool center_mode) { + T b1_x1, b1_x2, b1_y1, b1_y2; + T b2_x1, b2_x2, b2_y1, b2_y2; + if (center_mode) { + b1_x1 = box1[0] - box1[2] / 2; + b1_x2 = box1[0] + box1[2] / 2; + b1_y1 = box1[1] - box1[3] / 2; + b1_y2 = box1[1] + box1[3] / 2; + b2_x1 = box2[0] - box2[2] / 2; + b2_x2 = box2[0] + box2[2] / 2; + b2_y1 = box2[1] - box2[3] / 2; + b2_y2 = box2[1] + box2[3] / 2; + } else { + b1_x1 = box1[0]; + b1_x2 = box1[1]; + b1_y1 = box1[2]; + b1_y2 = box1[3]; + b2_x1 = box2[0]; + b2_x2 = box2[0]; + b2_y1 = box2[1]; + b2_y2 = box2[1]; + } + T b1_area = (b1_x2 - b1_x1 + 1.0) * (b1_y2 - b1_y1 + 1.0); + T b2_area = (b2_x2 - b2_x1 + 1.0) * (b2_y2 - b2_y1 + 1.0); + + T inter_rect_x1 = std::max(b1_x1, b2_x1); + T inter_rect_y1 = std::max(b1_y1, b2_y1); + T inter_rect_x2 = std::min(b1_x2, b2_x2); + T inter_rect_y2 = std::min(b1_y2, b2_y2); + T inter_area = std::max(inter_rect_x2 - inter_rect_x1 + 1.0, 0.0) * + std::max(inter_rect_y2 - inter_rect_y1 + 1.0, 0.0); + + return inter_area / (b1_area + b2_area - inter_area + 1e-16); +} + +template +static inline int GetPredLabel(const Tensor& pred_classes, int n, + int best_an_index, int gj, int gi) { + auto pred_classes_t = EigenTensor::From(pred_classes); + T score = 0.0; + int label = -1; + for (int i = 0; i < pred_classes.dims()[4]; i++) { + if (pred_classes_t(n, best_an_index, gj, gi, i) > score) { + score = pred_classes_t(n, best_an_index, gj, gi, i); + label = i; + } + } + return label; +} + +template +static void CalcPredBoxWithGTBox( + const Tensor& pred_boxes, const Tensor& pred_confs, + const Tensor& pred_classes, const Tensor& gt_boxes, + std::vector anchors, const float ignore_thresh, const int img_height, + int* gt_num, int* correct_num, Tensor* mask_true, Tensor* mask_false, + Tensor* tx, Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, + Tensor* tclass) { + const int n = gt_boxes.dims()[0]; + const int b = gt_boxes.dims()[1]; + const int grid_size = pred_boxes.dims()[1]; + const int anchor_num = anchors.size() / 2; + auto pred_boxes_t = EigenTensor::From(pred_boxes); + auto pred_confs_t = EigenTensor::From(pred_confs); + auto pred_classes_t = EigenTensor::From(pred_classes); + auto gt_boxes_t = EigenTensor::From(gt_boxes); + auto mask_true_t = EigenTensor::From(*mask_true).setConstant(0.0); + auto mask_false_t = EigenTensor::From(*mask_false).setConstant(1.0); + auto tx_t = EigenTensor::From(*tx).setConstant(0.0); + auto ty_t = EigenTensor::From(*ty).setConstant(0.0); + auto tw_t = EigenTensor::From(*tw).setConstant(0.0); + auto th_t = EigenTensor::From(*th).setConstant(0.0); + auto tconf_t = EigenTensor::From(*tconf).setConstant(0.0); + auto tclass_t = EigenTensor::From(*tclass).setConstant(0.0); + + *gt_num = 0; + *correct_num = 0; + for (int i = 0; i < n; i++) { + for (int j = 0; j < b; j++) { + if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && + isZero(gt_boxes_t(i, j, 2)) && isZero(gt_boxes_t(i, j, 3))) { + continue; + } + + *(gt_num)++; + int gt_label = gt_boxes_t(i, j, 0); + T gx = gt_boxes_t(i, j, 1); + T gy = gt_boxes_t(i, j, 2); + T gw = gt_boxes_t(i, j, 3); + T gh = gt_boxes_t(i, j, 4); + int gi = static_cast(gx); + int gj = static_cast(gy); + + T max_iou = static_cast(-1); + T iou; + int best_an_index = -1; + std::vector gt_box({0, 0, gw, gh}); + for (int an_idx = 0; an_idx < anchor_num; an_idx++) { + std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), + static_cast(anchors[2 * an_idx + 1])}); + iou = CalcBoxIoU(gt_box, anchor_shape, false); + if (iou > max_iou) { + max_iou = iou; + best_an_index = an_idx; + } + if (iou > ignore_thresh) { + mask_false_t(b, an_idx, gj, gi) = 0; + } + } + mask_true_t(b, best_an_index, gj, gi) = 1; + mask_false_t(b, best_an_index, gj, gi) = 1; + tx_t(i, best_an_index, gj, gi) = gx - gi; + ty_t(i, best_an_index, gj, gi) = gy - gj; + tw_t(i, best_an_index, gj, gi) = + log(gw / anchors[2 * best_an_index] + 1e-16); + th_t(i, best_an_index, gj, gi) = + log(gh / anchors[2 * best_an_index + 1] + 1e-16); + tclass_t(b, best_an_index, gj, gi, gt_label) = 1; + tconf_t(b, best_an_index, gj, gi) = 1; + + std::vector pred_box({ + pred_boxes_t(i, best_an_index, gj, gi, 0), + pred_boxes_t(i, best_an_index, gj, gi, 1), + pred_boxes_t(i, best_an_index, gj, gi, 2), + pred_boxes_t(i, best_an_index, gj, gi, 3), + }); + gt_box[0] = gx; + gt_box[1] = gy; + iou = CalcBoxIoU(gt_box, pred_box, true); + int pred_label = GetPredLabel(pred_classes, i, best_an_index, gj, gi); + T score = pred_confs_t(i, best_an_index, gj, gi); + if (iou > 0.5 && pred_label == gt_label && score > 0.5) { + (*correct_num)++; + } + } + } + mask_false_t = mask_true_t - mask_false_t; +} + +template +class Yolov3LossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_boxes = ctx.Input("GTBox"); + auto* output = ctx.Output("Out"); + int img_height = ctx.Attr("img_height"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + + const int n = input->dims()[0]; + const int c = input->dims()[1]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + const float stride = static_cast(img_height) / h; + + Tensor pred_x, pred_y, pred_w, pred_h; + Tensor pred_boxes, pred_confs, pred_classes; + pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_boxes.mutable_data({n, an_num, h, w, 4}, ctx.GetPlace()); + pred_confs.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_classes.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_boxes, &pred_confs, &pred_classes, &pred_x, + &pred_y, &pred_w, &pred_h, anchors, class_num, stride); + + Tensor mask_true, mask_false; + Tensor tx, ty, tw, th, tconf, tclass; + mask_true.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + mask_false.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + int gt_num = 0; + int correct_num = 0; + CalcPredBoxWithGTBox(pred_boxes, pred_confs, pred_classes, *gt_boxes, + anchors, ignore_thresh, img_height, >_num, + &correct_num, &mask_true, &mask_false, &tx, &ty, + &tw, &th, &tconf, &tclass); + + T loss_x = CalcMSEWithMask(pred_x, tx, mask_true); + T loss_y = CalcMSEWithMask(pred_y, ty, mask_true); + T loss_w = CalcMSEWithMask(pred_w, tw, mask_true); + T loss_h = CalcMSEWithMask(pred_h, th, mask_true); + T loss_conf_true = CalcBCEWithMask(pred_confs, tconf, mask_true); + T loss_conf_false = CalcBCEWithMask(pred_confs, tconf, mask_false); + // T loss_class = CalcCEWithMask() + } +}; + +template +class Yolov3LossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* d_input_t = ctx.Output(framework::GradVarName("X")); + auto* d_output_t = ctx.Input(framework::GradVarName("Out")); + } +}; + +} // namespace operators +} // namespace paddle From 77c1328fa749c900c7e12bd6b9d70e84b91d5f49 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sat, 10 Nov 2018 23:32:11 +0800 Subject: [PATCH 025/684] add CPU kernel forward --- paddle/fluid/operators/yolov3_loss_op.cc | 60 ++++--- paddle/fluid/operators/yolov3_loss_op.h | 215 ++++++++++------------- 2 files changed, 127 insertions(+), 148 deletions(-) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index b4c6a185e2b41..9ed7e13dc7b03 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -27,18 +27,8 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Input(X) of Yolov3LossOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("GTBox"), "Input(GTBox) of Yolov3LossOp should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "Output(Out) of Yolov3LossOp should not be null."); - - // PADDLE_ENFORCE(ctx->HasAttr("img_height"), - // "Attr(img_height) of Yolov3LossOp should not be null. "); - // PADDLE_ENFORCE(ctx->HasAttr("anchors"), - // "Attr(anchor) of Yolov3LossOp should not be null.") - // PADDLE_ENFORCE(ctx->HasAttr("class_num"), - // "Attr(class_num) of Yolov3LossOp should not be null."); - // PADDLE_ENFORCE(ctx->HasAttr( - // "ignore_thresh", - // "Attr(ignore_thresh) of Yolov3LossOp should not be null.")); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of Yolov3LossOp should not be null."); auto dim_x = ctx->GetInputDim("X"); auto dim_gt = ctx->GetInputDim("GTBox"); @@ -46,6 +36,14 @@ class Yolov3LossOp : public framework::OperatorWithKernel { auto anchors = ctx->Attrs().Get>("anchors"); auto box_num = ctx->Attrs().Get("box_num"); auto class_num = ctx->Attrs().Get("class_num"); + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); + PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], + "Input(X) dim[3] and dim[4] should be euqal."); + PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); PADDLE_ENFORCE_GT(img_height, 0, "Attr(img_height) value should be greater then 0"); PADDLE_ENFORCE_GT(anchors.size(), 0, @@ -56,14 +54,9 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Attr(box_num) should be an integer greater then 0."); PADDLE_ENFORCE_GT(class_num, 0, "Attr(class_num) should be an integer greater then 0."); - PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), - "Input(X) dim[1] should be equal to (anchor_number * (5 " - "+ class_num))."); - PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); - PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); - std::vector dim_out({dim_x[0], 1}); - ctx->SetOutputDim("Out", framework::make_ddim(dim_out)); + std::vector dim_out({1}); + ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); } protected: @@ -80,12 +73,31 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of bilinear interpolation, " "This is a 4-D tensor with shape of [N, C, H, W]"); - AddOutput("Out", - "The output yolo loss tensor, " - "This is a 2-D tensor with shape of [N, 1]"); + AddInput( + "GTBox", + "The input tensor of ground truth boxes, " + "This is a 3-D tensor with shape of [N, max_box_num, 5 + class_num], " + "max_box_num is the max number of boxes in each image, " + "class_num is the number of classes in data set. " + "In the third dimention, stores x, y, w, h, confidence, classes " + "one-hot key. " + "x, y is the center cordinate of boxes and w, h is the width and " + "height, " + "and all of them should be divided by input image height to scale to " + "[0, 1]."); + AddOutput("Loss", + "The output yolov3 loss tensor, " + "This is a 1-D tensor with shape of [1]"); AddAttr("box_num", "The number of boxes generated in each grid."); AddAttr("class_num", "The number of classes to predict."); + AddAttr>("anchors", + "The anchor width and height, " + "it will be parsed pair by pair."); + AddAttr("img_height", + "The input image height after crop of yolov3 network."); + AddAttr("ignore_thresh", + "The ignore threshold to ignore confidence loss."); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground truth boxes. @@ -100,8 +112,8 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); - PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), - "Input(Out@GRAD) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null"); auto dim_x = ctx->GetInputDim("X"); if (ctx->HasOutput(framework::GradVarName("X"))) { ctx->SetOutputDim(framework::GradVarName("X"), dim_x); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index 7950390567bd9..a796a578098f2 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -44,8 +44,16 @@ static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, auto x_t = EigenVector::Flatten(x); auto y_t = EigenVector::Flatten(y); auto mask_t = EigenVector::Flatten(mask); - auto result = ((x_t - y_t) * mask_t).pow(2).sum().eval(); - return result(0); + + T error_sum = 0.0; + T points = 0.0; + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + error_sum += pow(x_t(i) - y_t(i), 2); + points += 1; + } + } + return (error_sum / points); } template @@ -55,27 +63,24 @@ static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, auto y_t = EigenVector::Flatten(y); auto mask_t = EigenVector::Flatten(mask); - auto result = - ((y_t * (x_t.log()) + (1.0 - y_t) * ((1.0 - x_t).log())) * mask_t) - .sum() - .eval(); - return result; -} - -template -static inline T CalcCEWithMask(const Tensor& x, const Tensor& y, - const Tensor& mask) { - auto x_t = EigenVector::Flatten(x); - auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); + T error_sum = 0.0; + T points = 0.0; + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + error_sum += + -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i))); + points += 1; + } + } + return (error_sum / points); } template -static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, - Tensor* pred_confs, Tensor* pred_classes, - Tensor* pred_x, Tensor* pred_y, Tensor* pred_w, - Tensor* pred_h, std::vector anchors, - const int class_num, const int stride) { +static void CalcPredResult(const Tensor& input, Tensor* pred_confs, + Tensor* pred_classes, Tensor* pred_x, Tensor* pred_y, + Tensor* pred_w, Tensor* pred_h, + std::vector anchors, const int class_num, + const int stride) { const int n = input.dims()[0]; const int c = input.dims()[1]; const int h = input.dims()[2]; @@ -84,7 +89,7 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, const int box_attr_num = 5 + class_num; auto input_t = EigenTensor::From(input); - auto pred_boxes_t = EigenTensor::From(*pred_boxes); + // auto pred_boxes_t = EigenTensor::From(*pred_boxes); auto pred_confs_t = EigenTensor::From(*pred_confs); auto pred_classes_t = EigenTensor::From(*pred_classes); auto pred_x_t = EigenTensor::From(*pred_x); @@ -104,16 +109,16 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, pred_y_t(i, an_idx, j, k) = sigmod(input_t(i, box_attr_num * an_idx + 1, j, k)); pred_w_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx + 2, j, k)); + input_t(i, box_attr_num * an_idx + 2, j, k); pred_h_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx + 3, j, k)); + input_t(i, box_attr_num * an_idx + 3, j, k); - pred_boxes_t(i, an_idx, j, k, 0) = pred_x_t(i, an_idx, j, k) + k; - pred_boxes_t(i, an_idx, j, k, 1) = pred_y_t(i, an_idx, j, k) + j; - pred_boxes_t(i, an_idx, j, k, 2) = - exp(pred_w_t(i, an_idx, j, k)) * an_w; - pred_boxes_t(i, an_idx, j, k, 3) = - exp(pred_h_t(i, an_idx, j, k)) * an_h; + // pred_boxes_t(i, an_idx, j, k, 0) = pred_x_t(i, an_idx, j, k) + k; + // pred_boxes_t(i, an_idx, j, k, 1) = pred_y_t(i, an_idx, j, k) + j; + // pred_boxes_t(i, an_idx, j, k, 2) = + // exp(pred_w_t(i, an_idx, j, k)) * an_w; + // pred_boxes_t(i, an_idx, j, k, 3) = + // exp(pred_h_t(i, an_idx, j, k)) * an_h; pred_confs_t(i, an_idx, j, k) = sigmod(input_t(i, box_attr_num * an_idx + 4, j, k)); @@ -129,40 +134,27 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_boxes, } template -static T CalcBoxIoU(std::vector box1, std::vector box2, - bool center_mode) { - T b1_x1, b1_x2, b1_y1, b1_y2; - T b2_x1, b2_x2, b2_y1, b2_y2; - if (center_mode) { - b1_x1 = box1[0] - box1[2] / 2; - b1_x2 = box1[0] + box1[2] / 2; - b1_y1 = box1[1] - box1[3] / 2; - b1_y2 = box1[1] + box1[3] / 2; - b2_x1 = box2[0] - box2[2] / 2; - b2_x2 = box2[0] + box2[2] / 2; - b2_y1 = box2[1] - box2[3] / 2; - b2_y2 = box2[1] + box2[3] / 2; - } else { - b1_x1 = box1[0]; - b1_x2 = box1[1]; - b1_y1 = box1[2]; - b1_y2 = box1[3]; - b2_x1 = box2[0]; - b2_x2 = box2[0]; - b2_y1 = box2[1]; - b2_y2 = box2[1]; - } - T b1_area = (b1_x2 - b1_x1 + 1.0) * (b1_y2 - b1_y1 + 1.0); - T b2_area = (b2_x2 - b2_x1 + 1.0) * (b2_y2 - b2_y1 + 1.0); +static T CalcBoxIoU(std::vector box1, std::vector box2) { + T b1_x1 = box1[0] - box1[2] / 2; + T b1_x2 = box1[0] + box1[2] / 2; + T b1_y1 = box1[1] - box1[3] / 2; + T b1_y2 = box1[1] + box1[3] / 2; + T b2_x1 = box2[0] - box2[2] / 2; + T b2_x2 = box2[0] + box2[2] / 2; + T b2_y1 = box2[1] - box2[3] / 2; + T b2_y2 = box2[1] + box2[3] / 2; + + T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1); + T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1); T inter_rect_x1 = std::max(b1_x1, b2_x1); T inter_rect_y1 = std::max(b1_y1, b2_y1); T inter_rect_x2 = std::min(b1_x2, b2_x2); T inter_rect_y2 = std::min(b1_y2, b2_y2); - T inter_area = std::max(inter_rect_x2 - inter_rect_x1 + 1.0, 0.0) * - std::max(inter_rect_y2 - inter_rect_y1 + 1.0, 0.0); + T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast(0.0)) * + std::max(inter_rect_y2 - inter_rect_y1, static_cast(0.0)); - return inter_area / (b1_area + b2_area - inter_area + 1e-16); + return inter_area / (b1_area + b2_area - inter_area); } template @@ -181,23 +173,18 @@ static inline int GetPredLabel(const Tensor& pred_classes, int n, } template -static void CalcPredBoxWithGTBox( - const Tensor& pred_boxes, const Tensor& pred_confs, - const Tensor& pred_classes, const Tensor& gt_boxes, - std::vector anchors, const float ignore_thresh, const int img_height, - int* gt_num, int* correct_num, Tensor* mask_true, Tensor* mask_false, - Tensor* tx, Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, - Tensor* tclass) { +static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, + std::vector anchors, const int img_height, + const int grid_size, Tensor* obj_mask, + Tensor* noobj_mask, Tensor* tx, Tensor* ty, + Tensor* tw, Tensor* th, Tensor* tconf, + Tensor* tclass) { const int n = gt_boxes.dims()[0]; const int b = gt_boxes.dims()[1]; - const int grid_size = pred_boxes.dims()[1]; const int anchor_num = anchors.size() / 2; - auto pred_boxes_t = EigenTensor::From(pred_boxes); - auto pred_confs_t = EigenTensor::From(pred_confs); - auto pred_classes_t = EigenTensor::From(pred_classes); auto gt_boxes_t = EigenTensor::From(gt_boxes); - auto mask_true_t = EigenTensor::From(*mask_true).setConstant(0.0); - auto mask_false_t = EigenTensor::From(*mask_false).setConstant(1.0); + auto obj_mask_t = EigenTensor::From(*obj_mask).setConstant(0); + auto noobj_mask_t = EigenTensor::From(*noobj_mask).setConstant(1); auto tx_t = EigenTensor::From(*tx).setConstant(0.0); auto ty_t = EigenTensor::From(*ty).setConstant(0.0); auto tw_t = EigenTensor::From(*tw).setConstant(0.0); @@ -205,8 +192,6 @@ static void CalcPredBoxWithGTBox( auto tconf_t = EigenTensor::From(*tconf).setConstant(0.0); auto tclass_t = EigenTensor::From(*tclass).setConstant(0.0); - *gt_num = 0; - *correct_num = 0; for (int i = 0; i < n; i++) { for (int j = 0; j < b; j++) { if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && @@ -214,12 +199,11 @@ static void CalcPredBoxWithGTBox( continue; } - *(gt_num)++; int gt_label = gt_boxes_t(i, j, 0); - T gx = gt_boxes_t(i, j, 1); - T gy = gt_boxes_t(i, j, 2); - T gw = gt_boxes_t(i, j, 3); - T gh = gt_boxes_t(i, j, 4); + T gx = gt_boxes_t(i, j, 1) * grid_size; + T gy = gt_boxes_t(i, j, 2) * grid_size; + T gw = gt_boxes_t(i, j, 3) * grid_size; + T gh = gt_boxes_t(i, j, 4) * grid_size; int gi = static_cast(gx); int gj = static_cast(gy); @@ -230,43 +214,26 @@ static void CalcPredBoxWithGTBox( for (int an_idx = 0; an_idx < anchor_num; an_idx++) { std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), static_cast(anchors[2 * an_idx + 1])}); - iou = CalcBoxIoU(gt_box, anchor_shape, false); + iou = CalcBoxIoU(gt_box, anchor_shape); if (iou > max_iou) { max_iou = iou; best_an_index = an_idx; } if (iou > ignore_thresh) { - mask_false_t(b, an_idx, gj, gi) = 0; + noobj_mask_t(b, an_idx, gj, gi) = 0; } } - mask_true_t(b, best_an_index, gj, gi) = 1; - mask_false_t(b, best_an_index, gj, gi) = 1; + obj_mask_t(b, best_an_index, gj, gi) = 1; + noobj_mask_t(b, best_an_index, gj, gi) = 1; tx_t(i, best_an_index, gj, gi) = gx - gi; ty_t(i, best_an_index, gj, gi) = gy - gj; - tw_t(i, best_an_index, gj, gi) = - log(gw / anchors[2 * best_an_index] + 1e-16); - th_t(i, best_an_index, gj, gi) = - log(gh / anchors[2 * best_an_index + 1] + 1e-16); + tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); + th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); tclass_t(b, best_an_index, gj, gi, gt_label) = 1; tconf_t(b, best_an_index, gj, gi) = 1; - - std::vector pred_box({ - pred_boxes_t(i, best_an_index, gj, gi, 0), - pred_boxes_t(i, best_an_index, gj, gi, 1), - pred_boxes_t(i, best_an_index, gj, gi, 2), - pred_boxes_t(i, best_an_index, gj, gi, 3), - }); - gt_box[0] = gx; - gt_box[1] = gy; - iou = CalcBoxIoU(gt_box, pred_box, true); - int pred_label = GetPredLabel(pred_classes, i, best_an_index, gj, gi); - T score = pred_confs_t(i, best_an_index, gj, gi); - if (iou > 0.5 && pred_label == gt_label && score > 0.5) { - (*correct_num)++; - } } } - mask_false_t = mask_true_t - mask_false_t; + noobj_mask_t = noobj_mask_t - obj_mask_t; } template @@ -275,7 +242,7 @@ class Yolov3LossKernel : public framework::OpKernel { void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); auto* gt_boxes = ctx.Input("GTBox"); - auto* output = ctx.Output("Out"); + auto* loss = ctx.Output("Loss"); int img_height = ctx.Attr("img_height"); auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); @@ -286,44 +253,44 @@ class Yolov3LossKernel : public framework::OpKernel { const int h = input->dims()[2]; const int w = input->dims()[3]; const int an_num = anchors.size() / 2; - const float stride = static_cast(img_height) / h; + const T stride = static_cast(img_height) / h; Tensor pred_x, pred_y, pred_w, pred_h; - Tensor pred_boxes, pred_confs, pred_classes; + Tensor pred_confs, pred_classes; pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_boxes.mutable_data({n, an_num, h, w, 4}, ctx.GetPlace()); pred_confs.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_classes.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - CalcPredResult(*input, &pred_boxes, &pred_confs, &pred_classes, &pred_x, - &pred_y, &pred_w, &pred_h, anchors, class_num, stride); + CalcPredResult(*input, &pred_confs, &pred_classes, &pred_x, &pred_y, + &pred_w, &pred_h, anchors, class_num, stride); - Tensor mask_true, mask_false; + Tensor obj_mask, noobj_mask; Tensor tx, ty, tw, th, tconf, tclass; - mask_true.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - mask_false.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - int gt_num = 0; - int correct_num = 0; - CalcPredBoxWithGTBox(pred_boxes, pred_confs, pred_classes, *gt_boxes, - anchors, ignore_thresh, img_height, >_num, - &correct_num, &mask_true, &mask_false, &tx, &ty, - &tw, &th, &tconf, &tclass); - - T loss_x = CalcMSEWithMask(pred_x, tx, mask_true); - T loss_y = CalcMSEWithMask(pred_y, ty, mask_true); - T loss_w = CalcMSEWithMask(pred_w, tw, mask_true); - T loss_h = CalcMSEWithMask(pred_h, th, mask_true); - T loss_conf_true = CalcBCEWithMask(pred_confs, tconf, mask_true); - T loss_conf_false = CalcBCEWithMask(pred_confs, tconf, mask_false); - // T loss_class = CalcCEWithMask() + PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, img_height, h, + &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, + &tclass); + + T loss_x = CalcMSEWithMask(pred_x, tx, obj_mask); + T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); + T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); + T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); + T loss_conf_true = CalcBCEWithMask(pred_confs, tconf, obj_mask); + T loss_conf_false = CalcBCEWithMask(pred_confs, tconf, noobj_mask); + T loss_class = CalcBCEWithMask(pred_classes, tclass, obj_mask); + + auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); + loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_true + + loss_conf_false + loss_class; } }; From 36c46152e140adab7e74eaeee9dbeccb65fc5633 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Sun, 11 Nov 2018 23:52:36 +0800 Subject: [PATCH 026/684] Add unittest for yolov3_loss. test=develop --- paddle/fluid/operators/yolov3_loss_op.cc | 25 +-- paddle/fluid/operators/yolov3_loss_op.h | 67 +++--- python/paddle/fluid/layers/nn.py | 28 +++ .../tests/unittests/test_yolov3_loss_op.py | 194 ++++++++++++++++++ 4 files changed, 273 insertions(+), 41 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index 9ed7e13dc7b03..7369ce31e8c9e 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -34,7 +34,6 @@ class Yolov3LossOp : public framework::OperatorWithKernel { auto dim_gt = ctx->GetInputDim("GTBox"); auto img_height = ctx->Attrs().Get("img_height"); auto anchors = ctx->Attrs().Get>("anchors"); - auto box_num = ctx->Attrs().Get("box_num"); auto class_num = ctx->Attrs().Get("class_num"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], @@ -50,8 +49,6 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Attr(anchors) length should be greater then 0."); PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, "Attr(anchors) length should be even integer."); - PADDLE_ENFORCE_GT(box_num, 0, - "Attr(box_num) should be an integer greater then 0."); PADDLE_ENFORCE_GT(class_num, 0, "Attr(class_num) should be an integer greater then 0."); @@ -73,23 +70,19 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddInput("X", "The input tensor of bilinear interpolation, " "This is a 4-D tensor with shape of [N, C, H, W]"); - AddInput( - "GTBox", - "The input tensor of ground truth boxes, " - "This is a 3-D tensor with shape of [N, max_box_num, 5 + class_num], " - "max_box_num is the max number of boxes in each image, " - "class_num is the number of classes in data set. " - "In the third dimention, stores x, y, w, h, confidence, classes " - "one-hot key. " - "x, y is the center cordinate of boxes and w, h is the width and " - "height, " - "and all of them should be divided by input image height to scale to " - "[0, 1]."); + AddInput("GTBox", + "The input tensor of ground truth boxes, " + "This is a 3-D tensor with shape of [N, max_box_num, 5], " + "max_box_num is the max number of boxes in each image, " + "In the third dimention, stores label, x, y, w, h, " + "label is an integer to specify box class, x, y is the " + "center cordinate of boxes and w, h is the width and height" + "and x, y, w, h should be divided by input image height to " + "scale to [0, 1]."); AddOutput("Loss", "The output yolov3 loss tensor, " "This is a 1-D tensor with shape of [1]"); - AddAttr("box_num", "The number of boxes generated in each grid."); AddAttr("class_num", "The number of classes to predict."); AddAttr>("anchors", "The anchor width and height, " diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index a796a578098f2..426e0688ab63a 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -25,8 +25,7 @@ template using EigenVector = framework::EigenVector; -using Array2 = Eigen::DSizes; -using Array4 = Eigen::DSizes; +using Array5 = Eigen::DSizes; template static inline bool isZero(T x) { @@ -43,7 +42,7 @@ static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, const Tensor& mask) { auto x_t = EigenVector::Flatten(x); auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); + auto mask_t = EigenVector::Flatten(mask); T error_sum = 0.0; T points = 0.0; @@ -61,7 +60,7 @@ static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, const Tensor& mask) { auto x_t = EigenVector::Flatten(x); auto y_t = EigenVector::Flatten(y); - auto mask_t = EigenVector::Flatten(mask); + auto mask_t = EigenVector::Flatten(mask); T error_sum = 0.0; T points = 0.0; @@ -89,7 +88,6 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_confs, const int box_attr_num = 5 + class_num; auto input_t = EigenTensor::From(input); - // auto pred_boxes_t = EigenTensor::From(*pred_boxes); auto pred_confs_t = EigenTensor::From(*pred_confs); auto pred_classes_t = EigenTensor::From(*pred_classes); auto pred_x_t = EigenTensor::From(*pred_x); @@ -113,13 +111,6 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_confs, pred_h_t(i, an_idx, j, k) = input_t(i, box_attr_num * an_idx + 3, j, k); - // pred_boxes_t(i, an_idx, j, k, 0) = pred_x_t(i, an_idx, j, k) + k; - // pred_boxes_t(i, an_idx, j, k, 1) = pred_y_t(i, an_idx, j, k) + j; - // pred_boxes_t(i, an_idx, j, k, 2) = - // exp(pred_w_t(i, an_idx, j, k)) * an_w; - // pred_boxes_t(i, an_idx, j, k, 3) = - // exp(pred_h_t(i, an_idx, j, k)) * an_h; - pred_confs_t(i, an_idx, j, k) = sigmod(input_t(i, box_attr_num * an_idx + 4, j, k)); @@ -199,7 +190,7 @@ static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, continue; } - int gt_label = gt_boxes_t(i, j, 0); + int gt_label = static_cast(gt_boxes_t(i, j, 0)); T gx = gt_boxes_t(i, j, 1) * grid_size; T gy = gt_boxes_t(i, j, 2) * grid_size; T gw = gt_boxes_t(i, j, 3) * grid_size; @@ -207,7 +198,7 @@ static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, int gi = static_cast(gx); int gj = static_cast(gy); - T max_iou = static_cast(-1); + T max_iou = static_cast(0); T iou; int best_an_index = -1; std::vector gt_box({0, 0, gw, gh}); @@ -220,20 +211,33 @@ static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, best_an_index = an_idx; } if (iou > ignore_thresh) { - noobj_mask_t(b, an_idx, gj, gi) = 0; + noobj_mask_t(i, an_idx, gj, gi) = 0; } } - obj_mask_t(b, best_an_index, gj, gi) = 1; - noobj_mask_t(b, best_an_index, gj, gi) = 1; + obj_mask_t(i, best_an_index, gj, gi) = 1; + noobj_mask_t(i, best_an_index, gj, gi) = 0; tx_t(i, best_an_index, gj, gi) = gx - gi; ty_t(i, best_an_index, gj, gi) = gy - gj; tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); - tclass_t(b, best_an_index, gj, gi, gt_label) = 1; - tconf_t(b, best_an_index, gj, gi) = 1; + tclass_t(i, best_an_index, gj, gi, gt_label) = 1; + tconf_t(i, best_an_index, gj, gi) = 1; } } - noobj_mask_t = noobj_mask_t - obj_mask_t; +} + +static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand, + const Tensor& obj_mask) { + const int n = obj_mask_expand->dims()[0]; + const int an_num = obj_mask_expand->dims()[1]; + const int h = obj_mask_expand->dims()[2]; + const int w = obj_mask_expand->dims()[3]; + const int class_num = obj_mask_expand->dims()[4]; + auto obj_mask_expand_t = EigenTensor::From(*obj_mask_expand); + auto obj_mask_t = EigenTensor::From(obj_mask); + + obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1)) + .broadcast(Array5(1, 1, 1, 1, class_num)); } template @@ -280,17 +284,30 @@ class Yolov3LossKernel : public framework::OpKernel { &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); + Tensor obj_mask_expand; + obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, + ctx.GetPlace()); + ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); + T loss_x = CalcMSEWithMask(pred_x, tx, obj_mask); T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); - T loss_conf_true = CalcBCEWithMask(pred_confs, tconf, obj_mask); - T loss_conf_false = CalcBCEWithMask(pred_confs, tconf, noobj_mask); - T loss_class = CalcBCEWithMask(pred_classes, tclass, obj_mask); + T loss_conf_obj = CalcBCEWithMask(pred_confs, tconf, obj_mask); + T loss_conf_noobj = CalcBCEWithMask(pred_confs, tconf, noobj_mask); + T loss_class = CalcBCEWithMask(pred_classes, tclass, obj_mask_expand); + + // LOG(ERROR) << "loss_x: " << loss_x; + // LOG(ERROR) << "loss_y: " << loss_y; + // LOG(ERROR) << "loss_w: " << loss_w; + // LOG(ERROR) << "loss_h: " << loss_h; + // LOG(ERROR) << "loss_conf_obj: " << loss_conf_obj; + // LOG(ERROR) << "loss_conf_noobj: " << loss_conf_noobj; + // LOG(ERROR) << "loss_class: " << loss_class; auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); - loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_true + - loss_conf_false + loss_class; + loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_obj + + loss_conf_noobj + loss_class; } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d3623464e9974..1ee7198f29260 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -164,6 +164,7 @@ 'hash', 'grid_sampler', 'log_loss', + 'yolov3_loss', 'add_position_encoding', 'bilinear_tensor_product', ] @@ -8243,6 +8244,33 @@ def log_loss(input, label, epsilon=1e-4, name=None): return loss +def yolov3_loss(x, gtbox, img_height, anchors, ignore_thresh, name=None): + """ + **YOLOv3 Loss Layer** + + This layer + """ + helper = LayerHelper('yolov3_loss', **locals()) + + if name is None: + loss = helper.create_variable_for_type_inference(dtype=x.dtype) + else: + loss = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + helper.append_op( + type='yolov3_loss', + inputs={'X': x, + "GTBox": gtbox}, + outputs={'Loss': loss}, + attrs={ + "img_height": img_height, + "anchors": anchors, + "ignore_thresh": ignore_thresh, + }) + return loss + + def add_position_encoding(input, alpha, beta, name=None): """ **Add Position Encoding Layer** diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py new file mode 100644 index 0000000000000..f5b15efb27ff7 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -0,0 +1,194 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from op_test import OpTest + + +def sigmoid(x): + return 1.0 / (1.0 + np.exp(-1.0 * x)) + + +def mse(x, y, num): + return ((y - x)**2).sum() / num + + +def bce(x, y, mask): + x = x.reshape((-1)) + y = y.reshape((-1)) + mask = mask.reshape((-1)) + + error_sum = 0.0 + count = 0 + for i in range(x.shape[0]): + if mask[i] > 0: + error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i]) + count += 1 + return error_sum / (-1.0 * count) + + +def box_iou(box1, box2): + b1_x1 = box1[0] - box1[2] / 2 + b1_x2 = box1[0] + box1[2] / 2 + b1_y1 = box1[1] - box1[3] / 2 + b1_y2 = box1[1] + box1[3] / 2 + b2_x1 = box2[0] - box2[2] / 2 + b2_x2 = box2[0] + box2[2] / 2 + b2_y1 = box2[1] - box2[3] / 2 + b2_y2 = box2[1] + box2[3] / 2 + + b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) + b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + + inter_rect_x1 = max(b1_x1, b2_x1) + inter_rect_y1 = max(b1_y1, b2_y1) + inter_rect_x2 = min(b1_x2, b2_x2) + inter_rect_y2 = min(b1_y2, b2_y2) + inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max( + inter_rect_y2 - inter_rect_y1, 0) + + return inter_area / (b1_area + b2_area + inter_area) + + +def build_target(gtboxs, attrs, grid_size): + n, b, _ = gtboxs.shape + ignore_thresh = attrs["ignore_thresh"] + img_height = attrs["img_height"] + anchors = attrs["anchors"] + class_num = attrs["class_num"] + an_num = len(anchors) / 2 + obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32') + tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tcls = np.zeros( + (n, an_num, grid_size, grid_size, class_num)).astype('float32') + + for i in range(n): + for j in range(b): + if gtboxs[i, j, :].sum() == 0: + continue + + gt_label = int(gtboxs[i, j, 0]) + gx = gtboxs[i, j, 1] * grid_size + gy = gtboxs[i, j, 2] * grid_size + gw = gtboxs[i, j, 3] * grid_size + gh = gtboxs[i, j, 4] * grid_size + + gi = int(gx) + gj = int(gy) + + gtbox = [0, 0, gw, gh] + max_iou = 0 + for k in range(an_num): + anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]] + iou = box_iou(gtbox, anchor_box) + if iou > max_iou: + max_iou = iou + best_an_index = k + if iou > ignore_thresh: + noobj_mask[i, best_an_index, gj, gi] = 0 + + obj_mask[i, best_an_index, gj, gi] = 1 + noobj_mask[i, best_an_index, gj, gi] = 0 + tx[i, best_an_index, gj, gi] = gx - gi + ty[i, best_an_index, gj, gi] = gy - gj + tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 * + best_an_index]) + th[i, best_an_index, gj, gi] = np.log( + gh / anchors[2 * best_an_index + 1]) + tconf[i, best_an_index, gj, gi] = 1 + tcls[i, best_an_index, gj, gi, gt_label] = 1 + + return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask) + + +def YoloV3Loss(x, gtbox, attrs): + n, c, h, w = x.shape + an_num = len(attrs['anchors']) / 2 + class_num = attrs["class_num"] + x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) + pred_x = sigmoid(x[:, :, :, :, 0]) + pred_y = sigmoid(x[:, :, :, :, 1]) + pred_w = x[:, :, :, :, 2] + pred_h = x[:, :, :, :, 3] + pred_conf = sigmoid(x[:, :, :, :, 4]) + pred_cls = sigmoid(x[:, :, :, :, 5:]) + + tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target( + gtbox, attrs, x.shape[2]) + + obj_mask_expand = np.tile( + np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num']))) + loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum()) + loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum()) + loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum()) + loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum()) + loss_conf_obj = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) + loss_conf_noobj = bce(pred_conf * noobj_mask, tconf * noobj_mask, + noobj_mask) + loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, + obj_mask_expand) + # print "loss_x: ", loss_x + # print "loss_y: ", loss_y + # print "loss_w: ", loss_w + # print "loss_h: ", loss_h + # print "loss_conf_obj: ", loss_conf_obj + # print "loss_conf_noobj: ", loss_conf_noobj + # print "loss_class: ", loss_class + + return loss_x + loss_y + loss_w + loss_h + loss_conf_obj + loss_conf_noobj + loss_class + + +class TestYolov3LossOp(OpTest): + def setUp(self): + self.initTestCase() + self.op_type = 'yolov3_loss' + x = np.random.random(size=self.x_shape).astype('float32') + gtbox = np.random.random(size=self.gtbox_shape).astype('float32') + gtbox[:, :, 0] = np.random.randint(0, self.class_num, + self.gtbox_shape[:2]) + + self.attrs = { + "img_height": self.img_height, + "anchors": self.anchors, + "class_num": self.class_num, + "ignore_thresh": self.ignore_thresh, + } + + self.inputs = {'X': x, 'GTBox': gtbox} + self.outputs = {'Loss': np.array([YoloV3Loss(x, gtbox, self.attrs)])} + print self.outputs + + def test_check_output(self): + self.check_output(atol=1e-3) + + # def test_check_grad_normal(self): + # self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61) + + def initTestCase(self): + self.img_height = 608 + self.anchors = [10, 13, 16, 30, 33, 23] + self.class_num = 10 + self.ignore_thresh = 0.5 + self.x_shape = (5, len(self.anchors) / 2 * (5 + self.class_num), 7, 7) + self.gtbox_shape = (5, 10, 5) + + +if __name__ == "__main__": + unittest.main() From f4be1d99d0a9c334d6b4ee8d6c557ea0d936f58a Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 13 Nov 2018 06:19:26 +0000 Subject: [PATCH 027/684] polish code and test --- .../operators/hierarchical_sigmoid_op.cc | 2 +- python/paddle/fluid/layers/nn.py | 66 +++++++++++++------ .../fluid/tests/unittests/test_layers.py | 17 +++++ 3 files changed, 63 insertions(+), 22 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 49a17416c84ac..8d4e0556dd6a7 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -115,7 +115,7 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { "[batch_size, code_length], where code_length represents the " "maximum path length from root to leaf nodes.") .AsIntermediate(); - AddAttr("num_classes", "(int, required), The number of classes") + AddAttr("num_classes", "(int, optional), The number of classes") .SetDefault(2); AddComment(R"DOC( The hierarchical sigmoid operator organize the classes into a binary tree. diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index d3ee80ad529b3..835ec4506a944 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4348,12 +4348,14 @@ def nce(input, def hsigmoid(input, label, - num_classes, - ptabl=None, + num_classes=None, + non_leaf_num=None, + ptable=None, pcode=None, param_attr=None, bias_attr=None, - name=None): + name=None, + is_costum=False): """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a @@ -4373,7 +4375,8 @@ def hsigmoid(input, and :math:`D` is the feature size. label (Variable): The tensor variable contains labels of training data. It's a tensor with shape is :math:`[N \\times 1]`. - num_classes: (int), The number of classes, must not be less than 2. + num_classes: (int), The number of classes, must not be less than 2. with default tree this has to be set + non_leaf_num: this defines the number of non-leaf nodes in costumed tree ptable: (Variable|None) this variable can store each batch of samples' path to root, it should be in leaf -> root order ptable should have the same shape with pcode, and for each sample i ptable[i] indicates a np.array like @@ -4409,20 +4412,33 @@ def hsigmoid(input, out = helper.create_variable_for_type_inference(dtype) pre_out = helper.create_variable_for_type_inference(dtype) dim = input.shape[1] - if num_classes < 2: - raise ValueError("num_classes must not be less than 2.") - if (ptable is not None) and (pcode is None): - raise ValueError("pcode should not be None when ptable has been set") - elif (ptable is None) and (pcode is not None): - raise ValueError("ptable should not be None when pcode has been set") + if ((num_classes < 2) or (num_classes is None)) and (not is_costum): + raise ValueError( + "num_classes must not be less than 2 with default tree") + + if (is_costum) and (pcode is None): + raise ValueError("pcode should not be None with costum tree") + elif (is_costum) and (ptable is None): + raise ValueError("ptable should not be None with costum tree") + elif (is_costum) and (non_leaf_num is None): + raise ValueError("non_leaf_num should not be None with costum tree") else: pass - weights = helper.create_parameter( - attr=helper.param_attr, - shape=[num_classes - 1, dim], - is_bias=False, - dtype=input.dtype) + weights = None + + if not is_costum: + weights = helper.create_parameter( + attr=helper.param_attr, + shape=[num_classes - 1, dim], + is_bias=False, + dtype=input.dtype) + else: + weights = helper.create_parameter( + attr=helper.param_attr, + shape=[non_leaf_num, dim], + is_bias=False, + dtype=input.dtype) inputs = { "X": input, "W": weights, @@ -4431,12 +4447,20 @@ def hsigmoid(input, "Label": label } if helper.bias_attr: - bias = helper.create_parameter( - attr=helper.bias_attr, - shape=[1, num_classes - 1], - is_bias=True, - dtype=input.dtype) - inputs['Bias'] = bias + if not is_costum: + bias = helper.create_parameter( + attr=helper.bias_attr, + shape=[1, num_classes - 1], + is_bias=True, + dtype=input.dtype) + inputs['Bias'] = bias + else: + bias = helper.create_parameter( + attr=helper.bias_attr, + shape=[1, non_leaf_num], + is_bias=True, + dtype=input.dtype) + inputs['Bias'] = bias helper.append_op( type="hierarchical_sigmoid", inputs=inputs, diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 50de468dba803..b067e6213c8e5 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -185,6 +185,23 @@ def test_hsigmoid(self): input=x, label=y, num_classes=2)) print(str(program)) + program2 = Program() + + with program_guard(program2): + x2 = layers.data(name='x2', shape=[4, 8], dtype='float32') + y2 = layers.data(name='y2', shape=[4], dtype='int64') + ptable = layers.data(name='ptable', shape=[4, 6], dtype='int64') + pcode = layers.data(name='pcode', shape=[4, 6], dtype='int64') + self.assertIsNotNone( + layers.hsigmoid( + input=x2, + label=y2, + non_leaf_num=6, + ptable=ptable, + pcode=pcode, + is_costum=True)) + print(str(program2)) + def test_sequence_expand(self): program = Program() with program_guard(program): From 30332ad91d6c69b841d7ead0bb000b5964287a7b Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 13 Nov 2018 06:34:56 +0000 Subject: [PATCH 028/684] test=develop --- python/paddle/fluid/tests/unittests/test_layers.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index b067e6213c8e5..4379aeb9933d7 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -185,8 +185,8 @@ def test_hsigmoid(self): input=x, label=y, num_classes=2)) print(str(program)) + # test hsigmod with custom tree structure program2 = Program() - with program_guard(program2): x2 = layers.data(name='x2', shape=[4, 8], dtype='float32') y2 = layers.data(name='y2', shape=[4], dtype='int64') From db06568e693a724b5578ab6c77d9db833d253f18 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 13 Nov 2018 08:26:13 +0000 Subject: [PATCH 029/684] test=develop --- paddle/fluid/API.spec | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 3bbe7c2b8cd60..d64939413b369 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -98,7 +98,7 @@ paddle.fluid.layers.sequence_reshape ArgSpec(args=['input', 'new_dim'], varargs= paddle.fluid.layers.transpose ArgSpec(args=['x', 'perm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.im2sequence ArgSpec(args=['input', 'filter_size', 'stride', 'padding', 'input_image_size', 'out_stride', 'name'], varargs=None, keywords=None, defaults=(1, 1, 0, None, 1, None)) paddle.fluid.layers.nce ArgSpec(args=['input', 'label', 'num_total_classes', 'sample_weight', 'param_attr', 'bias_attr', 'num_neg_samples', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None)) -paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) +paddle.fluid.layers.hsigmoid ArgSpec(args=['input', 'label', 'num_classes', 'non_leaf_num', 'ptable', 'pcode', 'param_attr', 'bias_attr', 'name', 'is_costum'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, False)) paddle.fluid.layers.beam_search ArgSpec(args=['pre_ids', 'pre_scores', 'ids', 'scores', 'beam_size', 'end_id', 'level', 'name'], varargs=None, keywords=None, defaults=(0, None)) paddle.fluid.layers.row_conv ArgSpec(args=['input', 'future_context_size', 'param_attr', 'act'], varargs=None, keywords=None, defaults=(None, None)) paddle.fluid.layers.multiplex ArgSpec(args=['inputs', 'index'], varargs=None, keywords=None, defaults=None) From a0284f6fbcb4888e1653b7f094db615f1437943c Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Mon, 12 Nov 2018 21:13:25 +0800 Subject: [PATCH 030/684] Add backward CPU kernel. test=develop --- paddle/fluid/API.spec | 1 + paddle/fluid/operators/yolov3_loss_op.cc | 64 ++++- paddle/fluid/operators/yolov3_loss_op.cu | 4 +- paddle/fluid/operators/yolov3_loss_op.h | 256 +++++++++++++----- python/paddle/fluid/layers/nn.py | 49 +++- .../fluid/tests/unittests/test_layers.py | 9 + .../tests/unittests/test_yolov3_loss_op.py | 42 +-- 7 files changed, 327 insertions(+), 98 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index de32a5d5a297b..8344a913e9bf8 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -183,6 +183,7 @@ paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', ' paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'anchors', 'class_num', 'ignore_thresh', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index 7369ce31e8c9e..cf25e995054dc 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -20,8 +20,6 @@ using framework::Tensor; class Yolov3LossOp : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) of Yolov3LossOp should not be null."); @@ -32,7 +30,6 @@ class Yolov3LossOp : public framework::OperatorWithKernel { auto dim_x = ctx->GetInputDim("X"); auto dim_gt = ctx->GetInputDim("GTBox"); - auto img_height = ctx->Attrs().Get("img_height"); auto anchors = ctx->Attrs().Get>("anchors"); auto class_num = ctx->Attrs().Get("class_num"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); @@ -43,8 +40,6 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "+ class_num))."); PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); - PADDLE_ENFORCE_GT(img_height, 0, - "Attr(img_height) value should be greater then 0"); PADDLE_ENFORCE_GT(anchors.size(), 0, "Attr(anchors) length should be greater then 0."); PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, @@ -87,13 +82,43 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>("anchors", "The anchor width and height, " "it will be parsed pair by pair."); - AddAttr("img_height", - "The input image height after crop of yolov3 network."); AddAttr("ignore_thresh", "The ignore threshold to ignore confidence loss."); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground truth boxes. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, specify the grid size, each grid point predict given + number boxes, this given number is specified by anchors, it should be + half anchors length, which following will be represented as S. In the + second dimention(the channel dimention), C should be S * (class_num + 5), + class_num is the box categoriy number of source dataset(such as coco), + so in the second dimention, stores 4 box location coordinates x, y, w, h + and confidence score of the box and class one-hot key of each anchor box. + + While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions + correspnd to: + + $$ + b_x = \sigma(t_x) + c_x + b_y = \sigma(t_y) + c_y + b_w = p_w e^{t_w} + b_h = p_h e^{t_h} + $$ + + While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$ + is specified by anchors. + + As for confidence score, it is the logistic regression value of IoU between + anchor boxes and ground truth boxes, the score of the anchor box which has + the max IoU should be 1, and if the anchor box has IoU bigger then ignore + thresh, the confidence score loss of this anchor box will be ignored. + + Therefore, the yolov3 loss consist of three major parts, box location loss, + confidence score loss, and classification loss. The MSE loss is used for + box location, and binary cross entropy loss is used for confidence score + loss and classification loss. )DOC"); } }; @@ -101,8 +126,6 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { class Yolov3LossOpGrad : public framework::OperatorWithKernel { public: using framework::OperatorWithKernel::OperatorWithKernel; - - protected: void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), @@ -113,6 +136,7 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { } } + protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( @@ -120,12 +144,32 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { } }; +class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("yolov3_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("GTBox", Input("GTBox")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("GTBox"), {}); + return std::unique_ptr(op); + } +}; + } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, - paddle::framework::DefaultGradOpDescMaker); + ops::Yolov3LossGradMaker); REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); REGISTER_OP_CPU_KERNEL( yolov3_loss, diff --git a/paddle/fluid/operators/yolov3_loss_op.cu b/paddle/fluid/operators/yolov3_loss_op.cu index 48f997456ac48..f901b10d38e48 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cu +++ b/paddle/fluid/operators/yolov3_loss_op.cu @@ -17,7 +17,7 @@ namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( yolov3_loss, - ops::Yolov3LossOpKernel); + ops::Yolov3LossKernel); REGISTER_OP_CUDA_KERNEL( yolov3_loss_grad, - ops::Yolov3LossGradOpKernel); + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index 426e0688ab63a..a2ed4440a74fe 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -33,10 +33,22 @@ static inline bool isZero(T x) { } template -static inline T sigmod(T x) { +static inline T sigmoid(T x) { return 1.0 / (exp(-1.0 * x) + 1.0); } +template +static inline T CalcMaskPointNum(const Tensor& mask) { + auto mask_t = EigenVector::Flatten(mask); + T count = 0.0; + for (int i = 0; i < mask_t.dimensions()[0]; i++) { + if (mask_t(i)) { + count += 1.0; + } + } + return count; +} + template static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, const Tensor& mask) { @@ -55,6 +67,21 @@ static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, return (error_sum / points); } +template +static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y, + const Tensor& mask, T mf) { + auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf; + } + } +} + template static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, const Tensor& mask) { @@ -75,21 +102,34 @@ static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, } template -static void CalcPredResult(const Tensor& input, Tensor* pred_confs, - Tensor* pred_classes, Tensor* pred_x, Tensor* pred_y, - Tensor* pred_w, Tensor* pred_h, - std::vector anchors, const int class_num, - const int stride) { +static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x, + const Tensor& y, const Tensor& mask, + T mf) { + auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf; + } + } +} + +template +static void CalcPredResult(const Tensor& input, Tensor* pred_conf, + Tensor* pred_class, Tensor* pred_x, Tensor* pred_y, + Tensor* pred_w, Tensor* pred_h, const int anchor_num, + const int class_num) { const int n = input.dims()[0]; - const int c = input.dims()[1]; const int h = input.dims()[2]; const int w = input.dims()[3]; - const int anchor_num = anchors.size() / 2; const int box_attr_num = 5 + class_num; auto input_t = EigenTensor::From(input); - auto pred_confs_t = EigenTensor::From(*pred_confs); - auto pred_classes_t = EigenTensor::From(*pred_classes); + auto pred_conf_t = EigenTensor::From(*pred_conf); + auto pred_class_t = EigenTensor::From(*pred_class); auto pred_x_t = EigenTensor::From(*pred_x); auto pred_y_t = EigenTensor::From(*pred_y); auto pred_w_t = EigenTensor::From(*pred_w); @@ -97,26 +137,23 @@ static void CalcPredResult(const Tensor& input, Tensor* pred_confs, for (int i = 0; i < n; i++) { for (int an_idx = 0; an_idx < anchor_num; an_idx++) { - float an_w = anchors[an_idx * 2] / stride; - float an_h = anchors[an_idx * 2 + 1] / stride; - for (int j = 0; j < h; j++) { for (int k = 0; k < w; k++) { pred_x_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx, j, k)); + sigmoid(input_t(i, box_attr_num * an_idx, j, k)); pred_y_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx + 1, j, k)); + sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k)); pred_w_t(i, an_idx, j, k) = input_t(i, box_attr_num * an_idx + 2, j, k); pred_h_t(i, an_idx, j, k) = input_t(i, box_attr_num * an_idx + 3, j, k); - pred_confs_t(i, an_idx, j, k) = - sigmod(input_t(i, box_attr_num * an_idx + 4, j, k)); + pred_conf_t(i, an_idx, j, k) = + sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k)); for (int c = 0; c < class_num; c++) { - pred_classes_t(i, an_idx, j, k, c) = - sigmod(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); + pred_class_t(i, an_idx, j, k, c) = + sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); } } } @@ -148,27 +185,11 @@ static T CalcBoxIoU(std::vector box1, std::vector box2) { return inter_area / (b1_area + b2_area - inter_area); } -template -static inline int GetPredLabel(const Tensor& pred_classes, int n, - int best_an_index, int gj, int gi) { - auto pred_classes_t = EigenTensor::From(pred_classes); - T score = 0.0; - int label = -1; - for (int i = 0; i < pred_classes.dims()[4]; i++) { - if (pred_classes_t(n, best_an_index, gj, gi, i) > score) { - score = pred_classes_t(n, best_an_index, gj, gi, i); - label = i; - } - } - return label; -} - template static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, - std::vector anchors, const int img_height, - const int grid_size, Tensor* obj_mask, - Tensor* noobj_mask, Tensor* tx, Tensor* ty, - Tensor* tw, Tensor* th, Tensor* tconf, + std::vector anchors, const int grid_size, + Tensor* obj_mask, Tensor* noobj_mask, Tensor* tx, + Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, Tensor* tclass) { const int n = gt_boxes.dims()[0]; const int b = gt_boxes.dims()[1]; @@ -240,6 +261,61 @@ static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand, .broadcast(Array5(1, 1, 1, 1, class_num)); } +template +static void AddAllGradToInputGrad( + Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y, + const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, + const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, + const Tensor& grad_conf_obj, const Tensor& grad_conf_noobj, + const Tensor& grad_class, const int class_num) { + const int n = pred_x.dims()[0]; + const int an_num = pred_x.dims()[1]; + const int h = pred_x.dims()[2]; + const int w = pred_x.dims()[3]; + const int attr_num = class_num + 5; + auto grad_t = EigenTensor::From(*grad).setConstant(0.0); + auto pred_x_t = EigenTensor::From(pred_x); + auto pred_y_t = EigenTensor::From(pred_y); + auto pred_conf_t = EigenTensor::From(pred_conf); + auto pred_class_t = EigenTensor::From(pred_class); + auto grad_x_t = EigenTensor::From(grad_x); + auto grad_y_t = EigenTensor::From(grad_y); + auto grad_w_t = EigenTensor::From(grad_w); + auto grad_h_t = EigenTensor::From(grad_h); + auto grad_conf_obj_t = EigenTensor::From(grad_conf_obj); + auto grad_conf_noobj_t = EigenTensor::From(grad_conf_noobj); + auto grad_class_t = EigenTensor::From(grad_class); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + grad_t(i, j * attr_num, k, l) = grad_x_t(i, j, k, l) * + pred_x_t(i, j, k, l) * + (1.0 - pred_x_t(i, j, k, l)) * loss; + grad_t(i, j * attr_num + 1, k, l) = + grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * + (1.0 - pred_y_t(i, j, k, l)) * loss; + grad_t(i, j * attr_num + 2, k, l) = grad_w_t(i, j, k, l) * loss; + grad_t(i, j * attr_num + 3, k, l) = grad_h_t(i, j, k, l) * loss; + grad_t(i, j * attr_num + 4, k, l) = + grad_conf_obj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss; + grad_t(i, j * attr_num + 4, k, l) += + grad_conf_noobj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss; + + for (int c = 0; c < class_num; c++) { + grad_t(i, j * attr_num + 5 + c, k, l) = + grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * + (1.0 - pred_class_t(i, j, k, l, c)) * loss; + } + } + } + } + } +} + template class Yolov3LossKernel : public framework::OpKernel { public: @@ -247,28 +323,25 @@ class Yolov3LossKernel : public framework::OpKernel { auto* input = ctx.Input("X"); auto* gt_boxes = ctx.Input("GTBox"); auto* loss = ctx.Output("Loss"); - int img_height = ctx.Attr("img_height"); auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); const int n = input->dims()[0]; - const int c = input->dims()[1]; const int h = input->dims()[2]; const int w = input->dims()[3]; const int an_num = anchors.size() / 2; - const T stride = static_cast(img_height) / h; Tensor pred_x, pred_y, pred_w, pred_h; - Tensor pred_confs, pred_classes; + Tensor pred_conf, pred_class; pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_confs.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - pred_classes.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - CalcPredResult(*input, &pred_confs, &pred_classes, &pred_x, &pred_y, - &pred_w, &pred_h, anchors, class_num, stride); + pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, + &pred_w, &pred_h, an_num, class_num); Tensor obj_mask, noobj_mask; Tensor tx, ty, tw, th, tconf, tclass; @@ -280,9 +353,8 @@ class Yolov3LossKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, img_height, h, - &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, - &tclass); + PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, @@ -293,17 +365,9 @@ class Yolov3LossKernel : public framework::OpKernel { T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); - T loss_conf_obj = CalcBCEWithMask(pred_confs, tconf, obj_mask); - T loss_conf_noobj = CalcBCEWithMask(pred_confs, tconf, noobj_mask); - T loss_class = CalcBCEWithMask(pred_classes, tclass, obj_mask_expand); - - // LOG(ERROR) << "loss_x: " << loss_x; - // LOG(ERROR) << "loss_y: " << loss_y; - // LOG(ERROR) << "loss_w: " << loss_w; - // LOG(ERROR) << "loss_h: " << loss_h; - // LOG(ERROR) << "loss_conf_obj: " << loss_conf_obj; - // LOG(ERROR) << "loss_conf_noobj: " << loss_conf_noobj; - // LOG(ERROR) << "loss_class: " << loss_class; + T loss_conf_obj = CalcBCEWithMask(pred_conf, tconf, obj_mask); + T loss_conf_noobj = CalcBCEWithMask(pred_conf, tconf, noobj_mask); + T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_obj + @@ -315,8 +379,76 @@ template class Yolov3LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* d_input_t = ctx.Output(framework::GradVarName("X")); - auto* d_output_t = ctx.Input(framework::GradVarName("Out")); + auto* input = ctx.Input("X"); + auto* gt_boxes = ctx.Input("GTBox"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Loss")); + const T loss = output_grad->data()[0]; + + const int n = input->dims()[0]; + const int c = input->dims()[1]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + + Tensor pred_x, pred_y, pred_w, pred_h; + Tensor pred_conf, pred_class; + pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, + &pred_w, &pred_h, an_num, class_num); + + Tensor obj_mask, noobj_mask; + Tensor tx, ty, tw, th, tconf, tclass; + obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); + + Tensor obj_mask_expand; + obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, + ctx.GetPlace()); + ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); + + Tensor grad_x, grad_y, grad_w, grad_h; + Tensor grad_conf_obj, grad_conf_noobj, grad_class; + grad_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_obj.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_noobj.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + T obj_mf = CalcMaskPointNum(obj_mask); + T noobj_mf = CalcMaskPointNum(noobj_mask); + T obj_expand_mf = CalcMaskPointNum(obj_mask_expand); + CalcMSEGradWithMask(&grad_x, pred_x, tx, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_y, pred_y, ty, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_w, pred_w, tw, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_h, pred_h, th, obj_mask, obj_mf); + CalcBCEGradWithMask(&grad_conf_obj, pred_conf, tconf, obj_mask, obj_mf); + CalcBCEGradWithMask(&grad_conf_noobj, pred_conf, tconf, noobj_mask, + noobj_mf); + CalcBCEGradWithMask(&grad_class, pred_class, tclass, obj_mask_expand, + obj_expand_mf); + + input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); + AddAllGradToInputGrad( + input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, + grad_w, grad_h, grad_conf_obj, grad_conf_noobj, grad_class, class_num); } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 1ee7198f29260..a4efb166826d5 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -8244,14 +8244,55 @@ def log_loss(input, label, epsilon=1e-4, name=None): return loss -def yolov3_loss(x, gtbox, img_height, anchors, ignore_thresh, name=None): +@templatedoc(op_type="yolov3_loss") +def yolov3_loss(x, gtbox, anchors, class_num, ignore_thresh, name=None): """ - **YOLOv3 Loss Layer** + ${comment} + + Args: + x (Variable): ${x_comment} + gtbox (Variable): groud truth boxes, shoulb be in shape of [N, B, 5], + in the third dimenstion, class_id, x, y, w, h should + be stored and x, y, w, h should be relative valud of + input image. + anchors (list|tuple): ${anchors_comment} + class_num (int): ${class_num_comment} + ignore_thresh (float): ${ignore_thresh_comment} + name (string): the name of yolov3 loss - This layer + Returns: + Variable: A 1-D tensor with shape [1], the value of yolov3 loss + + Raises: + TypeError: Input x of yolov3_loss must be Variable + TypeError: Input gtbox of yolov3_loss must be Variable" + TypeError: Attr anchors of yolov3_loss must be list or tuple + TypeError: Attr class_num of yolov3_loss must be an integer + TypeError: Attr ignore_thresh of yolov3_loss must be a float number + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10, 255, 13, 13], dtype='float32') + gtbox = fluid.layers.data(name='gtbox', shape=[10, 6, 5], dtype='float32') + anchors = [10, 13, 16, 30, 33, 23] + loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 + anchors=anchors, ignore_thresh=0.5) """ helper = LayerHelper('yolov3_loss', **locals()) + if not isinstance(x, Variable): + raise TypeError("Input x of yolov3_loss must be Variable") + if not isinstance(gtbox, Variable): + raise TypeError("Input gtbox of yolov3_loss must be Variable") + if not isinstance(anchors, list) and not isinstance(anchors, tuple): + raise TypeError("Attr anchors of yolov3_loss must be list or tuple") + if not isinstance(class_num, int): + raise TypeError("Attr class_num of yolov3_loss must be an integer") + if not isinstance(ignore_thresh, float): + raise TypeError( + "Attr ignore_thresh of yolov3_loss must be a float number") + if name is None: loss = helper.create_variable_for_type_inference(dtype=x.dtype) else: @@ -8264,8 +8305,8 @@ def yolov3_loss(x, gtbox, img_height, anchors, ignore_thresh, name=None): "GTBox": gtbox}, outputs={'Loss': loss}, attrs={ - "img_height": img_height, "anchors": anchors, + "class_num": class_num, "ignore_thresh": ignore_thresh, }) return loss diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index f48d9c84f9c10..dd02968c30fc3 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -911,6 +911,15 @@ def test_affine_grid(self): self.assertIsNotNone(data_1) print(str(program)) + def test_yolov3_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') + gtbox = layers.data(name='gtbox', shape=[10, 5], dtype='float32') + loss = layers.yolov3_loss(x, gtbox, [10, 13, 30, 13], 10, 0.5) + + self.assertIsNotNone(loss) + def test_bilinear_tensor_product_layer(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index f5b15efb27ff7..4562f8bd4962e 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -12,10 +12,14 @@ # See the License for the specific language governing permissions and # limitations under the License. +from __future__ import division + import unittest import numpy as np from op_test import OpTest +from paddle.fluid import core + def sigmoid(x): return 1.0 / (1.0 + np.exp(-1.0 * x)) @@ -65,10 +69,9 @@ def box_iou(box1, box2): def build_target(gtboxs, attrs, grid_size): n, b, _ = gtboxs.shape ignore_thresh = attrs["ignore_thresh"] - img_height = attrs["img_height"] anchors = attrs["anchors"] class_num = attrs["class_num"] - an_num = len(anchors) / 2 + an_num = len(anchors) // 2 obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32') tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') @@ -120,7 +123,7 @@ def build_target(gtboxs, attrs, grid_size): def YoloV3Loss(x, gtbox, attrs): n, c, h, w = x.shape - an_num = len(attrs['anchors']) / 2 + an_num = len(attrs['anchors']) // 2 class_num = attrs["class_num"] x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) pred_x = sigmoid(x[:, :, :, :, 0]) @@ -144,13 +147,6 @@ def YoloV3Loss(x, gtbox, attrs): noobj_mask) loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, obj_mask_expand) - # print "loss_x: ", loss_x - # print "loss_y: ", loss_y - # print "loss_w: ", loss_w - # print "loss_h: ", loss_h - # print "loss_conf_obj: ", loss_conf_obj - # print "loss_conf_noobj: ", loss_conf_noobj - # print "loss_class: ", loss_class return loss_x + loss_y + loss_w + loss_h + loss_conf_obj + loss_conf_noobj + loss_class @@ -165,29 +161,35 @@ def setUp(self): self.gtbox_shape[:2]) self.attrs = { - "img_height": self.img_height, "anchors": self.anchors, "class_num": self.class_num, "ignore_thresh": self.ignore_thresh, } self.inputs = {'X': x, 'GTBox': gtbox} - self.outputs = {'Loss': np.array([YoloV3Loss(x, gtbox, self.attrs)])} - print self.outputs + self.outputs = { + 'Loss': + np.array([YoloV3Loss(x, gtbox, self.attrs)]).astype('float32') + } def test_check_output(self): - self.check_output(atol=1e-3) + place = core.CPUPlace() + self.check_output_with_place(place, atol=1e-3) - # def test_check_grad_normal(self): - # self.check_grad(['X', 'Grid'], 'Output', max_relative_error=0.61) + def test_check_grad_ignore_gtbox(self): + place = core.CPUPlace() + self.check_grad_with_place( + place, ['X'], + 'Loss', + no_grad_set=set("GTBox"), + max_relative_error=0.1) def initTestCase(self): - self.img_height = 608 - self.anchors = [10, 13, 16, 30, 33, 23] + self.anchors = [10, 13, 12, 12] self.class_num = 10 self.ignore_thresh = 0.5 - self.x_shape = (5, len(self.anchors) / 2 * (5 + self.class_num), 7, 7) - self.gtbox_shape = (5, 10, 5) + self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) + self.gtbox_shape = (5, 5, 5) if __name__ == "__main__": From 99d1446a8ba3bddf899026a030ed6ab2f44a6531 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 14 Nov 2018 05:49:51 +0000 Subject: [PATCH 031/684] test=develop --- python/paddle/fluid/layers/nn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 835ec4506a944..4472f20409f8b 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4412,7 +4412,7 @@ def hsigmoid(input, out = helper.create_variable_for_type_inference(dtype) pre_out = helper.create_variable_for_type_inference(dtype) dim = input.shape[1] - if ((num_classes < 2) or (num_classes is None)) and (not is_costum): + if ((num_classes is None) or (num_classes < 2)) and (not is_costum): raise ValueError( "num_classes must not be less than 2 with default tree") From a507845a7735af6552f035f27902d2758bd36bcb Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 14 Nov 2018 06:13:41 +0000 Subject: [PATCH 032/684] test=develop --- paddle/fluid/operators/math/matrix_bit_code.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 1e2abd1e697a2..39c3b1520b41e 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -159,7 +159,7 @@ class CustomCode : public Code { for (int i = 0; i < static_cast(ptable_->dims()[1]); i++) { if (ptable_->data()[index_ * static_cast(ptable_->dims()[1]) + - i] != -1) { + i] >= 0) { length++; } else { return length; From 2faa2b4048d14e24acd3f8a3f8c55c2f492d0285 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Tue, 13 Nov 2018 20:08:54 +0800 Subject: [PATCH 033/684] remove cu file. test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/yolov3_loss_op.cc | 36 ++++++- paddle/fluid/operators/yolov3_loss_op.cu | 23 ----- paddle/fluid/operators/yolov3_loss_op.h | 43 +++++--- python/paddle/fluid/layers/detection.py | 98 +++++++++++++++++++ python/paddle/fluid/layers/nn.py | 69 ------------- .../tests/unittests/test_yolov3_loss_op.py | 23 ++++- 7 files changed, 182 insertions(+), 112 deletions(-) delete mode 100644 paddle/fluid/operators/yolov3_loss_op.cu diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 8344a913e9bf8..7e0d5e60887b6 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -183,7 +183,6 @@ paddle.fluid.layers.similarity_focus ArgSpec(args=['input', 'axis', 'indexes', ' paddle.fluid.layers.hash ArgSpec(args=['input', 'hash_size', 'num_hash', 'name'], varargs=None, keywords=None, defaults=(1, None)) paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'anchors', 'class_num', 'ignore_thresh', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) @@ -289,6 +288,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'anchors', 'class_num', 'ignore_thresh', 'lambda_xy', 'lambda_wh', 'lambda_conf_obj', 'lambda_conf_noobj', 'lambda_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index cf25e995054dc..f6c134e1b4df8 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -55,7 +55,8 @@ class Yolov3LossOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); } }; @@ -63,8 +64,11 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "The input tensor of bilinear interpolation, " - "This is a 4-D tensor with shape of [N, C, H, W]"); + "The input tensor of YOLO v3 loss operator, " + "This is a 4-D tensor with shape of [N, C, H, W]." + "H and W should be same, and the second dimention(C) stores" + "box locations, confidence score and classification one-hot" + "key of each anchor box"); AddInput("GTBox", "The input tensor of ground truth boxes, " "This is a 3-D tensor with shape of [N, max_box_num, 5], " @@ -84,6 +88,20 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "it will be parsed pair by pair."); AddAttr("ignore_thresh", "The ignore threshold to ignore confidence loss."); + AddAttr("lambda_xy", "The weight of x, y location loss.") + .SetDefault(1.0); + AddAttr("lambda_wh", "The weight of w, h location loss.") + .SetDefault(1.0); + AddAttr( + "lambda_conf_obj", + "The weight of confidence score loss in locations with target object.") + .SetDefault(1.0); + AddAttr("lambda_conf_noobj", + "The weight of confidence score loss in locations without " + "target object.") + .SetDefault(1.0); + AddAttr("lambda_class", "The weight of classification loss.") + .SetDefault(1.0); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground truth boxes. @@ -119,6 +137,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { confidence score loss, and classification loss. The MSE loss is used for box location, and binary cross entropy loss is used for confidence score loss and classification loss. + + Final loss will be represented as follow. + + $$ + loss = \lambda_{xy} * loss_{xy} + \lambda_{wh} * loss_{wh} + + \lambda_{conf_obj} * loss_{conf_obj} + + \lambda_{conf_noobj} * loss_{conf_noobj} + + \lambda_{class} * loss_{class} + $$ )DOC"); } }; @@ -140,7 +167,8 @@ class Yolov3LossOpGrad : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); } }; diff --git a/paddle/fluid/operators/yolov3_loss_op.cu b/paddle/fluid/operators/yolov3_loss_op.cu deleted file mode 100644 index f901b10d38e48..0000000000000 --- a/paddle/fluid/operators/yolov3_loss_op.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve. - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - http://www.apache.org/licenses/LICENSE-2.0 - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ - -#define EIGEN_USE_GPU - -#include "paddle/fluid/operators/yolov3_loss_op.h" -#include "paddle/fluid/platform/cuda_primitives.h" - -namespace ops = paddle::operators; -REGISTER_OP_CUDA_KERNEL( - yolov3_loss, - ops::Yolov3LossKernel); -REGISTER_OP_CUDA_KERNEL( - yolov3_loss_grad, - ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index a2ed4440a74fe..f4ede9258977b 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -267,7 +267,9 @@ static void AddAllGradToInputGrad( const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, const Tensor& grad_conf_obj, const Tensor& grad_conf_noobj, - const Tensor& grad_class, const int class_num) { + const Tensor& grad_class, const int class_num, const float lambda_xy, + const float lambda_wh, const float lambda_conf_obj, + const float lambda_conf_noobj, const float lambda_class) { const int n = pred_x.dims()[0]; const int an_num = pred_x.dims()[1]; const int h = pred_x.dims()[2]; @@ -290,25 +292,27 @@ static void AddAllGradToInputGrad( for (int j = 0; j < an_num; j++) { for (int k = 0; k < h; k++) { for (int l = 0; l < w; l++) { - grad_t(i, j * attr_num, k, l) = grad_x_t(i, j, k, l) * - pred_x_t(i, j, k, l) * - (1.0 - pred_x_t(i, j, k, l)) * loss; + grad_t(i, j * attr_num, k, l) = + grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) * + (1.0 - pred_x_t(i, j, k, l)) * loss * lambda_xy; grad_t(i, j * attr_num + 1, k, l) = grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * - (1.0 - pred_y_t(i, j, k, l)) * loss; - grad_t(i, j * attr_num + 2, k, l) = grad_w_t(i, j, k, l) * loss; - grad_t(i, j * attr_num + 3, k, l) = grad_h_t(i, j, k, l) * loss; + (1.0 - pred_y_t(i, j, k, l)) * loss * lambda_xy; + grad_t(i, j * attr_num + 2, k, l) = + grad_w_t(i, j, k, l) * loss * lambda_wh; + grad_t(i, j * attr_num + 3, k, l) = + grad_h_t(i, j, k, l) * loss * lambda_wh; grad_t(i, j * attr_num + 4, k, l) = grad_conf_obj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss; + (1.0 - pred_conf_t(i, j, k, l)) * loss * lambda_conf_obj; grad_t(i, j * attr_num + 4, k, l) += grad_conf_noobj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss; + (1.0 - pred_conf_t(i, j, k, l)) * loss * lambda_conf_noobj; for (int c = 0; c < class_num; c++) { grad_t(i, j * attr_num + 5 + c, k, l) = grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * - (1.0 - pred_class_t(i, j, k, l, c)) * loss; + (1.0 - pred_class_t(i, j, k, l, c)) * loss * lambda_class; } } } @@ -326,6 +330,11 @@ class Yolov3LossKernel : public framework::OpKernel { auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); + float lambda_xy = ctx.Attr("lambda_xy"); + float lambda_wh = ctx.Attr("lambda_wh"); + float lambda_conf_obj = ctx.Attr("lambda_conf_obj"); + float lambda_conf_noobj = ctx.Attr("lambda_conf_noobj"); + float lambda_class = ctx.Attr("lambda_class"); const int n = input->dims()[0]; const int h = input->dims()[2]; @@ -370,8 +379,10 @@ class Yolov3LossKernel : public framework::OpKernel { T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); - loss_data[0] = loss_x + loss_y + loss_w + loss_h + loss_conf_obj + - loss_conf_noobj + loss_class; + loss_data[0] = + lambda_xy * (loss_x + loss_y) + lambda_wh * (loss_w + loss_h) + + lambda_conf_obj * loss_conf_obj + lambda_conf_noobj * loss_conf_noobj + + lambda_class * loss_class; } }; @@ -387,6 +398,11 @@ class Yolov3LossGradKernel : public framework::OpKernel { auto* input_grad = ctx.Output(framework::GradVarName("X")); auto* output_grad = ctx.Input(framework::GradVarName("Loss")); const T loss = output_grad->data()[0]; + float lambda_xy = ctx.Attr("lambda_xy"); + float lambda_wh = ctx.Attr("lambda_wh"); + float lambda_conf_obj = ctx.Attr("lambda_conf_obj"); + float lambda_conf_noobj = ctx.Attr("lambda_conf_noobj"); + float lambda_class = ctx.Attr("lambda_class"); const int n = input->dims()[0]; const int c = input->dims()[1]; @@ -448,7 +464,8 @@ class Yolov3LossGradKernel : public framework::OpKernel { input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); AddAllGradToInputGrad( input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, - grad_w, grad_h, grad_conf_obj, grad_conf_noobj, grad_class, class_num); + grad_w, grad_h, grad_conf_obj, grad_conf_noobj, grad_class, class_num, + lambda_xy, lambda_wh, lambda_conf_obj, lambda_conf_noobj, lambda_class); } }; diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 4ac94981a7a47..2bb9514803e97 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -20,6 +20,7 @@ from .layer_function_generator import generate_layer_fn from .layer_function_generator import autodoc, templatedoc from ..layer_helper import LayerHelper +from ..framework import Variable from . import tensor from . import nn from . import ops @@ -45,6 +46,7 @@ 'iou_similarity', 'box_coder', 'polygon_box_transform', + 'yolov3_loss', ] @@ -404,6 +406,102 @@ def polygon_box_transform(input, name=None): return output +@templatedoc(op_type="yolov3_loss") +def yolov3_loss(x, + gtbox, + anchors, + class_num, + ignore_thresh, + lambda_xy=None, + lambda_wh=None, + lambda_conf_obj=None, + lambda_conf_noobj=None, + lambda_class=None, + name=None): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + gtbox (Variable): groud truth boxes, shoulb be in shape of [N, B, 5], + in the third dimenstion, class_id, x, y, w, h should + be stored and x, y, w, h should be relative valud of + input image. + anchors (list|tuple): ${anchors_comment} + class_num (int): ${class_num_comment} + ignore_thresh (float): ${ignore_thresh_comment} + lambda_xy (float|None): ${lambda_xy_comment} + lambda_wh (float|None): ${lambda_wh_comment} + lambda_conf_obj (float|None): ${lambda_conf_obj_comment} + lambda_conf_noobj (float|None): ${lambda_conf_noobj_comment} + lambda_class (float|None): ${lambda_class_comment} + name (string): the name of yolov3 loss + + Returns: + Variable: A 1-D tensor with shape [1], the value of yolov3 loss + + Raises: + TypeError: Input x of yolov3_loss must be Variable + TypeError: Input gtbox of yolov3_loss must be Variable" + TypeError: Attr anchors of yolov3_loss must be list or tuple + TypeError: Attr class_num of yolov3_loss must be an integer + TypeError: Attr ignore_thresh of yolov3_loss must be a float number + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[10, 255, 13, 13], dtype='float32') + gtbox = fluid.layers.data(name='gtbox', shape=[10, 6, 5], dtype='float32') + anchors = [10, 13, 16, 30, 33, 23] + loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 + anchors=anchors, ignore_thresh=0.5) + """ + helper = LayerHelper('yolov3_loss', **locals()) + + if not isinstance(x, Variable): + raise TypeError("Input x of yolov3_loss must be Variable") + if not isinstance(gtbox, Variable): + raise TypeError("Input gtbox of yolov3_loss must be Variable") + if not isinstance(anchors, list) and not isinstance(anchors, tuple): + raise TypeError("Attr anchors of yolov3_loss must be list or tuple") + if not isinstance(class_num, int): + raise TypeError("Attr class_num of yolov3_loss must be an integer") + if not isinstance(ignore_thresh, float): + raise TypeError( + "Attr ignore_thresh of yolov3_loss must be a float number") + + if name is None: + loss = helper.create_variable_for_type_inference(dtype=x.dtype) + else: + loss = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + attrs = { + "anchors": anchors, + "class_num": class_num, + "ignore_thresh": ignore_thresh, + } + + if lambda_xy is not None and isinstance(lambda_xy, float): + self.attrs['lambda_xy'] = lambda_xy + if lambda_wh is not None and isinstance(lambda_wh, float): + self.attrs['lambda_wh'] = lambda_wh + if lambda_conf_obj is not None and isinstance(lambda_conf_obj, float): + self.attrs['lambda_conf_obj'] = lambda_conf_obj + if lambda_conf_noobj is not None and isinstance(lambda_conf_noobj, float): + self.attrs['lambda_conf_noobj'] = lambda_conf_noobj + if lambda_class is not None and isinstance(lambda_class, float): + self.attrs['lambda_class'] = lambda_class + + helper.append_op( + type='yolov3_loss', + inputs={'X': x, + "GTBox": gtbox}, + outputs={'Loss': loss}, + attrs=attrs) + return loss + + @templatedoc() def detection_map(detect_res, label, diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index a4efb166826d5..d3623464e9974 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -164,7 +164,6 @@ 'hash', 'grid_sampler', 'log_loss', - 'yolov3_loss', 'add_position_encoding', 'bilinear_tensor_product', ] @@ -8244,74 +8243,6 @@ def log_loss(input, label, epsilon=1e-4, name=None): return loss -@templatedoc(op_type="yolov3_loss") -def yolov3_loss(x, gtbox, anchors, class_num, ignore_thresh, name=None): - """ - ${comment} - - Args: - x (Variable): ${x_comment} - gtbox (Variable): groud truth boxes, shoulb be in shape of [N, B, 5], - in the third dimenstion, class_id, x, y, w, h should - be stored and x, y, w, h should be relative valud of - input image. - anchors (list|tuple): ${anchors_comment} - class_num (int): ${class_num_comment} - ignore_thresh (float): ${ignore_thresh_comment} - name (string): the name of yolov3 loss - - Returns: - Variable: A 1-D tensor with shape [1], the value of yolov3 loss - - Raises: - TypeError: Input x of yolov3_loss must be Variable - TypeError: Input gtbox of yolov3_loss must be Variable" - TypeError: Attr anchors of yolov3_loss must be list or tuple - TypeError: Attr class_num of yolov3_loss must be an integer - TypeError: Attr ignore_thresh of yolov3_loss must be a float number - - Examples: - .. code-block:: python - - x = fluid.layers.data(name='x', shape=[10, 255, 13, 13], dtype='float32') - gtbox = fluid.layers.data(name='gtbox', shape=[10, 6, 5], dtype='float32') - anchors = [10, 13, 16, 30, 33, 23] - loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 - anchors=anchors, ignore_thresh=0.5) - """ - helper = LayerHelper('yolov3_loss', **locals()) - - if not isinstance(x, Variable): - raise TypeError("Input x of yolov3_loss must be Variable") - if not isinstance(gtbox, Variable): - raise TypeError("Input gtbox of yolov3_loss must be Variable") - if not isinstance(anchors, list) and not isinstance(anchors, tuple): - raise TypeError("Attr anchors of yolov3_loss must be list or tuple") - if not isinstance(class_num, int): - raise TypeError("Attr class_num of yolov3_loss must be an integer") - if not isinstance(ignore_thresh, float): - raise TypeError( - "Attr ignore_thresh of yolov3_loss must be a float number") - - if name is None: - loss = helper.create_variable_for_type_inference(dtype=x.dtype) - else: - loss = helper.create_variable( - name=name, dtype=x.dtype, persistable=False) - - helper.append_op( - type='yolov3_loss', - inputs={'X': x, - "GTBox": gtbox}, - outputs={'Loss': loss}, - attrs={ - "anchors": anchors, - "class_num": class_num, - "ignore_thresh": ignore_thresh, - }) - return loss - - def add_position_encoding(input, alpha, beta, name=None): """ **Add Position Encoding Layer** diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 4562f8bd4962e..3b6d58563f4b0 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -148,11 +148,20 @@ def YoloV3Loss(x, gtbox, attrs): loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, obj_mask_expand) - return loss_x + loss_y + loss_w + loss_h + loss_conf_obj + loss_conf_noobj + loss_class + return attrs['lambda_xy'] * (loss_x + loss_y) \ + + attrs['lambda_wh'] * (loss_w + loss_h) \ + + attrs['lambda_conf_obj'] * loss_conf_obj \ + + attrs['lambda_conf_noobj'] * loss_conf_noobj \ + + attrs['lambda_class'] * loss_class class TestYolov3LossOp(OpTest): def setUp(self): + self.lambda_xy = 1.0 + self.lambda_wh = 1.0 + self.lambda_conf_obj = 1.0 + self.lambda_conf_noobj = 1.0 + self.lambda_class = 1.0 self.initTestCase() self.op_type = 'yolov3_loss' x = np.random.random(size=self.x_shape).astype('float32') @@ -164,6 +173,11 @@ def setUp(self): "anchors": self.anchors, "class_num": self.class_num, "ignore_thresh": self.ignore_thresh, + "lambda_xy": self.lambda_xy, + "lambda_wh": self.lambda_wh, + "lambda_conf_obj": self.lambda_conf_obj, + "lambda_conf_noobj": self.lambda_conf_noobj, + "lambda_class": self.lambda_class, } self.inputs = {'X': x, 'GTBox': gtbox} @@ -182,7 +196,7 @@ def test_check_grad_ignore_gtbox(self): place, ['X'], 'Loss', no_grad_set=set("GTBox"), - max_relative_error=0.1) + max_relative_error=0.06) def initTestCase(self): self.anchors = [10, 13, 12, 12] @@ -190,6 +204,11 @@ def initTestCase(self): self.ignore_thresh = 0.5 self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) self.gtbox_shape = (5, 5, 5) + self.lambda_xy = 2.5 + self.lambda_wh = 0.8 + self.lambda_conf_obj = 1.5 + self.lambda_conf_noobj = 0.5 + self.lambda_class = 1.2 if __name__ == "__main__": From ba9ff508e8339319c926b105e9ffb32f7332977a Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 15 Nov 2018 08:43:36 +0000 Subject: [PATCH 034/684] temp fix --- .../fluid/operators/math/matrix_bit_code.cc | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 88279f8d8a781..090c0cca36607 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -119,6 +119,33 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, } } +// template +// void MatrixBitCodeFunctor::MulGradSparseWeight(const framework::Tensor& +// tmat, +// framework::SelectedRows* weight, +// const framework::Tensor& input) { +// size_t num_samples = tmat.dims()[0]; +// size_t input_width = input.dims()[1]; +// size_t tmat_width = tmat.dims()[1]; +// size_t weight_width = weight->dims()[1]; +// auto tmat_value = tmat.data(); +// auto weight_value = weight->data(); +// auto input_value = input.data(); +// for (size_t i = 0; i < num_samples; ++i) { +// auto code = code_table->get_code(i); +// int code_length = code->get_length(); +// for (int j = 0; j < code_length; ++j) { +// // size_t index = code->calc_index(j); + +// for (size_t k = 0; k < input_width; ++k) { +// weight_value[j * weight_width + k] += +// tmat_value[i * tmat_width + j] * input_value[input_width * i + +// k]; +// } +// } +// } +// } + template void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, const framework::Tensor& weight, From 95d5060dddcbfd0eff8cb50d542f5adb6899b6b6 Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 15 Nov 2018 18:57:49 +0800 Subject: [PATCH 035/684] fix abs -> fabs error. test=develop --- paddle/fluid/operators/yolov3_loss_op.h | 13 +++++++------ .../fluid/tests/unittests/test_yolov3_loss_op.py | 14 +++++++------- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index f4ede9258977b..608ef3f94bdc7 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -29,7 +29,7 @@ using Array5 = Eigen::DSizes; template static inline bool isZero(T x) { - return abs(x) < 1e-6; + return fabs(x) < 1e-6; } template @@ -186,7 +186,7 @@ static T CalcBoxIoU(std::vector box1, std::vector box2) { } template -static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, +static void PreProcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, std::vector anchors, const int grid_size, Tensor* obj_mask, Tensor* noobj_mask, Tensor* tx, Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, @@ -206,8 +206,9 @@ static void PrePorcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, for (int i = 0; i < n; i++) { for (int j = 0; j < b; j++) { - if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && - isZero(gt_boxes_t(i, j, 2)) && isZero(gt_boxes_t(i, j, 3))) { + if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && + isZero(gt_boxes_t(i, j, 2)) && isZero(gt_boxes_t(i, j, 3)) && + isZero(gt_boxes_t(i, j, 4))) { continue; } @@ -362,7 +363,7 @@ class Yolov3LossKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + PreProcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; @@ -431,7 +432,7 @@ class Yolov3LossGradKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PrePorcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + PreProcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 3b6d58563f4b0..03a64055f0bdc 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -190,13 +190,13 @@ def test_check_output(self): place = core.CPUPlace() self.check_output_with_place(place, atol=1e-3) - def test_check_grad_ignore_gtbox(self): - place = core.CPUPlace() - self.check_grad_with_place( - place, ['X'], - 'Loss', - no_grad_set=set("GTBox"), - max_relative_error=0.06) + # def test_check_grad_ignore_gtbox(self): + # place = core.CPUPlace() + # self.check_grad_with_place( + # place, ['X'], + # 'Loss', + # no_grad_set=set("GTBox"), + # max_relative_error=0.06) def initTestCase(self): self.anchors = [10, 13, 12, 12] From d1429ac4a55a2f6cbaeaf1cca572601e5d344667 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 15 Nov 2018 19:46:22 +0800 Subject: [PATCH 036/684] add recordio support --- CMakeLists.txt | 6 +- cmake/external/eigen.cmake | 10 +-- cmake/external/gflags.cmake | 5 +- cmake/external/glog.cmake | 3 +- cmake/external/gtest.cmake | 5 +- cmake/external/protobuf.cmake | 5 +- cmake/external/snappy.cmake | 12 +++- cmake/external/snappystream.cmake | 61 +++++++++++-------- cmake/external/zlib.cmake | 5 +- paddle/fluid/CMakeLists.txt | 6 +- paddle/fluid/framework/CMakeLists.txt | 6 +- paddle/fluid/operators/CMakeLists.txt | 8 +-- .../operators/reader/create_py_reader_op.cc | 2 +- paddle/fluid/operators/roi_align_op.cc | 6 +- paddle/fluid/operators/roi_pool_op.cc | 6 +- paddle/fluid/operators/space_to_depth_op.cc | 2 +- paddle/fluid/platform/port.h | 10 +-- paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/pybind.cc | 9 +-- python/paddle/fluid/layers/nn.py | 6 ++ 20 files changed, 97 insertions(+), 81 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 996a79fbbc300..d6e7b88f86027 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,11 +190,11 @@ include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) include(external/xxhash) # download xxhash - -if (NOT WIN32) -# there is no official support of snappystream, warpctc, nccl, cupti in windows include(external/snappy) # download snappy include(external/snappystream) # download snappystream + +if (NOT WIN32) +# there is no official support of warpctc, nccl, cupti in windows include(external/warpctc) # download, build, install warpctc include(cupti) endif (NOT WIN32) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 573ad5e5f06a9..98079678ae5fd 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -16,8 +16,9 @@ if(WITH_AMD_GPU) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 +# GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" +# GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -29,10 +30,11 @@ else() ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" +# GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen - GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c +# GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c PREFIX ${EIGEN_SOURCE_DIR} DOWNLOAD_NAME "eigen" UPDATE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 4e98e4bf889bc..7c062d682ceea 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -28,8 +28,9 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/gflags/gflags.git" - GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a +# GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gflags.git" +# GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index 8cd0455c16bf8..a3f3c6adf304e 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -34,13 +34,14 @@ ELSE() SET(GLOG_REPOSITORY "https://github.com/google/glog.git") SET(GLOG_TAG "v0.3.5") ENDIF() + SET(GLOG_REPOSITORY "http://admin@172.20.90.14:8080/r/glog.git") ExternalProject_Add( extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY ${GLOG_REPOSITORY} - GIT_TAG ${GLOG_TAG} + # GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index d335298742c73..da539d52bd4c4 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -43,8 +43,9 @@ IF(WITH_TESTING) extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${GTEST_DEPENDS} - GIT_REPOSITORY "https://github.com/google/googletest.git" - GIT_TAG "release-1.8.0" + # GIT_REPOSITORY "https://github.com/google/googletest.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gtest.git" +# GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index e1e619e572b05..94d8ac30cc5f7 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -202,8 +202,9 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") ENDIF() - SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") - SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + # SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") + SET(PROTOBUF_REPO http://admin@172.20.90.14:8080/r/protobuf.git) IF(MOBILE_INFERENCE) # The reason why the official version is not used is described in # https://github.com/PaddlePaddle/Paddle/issues/6114 diff --git a/cmake/external/snappy.cmake b/cmake/external/snappy.cmake index af09ed4d5d6e2..b30403d2d81ce 100644 --- a/cmake/external/snappy.cmake +++ b/cmake/external/snappy.cmake @@ -24,7 +24,11 @@ set(SNAPPY_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy) set(SNAPPY_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy) set(SNAPPY_INCLUDE_DIR "${SNAPPY_INSTALL_DIR}/include" CACHE PATH "snappy include directory." FORCE) -set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +if (WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/snappy.lib") +else(WIN32) + set(SNAPPY_LIBRARIES "${SNAPPY_INSTALL_DIR}/lib/libsnappy.a") +endif (WIN32) ExternalProject_Add( extern_snappy @@ -34,8 +38,12 @@ ExternalProject_Add( UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib -DCMAKE_POSITION_INDEPENDENT_CODE=ON diff --git a/cmake/external/snappystream.cmake b/cmake/external/snappystream.cmake index 6df636d7fa875..1ec79462c14e4 100644 --- a/cmake/external/snappystream.cmake +++ b/cmake/external/snappystream.cmake @@ -18,36 +18,45 @@ ENDIF() include (ExternalProject) -# NOTE: snappy is needed when linking with recordio - set(SNAPPYSTREAM_SOURCES_DIR ${THIRD_PARTY_PATH}/snappy_stream) set(SNAPPYSTREAM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/snappy_stream) set(SNAPPYSTREAM_INCLUDE_DIR "${SNAPPYSTREAM_INSTALL_DIR}/include" CACHE PATH "snappy stream include directory." FORCE) -set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") - -ExternalProject_Add( - extern_snappystream - GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git" - GIT_TAG "0.2.8" - PREFIX ${SNAPPYSTREAM_SOURCES_DIR} - UPDATE_COMMAND "" - CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} - -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} - -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} - -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} - -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib - -DCMAKE_POSITION_INDEPENDENT_CODE=ON - -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} - -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR} - ${EXTERNAL_OPTIONAL_ARGS} - CMAKE_CACHE_ARGS - -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR} - -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib - -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} - DEPENDS snappy -) +if(WIN32) + # Fix me, VS2015 come without VLA support + set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/snappystream.lib") + MESSAGE(WARNING, "In windows, snappystream has no compile support for windows, + please build it manually and put it at " ${SNAPPYSTREAM_INSTALL_DIR}) +else(WIN32) + set(SNAPPYSTREAM_LIBRARIES "${SNAPPYSTREAM_INSTALL_DIR}/lib/libsnappystream.a") + + ExternalProject_Add( + extern_snappystream + GIT_REPOSITORY "https://github.com/hoxnox/snappystream.git" + GIT_TAG "0.2.8" + PREFIX ${SNAPPYSTREAM_SOURCES_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} + -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} + -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} + -DCMAKE_INSTALL_PREFIX=${SNAPPY_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR=${SNAPPY_INSTALL_DIR}/lib + -DCMAKE_POSITION_INDEPENDENT_CODE=ON + -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} + -DSNAPPY_ROOT=${SNAPPY_INSTALL_DIR} + ${EXTERNAL_OPTIONAL_ARGS} + CMAKE_CACHE_ARGS + -DCMAKE_INSTALL_PREFIX:PATH=${SNAPPYSTREAM_INSTALL_DIR} + -DCMAKE_INSTALL_LIBDIR:PATH=${SNAPPYSTREAM_INSTALL_DIR}/lib + -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} + DEPENDS snappy + ) +endif(WIN32) add_library(snappystream STATIC IMPORTED GLOBAL) set_property(TARGET snappystream PROPERTY IMPORTED_LOCATION ${SNAPPYSTREAM_LIBRARIES}) diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index c3d73235453c8..456f26385c4f2 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -31,8 +31,9 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl ExternalProject_Add( extern_zlib ${EXTERNAL_PROJECT_LOG_ARGS} - GIT_REPOSITORY "https://github.com/madler/zlib.git" - GIT_TAG "v1.2.8" + # GIT_REPOSITORY "https://github.com/madler/zlib.git" + GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/zlib.git" +# GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index abadda3adb00e..6b526f0103ad3 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -3,13 +3,9 @@ add_subdirectory(platform) add_subdirectory(framework) add_subdirectory(operators) add_subdirectory(string) - -add_subdirectory(pybind) -if (NOT WIN32) add_subdirectory(recordio) -endif(NOT WIN32) +add_subdirectory(pybind) # NOTE: please add subdirectory inference at last. add_subdirectory(inference) - add_subdirectory(train) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cb9057672cc2c..42af482f852a1 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -68,11 +68,7 @@ if(WITH_GPU) else() cc_test(mixed_vector_test SRCS mixed_vector_test.cc DEPS place memory device_context tensor) endif() -if (NOT WIN32) - cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) -else() - cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto version) -endif (NOT WIN32) +cc_library(lod_tensor SRCS lod_tensor.cc DEPS ddim place tensor framework_proto recordio version) cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor memory) nv_test(lod_tensor_gpu_test SRCS lod_tensor_test.cu DEPS lod_tensor) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index f06ef199d1654..edd062e1752a4 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -95,7 +95,8 @@ function(op_library TARGET) foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op" - "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op") + "fusion_seqexpand_concat_fc_op" "attention_lstm_op" "fused_embedding_fc_lstm_op" "fc_op" + ) if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() @@ -225,7 +226,6 @@ if(WITH_DISTRIBUTE) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET ibverbs PROPERTY IMPORTED_LOCATION ${IBVERBS_LIBRARY}) - find_library(RDMACM_LIBRARY NAMES rdmacm) ADD_LIBRARY(rdmacm SHARED IMPORTED GLOBAL) SET_PROPERTY(TARGET rdmacm PROPERTY IMPORTED_LOCATION ${RDMACM_LIBRARY}) @@ -338,11 +338,7 @@ foreach(src ${GENERAL_OPS}) endforeach() file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(logical_and);\nUSE_NO_KERNEL_OP(read_from_array);\n") - - -if (NOT WIN32) add_subdirectory(reader) -endif(NOT WIN32) foreach(src ${READER_LIBRARY}) set(OP_LIBRARY ${src} ${OP_LIBRARY}) endforeach() diff --git a/paddle/fluid/operators/reader/create_py_reader_op.cc b/paddle/fluid/operators/reader/create_py_reader_op.cc index 0f31ca1a94326..901a92ab5b5c7 100644 --- a/paddle/fluid/operators/reader/create_py_reader_op.cc +++ b/paddle/fluid/operators/reader/create_py_reader_op.cc @@ -74,7 +74,7 @@ class CreatePyReaderOpMaker : public FileReaderMakerBase { "Name of the `LoDTensorBlockingQueueHolder` variable"); AddComment(R"DOC( - Create PyReader to support LoDTensor data feeding in Python side. + Create PyReader to support LoDTensor data feeding in Python side. )DOC"); } }; diff --git a/paddle/fluid/operators/roi_align_op.cc b/paddle/fluid/operators/roi_align_op.cc index c57a34c3a745e..79f189222ef37 100644 --- a/paddle/fluid/operators/roi_align_op.cc +++ b/paddle/fluid/operators/roi_align_op.cc @@ -35,10 +35,10 @@ class ROIAlignOp : public framework::OperatorWithKernel { "The format of input tensor is NCHW."); PADDLE_ENFORCE(rois_dims.size() == 2, "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]."); + "given as [[x1, y1, x2, y2], ...]."); PADDLE_ENFORCE(rois_dims[1] == 4, "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]."); + "given as [[x1, y1, x2, y2], ...]."); int pooled_height = ctx->Attrs().Get("pooled_height"); int pooled_width = ctx->Attrs().Get("pooled_width"); float spatial_scale = ctx->Attrs().Get("spatial_scale"); @@ -103,7 +103,7 @@ class ROIAlignOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor), " "ROIs (Regions of Interest) to pool over. " "should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]. " + "given as [[x1, y1, x2, y2], ...]. " "(x1, y1) is the top left coordinates, and " "(x2, y2) is the bottom right coordinates."); AddOutput("Out", diff --git a/paddle/fluid/operators/roi_pool_op.cc b/paddle/fluid/operators/roi_pool_op.cc index 043ea680d1506..3f6b2e46c7014 100644 --- a/paddle/fluid/operators/roi_pool_op.cc +++ b/paddle/fluid/operators/roi_pool_op.cc @@ -40,10 +40,10 @@ class ROIPoolOp : public framework::OperatorWithKernel { "The format of input tensor is NCHW."); PADDLE_ENFORCE(rois_dims.size() == 2, "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]."); + "given as [[x1, y1, x2, y2], ...]."); PADDLE_ENFORCE(rois_dims[1] == kROISize, "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]."); + "given as [[x1, y1, x2, y2], ...]."); int pooled_height = ctx->Attrs().Get("pooled_height"); int pooled_width = ctx->Attrs().Get("pooled_width"); @@ -110,7 +110,7 @@ class ROIPoolOpMaker : public framework::OpProtoAndCheckerMaker { "(LoDTensor), " "ROIs (Regions of Interest) to pool over. " "should be a 2-D LoDTensor of shape (num_rois, 4)" - "given as [[x1, y1, x2, y2], …]. " + "given as [[x1, y1, x2, y2], ...]. " "Where batch_id is the id of the data, " "(x1, y1) is the top left coordinates, and " "(x2, y2) is the bottom right coordinates."); diff --git a/paddle/fluid/operators/space_to_depth_op.cc b/paddle/fluid/operators/space_to_depth_op.cc index f109dd685c87a..b579244673fa1 100644 --- a/paddle/fluid/operators/space_to_depth_op.cc +++ b/paddle/fluid/operators/space_to_depth_op.cc @@ -86,7 +86,7 @@ class SpaceToDepthOpMaker : public framework::OpProtoAndCheckerMaker { .GreaterThan(1); AddComment(R"DOC( reorg operator used in Yolo v2. - The equation is: C2 = C1/blocksize * blocksize, W2 = W1 ∗ blocksize + offset % blocksize, H2 = H1 ∗ blocksize + offset / blocksize, + The equation is: C2 = C1/blocksize * blocksize, W2 = W1 * blocksize + offset % blocksize, H2 = H1 * blocksize + offset / blocksize, Reshape Input(X) into the shape according to Attr(blocksize). The data in Input(X) are unchanged. diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index 8823e97b0b696..a07b993c8a84e 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -132,10 +132,12 @@ static void MkDir(const char *path) { } } #else - CreateDirectory(path, NULL); - auto errorno = GetLastError(); - if (errorno != ERROR_ALREADY_EXISTS) { - throw std::runtime_error(path_error); + BOOL return_value = CreateDirectory(path, NULL); + if (!return_value) { + auto errorno = GetLastError(); + if (errorno != ERROR_ALREADY_EXISTS) { + throw std::runtime_error(path_error); + } } #endif // !_WIN32 } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 6afa53cd36dee..cd8256f1c70a9 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,9 +1,8 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) if(NOT WIN32) list(APPEND PYBIND_DEPS parallel_executor profiler) - list(APPEND PYBIND_SRCS recordio.cc) endif(NOT WIN32) if(WITH_PYTHON) if(WITH_AMD_GPU) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 0d059d8aea756..89959c389f8ba 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -357,19 +357,16 @@ All parameter, weight, gradient are variables in Paddle. return self.GetMutable(); }, py::return_value_policy::reference) +#endif .def("get_reader", [](Variable &self) -> framework::ReaderHolder * { PADDLE_ENFORCE(self.IsType()); return self.GetMutable(); }, - py::return_value_policy::reference) -#endif - ; + py::return_value_policy::reference); -#if !defined(_WIN32) py::class_(m, "Reader", "") .def("reset", &framework::ReaderHolder::ResetAll); -#endif using LoDTensorBlockingQueue = ::paddle::operators::reader::LoDTensorBlockingQueue; @@ -914,9 +911,9 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(fetch_tensors, fetched_var_name); }); +#endif BindRecordIOWriter(&m); -#endif return m.ptr(); } } // namespace pybind diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 1b5009e76126e..2971319141ccb 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -169,6 +169,12 @@ 'bilinear_tensor_product', ] +# To avoid the api checker complains +if os.name == 'nt': + __all__.remove('dynamic_lstm') + __all__.remove('crf_decoding') + __all__.remove('roi_pool') + def fc(input, size, From f115eb0d1e6ffa1dd65bfcc7b30b419d52f3c68b Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Thu, 15 Nov 2018 21:05:28 +0800 Subject: [PATCH 037/684] enhance api. test=develop --- paddle/fluid/API.spec | 2 +- paddle/fluid/operators/yolov3_loss_op.cc | 50 ++++--- paddle/fluid/operators/yolov3_loss_op.h | 129 ++++++++++-------- python/paddle/fluid/layers/detection.py | 67 +++++---- python/paddle/fluid/tests/test_detection.py | 13 ++ .../fluid/tests/unittests/test_layers.py | 9 -- .../tests/unittests/test_yolov3_loss_op.py | 88 ++++++------ 7 files changed, 199 insertions(+), 159 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 7e0d5e60887b6..1f1dc3757d1a0 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -288,7 +288,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) -paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'anchors', 'class_num', 'ignore_thresh', 'lambda_xy', 'lambda_wh', 'lambda_conf_obj', 'lambda_conf_noobj', 'lambda_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index f6c134e1b4df8..1d7f4823626ee 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -25,11 +25,14 @@ class Yolov3LossOp : public framework::OperatorWithKernel { "Input(X) of Yolov3LossOp should not be null."); PADDLE_ENFORCE(ctx->HasInput("GTBox"), "Input(GTBox) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTLabel"), + "Input(GTLabel) of Yolov3LossOp should not be null."); PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) of Yolov3LossOp should not be null."); auto dim_x = ctx->GetInputDim("X"); - auto dim_gt = ctx->GetInputDim("GTBox"); + auto dim_gtbox = ctx->GetInputDim("GTBox"); + auto dim_gtlabel = ctx->GetInputDim("GTLabel"); auto anchors = ctx->Attrs().Get>("anchors"); auto class_num = ctx->Attrs().Get("class_num"); PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); @@ -38,8 +41,15 @@ class Yolov3LossOp : public framework::OperatorWithKernel { PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), "Input(X) dim[1] should be equal to (anchor_number * (5 " "+ class_num))."); - PADDLE_ENFORCE_EQ(dim_gt.size(), 3, "Input(GTBox) should be a 3-D tensor"); - PADDLE_ENFORCE_EQ(dim_gt[2], 5, "Input(GTBox) dim[2] should be 5"); + PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3, + "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5"); + PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2, + "Input(GTBox) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0], + "Input(GTBox) and Input(GTLabel) dim[0] should be same"); + PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1], + "Input(GTBox) and Input(GTLabel) dim[1] should be same"); PADDLE_ENFORCE_GT(anchors.size(), 0, "Attr(anchors) length should be greater then 0."); PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, @@ -73,11 +83,15 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "The input tensor of ground truth boxes, " "This is a 3-D tensor with shape of [N, max_box_num, 5], " "max_box_num is the max number of boxes in each image, " - "In the third dimention, stores label, x, y, w, h, " - "label is an integer to specify box class, x, y is the " - "center cordinate of boxes and w, h is the width and height" - "and x, y, w, h should be divided by input image height to " - "scale to [0, 1]."); + "In the third dimention, stores x, y, w, h coordinates, " + "x, y is the center cordinate of boxes and w, h is the " + "width and height and x, y, w, h should be divided by " + "input image height to scale to [0, 1]."); + AddInput("GTLabel", + "The input tensor of ground truth label, " + "This is a 2-D tensor with shape of [N, max_box_num], " + "and each element shoudl be an integer to indicate the " + "box class id."); AddOutput("Loss", "The output yolov3 loss tensor, " "This is a 1-D tensor with shape of [1]"); @@ -88,19 +102,19 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { "it will be parsed pair by pair."); AddAttr("ignore_thresh", "The ignore threshold to ignore confidence loss."); - AddAttr("lambda_xy", "The weight of x, y location loss.") + AddAttr("loss_weight_xy", "The weight of x, y location loss.") .SetDefault(1.0); - AddAttr("lambda_wh", "The weight of w, h location loss.") + AddAttr("loss_weight_wh", "The weight of w, h location loss.") .SetDefault(1.0); AddAttr( - "lambda_conf_obj", + "loss_weight_conf_target", "The weight of confidence score loss in locations with target object.") .SetDefault(1.0); - AddAttr("lambda_conf_noobj", + AddAttr("loss_weight_conf_notarget", "The weight of confidence score loss in locations without " "target object.") .SetDefault(1.0); - AddAttr("lambda_class", "The weight of classification loss.") + AddAttr("loss_weight_class", "The weight of classification loss.") .SetDefault(1.0); AddComment(R"DOC( This operator generate yolov3 loss by given predict result and ground @@ -141,10 +155,10 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { Final loss will be represented as follow. $$ - loss = \lambda_{xy} * loss_{xy} + \lambda_{wh} * loss_{wh} - + \lambda_{conf_obj} * loss_{conf_obj} - + \lambda_{conf_noobj} * loss_{conf_noobj} - + \lambda_{class} * loss_{class} + loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh} + + \loss_weight_{conf_target} * loss_{conf_target} + + \loss_weight_{conf_notarget} * loss_{conf_notarget} + + \loss_weight_{class} * loss_{class} $$ )DOC"); } @@ -182,12 +196,14 @@ class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { op->SetType("yolov3_loss_grad"); op->SetInput("X", Input("X")); op->SetInput("GTBox", Input("GTBox")); + op->SetInput("GTLabel", Input("GTLabel")); op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); op->SetAttrMap(Attrs()); op->SetOutput(framework::GradVarName("X"), InputGrad("X")); op->SetOutput(framework::GradVarName("GTBox"), {}); + op->SetOutput(framework::GradVarName("GTLabel"), {}); return std::unique_ptr(op); } }; diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index 608ef3f94bdc7..a1072aca10802 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -186,15 +186,17 @@ static T CalcBoxIoU(std::vector box1, std::vector box2) { } template -static void PreProcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, - std::vector anchors, const int grid_size, - Tensor* obj_mask, Tensor* noobj_mask, Tensor* tx, - Tensor* ty, Tensor* tw, Tensor* th, Tensor* tconf, +static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label, + const float ignore_thresh, std::vector anchors, + const int grid_size, Tensor* obj_mask, + Tensor* noobj_mask, Tensor* tx, Tensor* ty, + Tensor* tw, Tensor* th, Tensor* tconf, Tensor* tclass) { - const int n = gt_boxes.dims()[0]; - const int b = gt_boxes.dims()[1]; + const int n = gt_box.dims()[0]; + const int b = gt_box.dims()[1]; const int anchor_num = anchors.size() / 2; - auto gt_boxes_t = EigenTensor::From(gt_boxes); + auto gt_box_t = EigenTensor::From(gt_box); + auto gt_label_t = EigenTensor::From(gt_label); auto obj_mask_t = EigenTensor::From(*obj_mask).setConstant(0); auto noobj_mask_t = EigenTensor::From(*noobj_mask).setConstant(1); auto tx_t = EigenTensor::From(*tx).setConstant(0.0); @@ -206,28 +208,27 @@ static void PreProcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, for (int i = 0; i < n; i++) { for (int j = 0; j < b; j++) { - if (isZero(gt_boxes_t(i, j, 0)) && isZero(gt_boxes_t(i, j, 1)) && - isZero(gt_boxes_t(i, j, 2)) && isZero(gt_boxes_t(i, j, 3)) && - isZero(gt_boxes_t(i, j, 4))) { + if (isZero(gt_box_t(i, j, 0)) && isZero(gt_box_t(i, j, 1)) && + isZero(gt_box_t(i, j, 2)) && isZero(gt_box_t(i, j, 3))) { continue; } - int gt_label = static_cast(gt_boxes_t(i, j, 0)); - T gx = gt_boxes_t(i, j, 1) * grid_size; - T gy = gt_boxes_t(i, j, 2) * grid_size; - T gw = gt_boxes_t(i, j, 3) * grid_size; - T gh = gt_boxes_t(i, j, 4) * grid_size; + int cur_label = gt_label_t(i, j); + T gx = gt_box_t(i, j, 0) * grid_size; + T gy = gt_box_t(i, j, 1) * grid_size; + T gw = gt_box_t(i, j, 2) * grid_size; + T gh = gt_box_t(i, j, 3) * grid_size; int gi = static_cast(gx); int gj = static_cast(gy); T max_iou = static_cast(0); T iou; int best_an_index = -1; - std::vector gt_box({0, 0, gw, gh}); + std::vector gt_box_shape({0, 0, gw, gh}); for (int an_idx = 0; an_idx < anchor_num; an_idx++) { std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), static_cast(anchors[2 * an_idx + 1])}); - iou = CalcBoxIoU(gt_box, anchor_shape); + iou = CalcBoxIoU(gt_box_shape, anchor_shape); if (iou > max_iou) { max_iou = iou; best_an_index = an_idx; @@ -242,7 +243,7 @@ static void PreProcessGTBox(const Tensor& gt_boxes, const float ignore_thresh, ty_t(i, best_an_index, gj, gi) = gy - gj; tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); - tclass_t(i, best_an_index, gj, gi, gt_label) = 1; + tclass_t(i, best_an_index, gj, gi, cur_label) = 1; tconf_t(i, best_an_index, gj, gi) = 1; } } @@ -267,10 +268,10 @@ static void AddAllGradToInputGrad( Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y, const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, - const Tensor& grad_conf_obj, const Tensor& grad_conf_noobj, - const Tensor& grad_class, const int class_num, const float lambda_xy, - const float lambda_wh, const float lambda_conf_obj, - const float lambda_conf_noobj, const float lambda_class) { + const Tensor& grad_conf_target, const Tensor& grad_conf_notarget, + const Tensor& grad_class, const int class_num, const float loss_weight_xy, + const float loss_weight_wh, const float loss_weight_conf_target, + const float loss_weight_conf_notarget, const float loss_weight_class) { const int n = pred_x.dims()[0]; const int an_num = pred_x.dims()[1]; const int h = pred_x.dims()[2]; @@ -285,8 +286,8 @@ static void AddAllGradToInputGrad( auto grad_y_t = EigenTensor::From(grad_y); auto grad_w_t = EigenTensor::From(grad_w); auto grad_h_t = EigenTensor::From(grad_h); - auto grad_conf_obj_t = EigenTensor::From(grad_conf_obj); - auto grad_conf_noobj_t = EigenTensor::From(grad_conf_noobj); + auto grad_conf_target_t = EigenTensor::From(grad_conf_target); + auto grad_conf_notarget_t = EigenTensor::From(grad_conf_notarget); auto grad_class_t = EigenTensor::From(grad_class); for (int i = 0; i < n; i++) { @@ -295,25 +296,26 @@ static void AddAllGradToInputGrad( for (int l = 0; l < w; l++) { grad_t(i, j * attr_num, k, l) = grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) * - (1.0 - pred_x_t(i, j, k, l)) * loss * lambda_xy; + (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy; grad_t(i, j * attr_num + 1, k, l) = grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * - (1.0 - pred_y_t(i, j, k, l)) * loss * lambda_xy; + (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy; grad_t(i, j * attr_num + 2, k, l) = - grad_w_t(i, j, k, l) * loss * lambda_wh; + grad_w_t(i, j, k, l) * loss * loss_weight_wh; grad_t(i, j * attr_num + 3, k, l) = - grad_h_t(i, j, k, l) * loss * lambda_wh; + grad_h_t(i, j, k, l) * loss * loss_weight_wh; grad_t(i, j * attr_num + 4, k, l) = - grad_conf_obj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss * lambda_conf_obj; + grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target; grad_t(i, j * attr_num + 4, k, l) += - grad_conf_noobj_t(i, j, k, l) * pred_conf_t(i, j, k, l) * - (1.0 - pred_conf_t(i, j, k, l)) * loss * lambda_conf_noobj; + grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss * + loss_weight_conf_notarget; for (int c = 0; c < class_num; c++) { grad_t(i, j * attr_num + 5 + c, k, l) = grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * - (1.0 - pred_class_t(i, j, k, l, c)) * loss * lambda_class; + (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class; } } } @@ -326,16 +328,18 @@ class Yolov3LossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); - auto* gt_boxes = ctx.Input("GTBox"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); auto* loss = ctx.Output("Loss"); auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); - float lambda_xy = ctx.Attr("lambda_xy"); - float lambda_wh = ctx.Attr("lambda_wh"); - float lambda_conf_obj = ctx.Attr("lambda_conf_obj"); - float lambda_conf_noobj = ctx.Attr("lambda_conf_noobj"); - float lambda_class = ctx.Attr("lambda_class"); + float loss_weight_xy = ctx.Attr("loss_weight_xy"); + float loss_weight_wh = ctx.Attr("loss_weight_wh"); + float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); + float loss_weight_conf_notarget = + ctx.Attr("loss_weight_conf_notarget"); + float loss_weight_class = ctx.Attr("loss_weight_class"); const int n = input->dims()[0]; const int h = input->dims()[2]; @@ -363,7 +367,7 @@ class Yolov3LossKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PreProcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; @@ -375,15 +379,16 @@ class Yolov3LossKernel : public framework::OpKernel { T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); - T loss_conf_obj = CalcBCEWithMask(pred_conf, tconf, obj_mask); - T loss_conf_noobj = CalcBCEWithMask(pred_conf, tconf, noobj_mask); + T loss_conf_target = CalcBCEWithMask(pred_conf, tconf, obj_mask); + T loss_conf_notarget = CalcBCEWithMask(pred_conf, tconf, noobj_mask); T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); - loss_data[0] = - lambda_xy * (loss_x + loss_y) + lambda_wh * (loss_w + loss_h) + - lambda_conf_obj * loss_conf_obj + lambda_conf_noobj * loss_conf_noobj + - lambda_class * loss_class; + loss_data[0] = loss_weight_xy * (loss_x + loss_y) + + loss_weight_wh * (loss_w + loss_h) + + loss_weight_conf_target * loss_conf_target + + loss_weight_conf_notarget * loss_conf_notarget + + loss_weight_class * loss_class; } }; @@ -392,18 +397,20 @@ class Yolov3LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { auto* input = ctx.Input("X"); - auto* gt_boxes = ctx.Input("GTBox"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); auto anchors = ctx.Attr>("anchors"); int class_num = ctx.Attr("class_num"); float ignore_thresh = ctx.Attr("ignore_thresh"); auto* input_grad = ctx.Output(framework::GradVarName("X")); auto* output_grad = ctx.Input(framework::GradVarName("Loss")); const T loss = output_grad->data()[0]; - float lambda_xy = ctx.Attr("lambda_xy"); - float lambda_wh = ctx.Attr("lambda_wh"); - float lambda_conf_obj = ctx.Attr("lambda_conf_obj"); - float lambda_conf_noobj = ctx.Attr("lambda_conf_noobj"); - float lambda_class = ctx.Attr("lambda_class"); + float loss_weight_xy = ctx.Attr("loss_weight_xy"); + float loss_weight_wh = ctx.Attr("loss_weight_wh"); + float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); + float loss_weight_conf_notarget = + ctx.Attr("loss_weight_conf_notarget"); + float loss_weight_class = ctx.Attr("loss_weight_class"); const int n = input->dims()[0]; const int c = input->dims()[1]; @@ -432,7 +439,7 @@ class Yolov3LossGradKernel : public framework::OpKernel { th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); - PreProcessGTBox(*gt_boxes, ignore_thresh, anchors, h, &obj_mask, + PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); Tensor obj_mask_expand; @@ -441,13 +448,13 @@ class Yolov3LossGradKernel : public framework::OpKernel { ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); Tensor grad_x, grad_y, grad_w, grad_h; - Tensor grad_conf_obj, grad_conf_noobj, grad_class; + Tensor grad_conf_target, grad_conf_notarget, grad_class; grad_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); grad_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); grad_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); grad_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_conf_obj.mutable_data({n, an_num, h, w}, ctx.GetPlace()); - grad_conf_noobj.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_target.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_notarget.mutable_data({n, an_num, h, w}, ctx.GetPlace()); grad_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); T obj_mf = CalcMaskPointNum(obj_mask); T noobj_mf = CalcMaskPointNum(noobj_mask); @@ -456,8 +463,9 @@ class Yolov3LossGradKernel : public framework::OpKernel { CalcMSEGradWithMask(&grad_y, pred_y, ty, obj_mask, obj_mf); CalcMSEGradWithMask(&grad_w, pred_w, tw, obj_mask, obj_mf); CalcMSEGradWithMask(&grad_h, pred_h, th, obj_mask, obj_mf); - CalcBCEGradWithMask(&grad_conf_obj, pred_conf, tconf, obj_mask, obj_mf); - CalcBCEGradWithMask(&grad_conf_noobj, pred_conf, tconf, noobj_mask, + CalcBCEGradWithMask(&grad_conf_target, pred_conf, tconf, obj_mask, + obj_mf); + CalcBCEGradWithMask(&grad_conf_notarget, pred_conf, tconf, noobj_mask, noobj_mf); CalcBCEGradWithMask(&grad_class, pred_class, tclass, obj_mask_expand, obj_expand_mf); @@ -465,8 +473,9 @@ class Yolov3LossGradKernel : public framework::OpKernel { input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); AddAllGradToInputGrad( input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, - grad_w, grad_h, grad_conf_obj, grad_conf_noobj, grad_class, class_num, - lambda_xy, lambda_wh, lambda_conf_obj, lambda_conf_noobj, lambda_class); + grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class, + class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target, + loss_weight_conf_notarget, loss_weight_class); } }; diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 2bb9514803e97..cab5c3e2a4377 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -409,32 +409,36 @@ def polygon_box_transform(input, name=None): @templatedoc(op_type="yolov3_loss") def yolov3_loss(x, gtbox, + gtlabel, anchors, class_num, ignore_thresh, - lambda_xy=None, - lambda_wh=None, - lambda_conf_obj=None, - lambda_conf_noobj=None, - lambda_class=None, + loss_weight_xy=None, + loss_weight_wh=None, + loss_weight_conf_target=None, + loss_weight_conf_notarget=None, + loss_weight_class=None, name=None): """ ${comment} Args: x (Variable): ${x_comment} - gtbox (Variable): groud truth boxes, shoulb be in shape of [N, B, 5], - in the third dimenstion, class_id, x, y, w, h should - be stored and x, y, w, h should be relative valud of - input image. + gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4], + in the third dimenstion, x, y, w, h should be stored + and x, y, w, h should be relative value of input image. + N is the batch number and B is the max box number in + an image. + gtlabel (Variable): class id of ground truth boxes, shoud be ins shape + of [N, B]. anchors (list|tuple): ${anchors_comment} class_num (int): ${class_num_comment} ignore_thresh (float): ${ignore_thresh_comment} - lambda_xy (float|None): ${lambda_xy_comment} - lambda_wh (float|None): ${lambda_wh_comment} - lambda_conf_obj (float|None): ${lambda_conf_obj_comment} - lambda_conf_noobj (float|None): ${lambda_conf_noobj_comment} - lambda_class (float|None): ${lambda_class_comment} + loss_weight_xy (float|None): ${loss_weight_xy_comment} + loss_weight_wh (float|None): ${loss_weight_wh_comment} + loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment} + loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment} + loss_weight_class (float|None): ${loss_weight_class_comment} name (string): the name of yolov3 loss Returns: @@ -443,6 +447,7 @@ def yolov3_loss(x, Raises: TypeError: Input x of yolov3_loss must be Variable TypeError: Input gtbox of yolov3_loss must be Variable" + TypeError: Input gtlabel of yolov3_loss must be Variable" TypeError: Attr anchors of yolov3_loss must be list or tuple TypeError: Attr class_num of yolov3_loss must be an integer TypeError: Attr ignore_thresh of yolov3_loss must be a float number @@ -450,8 +455,9 @@ def yolov3_loss(x, Examples: .. code-block:: python - x = fluid.layers.data(name='x', shape=[10, 255, 13, 13], dtype='float32') - gtbox = fluid.layers.data(name='gtbox', shape=[10, 6, 5], dtype='float32') + x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') + gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') + gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') anchors = [10, 13, 16, 30, 33, 23] loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 anchors=anchors, ignore_thresh=0.5) @@ -462,6 +468,8 @@ def yolov3_loss(x, raise TypeError("Input x of yolov3_loss must be Variable") if not isinstance(gtbox, Variable): raise TypeError("Input gtbox of yolov3_loss must be Variable") + if not isinstance(gtlabel, Variable): + raise TypeError("Input gtlabel of yolov3_loss must be Variable") if not isinstance(anchors, list) and not isinstance(anchors, tuple): raise TypeError("Attr anchors of yolov3_loss must be list or tuple") if not isinstance(class_num, int): @@ -482,21 +490,24 @@ def yolov3_loss(x, "ignore_thresh": ignore_thresh, } - if lambda_xy is not None and isinstance(lambda_xy, float): - self.attrs['lambda_xy'] = lambda_xy - if lambda_wh is not None and isinstance(lambda_wh, float): - self.attrs['lambda_wh'] = lambda_wh - if lambda_conf_obj is not None and isinstance(lambda_conf_obj, float): - self.attrs['lambda_conf_obj'] = lambda_conf_obj - if lambda_conf_noobj is not None and isinstance(lambda_conf_noobj, float): - self.attrs['lambda_conf_noobj'] = lambda_conf_noobj - if lambda_class is not None and isinstance(lambda_class, float): - self.attrs['lambda_class'] = lambda_class + if loss_weight_xy is not None and isinstance(loss_weight_xy, float): + self.attrs['loss_weight_xy'] = loss_weight_xy + if loss_weight_wh is not None and isinstance(loss_weight_wh, float): + self.attrs['loss_weight_wh'] = loss_weight_wh + if loss_weight_conf_target is not None and isinstance( + loss_weight_conf_target, float): + self.attrs['loss_weight_conf_target'] = loss_weight_conf_target + if loss_weight_conf_notarget is not None and isinstance( + loss_weight_conf_notarget, float): + self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget + if loss_weight_class is not None and isinstance(loss_weight_class, float): + self.attrs['loss_weight_class'] = loss_weight_class helper.append_op( type='yolov3_loss', - inputs={'X': x, - "GTBox": gtbox}, + inputs={"X": x, + "GTBox": gtbox, + "GTLabel": gtlabel}, outputs={'Loss': loss}, attrs=attrs) return loss diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 28dc7519571d8..527fd521d5eb3 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -366,5 +366,18 @@ def test_generate_proposals(self): print(rpn_rois.shape) +class TestYoloDetection(unittest.TestCase): + def test_yolov3_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') + gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') + gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') + loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10, + 0.5) + + self.assertIsNotNone(loss) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index dd02968c30fc3..f48d9c84f9c10 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -911,15 +911,6 @@ def test_affine_grid(self): self.assertIsNotNone(data_1) print(str(program)) - def test_yolov3_loss(self): - program = Program() - with program_guard(program): - x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') - gtbox = layers.data(name='gtbox', shape=[10, 5], dtype='float32') - loss = layers.yolov3_loss(x, gtbox, [10, 13, 30, 13], 10, 0.5) - - self.assertIsNotNone(loss) - def test_bilinear_tensor_product_layer(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 03a64055f0bdc..335214b298d72 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -66,7 +66,7 @@ def box_iou(box1, box2): return inter_area / (b1_area + b2_area + inter_area) -def build_target(gtboxs, attrs, grid_size): +def build_target(gtboxs, gtlabel, attrs, grid_size): n, b, _ = gtboxs.shape ignore_thresh = attrs["ignore_thresh"] anchors = attrs["anchors"] @@ -87,11 +87,11 @@ def build_target(gtboxs, attrs, grid_size): if gtboxs[i, j, :].sum() == 0: continue - gt_label = int(gtboxs[i, j, 0]) - gx = gtboxs[i, j, 1] * grid_size - gy = gtboxs[i, j, 2] * grid_size - gw = gtboxs[i, j, 3] * grid_size - gh = gtboxs[i, j, 4] * grid_size + gt_label = gtlabel[i, j] + gx = gtboxs[i, j, 0] * grid_size + gy = gtboxs[i, j, 1] * grid_size + gw = gtboxs[i, j, 2] * grid_size + gh = gtboxs[i, j, 3] * grid_size gi = int(gx) gj = int(gy) @@ -121,7 +121,7 @@ def build_target(gtboxs, attrs, grid_size): return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask) -def YoloV3Loss(x, gtbox, attrs): +def YoloV3Loss(x, gtbox, gtlabel, attrs): n, c, h, w = x.shape an_num = len(attrs['anchors']) // 2 class_num = attrs["class_num"] @@ -134,7 +134,7 @@ def YoloV3Loss(x, gtbox, attrs): pred_cls = sigmoid(x[:, :, :, :, 5:]) tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target( - gtbox, attrs, x.shape[2]) + gtbox, gtlabel, attrs, x.shape[2]) obj_mask_expand = np.tile( np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num']))) @@ -142,73 +142,73 @@ def YoloV3Loss(x, gtbox, attrs): loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum()) loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum()) loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum()) - loss_conf_obj = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) - loss_conf_noobj = bce(pred_conf * noobj_mask, tconf * noobj_mask, - noobj_mask) + loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) + loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask, + noobj_mask) loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, obj_mask_expand) - return attrs['lambda_xy'] * (loss_x + loss_y) \ - + attrs['lambda_wh'] * (loss_w + loss_h) \ - + attrs['lambda_conf_obj'] * loss_conf_obj \ - + attrs['lambda_conf_noobj'] * loss_conf_noobj \ - + attrs['lambda_class'] * loss_class + return attrs['loss_weight_xy'] * (loss_x + loss_y) \ + + attrs['loss_weight_wh'] * (loss_w + loss_h) \ + + attrs['loss_weight_conf_target'] * loss_conf_target \ + + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \ + + attrs['loss_weight_class'] * loss_class class TestYolov3LossOp(OpTest): def setUp(self): - self.lambda_xy = 1.0 - self.lambda_wh = 1.0 - self.lambda_conf_obj = 1.0 - self.lambda_conf_noobj = 1.0 - self.lambda_class = 1.0 + self.loss_weight_xy = 1.0 + self.loss_weight_wh = 1.0 + self.loss_weight_conf_target = 1.0 + self.loss_weight_conf_notarget = 1.0 + self.loss_weight_class = 1.0 self.initTestCase() self.op_type = 'yolov3_loss' x = np.random.random(size=self.x_shape).astype('float32') gtbox = np.random.random(size=self.gtbox_shape).astype('float32') - gtbox[:, :, 0] = np.random.randint(0, self.class_num, - self.gtbox_shape[:2]) + gtlabel = np.random.randint(0, self.class_num, + self.gtbox_shape[:2]).astype('int32') self.attrs = { "anchors": self.anchors, "class_num": self.class_num, "ignore_thresh": self.ignore_thresh, - "lambda_xy": self.lambda_xy, - "lambda_wh": self.lambda_wh, - "lambda_conf_obj": self.lambda_conf_obj, - "lambda_conf_noobj": self.lambda_conf_noobj, - "lambda_class": self.lambda_class, + "loss_weight_xy": self.loss_weight_xy, + "loss_weight_wh": self.loss_weight_wh, + "loss_weight_conf_target": self.loss_weight_conf_target, + "loss_weight_conf_notarget": self.loss_weight_conf_notarget, + "loss_weight_class": self.loss_weight_class, } - self.inputs = {'X': x, 'GTBox': gtbox} + self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel} self.outputs = { - 'Loss': - np.array([YoloV3Loss(x, gtbox, self.attrs)]).astype('float32') + 'Loss': np.array( + [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32') } def test_check_output(self): place = core.CPUPlace() self.check_output_with_place(place, atol=1e-3) - # def test_check_grad_ignore_gtbox(self): - # place = core.CPUPlace() - # self.check_grad_with_place( - # place, ['X'], - # 'Loss', - # no_grad_set=set("GTBox"), - # max_relative_error=0.06) + def test_check_grad_ignore_gtbox(self): + place = core.CPUPlace() + self.check_grad_with_place( + place, ['X'], + 'Loss', + no_grad_set=set("GTBox"), + max_relative_error=0.06) def initTestCase(self): self.anchors = [10, 13, 12, 12] self.class_num = 10 self.ignore_thresh = 0.5 self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) - self.gtbox_shape = (5, 5, 5) - self.lambda_xy = 2.5 - self.lambda_wh = 0.8 - self.lambda_conf_obj = 1.5 - self.lambda_conf_noobj = 0.5 - self.lambda_class = 1.2 + self.gtbox_shape = (5, 10, 4) + self.loss_weight_xy = 2.5 + self.loss_weight_wh = 0.8 + self.loss_weight_conf_target = 1.5 + self.loss_weight_conf_notarget = 0.5 + self.loss_weight_class = 1.2 if __name__ == "__main__": From 162f2d410912ebbe6dae12c4120d97ea69b9ffda Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 10:58:48 +0800 Subject: [PATCH 038/684] disable the openblas multi-thread on windows since no support adjust the python script --- paddle/fluid/platform/cpu_helper.cc | 6 + paddle/fluid/platform/init.cc | 7 - python/paddle/fluid/__init__.py | 3 +- python/paddle/fluid/contrib/inferencer.py | 4 +- python/paddle/fluid/contrib/trainer.py | 3 +- python/paddle/fluid/parallel_executor.py | 495 +++++++++++----------- 6 files changed, 258 insertions(+), 260 deletions(-) diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 234a04b5c2eb5..4e52e8ff00ca5 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -29,6 +29,12 @@ namespace platform { void SetNumThreads(int num_threads) { #ifdef PADDLE_USE_OPENBLAS + // windows has no support for openblas multi-thread +#ifdef _WIN32 + if (num_threads > 1) { + num_threads = 1; + } +#endif int real_num_threads = num_threads > 1 ? num_threads : 1; openblas_set_num_threads(real_num_threads); #elif defined(PADDLE_WITH_MKLML) diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 84d1b852cbe5a..69bbe8794d336 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -113,13 +113,6 @@ void InitDevices(bool init_p2p, const std::vector devices) { places.emplace_back(platform::CPUPlace()); platform::DeviceContextPool::Init(places); -// windows has no support for openblas multi-thread -#ifdef _WIN32 - if (FLAGS_paddle_num_threads > 1) { - FLAGS_paddle_num_threads = 1; - } -#endif - #ifndef PADDLE_WITH_MKLDNN platform::SetNumThreads(FLAGS_paddle_num_threads); #endif diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 8129918916073..dbe49c98bd150 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -47,7 +47,8 @@ from . import unique_name from . import recordio_writer from . import parallel_executor -from .parallel_executor import * +if os.name != 'nt': + from .parallel_executor import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor diff --git a/python/paddle/fluid/contrib/inferencer.py b/python/paddle/fluid/contrib/inferencer.py index b966ae01d039d..b8d5f4ffeadca 100644 --- a/python/paddle/fluid/contrib/inferencer.py +++ b/python/paddle/fluid/contrib/inferencer.py @@ -15,15 +15,13 @@ from __future__ import print_function import contextlib -import os from .. import core from .. import executor from .. import framework from .. import io -if os.name != 'nt': - from .. import parallel_executor +from .. import parallel_executor from .. import unique_name from .trainer import check_and_get_place diff --git a/python/paddle/fluid/contrib/trainer.py b/python/paddle/fluid/contrib/trainer.py index 096821a5ba690..8569e486f9178 100644 --- a/python/paddle/fluid/contrib/trainer.py +++ b/python/paddle/fluid/contrib/trainer.py @@ -28,8 +28,7 @@ from .. import io # optimizer is same as the parameter of Trainer.__init__. Rename it to opt_module from .. import optimizer as opt_module -if os.name != 'nt': - from .. import parallel_executor +from .. import parallel_executor from ..transpiler import distribute_transpiler __all__ = [ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 3f4dd5eb712e7..33f6df67a4210 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -25,263 +25,264 @@ __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] -ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy -BuildStrategy = core.ParallelExecutor.BuildStrategy - - -class ParallelExecutor(object): - """ - ParallelExecutor is designed for data parallelism, which focuses on distributing - the data across different nodes and every node operates on the data in parallel. - If you use ParallelExecutor to run the current program on GPU, the node means GPU - device, and ParallelExecutor will get the available GPU device automatically on - the current machine. If you use ParallelExecutor to run the current program on CPU, - the node means the CPU device, and you can specify the CPU device number by adding - 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable - is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number - of CPUs in the system. - - Args: - use_cuda (bool): Whether to use CUDA or not. - loss_name (str): The loss name must set in training. Default None. - main_program (Program): The program that need to run, if not provided, - then default_main_program will be used. Default None. - share_vars_from(ParallelExecutor): If provide, it will share variables - from the specified ParallelExecutor. Default None. - exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run - the program in ParallelExecutor, for example how many threads are used to - execute the program, how many iterations to clean up the temp variables - which is generated during execution. For more information, please refer - to fluid.ExecutionStrategy. Default None. - build_strategy(BuildStrategy): build_strategy is used to control how to - build the SSA Graph in ParallelExecutor by setting the property, - for example reduce_strategy, gradient_scale_strategy. For more information, - please refer to fluid.BuildStrategy. Default None. - num_trainers(int): If greater than 1, NCCL will be initialized with - multiple rank of nodes, each node should have same number of GPUs. - Distributed training will be enabled then. Default 1. - trainer_id(int): Must use together with num_trainers. trainer_id is the - "rank" of current node starts from 0. Default 0. - scope(Scope): scope to run with, default use fluid.global_scope(). - - Returns: - ParallelExecutor: The initialized ParallelExecutor object. - - Raises: - TypeError: If share_vars_from is provided, but not ParallelExecutor object. - - Examples: - .. code-block:: python - - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor(use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) - """ - - def __init__(self, - use_cuda, - loss_name=None, - main_program=None, - share_vars_from=None, - exec_strategy=None, - build_strategy=None, - num_trainers=1, - trainer_id=0, - scope=None): - self._places = [] - self._act_places = [] - if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): - p = core.Place() - self._act_places.append(core.CUDAPlace(i)) - p.set_place(self._act_places[-1]) - self._places.append(p) - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - for i in six.moves.range(cpu_num): - p = core.Place() - self._act_places.append(core.CPUPlace()) - p.set_place(self._act_places[-1]) - self._places.append(p) - assert self._places, "no place for execution" - - if exec_strategy is None: - exec_strategy = ExecutionStrategy() - exec_strategy.use_cuda = use_cuda - - if exec_strategy.num_threads == 0: - if use_cuda: - # Experiments on se-resnext shows that too many threads hurt - # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 4 - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - exec_strategy.num_threads = cpu_num * 2 - - # Set 1 thread num under nccl2 distribute - # env to make sure all gpus run ops in same order. - if num_trainers > 1: - assert (use_cuda) - # FIXME(gongwb): avoid this set. - exec_strategy.num_threads = 1 - - if build_strategy is None: - build_strategy = BuildStrategy() - - main = main_program - main = main if main else framework.default_main_program() - if scope == None: - scope = executor.global_scope() - - if share_vars_from and not isinstance(share_vars_from, - ParallelExecutor): - raise TypeError("share_vars_from must be ParallelExecutor.") - - local_scopes = share_vars_from.executor.local_scopes( - ) if share_vars_from else [] - - self.persistable_vars = [ - v.name for v in [ - var for var in main.list_vars() - if var.persistable and var.type != core.VarDesc.VarType.RAW - ] - ] - - self.executor = core.ParallelExecutor( - self._places, - set([ - cpt.to_text(p.name) - for p in main.global_block().iter_parameters() - if not p.stop_gradient - ]), - set(cpt.to_text(var) for var in self.persistable_vars), main.desc, - cpt.to_text(loss_name) - if loss_name else six.u(''), scope, local_scopes, exec_strategy, - build_strategy, num_trainers, trainer_id) - self.scope = scope - - def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): - """ - Run a parallel executor with fetch_list. - - The feed parameter can be a dict or a list. If feed is a dict, the - feed data will be split into multiple devices. If feed is a list, we - assume the data has been splitted into multiple devices, the each - element in the list will be copied to each device directly. - - For example, if the feed is a dict: - - >>> exe = ParallelExecutor() - >>> # the image will be splitted into devices. If there is two devices - >>> # each device will process an image with shape (24, 1, 28, 28) - >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) +if os.name != 'nt': + ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy + BuildStrategy = core.ParallelExecutor.BuildStrategy - For example, if the feed is a list: - >>> exe = ParallelExecutor() - >>> # each device will process each element in the list. - >>> # the 1st device will process an image with shape (48, 1, 28, 28) - >>> # the 2nd device will process an image with shape (32, 1, 28, 28) - >>> # - >>> # you can use exe.device_count to get the device number. - >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, - >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, - >>> ]) + class ParallelExecutor(object): + """ + ParallelExecutor is designed for data parallelism, which focuses on distributing + the data across different nodes and every node operates on the data in parallel. + If you use ParallelExecutor to run the current program on GPU, the node means GPU + device, and ParallelExecutor will get the available GPU device automatically on + the current machine. If you use ParallelExecutor to run the current program on CPU, + the node means the CPU device, and you can specify the CPU device number by adding + 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable + is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number + of CPUs in the system. Args: - fetch_list(list): The fetched variable names - feed(list|dict|None): The feed variables. If the feed is a dict, - tensors in that dict will be splitted into each devices. If - the feed is a list, each element of the list will be copied - to each device. Default None. - feed_dict: Alias for feed parameter, for backward compatibility. - This parameter has been deprecated. Default None. - return_numpy(bool): Whether converts the fetched tensor to numpy. - Default: True. + use_cuda (bool): Whether to use CUDA or not. + loss_name (str): The loss name must set in training. Default None. + main_program (Program): The program that need to run, if not provided, + then default_main_program will be used. Default None. + share_vars_from(ParallelExecutor): If provide, it will share variables + from the specified ParallelExecutor. Default None. + exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run + the program in ParallelExecutor, for example how many threads are used to + execute the program, how many iterations to clean up the temp variables + which is generated during execution. For more information, please refer + to fluid.ExecutionStrategy. Default None. + build_strategy(BuildStrategy): build_strategy is used to control how to + build the SSA Graph in ParallelExecutor by setting the property, + for example reduce_strategy, gradient_scale_strategy. For more information, + please refer to fluid.BuildStrategy. Default None. + num_trainers(int): If greater than 1, NCCL will be initialized with + multiple rank of nodes, each node should have same number of GPUs. + Distributed training will be enabled then. Default 1. + trainer_id(int): Must use together with num_trainers. trainer_id is the + "rank" of current node starts from 0. Default 0. + scope(Scope): scope to run with, default use fluid.global_scope(). Returns: - List: The fetched result list. + ParallelExecutor: The initialized ParallelExecutor object. Raises: - ValueError: If the feed is a list, but its length is not equal the - length of active places, or its element's is not dict. - - NOTES: - 1. If the feed's type is dict, the number of data that feeds to - ParallelExecutor must be bigger than active places. Otherwise, - it will throw exception from C++ side. Special attention should be - paid to check whether the last batch of the dataset is bigger - than active places. - 2. If active places are more than one, the fetch results for each - variable is a list, and each element of this list is the variable of - respective active place. + TypeError: If share_vars_from is provided, but not ParallelExecutor object. Examples: .. code-block:: python - pe = fluid.ParallelExecutor(use_cuda=use_cuda, - loss_name=avg_cost.name, - main_program=fluid.default_main_program()) - loss = pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name])) + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + test_loss, = test_exe.run([loss.name], feed=feed_dict) """ - if feed is None and feed_dict is not None: - feed = feed_dict - print( - "`feed_dict` is deprecated. Please use `feed=`", - file=sys.stderr) - - if isinstance(feed, dict): - feed_tensor_dict = dict() - for feed_name in feed: - feed_tensor = feed[feed_name] - if not isinstance(feed_tensor, core.LoDTensor): - feed_tensor = core.LoDTensor() - # always set to CPU place, since the tensor need to be splitted - # it is fast in CPU - feed_tensor.set(feed[feed_name], core.CPUPlace()) - feed_tensor_dict[feed_name] = feed_tensor - - self.executor.feed_and_split_tensor_into_local_scopes( - feed_tensor_dict) - elif isinstance(feed, list) or isinstance(feed, tuple): - if len(feed) != len(self._act_places): - raise ValueError( - "Feed a list of tensor, the list should be the same size as places" - ) - - res = list() - - for i, each in enumerate(feed): - if not isinstance(each, dict): - raise TypeError( - "Each element of feed list should be a dict") - res_dict = dict() - for feed_name in each: - tensor = each[feed_name] - if not isinstance(tensor, core.LoDTensor): - tmp = core.LoDTensor() - tmp.set(tensor, self._act_places[i]) - tensor = tmp - res_dict[feed_name] = tensor - res.append(res_dict) - self.executor.feed_tensors_into_local_scopes(res) - - fetch_var_name = '@FETCHED_VAR_NAME@' - self.executor.run(fetch_list, fetch_var_name) - arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - - if return_numpy: - return executor.as_numpy(arr) - - return [arr[i] for i in range(len(arr))] - - @property - def device_count(self): - return len(self._act_places) + + def __init__(self, + use_cuda, + loss_name=None, + main_program=None, + share_vars_from=None, + exec_strategy=None, + build_strategy=None, + num_trainers=1, + trainer_id=0, + scope=None): + self._places = [] + self._act_places = [] + if use_cuda: + for i in six.moves.range(core.get_cuda_device_count()): + p = core.Place() + self._act_places.append(core.CUDAPlace(i)) + p.set_place(self._act_places[-1]) + self._places.append(p) + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in six.moves.range(cpu_num): + p = core.Place() + self._act_places.append(core.CPUPlace()) + p.set_place(self._act_places[-1]) + self._places.append(p) + assert self._places, "no place for execution" + + if exec_strategy is None: + exec_strategy = ExecutionStrategy() + exec_strategy.use_cuda = use_cuda + + if exec_strategy.num_threads == 0: + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + exec_strategy.num_threads = len(self._places) * 4 + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num * 2 + + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + + if build_strategy is None: + build_strategy = BuildStrategy() + + main = main_program + main = main if main else framework.default_main_program() + if scope == None: + scope = executor.global_scope() + + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + self.persistable_vars = [ + v.name for v in [ + var for var in main.list_vars() + if var.persistable and var.type != core.VarDesc.VarType.RAW + ] + ] + + self.executor = core.ParallelExecutor( + self._places, + set([ + cpt.to_text(p.name) + for p in main.global_block().iter_parameters() + if not p.stop_gradient + ]), + set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + cpt.to_text(loss_name) + if loss_name else six.u(''), scope, local_scopes, exec_strategy, + build_strategy, num_trainers, trainer_id) + self.scope = scope + + def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): + """ + Run a parallel executor with fetch_list. + + The feed parameter can be a dict or a list. If feed is a dict, the + feed data will be split into multiple devices. If feed is a list, we + assume the data has been splitted into multiple devices, the each + element in the list will be copied to each device directly. + + For example, if the feed is a dict: + + >>> exe = ParallelExecutor() + >>> # the image will be splitted into devices. If there is two devices + >>> # each device will process an image with shape (24, 1, 28, 28) + >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) + + For example, if the feed is a list: + + >>> exe = ParallelExecutor() + >>> # each device will process each element in the list. + >>> # the 1st device will process an image with shape (48, 1, 28, 28) + >>> # the 2nd device will process an image with shape (32, 1, 28, 28) + >>> # + >>> # you can use exe.device_count to get the device number. + >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, + >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, + >>> ]) + + Args: + fetch_list(list): The fetched variable names + feed(list|dict|None): The feed variables. If the feed is a dict, + tensors in that dict will be splitted into each devices. If + the feed is a list, each element of the list will be copied + to each device. Default None. + feed_dict: Alias for feed parameter, for backward compatibility. + This parameter has been deprecated. Default None. + return_numpy(bool): Whether converts the fetched tensor to numpy. + Default: True. + + Returns: + List: The fetched result list. + + Raises: + ValueError: If the feed is a list, but its length is not equal the + length of active places, or its element's is not dict. + + NOTES: + 1. If the feed's type is dict, the number of data that feeds to + ParallelExecutor must be bigger than active places. Otherwise, + it will throw exception from C++ side. Special attention should be + paid to check whether the last batch of the dataset is bigger + than active places. + 2. If active places are more than one, the fetch results for each + variable is a list, and each element of this list is the variable of + respective active place. + + Examples: + .. code-block:: python + + pe = fluid.ParallelExecutor(use_cuda=use_cuda, + loss_name=avg_cost.name, + main_program=fluid.default_main_program()) + loss = pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])) + """ + if feed is None and feed_dict is not None: + feed = feed_dict + print( + "`feed_dict` is deprecated. Please use `feed=`", + file=sys.stderr) + + if isinstance(feed, dict): + feed_tensor_dict = dict() + for feed_name in feed: + feed_tensor = feed[feed_name] + if not isinstance(feed_tensor, core.LoDTensor): + feed_tensor = core.LoDTensor() + # always set to CPU place, since the tensor need to be splitted + # it is fast in CPU + feed_tensor.set(feed[feed_name], core.CPUPlace()) + feed_tensor_dict[feed_name] = feed_tensor + + self.executor.feed_and_split_tensor_into_local_scopes( + feed_tensor_dict) + elif isinstance(feed, list) or isinstance(feed, tuple): + if len(feed) != len(self._act_places): + raise ValueError( + "Feed a list of tensor, the list should be the same size as places" + ) + + res = list() + + for i, each in enumerate(feed): + if not isinstance(each, dict): + raise TypeError( + "Each element of feed list should be a dict") + res_dict = dict() + for feed_name in each: + tensor = each[feed_name] + if not isinstance(tensor, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(tensor, self._act_places[i]) + tensor = tmp + res_dict[feed_name] = tensor + res.append(res_dict) + self.executor.feed_tensors_into_local_scopes(res) + + fetch_var_name = '@FETCHED_VAR_NAME@' + self.executor.run(fetch_list, fetch_var_name) + arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + + if return_numpy: + return executor.as_numpy(arr) + + return [arr[i] for i in range(len(arr))] + + @property + def device_count(self): + return len(self._act_places) From d1a1fafc4c933e51341004b83f225d786c3fed49 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 11:21:29 +0800 Subject: [PATCH 039/684] code style --- paddle/fluid/platform/cpu_helper.cc | 2 +- python/paddle/fluid/parallel_executor.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 4e52e8ff00ca5..bd6aedb3ac787 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -29,7 +29,7 @@ namespace platform { void SetNumThreads(int num_threads) { #ifdef PADDLE_USE_OPENBLAS - // windows has no support for openblas multi-thread +// windows has no support for openblas multi-thread #ifdef _WIN32 if (num_threads > 1) { num_threads = 1; diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 33f6df67a4210..0d53f53a9ef6d 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,7 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy - class ParallelExecutor(object): """ ParallelExecutor is designed for data parallelism, which focuses on distributing @@ -161,7 +160,8 @@ def __init__(self, for p in main.global_block().iter_parameters() if not p.stop_gradient ]), - set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + set(cpt.to_text(var) + for var in self.persistable_vars), main.desc, cpt.to_text(loss_name) if loss_name else six.u(''), scope, local_scopes, exec_strategy, build_strategy, num_trainers, trainer_id) From dc80be275db8c1a75d222b0da11aba4c92c29aa8 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 11:21:29 +0800 Subject: [PATCH 040/684] code style test=develop --- cmake/external/eigen.cmake | 10 ++++------ cmake/external/gflags.cmake | 5 ++--- cmake/external/glog.cmake | 3 +-- cmake/external/gtest.cmake | 5 ++--- cmake/external/protobuf.cmake | 5 ++--- cmake/external/zlib.cmake | 5 ++--- paddle/fluid/platform/cpu_helper.cc | 2 +- python/paddle/fluid/parallel_executor.py | 4 ++-- 8 files changed, 16 insertions(+), 23 deletions(-) diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 98079678ae5fd..573ad5e5f06a9 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -16,9 +16,8 @@ if(WITH_AMD_GPU) ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} -# GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" -# GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" + GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" + GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" @@ -30,11 +29,10 @@ else() ExternalProject_Add( extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} -# GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/eigen3.git" + GIT_REPOSITORY "https://github.com/eigenteam/eigen-git-mirror" # eigen on cuda9.1 missing header of math_funtions.hpp # https://stackoverflow.com/questions/43113508/math-functions-hpp-not-found-when-using-cuda-with-eigen -# GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c + GIT_TAG 917060c364181f33a735dc023818d5a54f60e54c PREFIX ${EIGEN_SOURCE_DIR} DOWNLOAD_NAME "eigen" UPDATE_COMMAND "" diff --git a/cmake/external/gflags.cmake b/cmake/external/gflags.cmake index 7c062d682ceea..4e98e4bf889bc 100644 --- a/cmake/external/gflags.cmake +++ b/cmake/external/gflags.cmake @@ -28,9 +28,8 @@ INCLUDE_DIRECTORIES(${GFLAGS_INCLUDE_DIR}) ExternalProject_Add( extern_gflags ${EXTERNAL_PROJECT_LOG_ARGS} -# GIT_REPOSITORY "https://github.com/gflags/gflags.git" - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gflags.git" -# GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a + GIT_REPOSITORY "https://github.com/gflags/gflags.git" + GIT_TAG 77592648e3f3be87d6c7123eb81cbad75f9aef5a PREFIX ${GFLAGS_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/glog.cmake b/cmake/external/glog.cmake index a3f3c6adf304e..8cd0455c16bf8 100644 --- a/cmake/external/glog.cmake +++ b/cmake/external/glog.cmake @@ -34,14 +34,13 @@ ELSE() SET(GLOG_REPOSITORY "https://github.com/google/glog.git") SET(GLOG_TAG "v0.3.5") ENDIF() - SET(GLOG_REPOSITORY "http://admin@172.20.90.14:8080/r/glog.git") ExternalProject_Add( extern_glog ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS gflags GIT_REPOSITORY ${GLOG_REPOSITORY} - # GIT_TAG ${GLOG_TAG} + GIT_TAG ${GLOG_TAG} PREFIX ${GLOG_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index da539d52bd4c4..d335298742c73 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -43,9 +43,8 @@ IF(WITH_TESTING) extern_gtest ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${GTEST_DEPENDS} - # GIT_REPOSITORY "https://github.com/google/googletest.git" - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/gtest.git" -# GIT_TAG "release-1.8.0" + GIT_REPOSITORY "https://github.com/google/googletest.git" + GIT_TAG "release-1.8.0" PREFIX ${GTEST_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 94d8ac30cc5f7..e1e619e572b05 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -202,9 +202,8 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) SET(OPTIONAL_ARGS ${OPTIONAL_ARGS} "-DCMAKE_GENERATOR_PLATFORM=x64") ENDIF() - # SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") - # SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") - SET(PROTOBUF_REPO http://admin@172.20.90.14:8080/r/protobuf.git) + SET(PROTOBUF_REPO "https://github.com/google/protobuf.git") + SET(PROTOBUF_TAG "9f75c5aa851cd877fb0d93ccc31b8567a6706546") IF(MOBILE_INFERENCE) # The reason why the official version is not used is described in # https://github.com/PaddlePaddle/Paddle/issues/6114 diff --git a/cmake/external/zlib.cmake b/cmake/external/zlib.cmake index 456f26385c4f2..c3d73235453c8 100644 --- a/cmake/external/zlib.cmake +++ b/cmake/external/zlib.cmake @@ -31,9 +31,8 @@ INCLUDE_DIRECTORIES(${THIRD_PARTY_PATH}/install) # For Paddle code to include zl ExternalProject_Add( extern_zlib ${EXTERNAL_PROJECT_LOG_ARGS} - # GIT_REPOSITORY "https://github.com/madler/zlib.git" - GIT_REPOSITORY "http://admin@172.20.90.14:8080/r/zlib.git" -# GIT_TAG "v1.2.8" + GIT_REPOSITORY "https://github.com/madler/zlib.git" + GIT_TAG "v1.2.8" PREFIX ${ZLIB_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index 4e52e8ff00ca5..bd6aedb3ac787 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -29,7 +29,7 @@ namespace platform { void SetNumThreads(int num_threads) { #ifdef PADDLE_USE_OPENBLAS - // windows has no support for openblas multi-thread +// windows has no support for openblas multi-thread #ifdef _WIN32 if (num_threads > 1) { num_threads = 1; diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 33f6df67a4210..0d53f53a9ef6d 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -29,7 +29,6 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy BuildStrategy = core.ParallelExecutor.BuildStrategy - class ParallelExecutor(object): """ ParallelExecutor is designed for data parallelism, which focuses on distributing @@ -161,7 +160,8 @@ def __init__(self, for p in main.global_block().iter_parameters() if not p.stop_gradient ]), - set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + set(cpt.to_text(var) + for var in self.persistable_vars), main.desc, cpt.to_text(loss_name) if loss_name else six.u(''), scope, local_scopes, exec_strategy, build_strategy, num_trainers, trainer_id) From fcbd5a12b802560f279e30086d03ef152f760ab5 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 15:23:05 +0800 Subject: [PATCH 041/684] add create_recordio_file_reader back --- python/paddle/fluid/layers/io.py | 118 +++++++++++++++---------------- 1 file changed, 58 insertions(+), 60 deletions(-) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index a9075045a2d52..8e18a6e784bab 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -347,72 +347,70 @@ def _copy_reader_create_op_(block, op): return new_op -if os.name != 'nt': - - @templatedoc(op_type='create_recordio_file_reader') - def open_recordio_file(filename, - shapes, - lod_levels, - dtypes, - pass_num=1, - for_parallel=True): - """ - ${comment} - - Args: - filename(${filename_type}): ${filename_comment}. - shapes(list): List of tuples which declaring data shapes. - lod_levels(${lod_levels_type}): ${lod_levels_comment}. - dtypes(list): List of strs which declaring data type. - pass_num(int): Number of passes to run. - for_parallel(Bool): Set it as True if you are going to run - subsequent operators in parallel. - - Returns: - ${out_comment}. - - Examples: - - >>> import paddle.fluid as fluid - >>> reader = fluid.layers.io.open_recordio_file( - >>> filename='./data.recordio', - >>> shapes=[(3,224,224), (1)], - >>> lod_levels=[0, 0], - >>> dtypes=['float32', 'int64']) - >>> # Via the reader, we can use 'read_file' layer to get data: - >>> image, label = fluid.layers.io.read_file(reader) - """ - dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] - shape_concat = [] - ranks = [] +@templatedoc(op_type='create_recordio_file_reader') +def open_recordio_file(filename, + shapes, + lod_levels, + dtypes, + pass_num=1, + for_parallel=True): + """ + ${comment} - for shape in shapes: - shape_concat.extend(shape) - ranks.append(len(shape)) + Args: + filename(${filename_type}): ${filename_comment}. + shapes(list): List of tuples which declaring data shapes. + lod_levels(${lod_levels_type}): ${lod_levels_comment}. + dtypes(list): List of strs which declaring data type. + pass_num(int): Number of passes to run. + for_parallel(Bool): Set it as True if you are going to run + subsequent operators in parallel. - var_name = unique_name('open_recordio_file') + Returns: + ${out_comment}. - startup_blk = default_startup_program().current_block() - startup_var = startup_blk.create_var(name=var_name) - startup_blk.append_op( - type='create_recordio_file_reader', - outputs={'Out': [startup_var]}, - attrs={ - 'shape_concat': shape_concat, - 'lod_levels': lod_levels, - 'filename': filename, - 'ranks': ranks - }) + Examples: - startup_var.desc.set_dtypes(dtypes) - startup_var.persistable = True - main_prog_var = _copy_reader_var_( - default_main_program().current_block(), startup_var) + >>> import paddle.fluid as fluid + >>> reader = fluid.layers.io.open_recordio_file( + >>> filename='./data.recordio', + >>> shapes=[(3,224,224), (1)], + >>> lod_levels=[0, 0], + >>> dtypes=['float32', 'int64']) + >>> # Via the reader, we can use 'read_file' layer to get data: + >>> image, label = fluid.layers.io.read_file(reader) + """ + dtypes = [convert_np_dtype_to_dtype_(dt) for dt in dtypes] + shape_concat = [] + ranks = [] + + for shape in shapes: + shape_concat.extend(shape) + ranks.append(len(shape)) - if pass_num > 1: - main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) + var_name = unique_name('open_recordio_file') - return monkey_patch_reader_methods(main_prog_var) + startup_blk = default_startup_program().current_block() + startup_var = startup_blk.create_var(name=var_name) + startup_blk.append_op( + type='create_recordio_file_reader', + outputs={'Out': [startup_var]}, + attrs={ + 'shape_concat': shape_concat, + 'lod_levels': lod_levels, + 'filename': filename, + 'ranks': ranks + }) + + startup_var.desc.set_dtypes(dtypes) + startup_var.persistable = True + main_prog_var = _copy_reader_var_( + default_main_program().current_block(), startup_var) + + if pass_num > 1: + main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) + + return monkey_patch_reader_methods(main_prog_var) def random_data_generator(low, high, shapes, lod_levels, for_parallel=True): From 1dc1dd94d1c349c6bbed6fe826ae1d25a3983603 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 16:07:25 +0800 Subject: [PATCH 042/684] fix code style test=develop --- python/paddle/fluid/layers/io.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 8e18a6e784bab..3f47053961bcc 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -404,8 +404,8 @@ def open_recordio_file(filename, startup_var.desc.set_dtypes(dtypes) startup_var.persistable = True - main_prog_var = _copy_reader_var_( - default_main_program().current_block(), startup_var) + main_prog_var = _copy_reader_var_(default_main_program().current_block(), + startup_var) if pass_num > 1: main_prog_var = multi_pass(reader=main_prog_var, pass_num=pass_num) From 853878cbf218728608a783260ae74c408ef4b8a2 Mon Sep 17 00:00:00 2001 From: tink2123 Date: Fri, 16 Nov 2018 02:05:56 -0800 Subject: [PATCH 043/684] fix the wrong format test=develop --- python/paddle/fluid/average.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/paddle/fluid/average.py b/python/paddle/fluid/average.py index 42cd3b36420ef..40a734af311e2 100644 --- a/python/paddle/fluid/average.py +++ b/python/paddle/fluid/average.py @@ -48,6 +48,7 @@ class WeightedAverage(object): Examples: .. code-block:: python + avg = fluid.average.WeightedAverage() avg.add(value=2.0, weight=1) avg.add(value=4.0, weight=2) From 695e2aba5e8396ff0719da8516e38b1ef4782c05 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 19:33:50 +0800 Subject: [PATCH 044/684] fix the gtest.cmake on windows --- cmake/external/gtest.cmake | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cmake/external/gtest.cmake b/cmake/external/gtest.cmake index da539d52bd4c4..943767fb17bbc 100644 --- a/cmake/external/gtest.cmake +++ b/cmake/external/gtest.cmake @@ -51,7 +51,11 @@ IF(WITH_TESTING) CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER} -DCMAKE_CXX_FLAGS=${CMAKE_CXX_FLAGS} + -DCMAKE_CXX_FLAGS_RELEASE=${CMAKE_CXX_FLAGS_RELEASE} + -DCMAKE_CXX_FLAGS_DEBUG=${CMAKE_CXX_FLAGS_DEBUG} -DCMAKE_C_FLAGS=${CMAKE_C_FLAGS} + -DCMAKE_C_FLAGS_DEBUG=${CMAKE_C_FLAGS_DEBUG} + -DCMAKE_C_FLAGS_RELEASE=${CMAKE_C_FLAGS_RELEASE} -DCMAKE_INSTALL_PREFIX=${GTEST_INSTALL_DIR} -DCMAKE_POSITION_INDEPENDENT_CODE=ON -DBUILD_GMOCK=ON From b942f4760ab30f6a107c6cf944032c9dde143528 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 16 Nov 2018 22:04:11 +0800 Subject: [PATCH 045/684] fix cc_test on windows --- cmake/generic.cmake | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index e21f89c7c5850..111627a932afe 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -351,6 +351,9 @@ function(cc_test TARGET_NAME) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_executable(${TARGET_NAME} ${cc_test_SRCS}) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) + if(WIN32) + target_link_libraries(${TARGET_NAME} shlwapi) + endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_test(NAME ${TARGET_NAME} COMMAND ${TARGET_NAME} ${cc_test_ARGS} From ef943bd6cd463b4e00bb18cc137334a6b78fca55 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 00:32:33 +0800 Subject: [PATCH 046/684] fix the win build test=develop --- paddle/fluid/operators/CMakeLists.txt | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index df2a3e7aa635c..a2334c1499648 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE) add_subdirectory(distributed_ops) endif() -if (NOT WIN32) - add_subdirectory(reader) -endif() +add_subdirectory(reader) if (NOT WIN32) add_subdirectory(nccl) @@ -49,9 +47,9 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler) if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions dynload_warpctc jit_kernel) endif() if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) From a395942c6a5246f1702e592e1f3d369caa3accc1 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 08:38:02 +0800 Subject: [PATCH 047/684] remove fused compile support on windows test=develop --- paddle/fluid/operators/CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index a2334c1499648..40246d05e996e 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -11,7 +11,9 @@ add_subdirectory(controlflow) add_subdirectory(csp) add_subdirectory(detection) add_subdirectory(elementwise) -add_subdirectory(fused) +if(NOT WIN32) + add_subdirectory(fused) +endif(NOT WIN32) add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) From c75dc885b58000b018414ab442097ee515244b9c Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 14:36:56 +0800 Subject: [PATCH 048/684] add the jit support test=develop --- paddle/fluid/operators/CMakeLists.txt | 7 ++-- paddle/fluid/operators/math/CMakeLists.txt | 36 +++++++++---------- .../math/detail/activation_functions.h | 7 ++++ paddle/fluid/operators/math/jit_code.cc | 5 +++ 4 files changed, 31 insertions(+), 24 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 40246d05e996e..10748b0cda485 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -11,9 +11,7 @@ add_subdirectory(controlflow) add_subdirectory(csp) add_subdirectory(detection) add_subdirectory(elementwise) -if(NOT WIN32) - add_subdirectory(fused) -endif(NOT WIN32) +add_subdirectory(fused) add_subdirectory(metrics) add_subdirectory(optimizers) add_subdirectory(reduce_ops) @@ -50,8 +48,9 @@ endif() set(COMMON_OP_DEPS "") set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code gru_compute activation_functions jit_kernel) if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions dynload_warpctc jit_kernel) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch dynload_warpctc) endif() if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 4cd014cbadb88..08c8dbbfe82f3 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,6 +1,4 @@ -if (NOT WIN32) - add_subdirectory(detail) -endif(NOT WIN32) +add_subdirectory(detail) function(math_library TARGET) # math_library is a function to create math library. @@ -43,10 +41,8 @@ math_library(depthwise_conv) math_library(im2col) math_library(sampler) -if (NOT WIN32) # windows do not support avx functions yet. - math_library(gru_compute DEPS activation_functions math_function) - math_library(lstm_compute DEPS activation_functions) -endif (NOT WIN32) +math_library(gru_compute DEPS activation_functions math_function) +math_library(lstm_compute DEPS activation_functions) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) @@ -58,9 +54,9 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) math_library(sequence_scale) math_library(softmax DEPS math_function) -if (NOT WIN32) - math_library(matrix_bit_code) -endif (NOT WIN32) + +math_library(matrix_bit_code) + math_library(unpooling) math_library(vol2col) @@ -76,13 +72,13 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) -if (NOT WIN32) - set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) - set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) - if(WITH_XBYAK) - list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) - list(APPEND JIT_KERNEL_DEPS xbyak) - endif() - cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) - cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) -endif (NOT WIN32) + +set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) +if(WITH_XBYAK) + list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) + list(APPEND JIT_KERNEL_DEPS xbyak) +endif() +cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) +cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) + diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index b127fbe8c8515..42fb45a8a5ea9 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -15,6 +15,13 @@ limitations under the License. */ #pragma once #include #include + +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX__2 +#endif + + #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index e3b600d442767..0f4b8f65acada 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -118,7 +118,12 @@ void VXXJitCode::generate() { ret(); } +#ifdef _WIN32 +#define ALIGN32 +#else #define ALIGN32 __attribute__((aligned(32))) +#endif + #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 From 5e46c98362897fbf043eb8387618740fbbd6fd07 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 14:41:29 +0800 Subject: [PATCH 049/684] add the jit support, test=develop --- paddle/fluid/operators/math/detail/activation_functions.h | 6 ------ paddle/fluid/platform/cpu_info.h | 5 +++++ paddle/fluid/platform/enforce.h | 5 +++++ 3 files changed, 10 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index 42fb45a8a5ea9..2b3d38d95a18f 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -16,12 +16,6 @@ limitations under the License. */ #include #include -#ifdef _WIN32 -#undef __AVX__ -#undef __AVX__2 -#endif - - #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 6810a1651a14c..1b4840d9a117b 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -14,6 +14,11 @@ limitations under the License. */ #pragma once +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX2__ +#endif + #include namespace paddle { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a251bfcd99144..c03bbd59ac922 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -14,6 +14,11 @@ limitations under the License. */ #pragma once +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX2__ +#endif + #ifdef __GNUC__ #include // for __cxa_demangle #endif // __GNUC__ From 928efeed46132df27d1c389be046bdc31c9451f4 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sat, 17 Nov 2018 14:41:29 +0800 Subject: [PATCH 050/684] add the jit support, test=develop --- cmake/operators.cmake | 5 +++-- paddle/fluid/operators/math/detail/activation_functions.h | 6 ------ paddle/fluid/platform/cpu_info.h | 5 +++++ paddle/fluid/platform/enforce.h | 5 +++++ 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/cmake/operators.cmake b/cmake/operators.cmake index c9d0f80da2962..5636342ef72f2 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,8 +84,9 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" - "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" +# "hierarchical_sigmoid_op" "cumsum_op" +# "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() diff --git a/paddle/fluid/operators/math/detail/activation_functions.h b/paddle/fluid/operators/math/detail/activation_functions.h index 42fb45a8a5ea9..2b3d38d95a18f 100644 --- a/paddle/fluid/operators/math/detail/activation_functions.h +++ b/paddle/fluid/operators/math/detail/activation_functions.h @@ -16,12 +16,6 @@ limitations under the License. */ #include #include -#ifdef _WIN32 -#undef __AVX__ -#undef __AVX__2 -#endif - - #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/hostdevice.h" diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 6810a1651a14c..1b4840d9a117b 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -14,6 +14,11 @@ limitations under the License. */ #pragma once +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX2__ +#endif + #include namespace paddle { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a251bfcd99144..c03bbd59ac922 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -14,6 +14,11 @@ limitations under the License. */ #pragma once +#ifdef _WIN32 +#undef __AVX__ +#undef __AVX2__ +#endif + #ifdef __GNUC__ #include // for __cxa_demangle #endif // __GNUC__ From a3e952f41d9081b8d0f69128f7d758fd95f97f96 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sun, 18 Nov 2018 12:19:05 +0800 Subject: [PATCH 051/684] add the jit back fix compile error on windows --- CMakeLists.txt | 5 + cmake/operators.cmake | 5 +- cmake/simd.cmake | 25 +- paddle/fluid/operators/CMakeLists.txt | 9 +- .../fluid/operators/hierarchical_sigmoid_op.h | 2 +- paddle/fluid/operators/math/CMakeLists.txt | 35 +- paddle/fluid/operators/math/matrix_bit_code.h | 3 +- python/paddle/fluid/layers/nn.py | 374 +++++++++--------- python/paddle/fluid/layers/ops.py | 41 +- 9 files changed, 241 insertions(+), 258 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c3bc60d57b58f..c2804e234d0d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -130,6 +130,11 @@ if (APPLE OR WIN32) "Disable MKL for building on mac and windows" FORCE) endif() +if (WIN32) + set(WITH_AVX OFF CACHE STRING + "Disable AVX when compiling for Windows" FORCE) +endif() + set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING "A path setting third party libraries download & build directories.") diff --git a/cmake/operators.cmake b/cmake/operators.cmake index c9d0f80da2962..5e8b95b3e2392 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,9 +84,8 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" "hierarchical_sigmoid_op" - "crf_decoding_op" "select_op" "lstmp_op" "gru_op" "fusion_gru_op" "lstm_op" "fusion_lstm_op" "cumsum_op" - "fusion_seqconv_eltadd_relu_op" "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" + "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 566dc75fda019..4926fb991332b 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -70,17 +70,20 @@ int main() return 0; }" AVX_FOUND) -# Check AVX 2 -set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) -set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); - __m256i result = _mm256_abs_epi32 (a); - return 0; -}" AVX2_FOUND) +# disable AVX2 by default on windows +if(NOT WIN32) + # Check AVX 2 + set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) + set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256i a = _mm256_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4); + __m256i result = _mm256_abs_epi32 (a); + return 0; + }" AVX2_FOUND) +endif(NOT WIN32) # Check AVX512F set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index df2a3e7aa635c..284bf5dc9e2b5 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -22,9 +22,7 @@ if(WITH_DISTRIBUTE) add_subdirectory(distributed_ops) endif() -if (NOT WIN32) - add_subdirectory(reader) -endif() +add_subdirectory(reader) if (NOT WIN32) add_subdirectory(nccl) @@ -49,9 +47,10 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor dynload_warpctc sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code sequence2batch gru_compute activation_functions jit_kernel) if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 64096a717b12e..79980cda53bef 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -111,7 +111,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { auto pre_out_mat = EigenMatrix::From(*pre_out); auto pre_out_grad_mat = EigenMatrix::From(pre_out_grad); auto out_grad_mat = EigenMatrix::From(*out_grad); - Eigen::array bcast({{1, static_cast(pre_out_grad.dims()[1])}}); + Eigen::array bcast{1, static_cast(pre_out_grad.dims()[1])}; // softrelu derivative pre_out_grad_mat.device(place) = diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 4cd014cbadb88..e9397d552d2cb 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -1,6 +1,4 @@ -if (NOT WIN32) - add_subdirectory(detail) -endif(NOT WIN32) +add_subdirectory(detail) function(math_library TARGET) # math_library is a function to create math library. @@ -43,10 +41,8 @@ math_library(depthwise_conv) math_library(im2col) math_library(sampler) -if (NOT WIN32) # windows do not support avx functions yet. - math_library(gru_compute DEPS activation_functions math_function) - math_library(lstm_compute DEPS activation_functions) -endif (NOT WIN32) +math_library(gru_compute DEPS activation_functions math_function) +math_library(lstm_compute DEPS activation_functions) cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context) math_library(math_function DEPS blas) @@ -58,9 +54,9 @@ math_library(sequence_padding) math_library(sequence_pooling DEPS math_function) math_library(sequence_scale) math_library(softmax DEPS math_function) -if (NOT WIN32) - math_library(matrix_bit_code) -endif (NOT WIN32) + +math_library(matrix_bit_code) + math_library(unpooling) math_library(vol2col) @@ -76,13 +72,12 @@ if(WITH_GPU) endif() cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split) cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info) -if (NOT WIN32) - set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) - set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) - if(WITH_XBYAK) - list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) - list(APPEND JIT_KERNEL_DEPS xbyak) - endif() - cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) - cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) -endif (NOT WIN32) + +set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc) +set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce) +if(WITH_XBYAK) + list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc) + list(APPEND JIT_KERNEL_DEPS xbyak) +endif() +cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS}) +cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel) diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 07854c83584f9..c329b8b6113e8 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -67,7 +67,7 @@ inline constexpr size_t FindLastSet(size_t x) { : (std::is_same::value // NOLINT ? (x ? 8 * sizeof(x) - __builtin_clzl(x) : 0) : (x ? 8 * sizeof(x) - __builtin_clzll(x) : 0)); - +} #else // windows don't have built-in clz, ctz function template @@ -92,7 +92,6 @@ inline int clz(const T& value) { inline size_t FindLastSet(size_t x) { return sizeof(size_t) * 8 - clz(x); } #endif // !_WIN32 -} struct SimpleCode { SimpleCode(size_t code, size_t num_classes) : c_(code + num_classes) {} diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 9465d9756477e..a2bab6438456e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -170,12 +170,6 @@ 'bilinear_tensor_product', ] -# To avoid the api checker complains -if os.name == 'nt': - __all__.remove('dynamic_lstm') - __all__.remove('crf_decoding') - __all__.remove('roi_pool') - def fc(input, size, @@ -349,128 +343,126 @@ def embedding(input, return tmp -if os.name != 'nt': +@templatedoc(op_type="lstm") +def dynamic_lstm(input, + size, + h_0=None, + c_0=None, + param_attr=None, + bias_attr=None, + use_peepholes=True, + is_reverse=False, + gate_activation='sigmoid', + cell_activation='tanh', + candidate_activation='tanh', + dtype='float32', + name=None): + """ + ${comment} - @templatedoc(op_type="lstm") - def dynamic_lstm(input, - size, - h_0=None, - c_0=None, - param_attr=None, - bias_attr=None, - use_peepholes=True, - is_reverse=False, - gate_activation='sigmoid', - cell_activation='tanh', - candidate_activation='tanh', - dtype='float32', - name=None): - """ - ${comment} - - Args: - input (Variable): ${input_comment} - size (int): 4 * hidden size. - h_0(Variable): The initial hidden state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size and D is the hidden size. - c_0(Variable): The initial cell state is an optional input, default is zero. - This is a tensor with shape (N x D), where N is the - batch size. `h_0` and `c_0` can be NULL but only at the same time. - param_attr(ParamAttr|None): The parameter attribute for the learnable - hidden-hidden weights. - - - Weights = {:math:`W_{ch}, W_{ih}, \ - W_{fh}, W_{oh}`} - - The shape is (D x 4D), where D is the hidden - size. - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as param_attr. - If the Initializer of the param_attr is not set, the - parameter is initialized with Xavier. Default: None. - bias_attr (ParamAttr|None): The bias attribute for the learnable bias - weights, which contains two parts, input-hidden - bias weights and peephole connections weights if - setting `use_peepholes` to `True`. - - 1. `use_peepholes = False` - - Biases = {:math:`b_c, b_i, b_f, b_o`}. - - The shape is (1 x 4D). - 2. `use_peepholes = True` - - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ - W_{fc}, W_{oc}`}. - - The shape is (1 x 7D). - - If it is set to None or one attribute of ParamAttr, - dynamic_lstm will create ParamAttr as bias_attr. - If the Initializer of the bias_attr is not set, - the bias is initialized zero. Default: None. - use_peepholes (bool): ${use_peepholes_comment} - is_reverse (bool): ${is_reverse_comment} - gate_activation (str): ${gate_activation_comment} - cell_activation (str): ${cell_activation_comment} - candidate_activation (str): ${candidate_activation_comment} - dtype (str): Data type. Choices = ["float32", "float64"], default "float32". - name (str|None): A name for this layer(optional). If set None, the layer - will be named automatically. - - Returns: - tuple: The hidden state, and cell state of LSTM. The shape of both \ - is (T x D), and lod is the same with the `input`. - - Examples: - .. code-block:: python - - hidden_dim = 512 - forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, - bias_attr=False) - forward, _ = fluid.layers.dynamic_lstm( - input=forward_proj, size=hidden_dim * 4, use_peepholes=False) - """ - assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." - helper = LayerHelper('lstm', **locals()) - size = size // 4 - weight = helper.create_parameter( - attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype) - bias_size = [1, 7 * size] - if not use_peepholes: - bias_size[1] = 4 * size - bias = helper.create_parameter( - attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + Args: + input (Variable): ${input_comment} + size (int): 4 * hidden size. + h_0(Variable): The initial hidden state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size and D is the hidden size. + c_0(Variable): The initial cell state is an optional input, default is zero. + This is a tensor with shape (N x D), where N is the + batch size. `h_0` and `c_0` can be NULL but only at the same time. + param_attr(ParamAttr|None): The parameter attribute for the learnable + hidden-hidden weights. - hidden = helper.create_variable_for_type_inference(dtype) - cell = helper.create_variable_for_type_inference(dtype) - batch_gate = helper.create_variable_for_type_inference(dtype) - batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) - inputs = {'Input': input, 'Weight': weight, 'Bias': bias} - batch_size = input.shape[0] - if h_0: - assert h_0.shape == (batch_size, size), \ - 'The shape of h0 should be (batch_size, %d)' % size - inputs['H0'] = h_0 - if c_0: - assert c_0.shape == (batch_size, size), \ - 'The shape of c0 should be (batch_size, %d)' % size - inputs['C0'] = c_0 + - Weights = {:math:`W_{ch}, W_{ih}, \ + W_{fh}, W_{oh}`} + - The shape is (D x 4D), where D is the hidden + size. - helper.append_op( - type='lstm', - inputs=inputs, - outputs={ - 'Hidden': hidden, - 'Cell': cell, - 'BatchGate': batch_gate, - 'BatchCellPreAct': batch_cell_pre_act - }, - attrs={ - 'use_peepholes': use_peepholes, - 'is_reverse': is_reverse, - 'gate_activation': gate_activation, - 'cell_activation': cell_activation, - 'candidate_activation': candidate_activation - }) - return hidden, cell + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as param_attr. + If the Initializer of the param_attr is not set, the + parameter is initialized with Xavier. Default: None. + bias_attr (ParamAttr|None): The bias attribute for the learnable bias + weights, which contains two parts, input-hidden + bias weights and peephole connections weights if + setting `use_peepholes` to `True`. + + 1. `use_peepholes = False` + - Biases = {:math:`b_c, b_i, b_f, b_o`}. + - The shape is (1 x 4D). + 2. `use_peepholes = True` + - Biases = { :math:`b_c, b_i, b_f, b_o, W_{ic}, \ + W_{fc}, W_{oc}`}. + - The shape is (1 x 7D). + + If it is set to None or one attribute of ParamAttr, + dynamic_lstm will create ParamAttr as bias_attr. + If the Initializer of the bias_attr is not set, + the bias is initialized zero. Default: None. + use_peepholes (bool): ${use_peepholes_comment} + is_reverse (bool): ${is_reverse_comment} + gate_activation (str): ${gate_activation_comment} + cell_activation (str): ${cell_activation_comment} + candidate_activation (str): ${candidate_activation_comment} + dtype (str): Data type. Choices = ["float32", "float64"], default "float32". + name (str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + tuple: The hidden state, and cell state of LSTM. The shape of both \ + is (T x D), and lod is the same with the `input`. + + Examples: + .. code-block:: python + + hidden_dim = 512 + forward_proj = fluid.layers.fc(input=input_seq, size=hidden_dim * 4, + bias_attr=False) + forward, _ = fluid.layers.dynamic_lstm( + input=forward_proj, size=hidden_dim * 4, use_peepholes=False) + """ + assert bias_attr is not False, "bias_attr should not be False in dynamic_lstmp." + helper = LayerHelper('lstm', **locals()) + size = size // 4 + weight = helper.create_parameter( + attr=helper.param_attr, shape=[size, 4 * size], dtype=dtype) + bias_size = [1, 7 * size] + if not use_peepholes: + bias_size[1] = 4 * size + bias = helper.create_parameter( + attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True) + + hidden = helper.create_variable_for_type_inference(dtype) + cell = helper.create_variable_for_type_inference(dtype) + batch_gate = helper.create_variable_for_type_inference(dtype) + batch_cell_pre_act = helper.create_variable_for_type_inference(dtype) + inputs = {'Input': input, 'Weight': weight, 'Bias': bias} + batch_size = input.shape[0] + if h_0: + assert h_0.shape == (batch_size, size), \ + 'The shape of h0 should be (batch_size, %d)' % size + inputs['H0'] = h_0 + if c_0: + assert c_0.shape == (batch_size, size), \ + 'The shape of c0 should be (batch_size, %d)' % size + inputs['C0'] = c_0 + + helper.append_op( + type='lstm', + inputs=inputs, + outputs={ + 'Hidden': hidden, + 'Cell': cell, + 'BatchGate': batch_gate, + 'BatchCellPreAct': batch_cell_pre_act + }, + attrs={ + 'use_peepholes': use_peepholes, + 'is_reverse': is_reverse, + 'gate_activation': gate_activation, + 'cell_activation': cell_activation, + 'candidate_activation': candidate_activation + }) + return hidden, cell def dynamic_lstmp(input, @@ -969,43 +961,39 @@ def linear_chain_crf(input, label, param_attr=None): return log_likelihood -if os.name != 'nt': - - @templatedoc() - def crf_decoding(input, param_attr, label=None): - """ - ${comment} +@templatedoc() +def crf_decoding(input, param_attr, label=None): + """ + ${comment} - Args: - input(${emission_type}): ${emission_comment} + Args: + input(${emission_type}): ${emission_comment} - param_attr(ParamAttr): The parameter attribute for training. + param_attr(ParamAttr): The parameter attribute for training. - label(${label_type}): ${label_comment} + label(${label_type}): ${label_comment} - Returns: - Variable: ${viterbi_path_comment} + Returns: + Variable: ${viterbi_path_comment} - Examples: - .. code-block:: python + Examples: + .. code-block:: python - crf_decode = layers.crf_decoding( - input=hidden, param_attr=ParamAttr(name="crfw")) - """ - helper = LayerHelper('crf_decoding', **locals()) - transition = helper.get_parameter(param_attr.name) - viterbi_path = helper.create_variable_for_type_inference( - dtype=helper.input_dtype()) - helper.append_op( - type='crf_decoding', - inputs={ - "Emission": [input], + crf_decode = layers.crf_decoding( + input=hidden, param_attr=ParamAttr(name="crfw")) + """ + helper = LayerHelper('crf_decoding', **locals()) + transition = helper.get_parameter(param_attr.name) + viterbi_path = helper.create_variable_for_type_inference( + dtype=helper.input_dtype()) + helper.append_op( + type='crf_decoding', + inputs={"Emission": [input], "Transition": transition, - "Label": label - }, - outputs={"ViterbiPath": [viterbi_path]}) + "Label": label}, + outputs={"ViterbiPath": [viterbi_path]}) - return viterbi_path + return viterbi_path @templatedoc() @@ -5599,48 +5587,42 @@ def label_smooth(label, return smooth_label -if os.name != 'nt': - - @templatedoc() - def roi_pool(input, - rois, - pooled_height=1, - pooled_width=1, - spatial_scale=1.0): - """ - ${comment} - - Args: - input (Variable): ${x_comment} - rois (Variable): ROIs (Regions of Interest) to pool over. - pooled_height (integer): ${pooled_height_comment} Default: 1 - pooled_width (integer): ${pooled_width_comment} Default: 1 - spatial_scale (float): ${spatial_scale_comment} Default: 1.0 - - Returns: - Variable: ${out_comment}. - - Examples: - .. code-block:: python - - pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) - """ - helper = LayerHelper('roi_pool', **locals()) - dtype = helper.input_dtype() - pool_out = helper.create_variable_for_type_inference(dtype) - argmaxes = helper.create_variable_for_type_inference(dtype='int32') - helper.append_op( - type="roi_pool", - inputs={"X": input, - "ROIs": rois}, - outputs={"Out": pool_out, - "Argmax": argmaxes}, - attrs={ - "pooled_height": pooled_height, - "pooled_width": pooled_width, - "spatial_scale": spatial_scale - }) - return pool_out +@templatedoc() +def roi_pool(input, rois, pooled_height=1, pooled_width=1, spatial_scale=1.0): + """ + ${comment} + + Args: + input (Variable): ${x_comment} + rois (Variable): ROIs (Regions of Interest) to pool over. + pooled_height (integer): ${pooled_height_comment} Default: 1 + pooled_width (integer): ${pooled_width_comment} Default: 1 + spatial_scale (float): ${spatial_scale_comment} Default: 1.0 + + Returns: + Variable: ${out_comment}. + + Examples: + .. code-block:: python + + pool_out = fluid.layers.roi_pool(input=x, rois=rois, 7, 7, 1.0) + """ + helper = LayerHelper('roi_pool', **locals()) + dtype = helper.input_dtype() + pool_out = helper.create_variable_for_type_inference(dtype) + argmaxes = helper.create_variable_for_type_inference(dtype='int32') + helper.append_op( + type="roi_pool", + inputs={"X": input, + "ROIs": rois}, + outputs={"Out": pool_out, + "Argmax": argmaxes}, + attrs={ + "pooled_height": pooled_height, + "pooled_width": pooled_width, + "spatial_scale": spatial_scale + }) + return pool_out @templatedoc() diff --git a/python/paddle/fluid/layers/ops.py b/python/paddle/fluid/layers/ops.py index 66eb1229aa3ec..6c18af7283e19 100644 --- a/python/paddle/fluid/layers/ops.py +++ b/python/paddle/fluid/layers/ops.py @@ -100,26 +100,27 @@ def hard_shrink(x, threshold=None): >>> result = fluid.layers.hard_shrink(x=data, threshold=0.3) """ -if os.name != 'nt': - __all__ += ['cumsum'] - - _cum_sum_ = generate_layer_fn('cumsum') - - def cumsum(x, axis=None, exclusive=None, reverse=None): - locals_var = locals().keys() - kwargs = dict() - for name in locals_var: - val = locals()[name] - if val is not None: - kwargs[name] = val - return _cum_sum_(**kwargs) - - cumsum.__doc__ = _cum_sum_.__doc__ + """ - Examples: - - >>> data = fluid.layers.data(name="input", shape=[32, 784]) - >>> result = fluid.layers.cumsum(data, axis=0) - """ +__all__ += ['cumsum'] + +_cum_sum_ = generate_layer_fn('cumsum') + + +def cumsum(x, axis=None, exclusive=None, reverse=None): + locals_var = locals().keys() + kwargs = dict() + for name in locals_var: + val = locals()[name] + if val is not None: + kwargs[name] = val + return _cum_sum_(**kwargs) + + +cumsum.__doc__ = _cum_sum_.__doc__ + """ +Examples: + + >>> data = fluid.layers.data(name="input", shape=[32, 784]) + >>> result = fluid.layers.cumsum(data, axis=0) +""" __all__ += ['thresholded_relu'] From a1fa18542f308cc6c8a495f36ef72428b03ee704 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sun, 18 Nov 2018 12:56:03 +0800 Subject: [PATCH 052/684] rollback test=develop --- paddle/fluid/operators/math/jit_code.cc | 5 ----- paddle/fluid/platform/cpu_info.h | 5 ----- paddle/fluid/platform/enforce.h | 5 ----- 3 files changed, 15 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 0f4b8f65acada..e3b600d442767 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -118,12 +118,7 @@ void VXXJitCode::generate() { ret(); } -#ifdef _WIN32 -#define ALIGN32 -#else #define ALIGN32 __attribute__((aligned(32))) -#endif - #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f #define CEPHES_LOG2EF 1.44269504088896341 diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index 1b4840d9a117b..6810a1651a14c 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -14,11 +14,6 @@ limitations under the License. */ #pragma once -#ifdef _WIN32 -#undef __AVX__ -#undef __AVX2__ -#endif - #include namespace paddle { diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index c03bbd59ac922..a251bfcd99144 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -14,11 +14,6 @@ limitations under the License. */ #pragma once -#ifdef _WIN32 -#undef __AVX__ -#undef __AVX2__ -#endif - #ifdef __GNUC__ #include // for __cxa_demangle #endif // __GNUC__ From c59d3e83bc98d2dd3a8d9370b368c7d12c97d314 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Sun, 18 Nov 2018 18:06:09 +0800 Subject: [PATCH 053/684] test case fix --- paddle/fluid/platform/enforce.h | 64 ++++++++------------------------- 1 file changed, 15 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index a251bfcd99144..3643d2ad15b9e 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -127,14 +127,14 @@ struct EOFException : public std::exception { #define UNLIKELY(condition) __builtin_expect(static_cast(condition), 0) #else // there is no equivalent intrinsics in msvc. -#define UNLIKELY(condition) (condition == 0) +#define UNLIKELY(condition) (condition) #endif #if !defined(_WIN32) #define LIKELY(condition) __builtin_expect(static_cast(condition), 1) #else // there is no equivalent intrinsics in msvc. -#define LIKELY(condition) (condition != 0) +#define LIKELY(condition) !(condition) #endif template @@ -248,7 +248,6 @@ inline void throw_on_error(T e) { throw_on_error(e, ""); } -#if !defined(_WIN32) #define PADDLE_THROW(...) \ do { \ throw ::paddle::platform::EnforceNotMet( \ @@ -272,17 +271,6 @@ inline void throw_on_error(T e) { #define PADDLE_ENFORCE(...) ::paddle::platform::throw_on_error(__VA_ARGS__); #endif // REPLACE_ENFORCE_GLOG -#else // !_WIN32 -// disable enforce, caused by the varardic macro exception error -#define PADDLE_THROW(x) \ - do { \ - throw std::make_exception_ptr( \ - std::runtime_error("Windows disable the enforce.")); \ - } while (false) - -#define PADDLE_ENFORCE(x, ...) x -#endif // !_WIN32 - #define PADDLE_THROW_EOF() \ do { \ throw ::paddle::platform::EOFException("There is no next data.", __FILE__, \ @@ -302,20 +290,6 @@ inline void throw_on_error(T e) { * extra messages is also supported, for example: * PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2) */ -#if !defined(_WIN32) -#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) -#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) -#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) -#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) -#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) -#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ - __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) - #define PADDLE_ENFORCE_NOT_NULL(__VAL, ...) \ do { \ if (UNLIKELY(nullptr == (__VAL))) { \ @@ -335,27 +309,19 @@ inline void throw_on_error(T e) { paddle::string::Sprintf("" __VA_ARGS__)); \ } \ } while (0) -#else -#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) ((__VAL0) == (__VAL1)) -#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) ((__VAL0) != (__VAL1)) -#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) ((__VAL0) > (__VAL1)) -#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) ((__VAL0) >= (__VAL1)) -#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) ((__VAL0) < (__VAL1)) -#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) ((__VAL0) <= (__VAL1)) - -#define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...) \ - do { \ - if (!((__VAL0)__CMP(__VAL1))) { \ - PADDLE_THROW("Windows disable the enforce. Enforce failed."); \ - } \ - } while (0) -#define PADDLE_ENFORCE_NOT_NULL(__VAL1, ...) \ - do { \ - if (nullptr == (__VAL1)) { \ - PADDLE_THROW("Windows disable the enforce. Enforce failed"); \ - } \ - } while (0) -#endif // !_WIN32 + +#define PADDLE_ENFORCE_EQ(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, ==, !=, __VA_ARGS__) +#define PADDLE_ENFORCE_NE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, !=, ==, __VA_ARGS__) +#define PADDLE_ENFORCE_GT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >, <=, __VA_ARGS__) +#define PADDLE_ENFORCE_GE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, >=, <, __VA_ARGS__) +#define PADDLE_ENFORCE_LT(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <, >=, __VA_ARGS__) +#define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \ + __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__) } // namespace platform } // namespace paddle From 7d51a0e887c121b292a217e2ef3898b5c619b48a Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 10:07:45 +0800 Subject: [PATCH 054/684] disable DSO by default on windows --- CMakeLists.txt | 2 ++ paddle/fluid/operators/CMakeLists.txt | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c2804e234d0d7..0b42e60e1753e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -133,6 +133,8 @@ endif() if (WIN32) set(WITH_AVX OFF CACHE STRING "Disable AVX when compiling for Windows" FORCE) + set(WITH_DSO OFF CACHE STRING + "Disable DSO when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 284bf5dc9e2b5..73f44f3b67506 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -35,7 +35,7 @@ endif() register_operators(EXCLUDES warpctc_op) # warpctc_cudnn need cudnn 7 above -if (WITH_GPU) +if (WITH_GPU AND NOT WIN32) if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale SRCS warpctc_op.cc warpctc_op.cu.cc) else() From 1aff40a4c600d88fffc9117fc37d8feb0e7050e4 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 10:32:50 +0800 Subject: [PATCH 055/684] exclude warpctc_op on windows --- paddle/fluid/operators/CMakeLists.txt | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 73f44f3b67506..9b1b272292e6a 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -32,7 +32,9 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -register_operators(EXCLUDES warpctc_op) +if (NOT WIN32) + register_operators(EXCLUDES warpctc_op) +endif() # warpctc_cudnn need cudnn 7 above if (WITH_GPU AND NOT WIN32) @@ -47,10 +49,10 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory concat_and_split cross_entropy softmax vol2col im2col sampler) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} lstm_compute matrix_bit_code sequence2batch gru_compute activation_functions jit_kernel) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (NOT WIN32) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) From 8cf63475b096e0ce1f8421d644897fd294ec9c18 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 10:53:16 +0800 Subject: [PATCH 056/684] exclude the dynload_warpctc out on windows test=develop --- paddle/fluid/operators/CMakeLists.txt | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9b1b272292e6a..041de46ff57df 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -49,11 +49,12 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) endif() From 449406434ec680dff564847cad0d453590282e99 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 10:58:23 +0800 Subject: [PATCH 057/684] fix the scripts error test=develop --- paddle/fluid/operators/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 041de46ff57df..60a42cf56817f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -53,7 +53,7 @@ set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) From 8ef6280c034602f776554432672d42b826afbaee Mon Sep 17 00:00:00 2001 From: dengkaipeng Date: Fri, 16 Nov 2018 19:14:40 +0800 Subject: [PATCH 058/684] Add operator double support. test=develop --- paddle/fluid/operators/yolov3_loss_op.cc | 10 ++++------ paddle/fluid/operators/yolov3_loss_op.h | 4 ++-- .../fluid/tests/unittests/test_yolov3_loss_op.py | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc index 1d7f4823626ee..e7597f7324300 100644 --- a/paddle/fluid/operators/yolov3_loss_op.cc +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -215,9 +215,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, ops::Yolov3LossGradMaker); REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); -REGISTER_OP_CPU_KERNEL( - yolov3_loss, - ops::Yolov3LossKernel); -REGISTER_OP_CPU_KERNEL( - yolov3_loss_grad, - ops::Yolov3LossGradKernel); +REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel, + ops::Yolov3LossKernel); +REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel, + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h index a1072aca10802..0bb285722dded 100644 --- a/paddle/fluid/operators/yolov3_loss_op.h +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -323,7 +323,7 @@ static void AddAllGradToInputGrad( } } -template +template class Yolov3LossKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { @@ -392,7 +392,7 @@ class Yolov3LossKernel : public framework::OpKernel { } }; -template +template class Yolov3LossGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py index 335214b298d72..544fe4b4f8190 100644 --- a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -195,7 +195,7 @@ def test_check_grad_ignore_gtbox(self): self.check_grad_with_place( place, ['X'], 'Loss', - no_grad_set=set("GTBox"), + no_grad_set=set(["GTBox", "GTLabel"]), max_relative_error=0.06) def initTestCase(self): From cc319f64cbd2b2cccb0fe4e9117c1517927ba515 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 11:15:12 +0800 Subject: [PATCH 059/684] disable avx on windows by default test=develop --- cmake/simd.cmake | 54 ++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 4926fb991332b..86096d4feaace 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,21 +57,21 @@ int main() return 0; }" SSE3_FOUND) -# Check AVX -set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; -}" AVX_FOUND) - -# disable AVX2 by default on windows +# disable AVX by default on windows if(NOT WIN32) + # Check AVX + set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) + set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; + }" AVX_FOUND) + # Check AVX 2 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) @@ -83,20 +83,20 @@ if(NOT WIN32) __m256i result = _mm256_abs_epi32 (a); return 0; }" AVX2_FOUND) -endif(NOT WIN32) -# Check AVX512F -set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; -}" AVX512F_FOUND) + # Check AVX512F + set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) + set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; + }" AVX512F_FOUND) +endif(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) From 4a6769da84ac9f8a5dafbc0b9a00ef70944a2395 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 11:39:56 +0800 Subject: [PATCH 060/684] re-organize the cmake file --- cmake/simd.cmake | 54 +++++++++++++-------------- paddle/fluid/operators/CMakeLists.txt | 5 ++- 2 files changed, 30 insertions(+), 29 deletions(-) diff --git a/cmake/simd.cmake b/cmake/simd.cmake index 4926fb991332b..86096d4feaace 100644 --- a/cmake/simd.cmake +++ b/cmake/simd.cmake @@ -57,21 +57,21 @@ int main() return 0; }" SSE3_FOUND) -# Check AVX -set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) -set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); - __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); - __m256 result = _mm256_add_ps (a, b); - return 0; -}" AVX_FOUND) - -# disable AVX2 by default on windows +# disable AVX by default on windows if(NOT WIN32) + # Check AVX + set(CMAKE_REQUIRED_FLAGS ${AVX_FLAG}) + set(AVX_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m256 a = _mm256_set_ps (-1.0f, 2.0f, -3.0f, 4.0f, -1.0f, 2.0f, -3.0f, 4.0f); + __m256 b = _mm256_set_ps (1.0f, 2.0f, 3.0f, 4.0f, 1.0f, 2.0f, 3.0f, 4.0f); + __m256 result = _mm256_add_ps (a, b); + return 0; + }" AVX_FOUND) + # Check AVX 2 set(CMAKE_REQUIRED_FLAGS ${AVX2_FLAG}) set(AVX2_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) @@ -83,20 +83,20 @@ if(NOT WIN32) __m256i result = _mm256_abs_epi32 (a); return 0; }" AVX2_FOUND) -endif(NOT WIN32) -# Check AVX512F -set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) -set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) -CHECK_CXX_SOURCE_RUNS(" -#include -int main() -{ - __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, - 13, -5, 6, -7, 9, 2, -6, 3); - __m512i result = _mm512_abs_epi32 (a); - return 0; -}" AVX512F_FOUND) + # Check AVX512F + set(CMAKE_REQUIRED_FLAGS ${AVX512F_FLAG}) + set(AVX512F_FOUND_EXITCODE 1 CACHE STRING "Result from TRY_RUN" FORCE) + CHECK_CXX_SOURCE_RUNS(" + #include + int main() + { + __m512i a = _mm512_set_epi32 (-1, 2, -3, 4, -1, 2, -3, 4, + 13, -5, 6, -7, 9, 2, -6, 3); + __m512i result = _mm512_abs_epi32 (a); + return 0; + }" AVX512F_FOUND) +endif(NOT WIN32) set(CMAKE_REQUIRED_FLAGS ${CMAKE_REQUIRED_FLAGS_RETAINED}) mark_as_advanced(MMX_FOUND SSE2_FOUND SSE3_FOUND AVX_FOUND AVX2_FOUND AVX512F_FOUND) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 9b1b272292e6a..60a42cf56817f 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -49,11 +49,12 @@ endif() set(COMMON_OP_DEPS "") -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) -set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} xxhash selected_rows_functor selected_rows lod_tensor maxouting unpooling pooling lod_rank_table context_project sequence_pooling executor) if (NOT WIN32) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} dynload_warpctc) endif() +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) +set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv cub) endif() From 6e23d6a2d7e918d18e20380f9ac2192a7aaa91c8 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 13:46:21 +0800 Subject: [PATCH 061/684] disable mkl on windows by default --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 0b42e60e1753e..d9b797f3d1e7e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -135,6 +135,8 @@ if (WIN32) "Disable AVX when compiling for Windows" FORCE) set(WITH_DSO OFF CACHE STRING "Disable DSO when compiling for Windows" FORCE) + set(WITH_MKL OFF CACHE STRING + "Disable MKL when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING From 8443961a4f8b09ca1cfe632633ef21df87f5788a Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 16:55:49 +0800 Subject: [PATCH 062/684] add warp_ctc back --- paddle/fluid/operators/CMakeLists.txt | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 60a42cf56817f..412ab6670951a 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -32,9 +32,7 @@ if (WITH_GPU AND TENSORRT_FOUND) add_subdirectory(tensorrt) endif() -if (NOT WIN32) - register_operators(EXCLUDES warpctc_op) -endif() +register_operators(EXCLUDES warpctc_op) # warpctc_cudnn need cudnn 7 above if (WITH_GPU AND NOT WIN32) From a43bc612ad2590eeab42196b0adf87288f1c41f4 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 17:08:06 +0800 Subject: [PATCH 063/684] fix the dependency --- paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 6611e2e4b35ee..b6811f9183aaa 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1 +1 @@ -nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce) +nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context) From 81f750a88c354a7f34ee6732a152a87b0ee25d2f Mon Sep 17 00:00:00 2001 From: peizhilin Date: Mon, 19 Nov 2018 17:10:38 +0800 Subject: [PATCH 064/684] fix the dependency --- paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index 6611e2e4b35ee..b6811f9183aaa 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1 +1 @@ -nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce) +nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu prelu_op_plugin.cu DEPS enforce device_context) From 16bc8f2a755438cac5a279f27b082dbaf0e3e3a0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Mon, 19 Nov 2018 21:52:59 +0800 Subject: [PATCH 065/684] Add debug info --- .dockerignore | 1 + Dockerfile | 86 +++++++++++++++++++++++++-------------------------- 2 files changed, 44 insertions(+), 43 deletions(-) diff --git a/.dockerignore b/.dockerignore index 2b2e74053d33c..49adfe4f0accc 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,5 +1,6 @@ *.DS_Store build/ +build* *.user .vscode .idea diff --git a/Dockerfile b/Dockerfile index c8b9eed6d60e5..b36102175c42f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -71,46 +71,46 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3 install -U wheel && \ - pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - easy_install -U pip && \ - pip install -U pip setuptools wheel && \ - pip install -U docopt PyYAML sphinx==1.5.6 && \ - pip install sphinx-rtd-theme==0.1.9 recommonmark - -RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3 install opencv-python && \ - pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip install opencv-python - -#For docstring checker -RUN pip3 install pylint pytest astroid isort -RUN pip install pylint pytest astroid isort LinkChecker - -COPY ./python/requirements.txt /root/ -RUN pip3 install -r /root/requirements.txt -RUN pip install -r /root/requirements.txt - -# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use -# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -RUN apt-get install -y libssl-dev libffi-dev -RUN pip3 install certifi urllib3[secure] -RUN pip install certifi urllib3[secure] - - -# Install woboq_codebrowser to /woboq -RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - (cd /woboq \ - cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - -DCMAKE_BUILD_TYPE=Release . \ - make) - -# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service -RUN mkdir /var/run/sshd -RUN echo 'root:root' | chpasswd -RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config -RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config -EXPOSE 22 +# RUN pip3 install -U wheel && \ + # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ + # easy_install -U pip && \ + # pip install -U pip setuptools wheel && \ + # pip install -U docopt PyYAML sphinx==1.5.6 && \ + # pip install sphinx-rtd-theme==0.1.9 recommonmark + +# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + # pip3 install opencv-python && \ + # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + # pip install opencv-python + +# #For docstring checker +# RUN pip3 install pylint pytest astroid isort +# RUN pip install pylint pytest astroid isort LinkChecker + +# COPY ./python/requirements.txt /root/ +# RUN pip3 install -r /root/requirements.txt +# RUN pip install -r /root/requirements.txt + +# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +# RUN apt-get install -y libssl-dev libffi-dev +# RUN pip3 install certifi urllib3[secure] +# RUN pip install certifi urllib3[secure] + + +# # Install woboq_codebrowser to /woboq +# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + # (cd /woboq \ + # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + # -DCMAKE_BUILD_TYPE=Release . \ + # make) + +# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service +# RUN mkdir /var/run/sshd +# RUN echo 'root:root' | chpasswd +# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config +# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config +# EXPOSE 22 From 3f73c0a70d641b2b84b4764ee55f3f942fb2c6da Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 10:26:59 +0800 Subject: [PATCH 066/684] fix the build issue on windows --- paddle/fluid/memory/allocation/cpu_allocator.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 9e0044c47ae4e..165f11cd3b001 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -15,6 +15,11 @@ #pragma once #include "paddle/fluid/memory/allocation/allocator.h" +#ifdef _WIN32 +#define posix_memalign_free _aligned_free +#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#endif + namespace paddle { namespace memory { namespace allocation { From 301ed153231f0e0f6066663c1adc572af4907c97 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 13:36:10 +0800 Subject: [PATCH 067/684] remove unsupported flag on windows --- python/paddle/fluid/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index a7dfc6e9e3265..091697aaa50ad 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,7 +116,7 @@ def __bootstrap__(): 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - "dist_threadpool_size", 'cpu_deterministic', 'eager_delete_tensor_gb', + "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', 'reader_queue_speed_test_mode' ] if os.name != 'nt': From 935387f3fc4c36a13443443cb820868b65a3c667 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 14:40:19 +0800 Subject: [PATCH 068/684] code style --- python/paddle/fluid/__init__.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 2a28f7b2d1cad..6a4a5e098fc27 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -116,9 +116,8 @@ def __bootstrap__(): 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - "dist_threadpool_size", 'eager_delete_tensor_gb', - 'allocator_strategy', 'reader_queue_speed_test_mode', - 'print_sub_graph_dir' + "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', + 'reader_queue_speed_test_mode', 'print_sub_graph_dir' ] if os.name != 'nt': read_env_flags.append('warpctc_dir') From afeadf58f9ce90d4dd05f1eb1e4936cb83bc0cde Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 14:59:20 +0800 Subject: [PATCH 069/684] code style test=develop --- paddle/fluid/memory/allocation/cpu_allocator.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 165f11cd3b001..26d3643f4edff 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -17,7 +17,8 @@ #ifdef _WIN32 #define posix_memalign_free _aligned_free -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) #endif namespace paddle { From 0e1b426c839d8c91952b7d18139d3681beaa726f Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Tue, 20 Nov 2018 07:02:12 +0000 Subject: [PATCH 070/684] refine prelu api doc, test=develop --- python/paddle/fluid/layers/nn.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 99acd7e30884b..5749bcc54a618 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6900,18 +6900,18 @@ def prelu(x, mode, param_attr=None, name=None): """ Equation: - y = \max(0, x) + alpha \min(0, x) + y = \max(0, x) + alpha * \min(0, x) Args: x (Variable): The input tensor. param_attr(ParamAttr|None): The parameter attribute for the learnable - weight (alpha). - mode (string): The mode for weight sharing - all: all elements share same weight - channel:elements in a channel share same weight - element:each element has a weight + weight (alpha). + mode (string): The mode for weight sharing. It supports all, channel + and element. all: all elements share same weight + channel:elements in a channel share same weight + element:each element has a weight name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Returns: Variable: The output tensor with the same shape as input. @@ -6921,8 +6921,8 @@ def prelu(x, mode, param_attr=None, name=None): .. code-block:: python x = fluid.layers.data(name="x", shape=[10,10], dtype="float32") - mode = 'channel' - output = fluid.layers.prelu(x,mode) + mode = 'channel' + output = fluid.layers.prelu(x,mode) """ helper = LayerHelper('prelu', **locals()) if mode not in ['all', 'channel', 'element']: From c2cfb03a7277a92297b4617cb5c778bb495a998b Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 20 Nov 2018 08:50:24 +0000 Subject: [PATCH 071/684] add lstm jitcode --- paddle/fluid/operators/math/jit_code.cc | 49 +++++++++ paddle/fluid/operators/math/jit_code.h | 102 ++++++++++++++++-- paddle/fluid/operators/math/jit_kernel.h | 15 ++- paddle/fluid/operators/math/jit_kernel_impl.h | 49 +++++++++ 4 files changed, 198 insertions(+), 17 deletions(-) create mode 100644 paddle/fluid/operators/math/jit_kernel_impl.h diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index e484e9a3c705c..418c8433625cb 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_code.h" +#include // offsetof #include "paddle/fluid/operators/math/jit_kernel.h" // TODO(TJ): remove me namespace paddle { @@ -210,6 +211,54 @@ void VActJitCode::generate() { ret(); } +bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } + +void LSTMJitCode::generate() { + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ct_1 = r9; + reg64_t reg_ptr_ct = r10; + reg64_t reg_ptr_ht = r11; + mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); + mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); + mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + + int offset = 0; + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + /* C_t = C_t-1 * fgated + cand_gated * igated*/ + // c + vmovups(ymm_src, ptr[reg_ptr_gates + offset]); + act(ymm_c, ymm_src, act_cand_); + // i + vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]); + act(ymm_i, ymm_src, act_gate_); + vmulps(ymm_c, ymm_c, ymm_i); + if (first_) { + // f + vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]); + act(ymm_f, ymm_src, act_gate_); + vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]); + vmulps(ymm_f, ymm_f, ymm_i); + vaddps(ymm_f, ymm_f, ymm_c); + } + /* H_t = act_cell(C_t) * ogated */ + ymm_t ymm_ct = first_ ? ymm_c : ymm_f; + ymm_t ymm_o = first_ ? ymm_f : ymm_c; + ymm_t ymm_tmp = ymm_i; + act(ymm_tmp, ymm_ct, act_cell_); + vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]); + act(ymm_o, ymm_src, act_gate_); + vmulps(ymm_o, ymm_tmp, ymm_o); + // save ct and ht + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); + vmovups(ptr[reg_ptr_ht + offset], ymm_o); + + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + + ret(); +} + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 65f83ff484660..938b5525c1c2b 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include "paddle/fluid/operators/math/jit_gen.h" +#include "paddle/fluid/operators/math/jit_kernel_impl.h" #include "paddle/fluid/platform/cpu_info.h" namespace paddle { @@ -46,14 +47,6 @@ extern const float exp_float_consts[]; extern const int exp_int_0x7f[]; extern int g_tmp_mem[]; -// TODO(TJ): move these to some proper place -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - #define ALIGN32 __attribute__((aligned(32))) #define EXP_HIG 88.3762626647949f #define EXP_LOW -88.3762626647949f @@ -322,6 +315,99 @@ class VActJitCode : public JitCode { ymm_t ymm_dst = ymm_t(1); }; +class LSTMJitCode : public VActJitCode { + public: + const char* name() const override { + std::string base = "LSTMJitCode"; + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + if (first_) { + base += "_C1H1"; + } + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + AddTypeStr(act_cell_); + return base.c_str(); + } + + explicit LSTMJitCode(int d, bool first, operand_type act_gate, + operand_type act_cand, operand_type act_cell, + size_t code_size = 256 * 1024, void* code_ptr = nullptr) + : VActJitCode(d, act_gate, code_size, code_ptr), + num_(d), + first_(first), + act_gate_(act_gate), + act_cand_(act_cand), + act_cell_(act_cell) {} + static bool init(int d); + void generate() override; + + protected: + int num_; + bool first_; + operand_type act_gate_; + operand_type act_cand_; + operand_type act_cell_; + reg64_t param1{abi_param1}; + + xmm_t xmm_src = xmm_t(0); + xmm_t xmm_c = xmm_t(1); + xmm_t xmm_i = xmm_t(2); + xmm_t xmm_f = xmm_t(3); + + ymm_t ymm_src = ymm_t(0); + ymm_t ymm_c = ymm_t(1); + ymm_t ymm_i = ymm_t(2); + ymm_t ymm_f = ymm_t(3); + + template + void act(JMM& dst, JMM& src, operand_type type) { // NOLINT + // use 15 + JMM zero = JMM(15); + if (type_ == operand_type::relu) { + vxorps(zero, zero, zero); + } + switch (type) { + case operand_type::relu: + relu_jmm(dst, src, zero); + break; + case operand_type::exp: + exp_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::sigmoid: + sigmoid_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::tanh: + tanh_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::identity: + break; + default: + // throw error + break; + } + } +}; + } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 7e163c1349e73..b5e54fcc1b80f 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -17,6 +17,7 @@ limitations under the License. */ #include // for shared_ptr #include #include +#include "paddle/fluid/operators/math/jit_kernel_impl.h" #include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/macros.h" @@ -26,14 +27,7 @@ namespace operators { namespace math { namespace jitkernel { -// TODO(TJ): move these to some proper place -#define SIGMOID_THRESHOLD_MIN -40.0 -#define SIGMOID_THRESHOLD_MAX 13.0 -#define EXP_MAX_INPUT 40.0 -#define XMM_FLOAT_BLOCK 4 -#define YMM_FLOAT_BLOCK 8 -#define ZMM_FLOAT_BLOCK 16 - +// TODO(TJ): remove me typedef enum { kLT8, kEQ8, kGT8LT16, kEQ16, kGT16 } jit_block; class Kernel { @@ -124,10 +118,13 @@ class LSTMKernel : public Kernel { const T *wp_data = nullptr, T *checked = nullptr) const = 0; - // compute c1 and h1 without c0 or h0 virtual void ComputeC1H1(T *gates, T *ct, T *ht, /* below only used in peephole*/ const T *wp_data = nullptr) const = 0; + + // void (*ComputeCtHt)(lstm_t *); + // // compute c1 and h1 without c0 or h0 + // void (*ComputeC1H1)(lstm_t *); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h new file mode 100644 index 0000000000000..fcb6a7c09710c --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { + +#define SIGMOID_THRESHOLD_MIN -40.0 +#define SIGMOID_THRESHOLD_MAX 13.0 +#define EXP_MAX_INPUT 40.0 +#define XMM_FLOAT_BLOCK 4 +#define YMM_FLOAT_BLOCK 8 +#define ZMM_FLOAT_BLOCK 16 + +typedef struct { + void* gates; // gates: W_ch, W_ih, W_fh, W_oh + const void* ct_1; + void* ct; + void* ht; + /* below only used in peephole*/ + const void* wp_data{nullptr}; + void* checked{nullptr}; +} lstm_t; + +typedef struct { + int d; + std::string act_gate, act_cand, act_cell; +} lstm_attr_t; + +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle From 5d6b370a4968fc4bc7dea369ee588ebec0b8f660 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Tue, 20 Nov 2018 20:17:16 +0800 Subject: [PATCH 072/684] fix issue --- paddle/fluid/platform/enforce.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3643d2ad15b9e..31309738a52f0 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -134,7 +134,7 @@ struct EOFException : public std::exception { #define LIKELY(condition) __builtin_expect(static_cast(condition), 1) #else // there is no equivalent intrinsics in msvc. -#define LIKELY(condition) !(condition) +#define LIKELY(condition) (condition) #endif template From 79cec5311179e6e50b0126fea0e6dfa8a7cf354a Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Tue, 20 Nov 2018 12:37:04 +0000 Subject: [PATCH 073/684] add ignore index for sigmoid cross entropy with logits op, test=develop --- .../sigmoid_cross_entropy_with_logits_op.cc | 5 + .../sigmoid_cross_entropy_with_logits_op.h | 93 ++++++++++++++----- python/paddle/fluid/layers/nn.py | 5 +- .../fluid/tests/unittests/test_layers.py | 3 +- ...st_sigmoid_cross_entropy_with_logits_op.py | 35 +++++++ 5 files changed, 116 insertions(+), 25 deletions(-) diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index 193de05422bb7..d6a2fa6a1797e 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -100,6 +100,11 @@ class SigmoidCrossEntropyWithLogitsOpMaker AddOutput("Out", "(Tensor, default Tensor), a 2-D tensor with shape N x D " " of elementwise logistic losses."); + AddAttr( + "ignore_index", + "(int, default -1), Specifies a target value that is ignored and" + "does not contribute to the input gradient.") + .SetDefault(-1); AddComment(R"DOC( SigmoidCrossEntropyWithLogits Operator. diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h index faef72866eb49..2bfba6f1704f8 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -15,33 +15,82 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/platform/hostdevice.h" +#include "paddle/legacy/utils/Logging.h" namespace paddle { namespace operators { +using Tensor = framework::Tensor; +template +using EigenVector = framework::EigenVector; +template +using EigenMatrix = framework::EigenMatrix; + +template +struct SigmoidCrossEntropyWithLogitsForward { + // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward) + HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index) + : ignore_index(ignore_index) {} + + HOSTDEVICE T operator()(const T &x, const T &label) const { + if (static_cast(label) == ignore_index) { + return static_cast(0.); + } + T term1 = (x > 0) ? x : 0; + T term2 = x * label; + T term3 = std::log(static_cast(1) + std::exp(-(std::abs(x)))); + return term1 - term2 + term3; + } + + int ignore_index; +}; + +template +struct SigmoidCrossEntropyWithLogitsBackward { + // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward) + HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index) + : ignore_index(ignore_index) {} + + HOSTDEVICE T operator()(const T &x, const T &label) const { + if (static_cast(label) == ignore_index) { + return static_cast(0.); + } + T simoid_x = static_cast(1) / (static_cast(1) + std::exp(-x)); + return simoid_x - label; + } + + int ignore_index; +}; + // Out = max(X, 0) - X * Labels + log(1 + exp(-abs(X))) template class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = context.Input("Label"); - framework::Tensor *Out = context.Output("Out"); + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + Tensor *Out = context.Output("Out"); Out->mutable_data(context.GetPlace()); + int ignore_index = context.Attr("ignore_index"); - auto x = framework::EigenVector::Flatten(*X); - auto labels = framework::EigenVector::Flatten(*Labels); - auto out = framework::EigenVector::Flatten(*Out); + auto x = EigenVector::Flatten(*X); + auto labels = EigenVector::Flatten(*Labels); + auto out = EigenVector::Flatten(*Out); auto &place = *context.device_context().eigen_device(); + out.device(place) = x.binaryExpr( + labels, SigmoidCrossEntropyWithLogitsForward(ignore_index)); // term1 = max(x, 0) - auto term1 = x.cwiseMax(static_cast(0)); + // auto term1 = x.cwiseMax(static_cast(0)); // term2 = x * labels - auto term2 = x * labels; + // auto term2 = x * labels; // term3 = log(1 + exp(-abs(x))) - auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); + // auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); - out.device(place) = term1 - term2 + term3; + // out.device(place) = term1 - term2 + term3; } }; @@ -50,23 +99,23 @@ template class SigmoidCrossEntropyWithLogitsGradKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext &context) const override { - const framework::Tensor *X = context.Input("X"); - const framework::Tensor *Labels = context.Input("Label"); - const framework::Tensor *dOut = - context.Input(framework::GradVarName("Out")); - framework::Tensor *dX = - context.Output(framework::GradVarName("X")); + const Tensor *X = context.Input("X"); + const Tensor *Labels = context.Input("Label"); + const Tensor *dOut = context.Input(framework::GradVarName("Out")); + Tensor *dX = context.Output(framework::GradVarName("X")); dX->mutable_data(context.GetPlace()); - auto x = framework::EigenVector::Flatten(*X); - auto labels = framework::EigenVector::Flatten(*Labels); - auto dout = framework::EigenVector::Flatten(*dOut); - auto dx = framework::EigenVector::Flatten(*dX); + auto ignore_index = context.Attr("ignore_index"); + auto x = EigenVector::Flatten(*X); + auto labels = EigenVector::Flatten(*Labels); + auto dout = EigenVector::Flatten(*dOut); + auto dx = EigenVector::Flatten(*dX); auto &place = *context.template device_context().eigen_device(); - auto sigmoid_x = static_cast(1) / (static_cast(1) + (-x).exp()); - dx.device(place) = dout * (sigmoid_x - labels); + auto diff = x.binaryExpr(labels, SigmoidCrossEntropyWithLogitsBackward( + static_cast(ignore_index))); + dx.device(place) = dout * diff; } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 99acd7e30884b..e032835de32a5 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -7892,13 +7892,14 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): @templatedoc() -def sigmoid_cross_entropy_with_logits(x, label, name=None): +def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-1, name=None): """ ${comment} Args: x(${x_type}): ${x_comment} label(${label_type}): ${label_comment} + ignore_index(&{ignore_index}): ${ignore_index_comment} name(basestring|None): Name of the output. Returns: @@ -7917,7 +7918,7 @@ def sigmoid_cross_entropy_with_logits(x, label, name=None): type="sigmoid_cross_entropy_with_logits", inputs={"X": x, "Label": label}, - attrs={}, + attrs={"ignore_index": ignore_index}, outputs={"Out": out}) return out diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index a8fa5436c43d2..8e098e4961feb 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -170,9 +170,10 @@ def test_sigmoid_cross_entropy(self): with program_guard(program): dat = layers.data(name='data', shape=[10], dtype='float32') lbl = layers.data(name='label', shape=[10], dtype='float32') + ignore_index = -1 self.assertIsNotNone( layers.sigmoid_cross_entropy_with_logits( - x=dat, label=lbl)) + x=dat, label=lbl, ignore_index=-1)) print(str(program)) def test_hsigmoid(self): diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py index 97ff203499c0b..64f6f088e1075 100644 --- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py +++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py @@ -56,6 +56,40 @@ class TestSigmoidCrossEntropyWithLogitsOp2(OpTest): """Test sigmoid_cross_entropy_with_logit_op with probabalistic label """ + def setUp(self): + self.op_type = "sigmoid_cross_entropy_with_logits" + batch_size = 64 + num_classes = 20 + ignore_index = -1 + self.inputs = { + 'X': logit( + np.random.uniform(0, 1, (batch_size, num_classes)) + .astype("float32")), + 'Label': np.random.randint(-1, 2, (batch_size, num_classes)) + .astype("float32") + } + self.attrs = {'ignore_index': ignore_index, } + # Fw Pass is implemented as elementwise sigmoid followed by + # elementwise logistic loss + # Label * -log(sigmoid(X)) + (1 - label) * -log(1 - sigmoid(X)) + sigmoid_X = expit(self.inputs['X']) + term1 = self.inputs['Label'] * np.log(sigmoid_X) + term2 = (1 - self.inputs['Label']) * np.log(1 - sigmoid_X) + out = -term1 - term2 + out[np.where(self.inputs['Label'] == ignore_index)] = 0 + self.outputs = {'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['X'], 'Out') + + +class TestSigmoidCrossEntropyWithLogitsOp3(OpTest): + """Test sigmoid_cross_entropy_with_logit_op with probabalistic label + """ + def setUp(self): self.op_type = "sigmoid_cross_entropy_with_logits" batch_size = 64 @@ -85,3 +119,4 @@ def test_check_grad(self): if __name__ == '__main__': unittest.main() + np.random.seed(0) From ce31deb7e938270249b719bce93ef6d8baf5c0c4 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Tue, 20 Nov 2018 12:37:43 +0000 Subject: [PATCH 074/684] refine refer code and add lstm refer code test=develop --- .../fluid/operators/math/jit_kernel_blas.cc | 65 +------ paddle/fluid/operators/math/jit_kernel_exp.cc | 40 +--- paddle/fluid/operators/math/jit_kernel_impl.h | 6 +- .../fluid/operators/math/jit_kernel_refer.h | 171 ++++++++++++++++++ .../fluid/operators/math/jit_kernel_test.cc | 139 +++----------- 5 files changed, 220 insertions(+), 201 deletions(-) create mode 100644 paddle/fluid/operators/math/jit_kernel_refer.h diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index 36a50f20434f3..90b7029371a21 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -15,6 +15,7 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #include "paddle/fluid/platform/enforce.h" #ifdef PADDLE_WITH_XBYAK @@ -31,49 +32,6 @@ namespace math { namespace jitkernel { namespace jit = platform::jit; -template -void VMulRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - -template -void VAddRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - -template -void VAddReluRefer(const T* x, const T* y, T* z, int n) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} - -template -void VScalRefer(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] * x[i]; - } -} - -template -void VAddBiasRefer(const T* a, const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = a[0] + x[i]; - } -} - -template -void VReluRefer(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0 ? x[i] : 0; - } -} - #ifdef PADDLE_WITH_MKLML template void VMulMKL(const T* x, const T* y, T* z, int n); @@ -109,7 +67,7 @@ void VScalMKL(const float* a, const float* x, float* y, int n) { if (x == y) { platform::dynload::cblas_sscal(n, *a, y, 1); } else { - VScalRefer(a, x, y, n); + refer::VScal(a, x, y, n); } } @@ -118,7 +76,7 @@ void VScalMKL(const double* a, const double* x, double* y, int n) { if (x == y) { platform::dynload::cblas_dscal(n, *a, y, 1); } else { - VScalRefer(a, x, y, n); + refer::VScal(a, x, y, n); } } @@ -147,7 +105,7 @@ class VMulKernelImpl : public VMulKernel { return; } #endif - this->Compute = VMulRefer; + this->Compute = refer::VMul; } #ifdef PADDLE_WITH_XBYAK @@ -198,7 +156,7 @@ class VAddKernelImpl : public VAddKernel { return; } #endif - this->Compute = VAddRefer; + this->Compute = refer::VAdd; } #ifdef PADDLE_WITH_XBYAK @@ -242,7 +200,7 @@ class VAddReluKernelImpl : public VAddReluKernel { return; } #endif - this->Compute = VAddReluRefer; + this->Compute = refer::VAddRelu; } #ifdef PADDLE_WITH_XBYAK @@ -280,7 +238,7 @@ class VScalKernelImpl : public VScalKernel { return; } #endif - this->Compute = VScalRefer; + this->Compute = refer::VScal; } #ifdef PADDLE_WITH_XBYAK @@ -324,7 +282,7 @@ class VAddBiasKernelImpl : public VAddBiasKernel { } #endif - this->Compute = VAddBiasRefer; + this->Compute = refer::VAddBias; } #ifdef PADDLE_WITH_XBYAK @@ -358,7 +316,7 @@ class VReluKernelImpl : public VReluKernel { } #endif - this->Compute = VReluRefer; + this->Compute = refer::VRelu; } #ifdef PADDLE_WITH_XBYAK @@ -374,16 +332,13 @@ bool VReluKernelImpl::useJIT(int d) { } #endif -template -inline void VIdentityRefer(const T* x, T* y, int n) {} - /* An empty JitKernel */ template class VIdentityKernelImpl : public VIdentityKernel { public: JITKERNEL_DECLARE_STATIC_FUNC; explicit VIdentityKernelImpl(int d) : VIdentityKernel() { - this->Compute = VIdentityRefer; + this->Compute = refer::VIdentity; } }; diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index f26815300de31..1fe7d66c752d3 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -13,9 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" -#include // for exp #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #ifdef PADDLE_WITH_XBYAK #include "paddle/fluid/operators/math/jit_code.h" @@ -35,38 +35,6 @@ namespace math { namespace jitkernel { namespace jit = platform::jit; -// TODO(TJ): move refer codes to one file -// Refer code only focus on correctness -template -void VExpRefer(const T* x, T* y, int n) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - -template -void VSigmoidRefer(const T* x, T* y, int n) { - // y = 1 / (1 + e^-x) - const T min = SIGMOID_THRESHOLD_MIN; - const T max = SIGMOID_THRESHOLD_MAX; - for (int i = 0; i < n; ++i) { - T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); - y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); - } -} - -template -void VTanhRefer(const T* x, T* y, int n) { - // y = 2 * sigmoid(2x) - 1 - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * x[i]; - } - VSigmoidRefer(y, y, n); - for (int i = 0; i < n; ++i) { - y[i] = static_cast(2) * y[i] - static_cast(1); - } -} - #ifdef PADDLE_WITH_MKLML // try to use MKL to speedup template @@ -129,7 +97,7 @@ class VExpKernelImpl : public VExpKernel { return; } #endif - this->Compute = VExpRefer; + this->Compute = refer::VExp; } #ifdef PADDLE_WITH_XBYAK @@ -182,7 +150,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel { return; } #endif - this->Compute = VSigmoidRefer; + this->Compute = refer::VSigmoid; } #ifdef PADDLE_WITH_XBYAK @@ -234,7 +202,7 @@ class VTanhKernelImpl : public VTanhKernel { return; } #endif - this->Compute = VTanhRefer; + this->Compute = refer::VTanh; } #ifdef PADDLE_WITH_XBYAK diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h index fcb6a7c09710c..337d5ae91413a 100644 --- a/paddle/fluid/operators/math/jit_kernel_impl.h +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -38,9 +38,13 @@ typedef struct { void* checked{nullptr}; } lstm_t; -typedef struct { +typedef struct lstm_attr_s { int d; std::string act_gate, act_cand, act_cell; + lstm_attr_s() = default; + lstm_attr_s(int _d, const std::string& _act_gate, + const std::string& _act_cand, const std::string& _act_cell) + : d(_d), act_gate(_act_gate), act_cand(_act_cand), act_cell(_act_cell) {} } lstm_attr_t; } // namespace jitkernel diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h new file mode 100644 index 0000000000000..9c60ebc5873d2 --- /dev/null +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -0,0 +1,171 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/operators/math/jit_kernel_impl.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace math { +namespace jitkernel { +namespace refer { +/* Refer code only focus on correctness */ + +template +void VMul(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] * y[i]; + } +} + +template +void VAdd(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + } +} + +template +void VAddRelu(const T* x, const T* y, T* z, int n) { + for (int i = 0; i < n; ++i) { + z[i] = x[i] + y[i]; + z[i] = z[i] > 0 ? z[i] : 0; + } +} + +template +void VScal(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] * x[i]; + } +} + +template +void VAddBias(const T* a, const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = a[0] + x[i]; + } +} + +template +void VRelu(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = x[i] > 0 ? x[i] : 0; + } +} + +template +inline void VIdentity(const T* x, T* y, int n) {} + +template +void VExp(const T* x, T* y, int n) { + for (int i = 0; i < n; ++i) { + y[i] = std::exp(x[i]); + } +} + +template +void VSigmoid(const T* x, T* y, int n) { + // y = 1 / (1 + e^-x) + const T min = SIGMOID_THRESHOLD_MIN; + const T max = SIGMOID_THRESHOLD_MAX; + for (int i = 0; i < n; ++i) { + T tmp = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]); + y[i] = static_cast(1) / (static_cast(1) + std::exp(-tmp)); + } +} + +template +void VTanh(const T* x, T* y, int n) { + // y = 2 * sigmoid(2x) - 1 + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * x[i]; + } + VSigmoid(y, y, n); + for (int i = 0; i < n; ++i) { + y[i] = static_cast(2) * y[i] - static_cast(1); + } +} + +template +void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT + if (type == "sigmoid") { + return VSigmoid; + } else if (type == "relu") { + return VRelu; + } else if (type == "tanh") { + return VTanh; + } else if (type == "identity" || type == "") { + return VIdentity; + } + PADDLE_THROW("Not support type: %s", type); + return nullptr; +} + +template +void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + const T* ct_1 = reinterpret_cast(step->ct_1); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + // gates: W_ch, W_ih, W_fh, W_oh + act_gate(gates + d, gates + d, d3); + + /* C_t = C_t-1 * fgated + cand_gated * igated */ + act_cand(gates, gates, d); + VMul(gates, gates + d, gates + d, d); + VMul(ct_1, gates + d2, gates + d2, d); + VAdd(gates + d, gates + d2, ct, d); + + /* H_t = act_cell(C_t) * ogated */ + act_cell(ct, gates + d2, d); + VMul(gates + d2, gates + d3, ht, d); +} + +template +void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + const T* ct_1 = reinterpret_cast(step->ct_1); + T* ct = reinterpret_cast(step->ct); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + auto act_cell = getActFunc(attr->act_cell); + int d = attr->d; + int d2 = d * 2; + int d3 = d * 3; + /* C_t = igated * cgated*/ + act_gate(gates + d, gates + d, d); + act_cand(gates, gates, d); + VMul(gates, gates + d, ct, d); + /* H_t = act_cell(C_t) * ogated */ + act_gate(gates + d3, gates + d3, d); + act_cell(ct, gates + d2, d); + Vmul(gates + d2, gates + d3, ht, d); +} + +} // namespace refer +} // namespace jitkernel +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index b6c62a26348cd..a1705a81c47d7 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -22,6 +22,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" @@ -53,12 +54,6 @@ void RandomVec(const int n, T* a, const T lower = static_cast(-20.f), } } -void vrelu_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] > 0.f ? x[i] : 0.f; - } -} - #if defined __AVX__ || defined __AVX2__ void vrelu_intri8(const int n, const float* x, float* y) { __m256 tmp = _mm256_loadu_ps(x); @@ -69,6 +64,7 @@ void vrelu_intri8(const int n, const float* x, float* y) { TEST(JitKernel, vrelu) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {3, 7, 8, 15, 16, 30, 256, 512}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -80,7 +76,7 @@ TEST(JitKernel, vrelu) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vrelu_ref(d, x_data, zref_data); + refer::VRelu(x_data, zref_data, d); } auto trefe = GetCurrentUS(); #if defined __AVX__ || defined __AVX2__ @@ -107,14 +103,9 @@ TEST(JitKernel, vrelu) { } } -void vaddbias_ref(const int n, const float a, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = x[i] + a; - } -} - TEST(JitKernel, vaddbias) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -127,7 +118,7 @@ TEST(JitKernel, vaddbias) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vaddbias_ref(d, a, x_data, zref_data); + refer::VAddBias(&a, x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -145,12 +136,6 @@ TEST(JitKernel, vaddbias) { } } -void vexp_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = std::exp(x[i]); - } -} - #ifdef PADDLE_WITH_MKLML void vexp_mkl(const int n, const float* x, float* y) { paddle::platform::dynload::vsExp(n, x, y); @@ -159,6 +144,7 @@ void vexp_mkl(const int n, const float* x, float* y) { TEST(JitKernel, vexp) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 3, 4, 6, 7, 8, 12, 15, 16, 20, 30, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -170,7 +156,7 @@ TEST(JitKernel, vexp) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vexp_ref(d, x_data, zref_data); + refer::VExp(x_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -203,19 +189,6 @@ TEST(JitKernel, vexp) { } } -inline float _sigmoid(float x) { - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - float tmp = (x < min) ? min : ((x > max) ? max : x); - return 1.f / (1.f + std::exp(-tmp)); -} - -void vsigmoid_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = _sigmoid(x[i]); - } -} - void vsigmoid_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VExpKernel>& vexp, @@ -234,6 +207,7 @@ void vsigmoid_better( TEST(JitKernel, vsigmoid) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 3, 4, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -252,7 +226,7 @@ TEST(JitKernel, vsigmoid) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vsigmoid_ref(d, x_data, zref_data); + refer::VSigmoid(x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -271,14 +245,6 @@ TEST(JitKernel, vsigmoid) { } } -inline float _tanh(float x) { return 2.f * _sigmoid(2.f * x) - 1.f; } - -void vtanh_ref(const int n, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = _tanh(x[i]); - } -} - void vtanh_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VScalKernel>& vscal, @@ -298,6 +264,7 @@ void vtanh_better( TEST(JitKernel, vtanh) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) { std::vector x(d); std::vector zref(d), ztgt(d); @@ -320,7 +287,7 @@ TEST(JitKernel, vtanh) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vtanh_ref(d, x_data, zref_data); + refer::VTanh(x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -339,32 +306,6 @@ TEST(JitKernel, vtanh) { } } -void lstm_ctht_ref( - const std::shared_ptr< - const paddle::operators::math::jitkernel::VSigmoidKernel>& - vsigmoid_3d, - const std::shared_ptr< - const paddle::operators::math::jitkernel::VTanhKernel>& vtanh_d, - const std::shared_ptr< - const paddle::operators::math::jitkernel::VExpKernel>& vexp_1, - const int d, float* gates, const float* ct_1, float* ct, float* ht) { - vsigmoid_3d->Compute(gates + d, gates + d, 3 * d); - vtanh_d->Compute(gates, gates, d); - const float *i = gates + d, *f = gates + d * 2, *o = gates + d * 3; - const float min = SIGMOID_THRESHOLD_MIN; - const float max = SIGMOID_THRESHOLD_MAX; - for (int k = 0; k < d; ++k) { - // C_t = C_t-1 * fgated + cand_gated * igated - ct[k] = ct_1[k] * f[k] + gates[k] * i[k]; - // H_t = act_cell(C_t) * ogated - float tmp = ct[k] * 2; - tmp = 0.f - ((tmp < min) ? min : ((tmp > max) ? max : tmp)); - vexp_1->Compute(&tmp, &tmp, 1); - tmp = 2.f / (1.f + tmp) - 1.f; - ht[k] = tmp * o[k]; - } -} - void lstm_ctht_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VSigmoidKernel>& @@ -389,6 +330,7 @@ void lstm_ctht_better( TEST(JitKernel, lstm) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {1, 2, 3, 4, 5, 6, 7, 8, 15, 16, 30, 32, 64, 100}) { int d4 = d * 4; int d3 = d * 3; @@ -410,8 +352,6 @@ TEST(JitKernel, lstm) { d3); const auto& vtanh_d = jit::KernelPool::Instance().template Get>(d); - const auto& vexp_1 = - jit::KernelPool::Instance().template Get>(1); const auto& vmul_d = jit::KernelPool::Instance().template Get>(d); const auto& vadd_d = @@ -425,8 +365,14 @@ TEST(JitKernel, lstm) { float* ct_ref_data = ct_ref.data(); float* ht_ref_data = ht_ref.data(); // compute once to check correctness - lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, - ct_ref_data, ht_ref_data); + jit::lstm_t step; + jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell); + step.gates = xref_data; + step.ct_1 = ct_1_data; + step.ct = ct_ref_data; + step.ht = ht_ref_data; + refer::LSTMCtHt(&step, &attr); + ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); for (int i = 0; i < d; ++i) { EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3); @@ -441,8 +387,7 @@ TEST(JitKernel, lstm) { auto tmkle = GetCurrentUS(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - lstm_ctht_ref(vsigmoid_3d, vtanh_d, vexp_1, d, xref_data, ct_1_data, - ct_ref_data, ht_ref_data); + refer::LSTMCtHt(&step, &attr); } auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); @@ -457,16 +402,6 @@ TEST(JitKernel, lstm) { } } -void vscal_ref(const int n, const float a, const float* x, float* y) { - for (int i = 0; i < n; ++i) { - y[i] = a * x[i]; - } -} -void vscal_inp_ref(const int n, const float a, float* x) { - for (int i = 0; i < n; ++i) { - x[i] = a * x[i]; - } -} #if defined __AVX__ || defined __AVX2__ void vscal_intri8(const int n, const float a, const float* x, float* y) { __m256 tmp; @@ -492,6 +427,7 @@ void vscal_inp_mkl(const int n, const float a, float* x) { TEST(JitKernel, vscal) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -506,12 +442,12 @@ TEST(JitKernel, vscal) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vscal_ref(d, a, x_data, zref_data); + refer::VScal(&a, x_data, zref_data, d); } auto trefe = GetCurrentUS(); auto trefs1 = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vscal_inp_ref(d, a, y_data); + refer::VScal(&a, y_data, y_data, d); } auto trefe1 = GetCurrentUS(); @@ -567,12 +503,6 @@ TEST(JitKernel, vscal) { } } -void vmul_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] * y[i]; - } -} - #if defined __AVX__ || defined __AVX2__ void vmul_intri8(const int n, const float* x, const float* y, float* z) { __m256 tmpx, tmpy; @@ -591,6 +521,7 @@ void vmul_mkl(const int n, const float* x, const float* y, float* z) { TEST(JitKernel, vmul) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 20, 30, 256, 512, 1000, 1024}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -604,7 +535,7 @@ TEST(JitKernel, vmul) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vmul_ref(d, x_data, y_data, zref_data); + refer::VMul(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -647,12 +578,6 @@ TEST(JitKernel, vmul) { } } -void vadd_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - } -} - #if defined __AVX__ || defined __AVX2__ void vadd_intri8(const int n, const float* x, const float* y, float* z) { __m256 tmpx, tmpy; @@ -671,6 +596,7 @@ void vadd_mkl(const int n, const float* x, const float* y, float* z) { TEST(JitKernel, vadd) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -684,7 +610,7 @@ TEST(JitKernel, vadd) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vadd_ref(d, x_data, y_data, zref_data); + refer::VAdd(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); @@ -727,12 +653,6 @@ TEST(JitKernel, vadd) { } } -void vaddrelu_ref(const int n, const float* x, const float* y, float* z) { - for (int i = 0; i < n; ++i) { - z[i] = x[i] + y[i]; - z[i] = z[i] > 0 ? z[i] : 0; - } -} void vaddrelu_better( const std::shared_ptr< const paddle::operators::math::jitkernel::VAddKernel>& vadd, @@ -745,6 +665,7 @@ void vaddrelu_better( TEST(JitKernel, vaddrelu) { namespace jit = paddle::operators::math::jitkernel; + namespace refer = paddle::operators::math::jitkernel::refer; for (int d : {7, 8, 15, 16, 30, 256, 512}) { std::vector x(d), y(d); std::vector zref(d), ztgt(d); @@ -762,7 +683,7 @@ TEST(JitKernel, vaddrelu) { float* zref_data = zref.data(); auto trefs = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - vaddrelu_ref(d, x_data, y_data, zref_data); + refer::VAddRelu(x_data, y_data, zref_data, d); } auto trefe = GetCurrentUS(); auto tmkls = GetCurrentUS(); From 13e254faedd2c464fa14057d90c66995b2b4f159 Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Tue, 20 Nov 2018 13:08:23 +0000 Subject: [PATCH 075/684] refine code, test=develop --- paddle/fluid/API.spec | 2 +- .../operators/sigmoid_cross_entropy_with_logits_op.cc | 4 ++-- .../operators/sigmoid_cross_entropy_with_logits_op.h | 10 ---------- python/paddle/fluid/layers/nn.py | 2 +- python/paddle/fluid/tests/unittests/test_layers.py | 2 +- .../test_sigmoid_cross_entropy_with_logits_op.py | 1 - 6 files changed, 5 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index da8941c351571..f84ec4cb3e189 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -174,7 +174,7 @@ paddle.fluid.layers.clip ArgSpec(args=['x', 'min', 'max', 'name'], varargs=None, paddle.fluid.layers.clip_by_norm ArgSpec(args=['x', 'max_norm', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mean ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.mul ArgSpec(args=['x', 'y', 'x_num_col_dims', 'y_num_col_dims', 'name'], varargs=None, keywords=None, defaults=(1, 1, None)) -paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.sigmoid_cross_entropy_with_logits ArgSpec(args=['x', 'label', 'ignore_index', 'name'], varargs=None, keywords=None, defaults=(-100, None)) paddle.fluid.layers.maxout ArgSpec(args=['x', 'groups', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.space_to_depth ArgSpec(args=['x', 'blocksize', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.affine_grid ArgSpec(args=['theta', 'out_shape', 'name'], varargs=None, keywords=None, defaults=(None,)) diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc index d6a2fa6a1797e..368988d60da1a 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc @@ -102,9 +102,9 @@ class SigmoidCrossEntropyWithLogitsOpMaker " of elementwise logistic losses."); AddAttr( "ignore_index", - "(int, default -1), Specifies a target value that is ignored and" + "(int, default -100), Specifies a target value that is ignored and" "does not contribute to the input gradient.") - .SetDefault(-1); + .SetDefault(-100); AddComment(R"DOC( SigmoidCrossEntropyWithLogits Operator. diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h index 2bfba6f1704f8..b8731c2327530 100644 --- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h +++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.h @@ -31,7 +31,6 @@ using EigenMatrix = framework::EigenMatrix; template struct SigmoidCrossEntropyWithLogitsForward { - // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward) HOSTDEVICE SigmoidCrossEntropyWithLogitsForward(const int &ignore_index) : ignore_index(ignore_index) {} @@ -50,7 +49,6 @@ struct SigmoidCrossEntropyWithLogitsForward { template struct SigmoidCrossEntropyWithLogitsBackward { - // EIGEN_EMPTY_STRUCT_CTOR(SigmoidCrossEntropyWithLogitsForward) HOSTDEVICE SigmoidCrossEntropyWithLogitsBackward(const int &ignore_index) : ignore_index(ignore_index) {} @@ -83,14 +81,6 @@ class SigmoidCrossEntropyWithLogitsKernel : public framework::OpKernel { out.device(place) = x.binaryExpr( labels, SigmoidCrossEntropyWithLogitsForward(ignore_index)); - // term1 = max(x, 0) - // auto term1 = x.cwiseMax(static_cast(0)); - // term2 = x * labels - // auto term2 = x * labels; - // term3 = log(1 + exp(-abs(x))) - // auto term3 = (static_cast(1) + (-(x.abs())).exp()).log(); - - // out.device(place) = term1 - term2 + term3; } }; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e032835de32a5..38da9173cce59 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -7892,7 +7892,7 @@ def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): @templatedoc() -def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-1, name=None): +def sigmoid_cross_entropy_with_logits(x, label, ignore_index=-100, name=None): """ ${comment} diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 8e098e4961feb..326938e1150ea 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -173,7 +173,7 @@ def test_sigmoid_cross_entropy(self): ignore_index = -1 self.assertIsNotNone( layers.sigmoid_cross_entropy_with_logits( - x=dat, label=lbl, ignore_index=-1)) + x=dat, label=lbl, ignore_index=ignore_index)) print(str(program)) def test_hsigmoid(self): diff --git a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py index 64f6f088e1075..41797a241cab9 100644 --- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py +++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py @@ -119,4 +119,3 @@ def test_check_grad(self): if __name__ == '__main__': unittest.main() - np.random.seed(0) From 703b26e697c0a15a903d2e346d191e033e181073 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 21 Nov 2018 11:22:34 +0800 Subject: [PATCH 076/684] add profiler, parallel_executor back --- paddle/fluid/framework/CMakeLists.txt | 9 - .../fast_threaded_ssa_graph_executor.h | 2 +- .../fluid/memory/allocation/cpu_allocator.h | 3 +- paddle/fluid/platform/CMakeLists.txt | 12 +- paddle/fluid/platform/device_tracer.h | 12 +- paddle/fluid/platform/enforce.h | 2 +- paddle/fluid/platform/port.h | 21 + paddle/fluid/platform/profiler.cc | 6 +- paddle/fluid/platform/profiler.h | 10 - .../fluid/platform/stream_callback_manager.h | 13 +- paddle/fluid/pybind/CMakeLists.txt | 5 +- paddle/fluid/pybind/pybind.cc | 6 - python/paddle/fluid/__init__.py | 3 +- python/paddle/fluid/parallel_executor.py | 497 +++++++++--------- 14 files changed, 293 insertions(+), 308 deletions(-) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index 42af482f852a1..43e1bc6b2efec 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -31,9 +31,7 @@ function(windows_symbolic TARGET) endfunction() add_subdirectory(ir) -if (NOT WIN32) add_subdirectory(details) -endif (NOT WIN32) # ddim lib proto_library(framework_proto SRCS framework.proto) @@ -118,13 +116,8 @@ cc_test(op_proto_maker_test SRCS op_proto_maker_test.cc DEPS op_proto_maker) cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) -if (NOT WIN32) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog shape_inference data_transform lod_tensor profiler) -else() -cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor) -endif(NOT WIN32) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -179,12 +172,10 @@ else() cc_test(test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op) endif() -if (NOT WIN32) cc_library(parallel_executor SRCS parallel_executor.cc DEPS threaded_ssa_graph_executor scope_buffered_ssa_graph_executor graph build_strategy fast_threaded_ssa_graph_executor) -endif() # NOT WIN32 cc_library(prune SRCS prune.cc DEPS framework_proto) cc_test(prune_test SRCS prune_test.cc DEPS op_info prune recurrent_op device_context) diff --git a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h index 949616f02d516..c3a8b85423403 100644 --- a/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h @@ -13,9 +13,9 @@ // limitations under the License. #pragma once +#include #include #include -#include "ThreadPool.h" #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/details/exception_holder.h" #include "paddle/fluid/framework/details/execution_strategy.h" diff --git a/paddle/fluid/memory/allocation/cpu_allocator.h b/paddle/fluid/memory/allocation/cpu_allocator.h index 165f11cd3b001..26d3643f4edff 100644 --- a/paddle/fluid/memory/allocation/cpu_allocator.h +++ b/paddle/fluid/memory/allocation/cpu_allocator.h @@ -17,7 +17,8 @@ #ifdef _WIN32 #define posix_memalign_free _aligned_free -#define posix_memalign(p, a, s) (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) +#define posix_memalign(p, a, s) \ + (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) #endif namespace paddle { diff --git a/paddle/fluid/platform/CMakeLists.txt b/paddle/fluid/platform/CMakeLists.txt index 0d0613e1a4364..93cb5eb2dc0b3 100644 --- a/paddle/fluid/platform/CMakeLists.txt +++ b/paddle/fluid/platform/CMakeLists.txt @@ -1,4 +1,3 @@ -if (NOT WIN32) proto_library(profiler_proto SRCS profiler.proto DEPS framework_proto) py_proto_compile(profiler_py_proto SRCS profiler.proto) @@ -6,11 +5,19 @@ add_custom_target(profiler_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch _ add_dependencies(profiler_py_proto profiler_py_proto_init) +if (NOT WIN32) add_custom_command(TARGET profiler_py_proto POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMAND cp *.py ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) +else(NOT WIN32) +string(REPLACE "/" "\\" proto_dstpath "${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler/") +add_custom_command(TARGET profiler_py_proto POST_BUILD + COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_BINARY_DIR}/python/paddle/fluid/proto/profiler + COMMAND copy /Y *.py ${proto_dstpath} + COMMENT "Copy generated python proto into directory paddle/fluid/proto/profiler." + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endif(NOT WIN32) if(WITH_GPU) @@ -60,12 +67,9 @@ cc_test(init_test SRCS init_test.cc DEPS device_context) nv_test(cudnn_helper_test SRCS cudnn_helper_test.cc DEPS dynload_cuda) nv_test(transform_test SRCS transform_test.cu DEPS memory place device_context) - -if (NOT WIN32) cc_library(device_tracer SRCS device_tracer.cc DEPS boost profiler_proto framework_proto ${GPU_CTX_DEPS}) cc_library(profiler SRCS profiler.cc DEPS device_context device_tracer) cc_test(profiler_test SRCS profiler_test.cc DEPS profiler) -endif(NOT WIN32) nv_test(float16_gpu_test SRCS float16_test.cu DEPS lod_tensor) cc_test(float16_test SRCS float16_test.cc DEPS lod_tensor) diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index f59fc40b71699..eaf047d474476 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -13,17 +13,11 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#if !defined(_WIN32) -#include -#else -#include -#endif // !_WIN32 - -#include #include // NOLINT #include #include "paddle/fluid/platform/dynload/cupti.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.pb.h" namespace paddle { @@ -32,15 +26,11 @@ namespace platform { /////////////////////// // WARN: Under Development. Don't depend on it yet. ////////////////////// -#if !defined(_WIN32) inline uint64_t PosixInNsec() { struct timeval tv; gettimeofday(&tv, nullptr); return 1000 * (static_cast(tv.tv_sec) * 1000000 + tv.tv_usec); } -#else -inline uint64_t PosixInNsec() { return static_cast(0); } -#endif // !_WIN32 // DeviceTracer performs the following tasks: // 1. Register cuda callbacks for various events: kernel, memcpy, etc. diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 3643d2ad15b9e..31309738a52f0 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -134,7 +134,7 @@ struct EOFException : public std::exception { #define LIKELY(condition) __builtin_expect(static_cast(condition), 1) #else // there is no equivalent intrinsics in msvc. -#define LIKELY(condition) !(condition) +#define LIKELY(condition) (condition) #endif template diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index a07b993c8a84e..8be77fe464575 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -17,6 +17,7 @@ #include #include +#include #include #include @@ -27,6 +28,7 @@ #include // dladdr #include // backtrace #include +#include #include // std::accumulate #else #include // _popen, _pclose @@ -57,6 +59,25 @@ static void *dlopen(const char *filename, int flag) { return reinterpret_cast(hModule); } +static int gettimeofday(struct timeval *tp, void *tzp) { + time_t clock; + struct tm tm; + SYSTEMTIME wtm; + + GetLocalTime(&wtm); + tm.tm_year = wtm.wYear - 1900; + tm.tm_mon = wtm.wMonth - 1; + tm.tm_mday = wtm.wDay; + tm.tm_hour = wtm.wHour; + tm.tm_min = wtm.wMinute; + tm.tm_sec = wtm.wSecond; + tm.tm_isdst = -1; + clock = mktime(&tm); + tp->tv_sec = clock; + tp->tv_usec = wtm.wMilliseconds * 1000; + + return (0); +} #endif // !_WIN32 static void ExecShellCommand(const std::string &cmd, std::string *message) { diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 56bf9e31a35fd..03c102e24a189 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/platform/profiler.h" +#include "paddle/fluid/platform/port.h" -#include #include #include #include @@ -438,10 +438,10 @@ void ParseEvents(const std::vector>& events, event_items[index].total_time += event_time; // min time event_items[index].min_time = - std::min(event_time, event_items[index].min_time); + (std::min)(event_time, event_items[index].min_time); // max time event_items[index].max_time = - std::max(event_time, event_items[index].max_time); + (std::max)(event_time, event_items[index].max_time); } // remove the push marker from the list diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index e8eae874afa3d..f5d3490634f31 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -69,7 +69,6 @@ void PushEvent(const std::string& name, const DeviceContext* dev_ctx); void PopEvent(const std::string& name, const DeviceContext* dev_ctx); -#if !defined(_WIN32) struct RecordEvent { // dev_ctx can be set to nullptr if device is cpu. RecordEvent(const std::string& name, const DeviceContext* dev_ctx); @@ -106,15 +105,6 @@ struct RecordBlock { std::string name_; uint64_t start_ns_; }; -#else -// windows do not support profiler temporarily. -struct RecordEvent { - RecordEvent(const std::string& name, const DeviceContext* dev_ctx) {} -}; -struct RecordBlock { - explicit RecordBlock(int block_id) {} -}; -#endif // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 0e88a439cf6ca..11c68f3449ee2 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -45,16 +45,15 @@ class StreamCallbackManager { inline void AddCallback(Callback &&callback) const { auto *stream_callback_context = new StreamCallbackContext(this, std::forward(callback)); - PADDLE_ENFORCE( #if CUDA_VERSION >= 10000 - cudaLaunchHostFunc(stream_, StreamCallbackManager::StreamCallbackFunc, - stream_callback_context) + PADDLE_ENFORCE(cudaLaunchHostFunc(stream_, + StreamCallbackManager::StreamCallbackFunc, + stream_callback_context)); // NOLINT #else - cudaStreamAddCallback(stream_, - StreamCallbackManager::StreamCallbackFunc, - stream_callback_context, 0) + PADDLE_ENFORCE(cudaStreamAddCallback( + stream_, StreamCallbackManager::StreamCallbackFunc, + stream_callback_context, 0)); // NOLINT #endif - ); // NOLINT } void Wait() const { thread_pool_.reset(new ThreadPool(1)); } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index 25e919105cba4..fb6ee2f4a5392 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,9 +1,6 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder) +set(PYBIND_DEPS pybind python proto_desc memory executor prune feed_fetch_method pass_builder parallel_executor profiler) set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc) -if(NOT WIN32) - list(APPEND PYBIND_DEPS parallel_executor profiler) -endif(NOT WIN32) if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 2f040e1c34c79..102fa02adf3fb 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -36,9 +36,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor_array.h" #include "paddle/fluid/framework/op_registry.h" -#ifndef _WIN32 #include "paddle/fluid/framework/parallel_executor.h" -#endif #include "paddle/fluid/framework/prune.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" @@ -637,7 +635,6 @@ All parameter, weight, gradient are variables in Paddle. #endif #endif -#ifndef _WIN32 py::enum_(m, "ProfilerState", py::arithmetic()) .value("kDisabled", platform::ProfilerState::kDisabled) .value("kCPU", platform::ProfilerState::kCPU) @@ -658,7 +655,6 @@ All parameter, weight, gradient are variables in Paddle. m.def("disable_profiler", platform::DisableProfiler); m.def("is_profiler_enabled", platform::IsProfileEnabled); m.def("reset_profiler", platform::ResetProfiler); -#endif py::class_> pass(m, "Pass"); pass.def(py::init()) @@ -687,7 +683,6 @@ All parameter, weight, gradient are variables in Paddle. .def("remove_pass", [](ir::PassBuilder &self, size_t idx) { self.RemovePass(idx); }); -#ifndef _WIN32 // -- python binds for parallel executor. py::class_ pe(m, "ParallelExecutor"); py::class_ exec_strategy(pe, "ExecutionStrategy", R"DOC( @@ -913,7 +908,6 @@ All parameter, weight, gradient are variables in Paddle. pybind11::gil_scoped_release release; self.Run(fetch_tensors, fetched_var_name); }); -#endif BindRecordIOWriter(&m); return m.ptr(); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 6a4a5e098fc27..543acf2d349c7 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -47,8 +47,7 @@ from . import unique_name from . import recordio_writer from . import parallel_executor -if os.name != 'nt': - from .parallel_executor import * +from .parallel_executor import * from paddle.fluid.layers.math_op_patch import monkey_patch_variable Tensor = LoDTensor diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index 0d53f53a9ef6d..3f4dd5eb712e7 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -25,264 +25,263 @@ __all__ = ['ParallelExecutor', 'ExecutionStrategy', 'BuildStrategy'] -if os.name != 'nt': - ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy - BuildStrategy = core.ParallelExecutor.BuildStrategy - - class ParallelExecutor(object): +ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy +BuildStrategy = core.ParallelExecutor.BuildStrategy + + +class ParallelExecutor(object): + """ + ParallelExecutor is designed for data parallelism, which focuses on distributing + the data across different nodes and every node operates on the data in parallel. + If you use ParallelExecutor to run the current program on GPU, the node means GPU + device, and ParallelExecutor will get the available GPU device automatically on + the current machine. If you use ParallelExecutor to run the current program on CPU, + the node means the CPU device, and you can specify the CPU device number by adding + 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable + is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number + of CPUs in the system. + + Args: + use_cuda (bool): Whether to use CUDA or not. + loss_name (str): The loss name must set in training. Default None. + main_program (Program): The program that need to run, if not provided, + then default_main_program will be used. Default None. + share_vars_from(ParallelExecutor): If provide, it will share variables + from the specified ParallelExecutor. Default None. + exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run + the program in ParallelExecutor, for example how many threads are used to + execute the program, how many iterations to clean up the temp variables + which is generated during execution. For more information, please refer + to fluid.ExecutionStrategy. Default None. + build_strategy(BuildStrategy): build_strategy is used to control how to + build the SSA Graph in ParallelExecutor by setting the property, + for example reduce_strategy, gradient_scale_strategy. For more information, + please refer to fluid.BuildStrategy. Default None. + num_trainers(int): If greater than 1, NCCL will be initialized with + multiple rank of nodes, each node should have same number of GPUs. + Distributed training will be enabled then. Default 1. + trainer_id(int): Must use together with num_trainers. trainer_id is the + "rank" of current node starts from 0. Default 0. + scope(Scope): scope to run with, default use fluid.global_scope(). + + Returns: + ParallelExecutor: The initialized ParallelExecutor object. + + Raises: + TypeError: If share_vars_from is provided, but not ParallelExecutor object. + + Examples: + .. code-block:: python + + train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) + test_exe = fluid.ParallelExecutor(use_cuda=True, + main_program=test_program, + share_vars_from=train_exe) + + train_loss, = train_exe.run([loss.name], feed=feed_dict) + test_loss, = test_exe.run([loss.name], feed=feed_dict) + """ + + def __init__(self, + use_cuda, + loss_name=None, + main_program=None, + share_vars_from=None, + exec_strategy=None, + build_strategy=None, + num_trainers=1, + trainer_id=0, + scope=None): + self._places = [] + self._act_places = [] + if use_cuda: + for i in six.moves.range(core.get_cuda_device_count()): + p = core.Place() + self._act_places.append(core.CUDAPlace(i)) + p.set_place(self._act_places[-1]) + self._places.append(p) + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + for i in six.moves.range(cpu_num): + p = core.Place() + self._act_places.append(core.CPUPlace()) + p.set_place(self._act_places[-1]) + self._places.append(p) + assert self._places, "no place for execution" + + if exec_strategy is None: + exec_strategy = ExecutionStrategy() + exec_strategy.use_cuda = use_cuda + + if exec_strategy.num_threads == 0: + if use_cuda: + # Experiments on se-resnext shows that too many threads hurt + # performance. Worth tunning for other models in the future. + exec_strategy.num_threads = len(self._places) * 4 + else: + cpu_num = int( + os.environ.get('CPU_NUM', multiprocessing.cpu_count())) + exec_strategy.num_threads = cpu_num * 2 + + # Set 1 thread num under nccl2 distribute + # env to make sure all gpus run ops in same order. + if num_trainers > 1: + assert (use_cuda) + # FIXME(gongwb): avoid this set. + exec_strategy.num_threads = 1 + + if build_strategy is None: + build_strategy = BuildStrategy() + + main = main_program + main = main if main else framework.default_main_program() + if scope == None: + scope = executor.global_scope() + + if share_vars_from and not isinstance(share_vars_from, + ParallelExecutor): + raise TypeError("share_vars_from must be ParallelExecutor.") + + local_scopes = share_vars_from.executor.local_scopes( + ) if share_vars_from else [] + + self.persistable_vars = [ + v.name for v in [ + var for var in main.list_vars() + if var.persistable and var.type != core.VarDesc.VarType.RAW + ] + ] + + self.executor = core.ParallelExecutor( + self._places, + set([ + cpt.to_text(p.name) + for p in main.global_block().iter_parameters() + if not p.stop_gradient + ]), + set(cpt.to_text(var) for var in self.persistable_vars), main.desc, + cpt.to_text(loss_name) + if loss_name else six.u(''), scope, local_scopes, exec_strategy, + build_strategy, num_trainers, trainer_id) + self.scope = scope + + def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): """ - ParallelExecutor is designed for data parallelism, which focuses on distributing - the data across different nodes and every node operates on the data in parallel. - If you use ParallelExecutor to run the current program on GPU, the node means GPU - device, and ParallelExecutor will get the available GPU device automatically on - the current machine. If you use ParallelExecutor to run the current program on CPU, - the node means the CPU device, and you can specify the CPU device number by adding - 'CPU_NUM' environment variable, for example 'CPU_NUM=4', if the environment variable - is not found, ParallelExecutor will call `multiprocessing.cpu_count` to get the number - of CPUs in the system. + Run a parallel executor with fetch_list. + + The feed parameter can be a dict or a list. If feed is a dict, the + feed data will be split into multiple devices. If feed is a list, we + assume the data has been splitted into multiple devices, the each + element in the list will be copied to each device directly. + + For example, if the feed is a dict: + + >>> exe = ParallelExecutor() + >>> # the image will be splitted into devices. If there is two devices + >>> # each device will process an image with shape (24, 1, 28, 28) + >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) + + For example, if the feed is a list: + + >>> exe = ParallelExecutor() + >>> # each device will process each element in the list. + >>> # the 1st device will process an image with shape (48, 1, 28, 28) + >>> # the 2nd device will process an image with shape (32, 1, 28, 28) + >>> # + >>> # you can use exe.device_count to get the device number. + >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, + >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, + >>> ]) Args: - use_cuda (bool): Whether to use CUDA or not. - loss_name (str): The loss name must set in training. Default None. - main_program (Program): The program that need to run, if not provided, - then default_main_program will be used. Default None. - share_vars_from(ParallelExecutor): If provide, it will share variables - from the specified ParallelExecutor. Default None. - exec_strategy(ExecutionStrategy): exec_strategy is used to control how to run - the program in ParallelExecutor, for example how many threads are used to - execute the program, how many iterations to clean up the temp variables - which is generated during execution. For more information, please refer - to fluid.ExecutionStrategy. Default None. - build_strategy(BuildStrategy): build_strategy is used to control how to - build the SSA Graph in ParallelExecutor by setting the property, - for example reduce_strategy, gradient_scale_strategy. For more information, - please refer to fluid.BuildStrategy. Default None. - num_trainers(int): If greater than 1, NCCL will be initialized with - multiple rank of nodes, each node should have same number of GPUs. - Distributed training will be enabled then. Default 1. - trainer_id(int): Must use together with num_trainers. trainer_id is the - "rank" of current node starts from 0. Default 0. - scope(Scope): scope to run with, default use fluid.global_scope(). + fetch_list(list): The fetched variable names + feed(list|dict|None): The feed variables. If the feed is a dict, + tensors in that dict will be splitted into each devices. If + the feed is a list, each element of the list will be copied + to each device. Default None. + feed_dict: Alias for feed parameter, for backward compatibility. + This parameter has been deprecated. Default None. + return_numpy(bool): Whether converts the fetched tensor to numpy. + Default: True. Returns: - ParallelExecutor: The initialized ParallelExecutor object. + List: The fetched result list. Raises: - TypeError: If share_vars_from is provided, but not ParallelExecutor object. + ValueError: If the feed is a list, but its length is not equal the + length of active places, or its element's is not dict. + + NOTES: + 1. If the feed's type is dict, the number of data that feeds to + ParallelExecutor must be bigger than active places. Otherwise, + it will throw exception from C++ side. Special attention should be + paid to check whether the last batch of the dataset is bigger + than active places. + 2. If active places are more than one, the fetch results for each + variable is a list, and each element of this list is the variable of + respective active place. Examples: .. code-block:: python - train_exe = fluid.ParallelExecutor(use_cuda=True, loss_name=loss.name) - test_exe = fluid.ParallelExecutor(use_cuda=True, - main_program=test_program, - share_vars_from=train_exe) - - train_loss, = train_exe.run([loss.name], feed=feed_dict) - test_loss, = test_exe.run([loss.name], feed=feed_dict) + pe = fluid.ParallelExecutor(use_cuda=use_cuda, + loss_name=avg_cost.name, + main_program=fluid.default_main_program()) + loss = pe.run(feed=feeder.feed(cur_batch), + fetch_list=[avg_cost.name])) """ - - def __init__(self, - use_cuda, - loss_name=None, - main_program=None, - share_vars_from=None, - exec_strategy=None, - build_strategy=None, - num_trainers=1, - trainer_id=0, - scope=None): - self._places = [] - self._act_places = [] - if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): - p = core.Place() - self._act_places.append(core.CUDAPlace(i)) - p.set_place(self._act_places[-1]) - self._places.append(p) - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - for i in six.moves.range(cpu_num): - p = core.Place() - self._act_places.append(core.CPUPlace()) - p.set_place(self._act_places[-1]) - self._places.append(p) - assert self._places, "no place for execution" - - if exec_strategy is None: - exec_strategy = ExecutionStrategy() - exec_strategy.use_cuda = use_cuda - - if exec_strategy.num_threads == 0: - if use_cuda: - # Experiments on se-resnext shows that too many threads hurt - # performance. Worth tunning for other models in the future. - exec_strategy.num_threads = len(self._places) * 4 - else: - cpu_num = int( - os.environ.get('CPU_NUM', multiprocessing.cpu_count())) - exec_strategy.num_threads = cpu_num * 2 - - # Set 1 thread num under nccl2 distribute - # env to make sure all gpus run ops in same order. - if num_trainers > 1: - assert (use_cuda) - # FIXME(gongwb): avoid this set. - exec_strategy.num_threads = 1 - - if build_strategy is None: - build_strategy = BuildStrategy() - - main = main_program - main = main if main else framework.default_main_program() - if scope == None: - scope = executor.global_scope() - - if share_vars_from and not isinstance(share_vars_from, - ParallelExecutor): - raise TypeError("share_vars_from must be ParallelExecutor.") - - local_scopes = share_vars_from.executor.local_scopes( - ) if share_vars_from else [] - - self.persistable_vars = [ - v.name for v in [ - var for var in main.list_vars() - if var.persistable and var.type != core.VarDesc.VarType.RAW - ] - ] - - self.executor = core.ParallelExecutor( - self._places, - set([ - cpt.to_text(p.name) - for p in main.global_block().iter_parameters() - if not p.stop_gradient - ]), - set(cpt.to_text(var) - for var in self.persistable_vars), main.desc, - cpt.to_text(loss_name) - if loss_name else six.u(''), scope, local_scopes, exec_strategy, - build_strategy, num_trainers, trainer_id) - self.scope = scope - - def run(self, fetch_list, feed=None, feed_dict=None, return_numpy=True): - """ - Run a parallel executor with fetch_list. - - The feed parameter can be a dict or a list. If feed is a dict, the - feed data will be split into multiple devices. If feed is a list, we - assume the data has been splitted into multiple devices, the each - element in the list will be copied to each device directly. - - For example, if the feed is a dict: - - >>> exe = ParallelExecutor() - >>> # the image will be splitted into devices. If there is two devices - >>> # each device will process an image with shape (24, 1, 28, 28) - >>> exe.run(feed={'image': numpy.random.random(size=(48, 1, 28, 28))}) - - For example, if the feed is a list: - - >>> exe = ParallelExecutor() - >>> # each device will process each element in the list. - >>> # the 1st device will process an image with shape (48, 1, 28, 28) - >>> # the 2nd device will process an image with shape (32, 1, 28, 28) - >>> # - >>> # you can use exe.device_count to get the device number. - >>> exe.run(feed=[{"image": numpy.random.random(size=(48, 1, 28, 28))}, - >>> {"image": numpy.random.random(size=(32, 1, 28, 28))}, - >>> ]) - - Args: - fetch_list(list): The fetched variable names - feed(list|dict|None): The feed variables. If the feed is a dict, - tensors in that dict will be splitted into each devices. If - the feed is a list, each element of the list will be copied - to each device. Default None. - feed_dict: Alias for feed parameter, for backward compatibility. - This parameter has been deprecated. Default None. - return_numpy(bool): Whether converts the fetched tensor to numpy. - Default: True. - - Returns: - List: The fetched result list. - - Raises: - ValueError: If the feed is a list, but its length is not equal the - length of active places, or its element's is not dict. - - NOTES: - 1. If the feed's type is dict, the number of data that feeds to - ParallelExecutor must be bigger than active places. Otherwise, - it will throw exception from C++ side. Special attention should be - paid to check whether the last batch of the dataset is bigger - than active places. - 2. If active places are more than one, the fetch results for each - variable is a list, and each element of this list is the variable of - respective active place. - - Examples: - .. code-block:: python - - pe = fluid.ParallelExecutor(use_cuda=use_cuda, - loss_name=avg_cost.name, - main_program=fluid.default_main_program()) - loss = pe.run(feed=feeder.feed(cur_batch), - fetch_list=[avg_cost.name])) - """ - if feed is None and feed_dict is not None: - feed = feed_dict - print( - "`feed_dict` is deprecated. Please use `feed=`", - file=sys.stderr) - - if isinstance(feed, dict): - feed_tensor_dict = dict() - for feed_name in feed: - feed_tensor = feed[feed_name] - if not isinstance(feed_tensor, core.LoDTensor): - feed_tensor = core.LoDTensor() - # always set to CPU place, since the tensor need to be splitted - # it is fast in CPU - feed_tensor.set(feed[feed_name], core.CPUPlace()) - feed_tensor_dict[feed_name] = feed_tensor - - self.executor.feed_and_split_tensor_into_local_scopes( - feed_tensor_dict) - elif isinstance(feed, list) or isinstance(feed, tuple): - if len(feed) != len(self._act_places): - raise ValueError( - "Feed a list of tensor, the list should be the same size as places" - ) - - res = list() - - for i, each in enumerate(feed): - if not isinstance(each, dict): - raise TypeError( - "Each element of feed list should be a dict") - res_dict = dict() - for feed_name in each: - tensor = each[feed_name] - if not isinstance(tensor, core.LoDTensor): - tmp = core.LoDTensor() - tmp.set(tensor, self._act_places[i]) - tensor = tmp - res_dict[feed_name] = tensor - res.append(res_dict) - self.executor.feed_tensors_into_local_scopes(res) - - fetch_var_name = '@FETCHED_VAR_NAME@' - self.executor.run(fetch_list, fetch_var_name) - arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() - - if return_numpy: - return executor.as_numpy(arr) - - return [arr[i] for i in range(len(arr))] - - @property - def device_count(self): - return len(self._act_places) + if feed is None and feed_dict is not None: + feed = feed_dict + print( + "`feed_dict` is deprecated. Please use `feed=`", + file=sys.stderr) + + if isinstance(feed, dict): + feed_tensor_dict = dict() + for feed_name in feed: + feed_tensor = feed[feed_name] + if not isinstance(feed_tensor, core.LoDTensor): + feed_tensor = core.LoDTensor() + # always set to CPU place, since the tensor need to be splitted + # it is fast in CPU + feed_tensor.set(feed[feed_name], core.CPUPlace()) + feed_tensor_dict[feed_name] = feed_tensor + + self.executor.feed_and_split_tensor_into_local_scopes( + feed_tensor_dict) + elif isinstance(feed, list) or isinstance(feed, tuple): + if len(feed) != len(self._act_places): + raise ValueError( + "Feed a list of tensor, the list should be the same size as places" + ) + + res = list() + + for i, each in enumerate(feed): + if not isinstance(each, dict): + raise TypeError( + "Each element of feed list should be a dict") + res_dict = dict() + for feed_name in each: + tensor = each[feed_name] + if not isinstance(tensor, core.LoDTensor): + tmp = core.LoDTensor() + tmp.set(tensor, self._act_places[i]) + tensor = tmp + res_dict[feed_name] = tensor + res.append(res_dict) + self.executor.feed_tensors_into_local_scopes(res) + + fetch_var_name = '@FETCHED_VAR_NAME@' + self.executor.run(fetch_list, fetch_var_name) + arr = self.scope.find_var(fetch_var_name).get_lod_tensor_array() + + if return_numpy: + return executor.as_numpy(arr) + + return [arr[i] for i in range(len(arr))] + + @property + def device_count(self): + return len(self._act_places) From 6e66fadb951fe02218ab2be2916bc12c4b966e00 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 21 Nov 2018 15:24:23 +0800 Subject: [PATCH 077/684] clean up the pre-definitions on windows --- CMakeLists.txt | 2 ++ cmake/operators.cmake | 3 +-- paddle/fluid/framework/eigen.h | 5 ----- paddle/fluid/framework/op_registry.h | 5 ----- paddle/fluid/framework/operator.cc | 2 -- paddle/fluid/framework/operator.h | 2 -- paddle/fluid/inference/api/api_impl.h | 6 ------ paddle/fluid/platform/cpu_helper.cc | 1 + paddle/fluid/platform/dynload/cudnn.h | 2 -- paddle/fluid/platform/enforce.h | 6 ------ paddle/fluid/platform/init.h | 3 --- paddle/fluid/platform/port.h | 4 ++++ paddle/fluid/platform/profiler.cc | 4 ++-- paddle/fluid/pybind/pybind.cc | 7 ------- 14 files changed, 10 insertions(+), 42 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 27f2d81dd5d3d..5325e3034c854 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -137,6 +137,8 @@ if (WIN32) "Disable DSO when compiling for Windows" FORCE) set(WITH_MKL OFF CACHE STRING "Disable MKL when compiling for Windows" FORCE) + set(WITH_DISTRIBUTE OFF CACHE STRING + "Disable DISTRIBUTE when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 0bc4dbe6cfa32..17107e0698757 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -84,8 +84,7 @@ function(op_library TARGET) endif() if (WIN32) # remove windows unsupported op, because windows has no nccl, no warpctc such ops. - foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op" - "channel_send_op" "channel_create_op" "channel_close_op" "channel_recv_op") + foreach(windows_unsupport_op "nccl_op" "gen_nccl_id_op" "warpctc_op") if ("${TARGET}" STREQUAL "${windows_unsupport_op}") return() endif() diff --git a/paddle/fluid/framework/eigen.h b/paddle/fluid/framework/eigen.h index 2b265a773fe96..5bafa4345f42a 100644 --- a/paddle/fluid/framework/eigen.h +++ b/paddle/fluid/framework/eigen.h @@ -13,11 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -// logging.h and windows.h conflict -#define GLOG_NO_ABBREVIATED_SEVERITIES -// solve static linking error in windows -// https://github.com/google/glog/issues/301 -#define GOOGLE_GLOG_DLL_DECL #include "paddle/fluid/framework/tensor.h" #include "unsupported/Eigen/CXX11/Tensor" diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index ef2eb334a4e7f..0e6e74293c30d 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -23,11 +23,6 @@ limitations under the License. */ #include #include -#if defined(_WIN32) -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#define GOOGLE_GLOG_DLL_DECL -#endif - #include "glog/logging.h" // For VLOG() #include "paddle/fluid/framework/attribute.h" #include "paddle/fluid/framework/details/op_registry.h" diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 2b35943d09251..1ec170b6f65f9 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -11,8 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#define GLOG_NO_ABBREVIATED_SEVERITIES -#define GOOGLE_GLOG_DLL_DECL #include #include diff --git a/paddle/fluid/framework/operator.h b/paddle/fluid/framework/operator.h index 6918e030bf859..ef838332177c0 100644 --- a/paddle/fluid/framework/operator.h +++ b/paddle/fluid/framework/operator.h @@ -20,8 +20,6 @@ limitations under the License. */ #include #include #include -#define GLOG_NO_ABBREVIATED_SEVERITIES -#define GOOGLE_GLOG_DLL_DECL #include "glog/logging.h" // For VLOG #include "paddle/fluid/framework/attribute.h" diff --git a/paddle/fluid/inference/api/api_impl.h b/paddle/fluid/inference/api/api_impl.h index 4e4ab47ca9c5e..9dfa48d501f17 100644 --- a/paddle/fluid/inference/api/api_impl.h +++ b/paddle/fluid/inference/api/api_impl.h @@ -14,12 +14,6 @@ limitations under the License. */ #pragma once -// logging.h and windows.h conflict -#define GLOG_NO_ABBREVIATED_SEVERITIES -// solve static linking error in windows -// https://github.com/google/glog/issues/301 -#define GOOGLE_GLOG_DLL_DECL - #include #include #include diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index bd6aedb3ac787..f2d691b2931f5 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -30,6 +30,7 @@ namespace platform { void SetNumThreads(int num_threads) { #ifdef PADDLE_USE_OPENBLAS // windows has no support for openblas multi-thread +// please refer to: https://github.com/PaddlePaddle/Paddle/issues/7234 #ifdef _WIN32 if (num_threads > 1) { num_threads = 1; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 065b940b9ca6f..1a83ac7780a01 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -13,8 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#define GLOG_NO_ABBREVIATED_SEVERITIES -#define GOOGLE_GLOG_DLL_DECL #include #include diff --git a/paddle/fluid/platform/enforce.h b/paddle/fluid/platform/enforce.h index 31309738a52f0..a85972bdb72ca 100644 --- a/paddle/fluid/platform/enforce.h +++ b/paddle/fluid/platform/enforce.h @@ -18,12 +18,6 @@ limitations under the License. */ #include // for __cxa_demangle #endif // __GNUC__ -#if defined(_WIN32) -#define NOMINMAX // msvc max/min macro conflict with std::min/max -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#define GOOGLE_GLOG_DLL_DECL -#endif - #ifdef PADDLE_WITH_CUDA #include #include diff --git a/paddle/fluid/platform/init.h b/paddle/fluid/platform/init.h index 992ca5e6f6a96..0e30594672927 100644 --- a/paddle/fluid/platform/init.h +++ b/paddle/fluid/platform/init.h @@ -16,9 +16,6 @@ limitations under the License. */ #include #include -#define GLOG_NO_ABBREVIATED_SEVERITIES -#define GOOGLE_GLOG_DLL_DECL - #include "gflags/gflags.h" #include "glog/logging.h" diff --git a/paddle/fluid/platform/port.h b/paddle/fluid/platform/port.h index 8be77fe464575..ad070171df32f 100644 --- a/paddle/fluid/platform/port.h +++ b/paddle/fluid/platform/port.h @@ -31,6 +31,10 @@ #include #include // std::accumulate #else +#define NOMINMAX // msvc max/min macro conflict with std::min/max +// solve static linking error in windows +// https://github.com/google/glog/issues/301 +#define GOOGLE_GLOG_DLL_DECL #include // _popen, _pclose #include #include diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index 03c102e24a189..998242fb4a091 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -438,10 +438,10 @@ void ParseEvents(const std::vector>& events, event_items[index].total_time += event_time; // min time event_items[index].min_time = - (std::min)(event_time, event_items[index].min_time); + std::min(event_time, event_items[index].min_time); // max time event_items[index].max_time = - (std::max)(event_time, event_items[index].max_time); + std::max(event_time, event_items[index].max_time); } // remove the push marker from the list diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 102fa02adf3fb..6cc3a1739a5cc 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -21,13 +21,6 @@ limitations under the License. */ #include #include -#if defined(_WIN32) -#define NOMINMAX -#define GLOG_NO_ABBREVIATED_SEVERITIES // msvc conflict logging with windows.h -#define GOOGLE_GLOG_DLL_DECL -#include -#endif - #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/feed_fetch_method.h" #include "paddle/fluid/framework/framework.pb.h" From c19ff1f3d28b38867de8b98d63f19b8c759c4535 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 21 Nov 2018 15:37:36 +0800 Subject: [PATCH 078/684] Add python3.6 and python3.7 support in padde build scripts test=develop --- paddle/scripts/paddle_build.sh | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 32f9bca645d80..569e56e5a9372 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -116,6 +116,18 @@ function cmake_gen() { export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.5.1/bin/python3 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.5.1/include/python3.5m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.5.1/lib/libpython3.so" + elif [ "$1" == "cp36-cp36m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.6.0/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.6.0/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.6.0/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.6.0/include/python3.6m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.6.0/lib/libpython3.so" + elif [ "$1" == "cp37-cp37m" ]; then + export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} + export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH} + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m + -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" fi fi fi @@ -419,7 +431,7 @@ function assert_api_not_changed() { source .env/bin/activate pip install ${PADDLE_ROOT}/build/python/dist/*whl python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec - if [ "$1" == "cp35-cp35m" ]; then + if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec From 255cc1eb6540785c8cb786a6c9f291fa53010ca0 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Wed, 21 Nov 2018 15:43:17 +0800 Subject: [PATCH 079/684] Add support for Mac build test=develop --- paddle/scripts/paddle_build.sh | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 569e56e5a9372..9632eaec005df 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -94,6 +94,30 @@ function cmake_gen() { else exit 1 fi + elif [ "$1" == "cp36-cp36m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.6" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.6/include/python3.6m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.6/lib/libpython3.6m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi + elif [ "$1" == "cp37-cp37m" ]; then + if [ -d "/Library/Frameworks/Python.framework/Versions/3.7" ]; then + export LD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/ + export DYLD_LIBRARY_PATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/ + export PATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/:${PATH} + PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/bin/python3 + -DPYTHON_INCLUDE_DIR:PATH=/Library/Frameworks/Python.framework/Versions/3.7/include/python3.7m/ + -DPYTHON_LIBRARY:FILEPATH=/Library/Frameworks/Python.framework/Versions/3.7/lib/libpython3.7m.dylib" + WITH_FLUID_ONLY=${WITH_FLUID_ONLY:-ON} + else + exit 1 + fi fi else if [ "$1" != "" ]; then From 014e50c284eb9698cc02d0457f8eb3b566687e70 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 21 Nov 2018 07:53:15 +0000 Subject: [PATCH 080/684] test=develop --- paddle/fluid/framework/mixed_vector.h | 6 + .../operators/hierarchical_sigmoid_op.cc | 68 ++++-- .../fluid/operators/hierarchical_sigmoid_op.h | 92 +++++--- .../fluid/operators/math/matrix_bit_code.cc | 85 ++++---- paddle/fluid/operators/math/matrix_bit_code.h | 53 +++-- python/paddle/fluid/layers/nn.py | 10 +- .../fluid/tests/unittests/test_hsigmoid_op.py | 206 ++++++++++++------ 7 files changed, 349 insertions(+), 171 deletions(-) diff --git a/paddle/fluid/framework/mixed_vector.h b/paddle/fluid/framework/mixed_vector.h index e1aac6dc5a92f..cd06da9d05c73 100644 --- a/paddle/fluid/framework/mixed_vector.h +++ b/paddle/fluid/framework/mixed_vector.h @@ -533,6 +533,12 @@ class CPUVector : public std::vector> { return os; } + size_t size() const noexcept { + size_t size = + static_cast(std::vector>::size()); + return size; + } + T &operator[](size_t id) { return this->at(id); } const T &operator[](size_t id) const { return this->at(id); } diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 8d4e0556dd6a7..b2f4616441582 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -70,13 +70,14 @@ class HierarchicalSigmoidOp : public framework::OperatorWithKernel { const int64_t batch_size = ctx->GetInputDim("X")[0]; std::vector output_shape({batch_size, 1}); ctx->SetOutputDim("Out", framework::make_ddim(output_shape)); + ctx->ShareLoD("X", /*->*/ "Out"); } protected: framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); } }; @@ -86,32 +87,34 @@ class HierarchicalSigmoidOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { AddInput("X", - "(Tensor, required) The input tensor with shape [N, D], " + "(LoDTensor, required) The input tensor with shape [N, D], " "where N is the size of mini-batch, and D is the feature size."); AddInput("W", - "(Tensor, required), The parameters of hierarchical " + "(LoDTensor, required), The parameters of hierarchical " "sigmoid operator, each of them is a 2-D tensor, the shape is" "[K, D]. Which K is the num of non-leaf node in Path Tree"); AddInput("Label", - "(Tensor, required), The labels of training data. It's a" + "(LoDTensor, required), The labels of training data. It's a" "tensor with shape [N, 1]."); AddInput("PTable", - "(Tensor, optional), The Path Table from root to current word" + "(LoDTensor, optional), The Path Table from root to current word" "it should have shape like [N, L], L is the length of the Path") .AsDispensable(); - AddInput("PCode", - "(Tensor, optional), The Code on each Node of the Path from root " - "to current word" - "it should have shape like [N, L], L is the length of the Path") + AddInput( + "PCode", + "(LoDTensor, optional), The Code on each Node of the Path from root " + "to current word" + "it should have shape like [N, L], L is the length of the Path") .AsDispensable(); AddInput("Bias", - "(Tensor, optional), The bias is a tensor with shape" + "(LoDTensor, optional), The bias is a tensor with shape" "[1, num_classes - 1]."); - AddOutput("Out", - "(Tensor, required) The output of hierarchical sigmoid operator." - "The shape is [N, 1]."); + AddOutput( + "Out", + "(LoDTensor, required) The output of hierarchical sigmoid operator." + "The shape is [N, 1]."); AddOutput("PreOut", - "(Tensor, required) A intermedia 2-D tensor with shape " + "(LoDTensor, required) A intermedia 2-D tensor with shape " "[batch_size, code_length], where code_length represents the " "maximum path length from root to leaf nodes.") .AsIntermediate(); @@ -124,6 +127,10 @@ belonging to the right branch. This idea is from "F. Morin, Y. Bengio (AISTATS 05): Hierarchical Probabilistic Neural Network Language Model." )DOC"); + AddAttr("is_sparse", + "(boolean, default false) " + "Sparse update.") + .SetDefault(false); } }; @@ -133,6 +140,8 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override { PADDLE_ENFORCE(ctx->HasInput("W"), "Input(W) should not be null."); PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should not be null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")), + "Input(Out@Grad) should not be null"); PADDLE_ENFORCE(ctx->HasInput("PreOut"), "Input(Preout) should not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")), @@ -142,7 +151,9 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->GetInputDim("Bias")); } - ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + if (!ctx->Attrs().Get("is_sparse")) { + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + } ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); } @@ -150,11 +161,33 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { framework::OpKernelType GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( - framework::ToDataType(ctx.Input("X")->type()), + framework::ToDataType(ctx.Input("X")->type()), ctx.GetPlace()); } }; +class HierarchicalSigmoidGradOpGradVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc& op_desc, + framework::BlockDesc* block) const override { + auto out_var_name = op_desc.Output(framework::GradVarName("W")).front(); + auto attr = op_desc.GetAttr("is_sparse"); + bool is_sparse = boost::get(attr); + if (is_sparse) { + VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") + << " is set to SelectedRows"; + block->Var(out_var_name) + ->SetType(framework::proto::VarType::SELECTED_ROWS); + } else { + VLOG(3) << "hierarchical_sigmoid_grad op " << framework::GradVarName("W") + << " is set to LoDTensor"; + block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR); + } + block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType()); + } +}; + } // namespace operators } // namespace paddle @@ -162,7 +195,8 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(hierarchical_sigmoid, ops::HierarchicalSigmoidOp, ops::HierarchicalSigmoidOpMaker, paddle::framework::DefaultGradOpDescMaker); -REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp); +REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp, + ops::HierarchicalSigmoidGradOpGradVarTypeInference); REGISTER_OP_CPU_KERNEL( hierarchical_sigmoid, ops::HierarchicalSigmoidOpKernel, diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index df4f5f561a270..3e2fbafa2669e 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -14,9 +14,10 @@ limitations under the License. */ #pragma once #include +#include #include +#include "paddle/fluid/framework/mixed_vector.h" #include "paddle/fluid/framework/op_registry.h" -#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/operators/clip_op.h" #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/operators/math/matrix_bit_code.h" @@ -29,18 +30,37 @@ template ; using platform::Transform; +std::vector cal_rows(const framework::LoDTensor* path) { + std::set tmp; + std::vector rows; + rows.clear(); + for (size_t i = 0; i < static_cast(path->dims()[0]); i++) { + for (size_t j = 0; j < static_cast(path->dims()[1]); j++) { + int64_t temp = + path->data()[i * static_cast(path->dims()[1]) + j]; + if (temp >= 0) { + tmp.insert(temp); + } + } + } + for (std::set::iterator it = tmp.begin(); it != tmp.end(); ++it) { + rows.push_back(*it); + } + return rows; +} + template class HierarchicalSigmoidOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* w = ctx.Input("W"); - auto* path = ctx.Input("PTable"); - auto* code = ctx.Input("PCode"); - auto* label = ctx.Input("Label"); - auto* bias = ctx.Input("Bias"); - auto* out = ctx.Output("Out"); - auto* pre_out = ctx.Output("PreOut"); + auto* in = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* path = ctx.Input("PTable"); + auto* code = ctx.Input("PCode"); + auto* label = ctx.Input("Label"); + auto* bias = ctx.Input("Bias"); + auto* out = ctx.Output("Out"); + auto* pre_out = ctx.Output("PreOut"); size_t num_classes = static_cast(ctx.Attr("num_classes")); bool is_custom = false; if (path) { @@ -51,7 +71,7 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel { int64_t code_length = path ? path->dims()[1] : math::FindLastSet(num_classes - 1); int64_t batch_size = in->dims()[0]; - framework::Tensor sum; + framework::LoDTensor sum; auto& dev_ctx = ctx.template device_context(); auto* pre_out_data = pre_out->mutable_data( framework::make_ddim({batch_size, code_length}), ctx.GetPlace()); @@ -102,27 +122,26 @@ template class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* in = ctx.Input("X"); - auto* w = ctx.Input("W"); - auto* path = ctx.Input("PTable"); - auto* code = ctx.Input("PCode"); - auto* in_grad = ctx.Output(framework::GradVarName("X")); - auto* w_grad = ctx.Output(framework::GradVarName("W")); + auto* in = ctx.Input("X"); + auto* w = ctx.Input("W"); + auto* path = ctx.Input("PTable"); + auto* code = ctx.Input("PCode"); + auto* in_grad = + ctx.Output(framework::GradVarName("X")); + bool is_sparse = ctx.Attr("is_sparse"); + auto& dev_ctx = ctx.template device_context(); + math::SetConstant zero; auto* bias_grad = - ctx.Output(framework::GradVarName("Bias")); - auto* label = ctx.Input("Label"); - auto* pre_out = ctx.Input("PreOut"); + ctx.Output(framework::GradVarName("Bias")); + auto* label = ctx.Input("Label"); + auto* pre_out = ctx.Input("PreOut"); auto* out_grad = - ctx.Input(framework::GradVarName("Out")); - framework::Tensor pre_out_grad; + ctx.Input(framework::GradVarName("Out")); + framework::LoDTensor pre_out_grad; pre_out_grad.mutable_data(pre_out->dims(), ctx.GetPlace()); in_grad->mutable_data(ctx.GetPlace()); - w_grad->mutable_data(ctx.GetPlace()); - auto& dev_ctx = ctx.template device_context(); - math::SetConstant zero; zero(dev_ctx, in_grad, static_cast(0.0)); - zero(dev_ctx, w_grad, static_cast(0.0)); size_t num_classes = static_cast(ctx.Attr("num_classes")); @@ -162,7 +181,28 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { zero(dev_ctx, bias_grad, static_cast(0.0)); bit_code->AddGrad(pre_out_grad, bias_grad); } - bit_code->MulGradWeight(pre_out_grad, w_grad, *in); + if (!is_sparse) { + auto* w_grad = + ctx.Output(framework::GradVarName("W")); + w_grad->mutable_data(ctx.GetPlace()); + zero(dev_ctx, w_grad, static_cast(0.0)); + bit_code->MulGradWeight(pre_out_grad, w_grad, *in); + } else { + framework::Vector real_rows = cal_rows(path); + auto* w_grad = + ctx.Output(framework::GradVarName("W")); + + w_grad->set_rows(real_rows); + // build ids -> rows index map + w_grad->SyncIndex(); + auto* w_grad_value = w_grad->mutable_value(); + framework::DDim temp_dim(w->dims()); + set(temp_dim, 0, real_rows.size()); + + w_grad_value->mutable_data(temp_dim, ctx.GetPlace()); + zero(dev_ctx, w_grad_value, static_cast(0.0)); + bit_code->MulGradWeight(pre_out_grad, w_grad, *in); + } bit_code->MulGradError(pre_out_grad, *w, in_grad); } }; diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 090c0cca36607..8baffe1ba1e5f 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -19,8 +19,8 @@ namespace operators { namespace math { template -void MatrixBitCodeFunctor::Add(framework::Tensor* tmat, - const framework::Tensor& vec) { +void MatrixBitCodeFunctor::Add(framework::LoDTensor* tmat, + const framework::LoDTensor& vec) { size_t batch_size = tmat->dims()[0]; size_t width = tmat->dims()[1]; for (size_t i = 0; i < batch_size; ++i) { @@ -34,8 +34,8 @@ void MatrixBitCodeFunctor::Add(framework::Tensor* tmat, } template -void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, - framework::Tensor* vec) { +void MatrixBitCodeFunctor::AddGrad(const framework::LoDTensor& tmat, + framework::LoDTensor* vec) { size_t batch_size = tmat.dims()[0]; size_t width = tmat.dims()[1]; for (size_t i = 0; i < batch_size; ++i) { @@ -49,8 +49,8 @@ void MatrixBitCodeFunctor::AddGrad(const framework::Tensor& tmat, } template -void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, - framework::Tensor* sum, T scale_sum) { +void MatrixBitCodeFunctor::Sum(const framework::LoDTensor& tmat, + framework::LoDTensor* sum, T scale_sum) { size_t num_samples = tmat.dims()[0]; size_t o_width = tmat.dims()[1]; for (size_t i = 0; i < num_samples; ++i) { @@ -69,9 +69,9 @@ void MatrixBitCodeFunctor::Sum(const framework::Tensor& tmat, } template -void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, - const framework::Tensor& weight, - const framework::Tensor& input) { +void MatrixBitCodeFunctor::Mul(framework::LoDTensor* tmat, + const framework::LoDTensor& weight, + const framework::LoDTensor& input) { size_t num_samples = tmat->dims()[0]; size_t tmat_width = tmat->dims()[1]; size_t input_width = input.dims()[1]; @@ -95,9 +95,9 @@ void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, } template -void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, - framework::Tensor* weight, - const framework::Tensor& input) { +void MatrixBitCodeFunctor::MulGradWeight(const framework::LoDTensor& tmat, + framework::LoDTensor* weight, + const framework::LoDTensor& input) { size_t num_samples = tmat.dims()[0]; size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; @@ -119,37 +119,38 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, } } -// template -// void MatrixBitCodeFunctor::MulGradSparseWeight(const framework::Tensor& -// tmat, -// framework::SelectedRows* weight, -// const framework::Tensor& input) { -// size_t num_samples = tmat.dims()[0]; -// size_t input_width = input.dims()[1]; -// size_t tmat_width = tmat.dims()[1]; -// size_t weight_width = weight->dims()[1]; -// auto tmat_value = tmat.data(); -// auto weight_value = weight->data(); -// auto input_value = input.data(); -// for (size_t i = 0; i < num_samples; ++i) { -// auto code = code_table->get_code(i); -// int code_length = code->get_length(); -// for (int j = 0; j < code_length; ++j) { -// // size_t index = code->calc_index(j); - -// for (size_t k = 0; k < input_width; ++k) { -// weight_value[j * weight_width + k] += -// tmat_value[i * tmat_width + j] * input_value[input_width * i + -// k]; -// } -// } -// } -// } +template +void MatrixBitCodeFunctor::MulGradWeight(const framework::LoDTensor& tmat, + framework::SelectedRows* weight, + const framework::LoDTensor& input) { + size_t num_samples = tmat.dims()[0]; + size_t input_width = input.dims()[1]; + size_t tmat_width = tmat.dims()[1]; + size_t weight_width = weight->value().dims()[1]; + auto tmat_value = tmat.data(); + auto weight_value = weight->mutable_value()->data(); + auto input_value = input.data(); + for (size_t i = 0; i < num_samples; ++i) { + auto code = code_table->get_code(i); + int code_length = code->get_length(); + for (int j = 0; j < code_length; ++j) { + size_t index = code->calc_index(j); + + for (size_t k = 0; k < input_width; ++k) { + int64_t row_index = + weight->AutoGrownIndex(static_cast(index), false); + + weight_value[row_index * weight_width + k] += + tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; + } + } + } +} template -void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, - const framework::Tensor& weight, - framework::Tensor* input) { +void MatrixBitCodeFunctor::MulGradError(const framework::LoDTensor& tmat, + const framework::LoDTensor& weight, + framework::LoDTensor* input) { size_t num_samples = tmat.dims()[0]; size_t tmat_width = tmat.dims()[1]; size_t input_width = input->dims()[1]; @@ -174,7 +175,7 @@ void MatrixBitCodeFunctor::MulGradError(const framework::Tensor& tmat, } template -void MatrixBitCodeFunctor::Sub(framework::Tensor* tmat) { +void MatrixBitCodeFunctor::Sub(framework::LoDTensor* tmat) { size_t num_samples = tmat->dims()[0]; size_t o_width = tmat->dims()[1]; for (size_t i = 0; i < num_samples; ++i) { diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index 39c3b1520b41e..e4fe43ce9866c 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -14,6 +14,8 @@ limitations under the License. */ #pragma once #include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/platform/device_context.h" @@ -134,8 +136,9 @@ class SimpleCode : public Code { template class CustomCode : public Code { public: - CustomCode(const framework::Tensor* ptable, const framework::Tensor* pcode, - const int64_t* ids, const int index) + CustomCode(const framework::LoDTensor* ptable, + const framework::LoDTensor* pcode, const int64_t* ids, + const int index) : ptable_(ptable), pcode_(pcode), ids_(ids), index_(index) {} /** * Here the id of root shoud be 1 rather than 0, thus the encoding of class c @@ -169,8 +172,8 @@ class CustomCode : public Code { } private: - const framework::Tensor* ptable_; - const framework::Tensor* pcode_; + const framework::LoDTensor* ptable_; + const framework::LoDTensor* pcode_; const int64_t* ids_; const int index_; }; @@ -194,8 +197,9 @@ class SimpleCodeTable : public CodeTable { template class CustomCodeTable : public CodeTable { public: - explicit CustomCodeTable(const framework::Tensor* ptable, - const framework::Tensor* pcode, const int64_t* ids) + explicit CustomCodeTable(const framework::LoDTensor* ptable, + const framework::LoDTensor* pcode, + const int64_t* ids) : ptable_(ptable), pcode_(pcode), ids_(ids) {} std::unique_ptr get_code(int64_t code) const { @@ -209,8 +213,8 @@ class CustomCodeTable : public CodeTable { } private: - const framework::Tensor* ptable_; - const framework::Tensor* pcode_; + const framework::LoDTensor* ptable_; + const framework::LoDTensor* pcode_; const int64_t* ids_; }; @@ -222,8 +226,8 @@ class MatrixBitCodeFunctor { ids_(ids), code_table(new SimpleCodeTable(num_classes, ids)) {} - explicit MatrixBitCodeFunctor(const framework::Tensor* ptable, - const framework::Tensor* pcode, + explicit MatrixBitCodeFunctor(const framework::LoDTensor* ptable, + const framework::LoDTensor* pcode, const int64_t* ids) : num_classes_(static_cast(ptable->dims()[1])), ids_(ids), @@ -231,38 +235,47 @@ class MatrixBitCodeFunctor { /* For j < code_length tmat(i, j) += vec(0, index(i, j)) */ - void Add(framework::Tensor* tmat, const framework::Tensor& vec); + void Add(framework::LoDTensor* tmat, const framework::LoDTensor& vec); /* For j < code_length vec(0, index(i, j)) += tmat(i, j) */ - void AddGrad(const framework::Tensor& tmat, framework::Tensor* vec); + void AddGrad(const framework::LoDTensor& tmat, framework::LoDTensor* vec); /* For j < code_length sum(i, 0) = \sum_j bit(i, j) * tmat(i, j) */ - void Sum(const framework::Tensor& tmat, framework::Tensor* sum, T scale_sum); + void Sum(const framework::LoDTensor& tmat, framework::LoDTensor* sum, + T scale_sum); /* For j < code_length tmat(i, j) -= bit(i, j) */ - void Sub(framework::Tensor* tmat); + void Sub(framework::LoDTensor* tmat); /* For j < code_length input.row(i) += tmat(i, j) * weight.row(index(i, j)) */ - void Mul(framework::Tensor* tmat, const framework::Tensor& weight, - const framework::Tensor& input); + void Mul(framework::LoDTensor* tmat, const framework::LoDTensor& weight, + const framework::LoDTensor& input); /* For index(i, j) >= 0: weight.row(index(i, j)) += tmat(i, j) * input.row(i) */ - void MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight, - const framework::Tensor& input); + void MulGradWeight(const framework::LoDTensor& tmat, + framework::LoDTensor* weight, + const framework::LoDTensor& input); + /* For SelectedRows Weight, For index(i, j) >= 0: + weight.row(index(i, j)) += tmat(i, j) * input.row(i) + */ + void MulGradWeight(const framework::LoDTensor& tmat, + framework::SelectedRows* weight, + const framework::LoDTensor& input); /* For j < code_length input.row(i) += tmat(i, j) * weight.row(index(i, j)) */ - void MulGradError(const framework::Tensor& tmat, - const framework::Tensor& weight, framework::Tensor* input); + void MulGradError(const framework::LoDTensor& tmat, + const framework::LoDTensor& weight, + framework::LoDTensor* input); size_t num_classes_; const int64_t* ids_; diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 4472f20409f8b..7c92bdd882412 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -4355,7 +4355,8 @@ def hsigmoid(input, param_attr=None, bias_attr=None, name=None, - is_costum=False): + is_costum=False, + is_sparse=False): """ The hierarchical sigmoid operator is used to accelerate the training process of language model. This operator organizes the classes into a @@ -4394,9 +4395,11 @@ def hsigmoid(input, is not set, the bias is initialized zero. Default: None. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None. + is_costum: (bool|False)using user defined binary tree instead of default complete binary tree + is_sparse: (bool|False)using sparse update instead of dense update Returns: - Out: (Tensor) The cost of hierarchical sigmoid operator. the shape is [N, 1] + Out: (LodTensor) The cost of hierarchical sigmoid operator. the shape is [N, 1] Examples: @@ -4466,7 +4469,8 @@ def hsigmoid(input, inputs=inputs, outputs={"Out": out, "PreOut": pre_out}, - attrs={"num_classes": num_classes}) + attrs={"num_classes": num_classes, + "is_sparse": is_sparse}) return out diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 6152b96912da0..50dfaee76fda3 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -16,10 +16,9 @@ import unittest import numpy as np +import paddle.fluid.core as core +import paddle.fluid as fluid import math -# import paddle.fluid as fluid -# import paddle.fluid.core as core -# from op_builder import OpBuilder from op_test import OpTest np.random.seed(100) @@ -141,67 +140,148 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): return pre_output, out -class TestHSigmoidOp(OpTest): - def setUp(self): - self.op_type = "hierarchical_sigmoid" - num_classes = 6 - feature_size = 8 - batch_size = 4 - x = np.random.random((batch_size, feature_size)).astype("float32") * 2 - w = np.random.random( - (num_classes - 1, feature_size)).astype("float32") * 2 - label = np.random.randint(0, num_classes, (batch_size, 1)) - bias = np.random.random((1, num_classes - 1)).astype("float32") - self.attrs = {'num_classes': num_classes} - self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} - pre_output, out = hsigmoid(x, w, label, bias, num_classes) - self.outputs = {'PreOut': pre_output, 'Out': out} - - def test_check_output(self): - self.check_output() - - def test_check_grad(self): - self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) - - -class TestHSigmoidOpWithCostumTree(OpTest): - def setUp(self): - self.op_type = "hierarchical_sigmoid" - num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample - feature_size = 8 - batch_size = 4 - x = np.random.random((batch_size, feature_size)).astype("float32") * 2 - w = np.random.random( - (num_classes - 1, feature_size)).astype("float32") * 2 - label = np.array([0, 1, 4, 5]) - ptable = np.array( - [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), - (0, 2, -1, -1, - -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) - pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( - 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store - bias = np.random.random((1, num_classes - 1)).astype("float32") - self.attrs = {'num_classes': num_classes} - self.inputs = { - 'X': x, - 'W': w, - 'PTable': ptable, - 'PCode': pcode, - 'Label': label, - 'Bias': bias - } - pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, - bias, num_classes) - self.outputs = {'PreOut': pre_output, 'Out': out} - - def test_check_output(self): - print("checking output in CostumTree") - self.check_output() - - def test_check_grad(self): - print("checking outputGrad in CostumTree") - self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) +# class TestHSigmoidOp(OpTest): +# def setUp(self): +# self.op_type = "hierarchical_sigmoid" +# num_classes = 6 +# feature_size = 8 +# batch_size = 4 +# x = np.random.random((batch_size, feature_size)).astype("float32") * 2 +# w = np.random.random( +# (num_classes - 1, feature_size)).astype("float32") * 2 +# label = np.random.randint(0, num_classes, (batch_size, 1)) +# bias = np.random.random((1, num_classes - 1)).astype("float32") +# self.attrs = {'num_classes': num_classes, 'is_sparse': False} +# self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} +# pre_output, out = hsigmoid(x, w, label, bias, num_classes) +# self.outputs = {'PreOut': pre_output, 'Out': out} +# def test_check_output(self): +# self.check_output() + +# def test_check_grad(self): +# self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) + +# class TestHSigmoidOpSparse(OpTest): +# def setUp(self): +# self.op_type = "hierarchical_sigmoid" +# num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample +# feature_size = 8 +# batch_size = 4 +# x = np.random.random((batch_size, feature_size)).astype("float32") * 2 +# w = np.random.random( +# (num_classes - 1, feature_size)).astype("float32") * 2 +# label = np.array([0, 1, 4, 5]) +# ptable = np.array( +# [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), +# (0, 2, -1, -1, +# -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) +# pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( +# 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store +# bias = np.random.random((1, num_classes - 1)).astype("float32") +# self.attrs = {'num_classes': num_classes, 'is_sparse': True} +# self.inputs = { +# 'X': x, +# 'W': w, +# 'PTable': ptable, +# 'PCode': pcode, +# 'Label': label, +# 'Bias': bias +# } +# pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, +# bias, num_classes) +# self.outputs = {'PreOut': pre_output, 'Out': out} + +# def test_check_output(self): +# print("checking output in CostumTree") +# self.check_output() + + +class TestHSigmoidOpWithSparseGrad(): + def hs_net_conf(self): + emb = fluid.layers.data(name="x", shape=[3], dtype='int64') + ptable = fluid.layers.data(name='ptable', shape=[3], dtype='int64') + pcode = fluid.layers.data(name='pcode', shape=[3], dtype='int64') + label = fluid.layers.data(name='label', shape=[1], dtype='int64') + data_list = [emb, ptable, pcode, label] + cost = fluid.layers.hsigmoid( + input=emb, + label=predict_word, + non_leaf_num=4, + ptable=ptable, + pcode=pcode, + is_costum=True, + is_sparse=True) + + avg_cost = fluid.layers.reduce_mean(cost) + + return avg_cost, data_list + + def test_training_test(self): + print("im here") + w = np.arange(12).reshape(4, 3) + x = np.ones((2, 3)) + ptable = np.array([(1, 2, -1), (1, 2, -1)]) + pcode = np.array([(1, 0, -1), (0, 0, -1)]) + label = np.array([(1, 4)]) + + loss, data_list = hs_net_conf() + optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + optimizer.minimize(loss) + + main_program = fluid.default_main_program() + + place = fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=data_list, place=place) + data_name_list = [var.name for var in data_list] + exe = fluid.Executor(place) + exe.run(fluid.default_startup_program()) + for pass_id in range(args.num_passes): + for i in range(10): + data = [w, x[i % 2], ptable[i % 2], pcode[i % 2], label[i % 2]] + loss_val = exe.run(main_program, + feed=feeder.feed(data), + fetch_list=[loss]) + print("loss is: {loss}".format(loss=loss)) + + +# class TestHSigmoidOpWithCostumTree(OpTest): +# def setUp(self): +# self.op_type = "hierarchical_sigmoid" +# num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample +# feature_size = 8 +# batch_size = 4 +# x = np.random.random((batch_size, feature_size)).astype("float32") * 2 +# w = np.random.random( +# (num_classes - 1, feature_size)).astype("float32") * 2 +# label = np.array([0, 1, 4, 5]) +# ptable = np.array( +# [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), +# (0, 2, -1, -1, +# -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) +# pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( +# 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store +# bias = np.random.random((1, num_classes - 1)).astype("float32") +# self.attrs = {'num_classes': num_classes, 'is_sparse': False} +# self.inputs = { +# 'X': x, +# 'W': w, +# 'PTable': ptable, +# 'PCode': pcode, +# 'Label': label, +# 'Bias': bias +# } +# pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, +# bias, num_classes) +# self.outputs = {'PreOut': pre_output, 'Out': out} + +# def test_check_output(self): +# print("checking output in CostumTree") +# self.check_output() + +# def test_check_grad(self): +# print("checking outputGrad in CostumTree") +# self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) if __name__ == '__main__': unittest.main() From f913860873781ff4ccc9ee2eba73365d530fae22 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 21 Nov 2018 08:47:12 +0000 Subject: [PATCH 081/684] jitkernel lstm refer support peephole test=develop --- .../fluid/operators/fused/fusion_lstm_op.cc | 73 +++-- paddle/fluid/operators/math/jit_code.cc | 6 +- paddle/fluid/operators/math/jit_code.h | 42 ++- paddle/fluid/operators/math/jit_kernel.h | 15 +- paddle/fluid/operators/math/jit_kernel_impl.h | 14 +- .../fluid/operators/math/jit_kernel_macro.h | 8 +- .../fluid/operators/math/jit_kernel_refer.h | 35 ++- paddle/fluid/operators/math/jit_kernel_rnn.cc | 288 +++++++----------- .../fluid/operators/math/jit_kernel_test.cc | 32 +- 9 files changed, 250 insertions(+), 263 deletions(-) diff --git a/paddle/fluid/operators/fused/fusion_lstm_op.cc b/paddle/fluid/operators/fused/fusion_lstm_op.cc index 0959539068eef..8021a896ceaa8 100644 --- a/paddle/fluid/operators/fused/fusion_lstm_op.cc +++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc @@ -236,27 +236,31 @@ class FuisonLSTMKernel : public framework::OpKernel { const int D = wh_dims[0]; \ const int D4 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - /* diagonal weight*/ \ - const T* wp_data = bias->data() + D4; \ - /* for peephole only*/ \ - T* checked_cell_data = nullptr; \ - auto place = ctx.GetPlace(); \ - if (use_peepholes) { \ - /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ - auto* checked_cell = ctx.Output("CheckedCell"); \ - checked_cell_data = checked_cell->mutable_data(place); \ - } \ - const auto& ker = \ - math::jitkernel::KernelPool::Instance() \ - .template Get, const std::string&, \ - const std::string&, const std::string&>( \ - ctx.Attr("gate_activation"), \ - ctx.Attr("candidate_activation"), \ - ctx.Attr("cell_activation"), D, use_peepholes) +#define INIT_OTHER_DEFINES \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + /* diagonal weight*/ \ + const T* wp_data = bias->data() + D4; \ + /* for peephole only*/ \ + T* checked_cell_data = nullptr; \ + auto place = ctx.GetPlace(); \ + if (use_peepholes) { \ + /* w_ic * Ct-1, w_fc * Ct-1 ; w_oc * Ct => ih*/ \ + auto* checked_cell = ctx.Output("CheckedCell"); \ + checked_cell_data = checked_cell->mutable_data(place); \ + } \ + const math::jitkernel::lstm_attr_t attr( \ + D, ctx.Attr("gate_activation"), \ + ctx.Attr("candidate_activation"), \ + ctx.Attr("cell_activation"), use_peepholes); \ + math::jitkernel::lstm_t one_step; \ + one_step.wp = wp_data; \ + one_step.checked = checked_cell_data; \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, \ + const math::jitkernel::lstm_attr_t&>(attr) // Wh GEMM #define GEMM_WH_ADDON(bs, prev, out) \ @@ -299,7 +303,10 @@ class FuisonLSTMKernel : public framework::OpKernel { prev_h_data = h0_data + bid * D; prev_c_data = c0_data + bid * D; } else { - ker->ComputeC1H1(xx_data, c_out_data, h_out_data, wp_data); + one_step.gates = xx_data; + one_step.ct = c_out_data; + one_step.ht = h_out_data; + ker->ComputeC1H1(&one_step, &attr); tstart = 1; // move one step prev_h_data = h_out_data; @@ -310,8 +317,12 @@ class FuisonLSTMKernel : public framework::OpKernel { } for (int step = tstart; step < seq_len; ++step) { GEMM_WH_ADDON(1, prev_h_data, xx_data); - ker->ComputeCtHt(xx_data, prev_c_data, c_out_data, h_out_data, wp_data, - checked_cell_data); + + one_step.gates = xx_data; + one_step.ct_1 = prev_c_data; + one_step.ct = c_out_data; + one_step.ht = h_out_data; + ker->ComputeCtHt(&one_step, &attr); // move one step prev_h_data = h_out_data; prev_c_data = c_out_data; @@ -388,7 +399,11 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_h_out_data = batched_h_out_data; T* cur_c_out_data = batched_c_out_data; for (int i = 0; i < max_bs; ++i) { - ker->ComputeC1H1(cur_in_data, cur_c_out_data, cur_h_out_data, wp_data); + one_step.gates = cur_in_data; + one_step.ct = cur_c_out_data; + one_step.ht = cur_h_out_data; + ker->ComputeC1H1(&one_step, &attr); + cur_in_data += D4; cur_c_out_data += D; cur_h_out_data += D; @@ -413,8 +428,12 @@ class FuisonLSTMKernel : public framework::OpKernel { T* cur_c_out_data = batched_c_out_data; T* cur_h_out_data = batched_h_out_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeCtHt(cur_in_data, cur_prev_c_data, cur_c_out_data, - cur_h_out_data, wp_data, checked_cell_data); + one_step.gates = cur_in_data; + one_step.ct_1 = cur_prev_c_data; + one_step.ct = cur_c_out_data; + one_step.ht = cur_h_out_data; + ker->ComputeCtHt(&one_step, &attr); + // move one batch cur_in_data += D4; cur_prev_c_data += D; diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 418c8433625cb..ccc9206f5cda8 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -233,7 +233,7 @@ void LSTMJitCode::generate() { vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]); act(ymm_i, ymm_src, act_gate_); vmulps(ymm_c, ymm_c, ymm_i); - if (first_) { + if (!compute_c1h1_) { // f vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]); act(ymm_f, ymm_src, act_gate_); @@ -242,8 +242,8 @@ void LSTMJitCode::generate() { vaddps(ymm_f, ymm_f, ymm_c); } /* H_t = act_cell(C_t) * ogated */ - ymm_t ymm_ct = first_ ? ymm_c : ymm_f; - ymm_t ymm_o = first_ ? ymm_f : ymm_c; + ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; + ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c; ymm_t ymm_tmp = ymm_i; act(ymm_tmp, ymm_ct, act_cell_); vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]); diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 9782f5414c7de..bf28d444b7712 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -319,6 +319,12 @@ class LSTMJitCode : public VActJitCode { public: const char* name() const override { std::string base = "LSTMJitCode"; + if (use_peephole_) { + base += "_Peephole"; + } + if (compute_c1h1_) { + base += "_C1H1"; + } auto AddTypeStr = [&](operand_type type) { switch (type) { case operand_type::relu: @@ -340,30 +346,42 @@ class LSTMJitCode : public VActJitCode { break; } }; - if (first_) { - base += "_C1H1"; - } AddTypeStr(act_gate_); AddTypeStr(act_cand_); AddTypeStr(act_cell_); return base.c_str(); } - explicit LSTMJitCode(int d, bool first, operand_type act_gate, - operand_type act_cand, operand_type act_cell, + explicit LSTMJitCode(bool compute_c1h1, const lstm_attr_t& attr, size_t code_size = 256 * 1024, void* code_ptr = nullptr) - : VActJitCode(d, act_gate, code_size, code_ptr), - num_(d), - first_(first), - act_gate_(act_gate), - act_cand_(act_cand), - act_cell_(act_cell) {} + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + compute_c1h1_(compute_c1h1) { + auto typeExchange = [](const std::string& type) -> gen::operand_type { + if (type == "sigmoid") { + return operand_type::sigmoid; + } else if (type == "relu") { + return operand_type::relu; + } else if (type == "tanh") { + return operand_type::tanh; + } else if (type == "identity" || type == "") { + return operand_type::identity; + } // else throw error + return operand_type::identity; + }; + num_ = attr.d; + use_peephole_ = attr.use_peephole; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + act_cell_ = typeExchange(attr.act_cell); + } static bool init(int d); void generate() override; protected: int num_; - bool first_; + bool compute_c1h1_; + bool use_peephole_; operand_type act_gate_; operand_type act_cand_; operand_type act_cell_; diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index 36199eddaf5f1..bb5ba5813a7e6 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -122,18 +122,9 @@ class VTanhKernel : public VActKernel {}; template class LSTMKernel : public Kernel { public: - virtual void ComputeCtHt(T *gates, const T *ct_1, T *ct, T *ht, - /* below only used in peephole*/ - const T *wp_data = nullptr, - T *checked = nullptr) const = 0; - - virtual void ComputeC1H1(T *gates, T *ct, T *ht, - /* below only used in peephole*/ - const T *wp_data = nullptr) const = 0; - - // void (*ComputeCtHt)(lstm_t *); - // // compute c1 and h1 without c0 or h0 - // void (*ComputeC1H1)(lstm_t *); + void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *); + // compute c1 and h1 without c0 or h0 + void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h index 337d5ae91413a..2e734ca940895 100644 --- a/paddle/fluid/operators/math/jit_kernel_impl.h +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -33,18 +33,24 @@ typedef struct { const void* ct_1; void* ct; void* ht; - /* below only used in peephole*/ - const void* wp_data{nullptr}; + /* weight_peephole and checked data are only used in peephole*/ + const void* wp{nullptr}; void* checked{nullptr}; } lstm_t; typedef struct lstm_attr_s { + bool use_peephole; int d; std::string act_gate, act_cand, act_cell; lstm_attr_s() = default; lstm_attr_s(int _d, const std::string& _act_gate, - const std::string& _act_cand, const std::string& _act_cell) - : d(_d), act_gate(_act_gate), act_cand(_act_cand), act_cell(_act_cell) {} + const std::string& _act_cand, const std::string& _act_cell, + bool _use_peephole = false) + : use_peephole(_use_peephole), + d(_d), + act_gate(_act_gate), + act_cand(_act_cand), + act_cell(_act_cell) {} } lstm_attr_t; } // namespace jitkernel diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 8acf60cfbfd3d..5a3efd979f803 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -82,10 +82,10 @@ namespace jitkernel { #define REGISTER_JITKERNEL_ARGS(ker_key, ker_class, marco_define_name, \ marco_declare, macro_find_key, macro_impl) \ marco_define_name(ker_key, ker_class); \ - REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, JITKERNEL_DECLARE, \ - JITKERNEL_FIND_KEY, JITKERNEL_IMPL); \ - REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, JITKERNEL_DECLARE, \ - JITKERNEL_FIND_KEY, JITKERNEL_IMPL) + REGISTER_JITKERNEL_WITH_DTYPE(ker_class, float, marco_declare, \ + macro_find_key, macro_impl); \ + REGISTER_JITKERNEL_WITH_DTYPE(ker_class, double, marco_declare, \ + macro_find_key, macro_impl) #define REGISTER_JITKERNEL(ker_key, ker_class) \ REGISTER_JITKERNEL_ARGS(ker_key, ker_class, JITKERNEL_DEFINE_NAME, \ diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index 9c60ebc5873d2..097bb8595612b 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -117,11 +117,13 @@ void (*getActFunc(const std::string& type))(const T*, T*, int) { // NOLINT } template -void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) { +void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr) { T* gates = reinterpret_cast(step->gates); const T* ct_1 = reinterpret_cast(step->ct_1); T* ct = reinterpret_cast(step->ct); T* ht = reinterpret_cast(step->ht); + const T* wp = reinterpret_cast(step->wp); + T* checked = reinterpret_cast(step->checked); auto act_gate = getActFunc(attr->act_gate); auto act_cand = getActFunc(attr->act_cand); auto act_cell = getActFunc(attr->act_cell); @@ -129,23 +131,36 @@ void LSTMCtHt(lstm_t* step, lstm_attr_t* attr) { int d2 = d * 2; int d3 = d * 3; // gates: W_ch, W_ih, W_fh, W_oh - act_gate(gates + d, gates + d, d3); + if (attr->use_peephole) { + VMul(wp, ct_1, checked, d); + VMul(wp + d, ct_1, checked + d, d); + VAdd(checked, gates + d, gates + d, d2); + act_gate(gates + d, gates + d, d2); + } else { + act_gate(gates + d, gates + d, d3); + } - /* C_t = C_t-1 * fgated + cand_gated * igated */ + // C_t = C_t-1 * fgated + cand_gated * igated act_cand(gates, gates, d); VMul(gates, gates + d, gates + d, d); VMul(ct_1, gates + d2, gates + d2, d); VAdd(gates + d, gates + d2, ct, d); - /* H_t = act_cell(C_t) * ogated */ + if (attr->use_peephole) { + // get ogated + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + act_gate(gates + d3, gates + d3, d); + } + // H_t = act_cell(C_t) * ogated act_cell(ct, gates + d2, d); VMul(gates + d2, gates + d3, ht, d); } +// compute c1 and h1 without c0 or h0 template -void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) { +void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { T* gates = reinterpret_cast(step->gates); - const T* ct_1 = reinterpret_cast(step->ct_1); T* ct = reinterpret_cast(step->ct); T* ht = reinterpret_cast(step->ht); auto act_gate = getActFunc(attr->act_gate); @@ -158,10 +173,16 @@ void LSTMC1H1(lstm_t* step, lstm_attr_t* attr) { act_gate(gates + d, gates + d, d); act_cand(gates, gates, d); VMul(gates, gates + d, ct, d); + if (attr->use_peephole) { + // get outgated, put W_oc * C_t on igated + const T* wp = reinterpret_cast(step->wp); + VMul(wp + d2, ct, gates + d, d); + VAdd(gates + d, gates + d3, gates + d3, d); + } /* H_t = act_cell(C_t) * ogated */ act_gate(gates + d3, gates + d3, d); act_cell(ct, gates + d2, d); - Vmul(gates + d2, gates + d3, ht, d); + VMul(gates + d2, gates + d3, ht, d); } } // namespace refer diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index e79b0400ab75d..6b7463aa52b93 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -15,9 +15,14 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" #include #include "paddle/fluid/operators/math/jit_kernel_macro.h" +#include "paddle/fluid/operators/math/jit_kernel_refer.h" #include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/macros.h" +#ifdef PADDLE_WITH_XBYAK +#include "paddle/fluid/operators/math/jit_code.h" +#endif + #ifdef __AVX__ #include #endif @@ -154,211 +159,136 @@ static std::unique_ptr GetAVXAct(const std::string& type) { #endif /* LSTM JitKernel */ -template +template class LSTMKernelImpl : public LSTMKernel { public: - explicit LSTMKernelImpl(const std::string& act_gate, - const std::string& act_cand, - const std::string& act_cell, int d) - : LSTMKernel() { - d_ = d; - d2_ = d * 2; - d3_ = d * 3; - act_gate_d3_ = GetActKernel(act_gate, d3_); - act_gate_d_ = GetActKernel(act_gate, d); - act_cand_d_ = GetActKernel(act_cand, d); - act_cell_d_ = GetActKernel(act_cell, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - vadd_d_ = KernelPool::Instance().template Get>(d); + static inline std::string name(const lstm_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); + this->ComputeCtHt = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096)); + this->ComputeC1H1 = + jitcode1_->getCode(); + return; + } +#endif - void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, - T* checked) const override { - // gates: W_ch, W_ih, W_fh, W_oh - act_gate_d3_->Compute(gates + d_, gates + d_, d3_); - - /* C_t = C_t-1 * fgated + cand_gated * igated */ - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, gates + d_, d_); - vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); - vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); - - /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); - } - void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { - /* C_t = igated * cgated*/ - act_gate_d_->Compute(gates + d_, gates + d_, d_); - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, ct, d_); - /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); + this->ComputeCtHt = refer::LSTMCtHt; + this->ComputeC1H1 = refer::LSTMC1H1; } +#ifdef PADDLE_WITH_XBYAK + private: - int d_, d2_, d3_; - std::shared_ptr> act_gate_d3_, act_gate_d_, act_cand_d_, - act_cell_d_; - std::shared_ptr> vmul_d_; - std::shared_ptr> vadd_d_; -#ifdef __AVX__ - std::unique_ptr avx_act_gate_, avx_act_cand_, avx_act_cell_; + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}; #endif }; -#define INTRI8_FLOAT(isa) \ - template <> \ - LSTMKernelImpl::LSTMKernelImpl( \ - const std::string& act_gate, const std::string& act_cand, \ - const std::string& act_cell, int d) \ - : LSTMKernel() { \ - avx_act_gate_ = GetAVXAct(act_gate); \ - avx_act_cand_ = GetAVXAct(act_cand); \ - avx_act_cell_ = GetAVXAct(act_cell); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeCtHt( \ - float* gates, const float* ct_1, float* ct, float* ht, \ - const float* wp_data, float* checked) const { \ - /* gates: W_ch, W_ih, W_fh, W_oh */ \ - __m256 c, i, f, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - f = _mm256_loadu_ps(gates + 16); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = C_t-1 * fgated + cand_gated * igated*/ \ - c = _mm256_mul_ps(avx_act_cand_->Compute(c), avx_act_gate_->Compute(i)); \ - i = _mm256_loadu_ps(ct_1); \ - f = _mm256_mul_ps(i, avx_act_gate_->Compute(f)); \ - f = _mm256_add_ps(c, f); \ - _mm256_storeu_ps(ct, f); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(f), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ - } \ - template <> \ - void LSTMKernelImpl::ComputeC1H1( \ - float* gates, float* ct, float* ht, const float* wp_data) const { \ - __m256 c, i, o; \ - c = _mm256_loadu_ps(gates); \ - i = _mm256_loadu_ps(gates + 8); \ - o = _mm256_loadu_ps(gates + 24); \ - /* C_t = igated * cgated*/ \ - c = _mm256_mul_ps(avx_act_gate_->Compute(i), avx_act_cand_->Compute(c)); \ - _mm256_storeu_ps(ct, c); \ - /* H_t = act_cell(C_t) * ogated */ \ - o = _mm256_mul_ps(avx_act_cell_->Compute(c), avx_act_gate_->Compute(o)); \ - _mm256_storeu_ps(ht, o); \ - } - -// TODO(TJ): optimize keq16 - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); +#ifdef PADDLE_WITH_XBYAK +template <> +bool LSTMKernelImpl::useJIT(int d) { + return false; // not ready yet gen::LSTMJitCode::init(d); +} #endif /* Peephole JitKernel */ -template +template class PeepholeKernelImpl : public LSTMKernel { public: - explicit PeepholeKernelImpl(const std::string& act_gate, - const std::string& act_cand, - const std::string& act_cell, int d) - : LSTMKernel() { - d_ = d; - d2_ = d * 2; - d3_ = d * 3; - act_gate_d_ = GetActKernel(act_gate, d); - act_cand_d_ = GetActKernel(act_cand, d); - act_cell_d_ = GetActKernel(act_cell, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - vadd_d_ = KernelPool::Instance().template Get>(d); - vadd_d2_ = KernelPool::Instance().template Get>(d2_); - act_gate_d2_ = GetActKernel(act_gate, d2_); + static inline std::string name(const lstm_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); + this->ComputeCtHt = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::LSTMJitCode(true, attr, sz > 4096 ? sz : 4096)); + this->ComputeC1H1 = + jitcode1_->getCode(); + return; + } +#endif - void ComputeCtHt(T* gates, const T* ct_1, T* ct, T* ht, const T* wp_data, - T* checked) const override { - /* get fgated and igated*/ - vmul_d_->Compute(wp_data, ct_1, checked, d_); - vmul_d_->Compute(wp_data + d_, ct_1, checked + d_, d_); - vadd_d2_->Compute(checked, gates + d_, gates + d_, d2_); - act_gate_d2_->Compute(gates + d_, gates + d_, d2_); - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, gates + d_, d_); - vmul_d_->Compute(ct_1, gates + d2_, gates + d2_, d_); - vadd_d_->Compute(gates + d_, gates + d2_, ct, d_); - /* get ogated*/ - vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); - vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - /* H_t = act_cell(C_t) * ogated */ - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); + this->ComputeCtHt = refer::LSTMCtHt; + this->ComputeC1H1 = refer::LSTMC1H1; } - void ComputeC1H1(T* gates, T* ct, T* ht, const T* wp_data) const override { - /* C_t = igated * cgated*/ - act_gate_d_->Compute(gates + d_, gates + d_, d_); - act_cand_d_->Compute(gates, gates, d_); - vmul_d_->Compute(gates, gates + d_, ct, d_); - /* get outgated, put W_oc * C_t on igated */ - vmul_d_->Compute(wp_data + d2_, ct, gates + d_, d_); - vadd_d_->Compute(gates + d_, gates + d3_, gates + d3_, d_); - /* H_t = act_cell(C_t) * ogated */ - act_gate_d_->Compute(gates + d3_, gates + d3_, d_); - act_cell_d_->Compute(ct, gates + d2_, d_); - vmul_d_->Compute(gates + d2_, gates + d3_, ht, d_); - } +#ifdef PADDLE_WITH_XBYAK private: - int d_, d2_, d3_; - std::shared_ptr> act_gate_d2_, act_gate_d_, act_cand_d_, - act_cell_d_; - std::shared_ptr> vmul_d_; - std::shared_ptr> vadd_d_, vadd_d2_; + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}; +#endif }; -#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ - template <> \ - std::shared_ptr> \ - KernelPool::Get, const std::string&, \ - const std::string&, const std::string&, int, bool>( \ - const std::string& act_gate, const std::string& act_cand, \ - const std::string& act_cell, int d, bool use_peephole) - -#define JITKERNEL_KEY_LSTM(ker_key, dtype_key) \ - #ker_key #dtype_key + std::to_string(d) + act_gate + act_cand + act_cell + \ - (use_peephole ? "p" : "n") - -#define JITKERNEL_NEW_LSTM_IMPL(ker, dtype, isa, k) \ - if (use_peephole) { \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>( \ - act_gate, act_cand, act_cell, d)); \ - } else { \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>(act_gate, act_cand, \ - act_cell, d)); \ +#ifdef PADDLE_WITH_XBYAK +template <> +bool PeepholeKernelImpl::useJIT(int d) { + return false; // peephole jitcode not ready yet +} +#endif + +#define JITKERNEL_DEFINE_NAME_LSTM(ker_key, ker_class) \ + template <> \ + std::string ker_class##Impl::name(const lstm_attr_t& attr) { \ + std::string key(#ker_key "f"); \ + key += (attr.act_gate + attr.act_cand + attr.act_cell + \ + (attr.use_peephole ? "p" : "n")); \ + if (useJIT(attr.d)) { \ + /* only jit code need record d*/ \ + return key + "jit" + std::to_string(attr.d); \ + } else if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } \ + template <> \ + std::string ker_class##Impl::name(const lstm_attr_t& attr) { \ + std::string key(#ker_key "d"); \ + /* jit code do not support double yet*/ \ + if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ } -REGISTER_JITKERNEL_ARGS_DEPRECATED(lstm, LSTMKernel, JITKERNEL_DECLARE_LSTM, - JITKERNEL_KEY_LSTM, JITKERNEL_NEW_LSTM_IMPL); +#define JITKERNEL_DECLARE_LSTM(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const lstm_attr_t&>( \ + const lstm_attr_t& attr) + +#define JITKERNEL_FIND_KEY_LSTM(ker_class, ker_dtype) \ + std::string key = ker_class##Impl::name(attr) + +#define JITKERNEL_LSTM_IMPL(ker, dtype) \ + if (attr.use_peephole) { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); \ + } else { \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); \ + } -#undef INTRI8_FLOAT -#undef JITKERNEL_DECLARE_LSTM -#undef JITKERNEL_KEY_LSTM -#undef JITKERNEL_NEW_LSTM_IMPL +REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM, + JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM, + JITKERNEL_LSTM_IMPL); /* GRU JitKernel */ template diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index a1705a81c47d7..1cbe1b5d952f0 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -341,11 +341,11 @@ TEST(JitKernel, lstm) { RandomVec(d, ct_1.data(), -2.f, 2.f); memcpy(xref.data(), x.data(), sizeof(float) * d4); std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + const jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell, false); const auto& ker = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, d, false); + .template Get, const jit::lstm_attr_t&>( + attr); // below kernels are used to compute refer const auto& vsigmoid_3d = jit::KernelPool::Instance().template Get>( @@ -366,14 +366,16 @@ TEST(JitKernel, lstm) { float* ht_ref_data = ht_ref.data(); // compute once to check correctness jit::lstm_t step; - jit::lstm_attr_t attr(d, act_gate, act_cand, act_cell); step.gates = xref_data; step.ct_1 = ct_1_data; step.ct = ct_ref_data; step.ht = ht_ref_data; refer::LSTMCtHt(&step, &attr); - ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + step.gates = x_data; + step.ct = ct_tgt_data; + step.ht = ht_tgt_data; + ker->ComputeCtHt(&step, &attr); for (int i = 0; i < d; ++i) { EXPECT_NEAR(ct_tgt_data[i], ct_ref_data[i], 1e-3); EXPECT_NEAR(ht_tgt_data[i], ht_ref_data[i], 1e-3); @@ -392,7 +394,7 @@ TEST(JitKernel, lstm) { auto trefe = GetCurrentUS(); auto ttgts = GetCurrentUS(); for (int i = 0; i < repeat; ++i) { - ker->ComputeCtHt(x_data, ct_1_data, ct_tgt_data, ht_tgt_data); + ker->ComputeCtHt(&step, &attr); } auto ttgte = GetCurrentUS(); VLOG(30) << "Vec size " << d @@ -710,21 +712,21 @@ TEST(JitKernel, pool) { namespace jit = paddle::operators::math::jitkernel; const int frame_size = 4; std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; + jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); + const auto& plstm1 = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, false); + .template Get, const jit::lstm_attr_t&>(attr); + const auto& plstm2 = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, false); + .template Get, const jit::lstm_attr_t&>(attr); + EXPECT_EQ(plstm1, plstm2); + const auto& peephole = jit::KernelPool::Instance() - .template Get, const std::string&, - const std::string&, const std::string&>( - act_gate, act_cand, act_cell, frame_size, true); + .template Get, const jit::lstm_attr_t&>( + jit::lstm_attr_t(frame_size, act_gate, act_cand, act_cell, true)); EXPECT_TRUE(plstm1 != peephole); const auto& pvmul_f = From 94de2290f4cbc6df0050d06a2d381ba941eb78d1 Mon Sep 17 00:00:00 2001 From: jerrywgz Date: Wed, 21 Nov 2018 08:58:04 +0000 Subject: [PATCH 082/684] fix format in api doc, test=develop --- python/paddle/fluid/layers/nn.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 5749bcc54a618..be120e45d2d52 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6904,13 +6904,13 @@ def prelu(x, mode, param_attr=None, name=None): Args: x (Variable): The input tensor. - param_attr(ParamAttr|None): The parameter attribute for the learnable - weight (alpha). + param_attr(ParamAttr|None): The parameter attribute for the learnable + weight (alpha). mode (string): The mode for weight sharing. It supports all, channel and element. all: all elements share same weight channel:elements in a channel share same weight element:each element has a weight - name(str|None): A name for this layer(optional). If set None, the layer + name(str|None): A name for this layer(optional). If set None, the layer will be named automatically. Returns: @@ -6920,9 +6920,9 @@ def prelu(x, mode, param_attr=None, name=None): .. code-block:: python - x = fluid.layers.data(name="x", shape=[10,10], dtype="float32") - mode = 'channel' - output = fluid.layers.prelu(x,mode) + x = fluid.layers.data(name="x", shape=[10,10], dtype="float32") + mode = 'channel' + output = fluid.layers.prelu(x,mode) """ helper = LayerHelper('prelu', **locals()) if mode not in ['all', 'channel', 'element']: From f10e196fc8de5d76333940a263dabf33f0450fa5 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 21 Nov 2018 18:09:44 +0800 Subject: [PATCH 083/684] fix build issue --- paddle/fluid/inference/tensorrt/convert/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt index 27fb41d16ead6..840abd26a755c 100644 --- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt @@ -18,7 +18,7 @@ nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc nv_test(test_trt_conv_op SRCS test_conv2d_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine conv_op conv_transpose_op SERIAL) nv_test(test_trt_pool2d_op SRCS test_pool2d_op.cc pool2d_op.cc - DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op SERIAL) + DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine pool_op tensorrt_plugin SERIAL) nv_test(test_trt_elementwise_op SRCS test_elementwise_op.cc elementwise_op.cc DEPS ${FLUID_CORE_MODULES} ${GLOB_OPERATOR_DEPS} tensorrt_engine tensorrt_plugin elementwise_add_op elementwise_mul_op SERIAL) From 3e3599f3d937e0444606056f3c9f2261b74dfd93 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 21 Nov 2018 11:31:04 +0000 Subject: [PATCH 084/684] Refine split tensorrt plugin --- .../inference/tensorrt/convert/split_op.cc | 3 +- .../tensorrt/plugin/split_op_plugin.cu | 157 ++++++++++++++---- .../tensorrt/plugin/split_op_plugin.h | 9 +- 3 files changed, 134 insertions(+), 35 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 6620c76318f99..871354267e936 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -40,7 +40,7 @@ class SplitOpConverter : public OpConverter { int axis = boost::get(op_desc.GetAttr("axis")); std::vector output_lengths = boost::get>(op_desc.GetAttr("sections")); - PADDLE_ENFORCE(axis != 0); + // PADDLE_ENFORCE(axis != 0); if (axis < 0) { axis += input_dims.nbDims; } else { @@ -48,7 +48,6 @@ class SplitOpConverter : public OpConverter { } PADDLE_ENFORCE(output_lengths.size() == output_num); - // plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 4adea2db1ee80..1ec0753e9fb01 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -12,6 +12,8 @@ // See the License for the specific language governing permissions and // limitations under the License. +#include +#include #include "paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h" namespace paddle { @@ -19,6 +21,52 @@ namespace inference { namespace tensorrt { namespace plugin { +// copied from operators::math::SplitFunctor +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int* out_cols, + int out_cols_size, T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + int curr_segment = 0; + int curr_offset = out_cols[0]; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int curr_col_offset = out_cols[curr_segment + 1]; + while (curr_col_offset <= tid_x) { + curr_offset = curr_col_offset; + ++curr_segment; + curr_col_offset = out_cols[curr_segment + 1]; + } + + int local_col = tid_x - curr_offset; + int segment_width = curr_col_offset - curr_offset; + T* output_ptr = outputs_data[curr_segment]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * segment_width + local_col] = + input_data[tid_y * in_col + tid_x]; + } + } +} + +template +__global__ void SplitKernel(const T* input_data, const int in_row, + const int in_col, const int fixed_out_col, + T** outputs_data) { + int tid_x = blockIdx.x * blockDim.x + threadIdx.x; + for (; tid_x < in_col; tid_x += blockDim.x * gridDim.x) { + int split = tid_x / fixed_out_col; + int in_offset = tid_x - split * fixed_out_col; + T* output_ptr = outputs_data[split]; + if (output_ptr != nullptr) { + int tid_y = blockIdx.y * blockDim.y + threadIdx.y; + for (; tid_y < in_row; tid_y += blockDim.y * gridDim.y) + output_ptr[tid_y * fixed_out_col + in_offset] = + input_data[tid_y * in_col + tid_x]; + } + } +} + nvinfer1::Dims SplitPlugin::getOutputDimensions( int index, const nvinfer1::Dims* input_dims, int num_inputs) { PADDLE_ENFORCE_EQ(num_inputs, 1); @@ -31,48 +79,95 @@ nvinfer1::Dims SplitPlugin::getOutputDimensions( int SplitPlugin::initialize() { PADDLE_ENFORCE_LE(axis_, nvinfer1::Dims::MAX_DIMS); - + // notice input dims is [C, H, W] + nvinfer1::Dims dims = this->getInputDims(0); + outer_rows_ = 1; + inner_cols_ = 1; + for (int i = 0; i < axis_; ++i) { + outer_rows_ *= dims.d[i]; + } + for (int i = axis_ + 1; i < dims.nbDims; ++i) { + inner_cols_ *= dims.d[i]; + } + same_shape_ = true; std::vector segment_offsets(1, 0); for (int i = 0; i < this->getNbOutputs(); ++i) { - segment_offsets.push_back(segment_offsets.back() + output_length_[i]); + if (output_length_[i] != output_length_[0]) { + same_shape_ = false; + } + segment_offsets.push_back(segment_offsets.back() + + output_length_[i] * inner_cols_); } - segment_offsets_ = segment_offsets; - nvinfer1::Dims dims = this->getInputDims(0); - nx_ = 1; - for (int i = dims.nbDims - 1; i > axis_; --i) { - nx_ *= dims.d[i]; + inner_cols_ *= dims.d[axis_]; + d_segment_offsets_ = segment_offsets; + segment_offsets_ = std::move(segment_offsets); + d_output_ptrs_.resize(this->getNbOutputs(), nullptr); + return 0; +} + +template +inline void Split(cudaStream_t stream, const bool same_shape, + const int outer_rows, const int inner_cols, + const std::vector& segment_offsets, + const int* d_segment_offsets, const T* input, T** outputs) { + const int kThreadsPerBlock = 1024; + const int kMaxBlocks = 65535; + int block_cols = kThreadsPerBlock; + if (inner_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((inner_cols + 31) >> 5) << 5; } - ny_ = dims.d[axis_]; - nz_ = 1; - for (int i = axis_ - 1; i >= 0; --i) { - nz_ *= dims.d[i]; + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int grid_cols = + std::min((inner_cols + block_cols - 1) / block_cols, kMaxBlocks); + int grid_rows = + std::min(kMaxBlocks / grid_cols, std::max(outer_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + + if (same_shape) { + SplitKernel<<>>( + input, outer_rows, inner_cols, segment_offsets[1], outputs); + } else { + SplitKernel<<>>( + input, outer_rows, inner_cols, d_segment_offsets, + static_cast(segment_offsets.size()), outputs); } - return 0; } int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { - auto const& input_dims = this->getInputDims(0); - int input_size = 0; - float const* idata = reinterpret_cast(inputs[0]); - float** odatas = reinterpret_cast(outputs); - - // kernel impl here. - int inputBatchOffset = nx_ * ny_ * nz_; - for (size_t i = 0; i < this->getNbOutputs(); i++) { - for (size_t j = 0; j < batchSize; j++) { - cudaMemcpyAsync( - odatas[i] + - j * (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * - sizeof(float), - inputs[0] + - (inputBatchOffset * j + segment_offsets_[i] * nx_) * - sizeof(float), - (segment_offsets_[i + 1] - segment_offsets_[i]) * nx_ * sizeof(float), - cudaMemcpyDeviceToDevice, stream); + float const* input_ptr = reinterpret_cast(inputs[0]); + if (axis_ == -1 && this->getNbOutputs() < 10) { + float** output_ptrs = reinterpret_cast(outputs); + int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) + ? sizeof(__half) + : sizeof(float); + for (int i = 0; i < this->getNbOutputs(); ++i) { + PADDLE_ENFORCE( + cudaMemcpyAsync( + output_ptrs[i], input_ptr + segment_offsets_[i], + (segment_offsets_[i + 1] - segment_offsets_[i]) * data_type_size, + cudaMemcpyDeviceToDevice, stream) == cudaSuccess); + } + } else { + outer_rows_ *= batchSize; + const int* d_segment_offsets_ptr = + thrust::raw_pointer_cast(&d_segment_offsets_[0]); + float** output_ptrs = thrust::raw_pointer_cast(&d_output_ptrs_[0]); + PADDLE_ENFORCE(cudaMemcpyAsync(output_ptrs, outputs, + this->getNbOutputs() * sizeof(float*), + cudaMemcpyHostToDevice, + stream) == cudaSuccess); + if (this->getDataType() == nvinfer1::DataType::kFLOAT) { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, input_ptr, output_ptrs); + } else { + Split(stream, same_shape_, outer_rows_, inner_cols_, segment_offsets_, + d_segment_offsets_ptr, (__half*)input_ptr, // NOLINT + (__half**)output_ptrs); // NOLINT } } - return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h index b5b6e69992b05..6f028d3d72ae3 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.h @@ -14,6 +14,7 @@ #pragma once +#include #include #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin.h" @@ -25,7 +26,7 @@ namespace plugin { class SplitPlugin : public PluginTensorRT { public: SplitPlugin(int axis, std::vector const &output_lengths) - : axis_(axis), output_length_(output_lengths) {} + : axis_(axis), same_shape_(true), output_length_(output_lengths) {} SplitPlugin(void const *serial_data, size_t serial_length) { deserializeBase(serial_data, serial_length); @@ -60,9 +61,13 @@ class SplitPlugin : public PluginTensorRT { } int axis_; + int outer_rows_; + int inner_cols_; + bool same_shape_; std::vector output_length_; - int nx_, ny_, nz_; std::vector segment_offsets_; + thrust::device_vector d_segment_offsets_; + thrust::device_vector d_output_ptrs_; }; } // namespace plugin From af9a3301dab9ab291d3cdd278734ae129de8a0f0 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Wed, 21 Nov 2018 12:35:21 +0000 Subject: [PATCH 085/684] test=develop --- paddle/fluid/framework/selected_rows.h | 6 +- .../operators/hierarchical_sigmoid_op.cc | 5 +- .../fluid/operators/hierarchical_sigmoid_op.h | 2 +- .../fluid/tests/unittests/test_hsigmoid_op.py | 269 ++++++++++-------- 4 files changed, 152 insertions(+), 130 deletions(-) diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 4d728ae54ae6b..9d87c3eac7fd9 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -121,7 +121,9 @@ class SelectedRows { int64_t AutoGrownIndex(int64_t key, bool auto_grown); void SyncIndex(); - + /* + * @brief Get complete Dims before + */ DDim GetCompleteDims() const { std::vector dims = vectorize(value_->dims()); dims[0] = height_; @@ -136,7 +138,7 @@ class SelectedRows { std::unordered_map id_to_index_; // should not be used when ids has duplicate member std::unique_ptr value_{nullptr}; - int64_t height_; + int64_t height_; // height indicates the underline tensor's height std::unique_ptr rwlock_{nullptr}; }; diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index b2f4616441582..c350e6489ddec 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -145,8 +145,9 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("PreOut"), "Input(Preout) should not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("W")), - "Output(W@Grad should not be null.)"); - PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X"))); + "Output(W@Grad should not be null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@Grad should not be null."); if (ctx->HasOutput(framework::GradVarName("Bias"))) { ctx->SetOutputDim(framework::GradVarName("Bias"), ctx->GetInputDim("Bias")); diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 3e2fbafa2669e..35a1de3e1917f 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -191,10 +191,10 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { framework::Vector real_rows = cal_rows(path); auto* w_grad = ctx.Output(framework::GradVarName("W")); - w_grad->set_rows(real_rows); // build ids -> rows index map w_grad->SyncIndex(); + w_grad->set_height(w->dims()[0]); auto* w_grad_value = w_grad->mutable_value(); framework::DDim temp_dim(w->dims()); set(temp_dim, 0, real_rows.size()); diff --git a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py index 50dfaee76fda3..2f4225f912d1c 100644 --- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py +++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py @@ -140,148 +140,167 @@ def hsigmoidWithCustomTree(x, w, ptable, pcode, label, bias, num_classes): return pre_output, out -# class TestHSigmoidOp(OpTest): -# def setUp(self): -# self.op_type = "hierarchical_sigmoid" -# num_classes = 6 -# feature_size = 8 -# batch_size = 4 -# x = np.random.random((batch_size, feature_size)).astype("float32") * 2 -# w = np.random.random( -# (num_classes - 1, feature_size)).astype("float32") * 2 -# label = np.random.randint(0, num_classes, (batch_size, 1)) -# bias = np.random.random((1, num_classes - 1)).astype("float32") -# self.attrs = {'num_classes': num_classes, 'is_sparse': False} -# self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} -# pre_output, out = hsigmoid(x, w, label, bias, num_classes) -# self.outputs = {'PreOut': pre_output, 'Out': out} - -# def test_check_output(self): -# self.check_output() - -# def test_check_grad(self): -# self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) - -# class TestHSigmoidOpSparse(OpTest): -# def setUp(self): -# self.op_type = "hierarchical_sigmoid" -# num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample -# feature_size = 8 -# batch_size = 4 -# x = np.random.random((batch_size, feature_size)).astype("float32") * 2 -# w = np.random.random( -# (num_classes - 1, feature_size)).astype("float32") * 2 -# label = np.array([0, 1, 4, 5]) -# ptable = np.array( -# [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), -# (0, 2, -1, -1, -# -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) -# pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( -# 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store -# bias = np.random.random((1, num_classes - 1)).astype("float32") -# self.attrs = {'num_classes': num_classes, 'is_sparse': True} -# self.inputs = { -# 'X': x, -# 'W': w, -# 'PTable': ptable, -# 'PCode': pcode, -# 'Label': label, -# 'Bias': bias -# } -# pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, -# bias, num_classes) -# self.outputs = {'PreOut': pre_output, 'Out': out} - -# def test_check_output(self): -# print("checking output in CostumTree") -# self.check_output() - - -class TestHSigmoidOpWithSparseGrad(): - def hs_net_conf(self): - emb = fluid.layers.data(name="x", shape=[3], dtype='int64') +class TestHSigmoidOp(OpTest): + def setUp(self): + self.op_type = "hierarchical_sigmoid" + num_classes = 6 + feature_size = 8 + batch_size = 4 + x = np.random.random((batch_size, feature_size)).astype("float32") * 2 + w = np.random.random( + (num_classes - 1, feature_size)).astype("float32") * 2 + label = np.random.randint(0, num_classes, (batch_size, 1)) + bias = np.random.random((1, num_classes - 1)).astype("float32") + self.attrs = {'num_classes': num_classes, 'is_sparse': False} + self.inputs = {'X': x, 'W': w, 'Label': label, 'Bias': bias} + pre_output, out = hsigmoid(x, w, label, bias, num_classes) + self.outputs = {'PreOut': pre_output, 'Out': out} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) + + +class TestHSigmoidOpSparse(OpTest): + def setUp(self): + self.op_type = "hierarchical_sigmoid" + num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample + feature_size = 8 + batch_size = 4 + x = np.random.random((batch_size, feature_size)).astype("float32") + w = np.random.random((num_classes - 1, feature_size)).astype("float32") + label = np.array([0, 1, 4, 5]) + ptable = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, + -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( + 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store + bias = np.random.random((1, num_classes - 1)).astype("float32") + self.attrs = {'num_classes': num_classes, 'is_sparse': True} + self.inputs = { + 'X': x, + 'W': w, + 'PTable': ptable, + 'PCode': pcode, + 'Label': label, + 'Bias': bias + } + pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, + bias, num_classes) + self.outputs = {'PreOut': pre_output, 'Out': out} + + def test_check_output(self): + print("checking output in CostumTree") + self.check_output() + + +class TestHSigmoidOpWithSparseGrad(unittest.TestCase): + def hs_net_conf(self, is_sparse): + input_word = fluid.layers.data(name="x", shape=[1], dtype='int64') ptable = fluid.layers.data(name='ptable', shape=[3], dtype='int64') pcode = fluid.layers.data(name='pcode', shape=[3], dtype='int64') label = fluid.layers.data(name='label', shape=[1], dtype='int64') - data_list = [emb, ptable, pcode, label] + + data_list = [input_word, ptable, pcode, label] + + emb = fluid.layers.embedding( + input=input_word, + is_sparse=False, + size=[3, 3], + param_attr=fluid.ParamAttr(initializer=fluid.initializer.Normal( + scale=1 / math.sqrt(3)))) + cost = fluid.layers.hsigmoid( input=emb, - label=predict_word, - non_leaf_num=4, + label=label, + non_leaf_num=3, ptable=ptable, pcode=pcode, is_costum=True, - is_sparse=True) + is_sparse=is_sparse) avg_cost = fluid.layers.reduce_mean(cost) return avg_cost, data_list - def test_training_test(self): - print("im here") - w = np.arange(12).reshape(4, 3) - x = np.ones((2, 3)) - ptable = np.array([(1, 2, -1), (1, 2, -1)]) - pcode = np.array([(1, 0, -1), (0, 0, -1)]) - label = np.array([(1, 4)]) - - loss, data_list = hs_net_conf() - optimizer = fluid.optimizer.SGD(learning_rate=1e-3) - optimizer.minimize(loss) - - main_program = fluid.default_main_program() - - place = fluid.CPUPlace() - feeder = fluid.DataFeeder(feed_list=data_list, place=place) - data_name_list = [var.name for var in data_list] - exe = fluid.Executor(place) - exe.run(fluid.default_startup_program()) - for pass_id in range(args.num_passes): + def training_test(self, is_sparse): + with fluid.program_guard(fluid.Program(), fluid.Program()): + start_up = fluid.default_startup_program() + start_up.random_seed = 1 # Fix random seed + x = np.arange(6).reshape(6) + ptable = np.array([(1, 2, -1), (1, 2, -1)]) + pcode = np.array([(1, 0, -1), (0, 0, -1)]) + label = np.array([1, 4]) + + loss, data_list = self.hs_net_conf(is_sparse) + optimizer = fluid.optimizer.SGD(learning_rate=1e-3) + optimizer.minimize(loss) + + main_program = fluid.default_main_program() + # print("main program: {program}".format{program=str(main_program)}) + place = fluid.CPUPlace() + feeder = fluid.DataFeeder(feed_list=data_list, place=place) + exe = fluid.Executor(place) + + exe.run(start_up) + result = list() for i in range(10): - data = [w, x[i % 2], ptable[i % 2], pcode[i % 2], label[i % 2]] + data = [([[x[i % 2]]], [list(ptable[i % 2])], + [list(pcode[i % 2])], [label[i % 2]])] + loss_val = exe.run(main_program, feed=feeder.feed(data), fetch_list=[loss]) - print("loss is: {loss}".format(loss=loss)) - - -# class TestHSigmoidOpWithCostumTree(OpTest): -# def setUp(self): -# self.op_type = "hierarchical_sigmoid" -# num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample -# feature_size = 8 -# batch_size = 4 -# x = np.random.random((batch_size, feature_size)).astype("float32") * 2 -# w = np.random.random( -# (num_classes - 1, feature_size)).astype("float32") * 2 -# label = np.array([0, 1, 4, 5]) -# ptable = np.array( -# [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), -# (0, 2, -1, -1, -# -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) -# pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( -# 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store -# bias = np.random.random((1, num_classes - 1)).astype("float32") -# self.attrs = {'num_classes': num_classes, 'is_sparse': False} -# self.inputs = { -# 'X': x, -# 'W': w, -# 'PTable': ptable, -# 'PCode': pcode, -# 'Label': label, -# 'Bias': bias -# } -# pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, -# bias, num_classes) -# self.outputs = {'PreOut': pre_output, 'Out': out} - -# def test_check_output(self): -# print("checking output in CostumTree") -# self.check_output() - -# def test_check_grad(self): -# print("checking outputGrad in CostumTree") -# self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) + result.append(loss_val) + return result + + def test_hs_grad_with_sparse(self): + dense_result = self.training_test(is_sparse=False) + sparse_result = self.training_test(is_sparse=True) + assert (dense_result == sparse_result) + + +class TestHSigmoidOpWithCostumTree(OpTest): + def setUp(self): + self.op_type = "hierarchical_sigmoid" + num_classes = 6 #using 1,2,3,4,5,6 to build a huffman tree and select 1,2,5,6 as sample + feature_size = 8 + batch_size = 4 + x = np.random.random((batch_size, feature_size)).astype("float32") * 2 + w = np.random.random( + (num_classes - 1, feature_size)).astype("float32") * 2 + label = np.array([0, 1, 4, 5]) + ptable = np.array( + [(0, 2, -1, -1, -1), (0, 1, 3, -1, -1), (0, 1, 4, -1, -1), + (0, 2, -1, -1, + -1)]) #np.array to store 1,2,5,6s' non-leaf path(root -> leaf) + pcode = np.array([(0, 0, -1, -1, -1), (1, 1, 1, -1, -1), ( + 1, 0, 0, -1, -1), (0, 1, -1, -1, -1)]) #np.array to store + bias = np.random.random((1, num_classes - 1)).astype("float32") + self.attrs = {'num_classes': num_classes, 'is_sparse': False} + self.inputs = { + 'X': x, + 'W': w, + 'PTable': ptable, + 'PCode': pcode, + 'Label': label, + 'Bias': bias + } + pre_output, out = hsigmoidWithCustomTree(x, w, ptable, pcode, label, + bias, num_classes) + self.outputs = {'PreOut': pre_output, 'Out': out} + + def test_check_output(self): + print("checking output in CostumTree") + self.check_output() + + def test_check_grad(self): + print("checking outputGrad in CostumTree") + self.check_grad(['Bias', 'X', 'W'], ['Out'], no_grad_set=set('Label')) + if __name__ == '__main__': unittest.main() From 35620513023000ceb47ec0b57909dae4f0634355 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Wed, 21 Nov 2018 12:37:28 +0000 Subject: [PATCH 086/684] add gru refer code and remove redundant avx code test=develop --- paddle/fluid/operators/fused/fusion_gru_op.cc | 67 ++-- paddle/fluid/operators/math/jit_kernel.h | 8 +- paddle/fluid/operators/math/jit_kernel_exp.cc | 152 --------- paddle/fluid/operators/math/jit_kernel_impl.h | 30 +- .../fluid/operators/math/jit_kernel_refer.h | 40 +++ paddle/fluid/operators/math/jit_kernel_rnn.cc | 294 ++++-------------- 6 files changed, 163 insertions(+), 428 deletions(-) diff --git a/paddle/fluid/operators/fused/fusion_gru_op.cc b/paddle/fluid/operators/fused/fusion_gru_op.cc index 7e34d1019c9e6..25b7ae7c2828b 100644 --- a/paddle/fluid/operators/fused/fusion_gru_op.cc +++ b/paddle/fluid/operators/fused/fusion_gru_op.cc @@ -183,24 +183,27 @@ class FusionGRUKernel : public framework::OpKernel { const int total_T = x_dims[0]; \ const int D3 = wh_dims[1] -#define INIT_OTHER_DEFINES \ - auto* h0 = ctx.Input("H0"); \ - auto* wx = ctx.Input("WeightX"); \ - auto* bias = ctx.Input("Bias"); \ - auto* hidden_out = ctx.Output("Hidden"); \ - bool is_reverse = ctx.Attr("is_reverse"); \ - const int M = x_dims[1]; \ - const int D = wh_dims[0]; \ - const int D2 = D * 2; \ - const auto& ker = math::jitkernel::KernelPool::Instance() \ - .template Get, \ - const std::string&, const std::string&>( \ - ctx.Attr("gate_activation"), \ - ctx.Attr("activation"), D); \ - const T* x_data = x->data(); \ - const T* wx_data = wx->data(); \ - const T* wh_data = wh->data(); \ - auto place = ctx.GetPlace(); \ +#define INIT_OTHER_DEFINES \ + auto* h0 = ctx.Input("H0"); \ + auto* wx = ctx.Input("WeightX"); \ + auto* bias = ctx.Input("Bias"); \ + auto* hidden_out = ctx.Output("Hidden"); \ + bool is_reverse = ctx.Attr("is_reverse"); \ + const int M = x_dims[1]; \ + const int D = wh_dims[0]; \ + const int D2 = D * 2; \ + const math::jitkernel::gru_attr_t attr( \ + D, ctx.Attr("gate_activation"), \ + ctx.Attr("activation")); \ + math::jitkernel::gru_t one_step; \ + const auto& ker = \ + math::jitkernel::KernelPool::Instance() \ + .template Get, \ + const math::jitkernel::gru_attr_t&>(attr); \ + const T* x_data = x->data(); \ + const T* wx_data = wx->data(); \ + const T* wh_data = wh->data(); \ + auto place = ctx.GetPlace(); \ T* xx_data = xx->mutable_data(place) void SeqCompute(const framework::ExecutionContext& ctx) const { @@ -237,7 +240,9 @@ class FusionGRUKernel : public framework::OpKernel { if (h0_data) { prev_hidden_data = h0_data + bid * D; } else { - ker->ComputeH1(xx_data, hidden_out_data); + one_step.gates = xx_data; + one_step.ht = hidden_out_data; + ker->ComputeH1(&one_step, &attr); prev_hidden_data = hidden_out_data; tstart = 1; move_step(); @@ -247,12 +252,15 @@ class FusionGRUKernel : public framework::OpKernel { blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D2, D, static_cast(1), prev_hidden_data, D, wh_data, D2, static_cast(1), xx_data, D3); - ker->ComputeHtPart1(xx_data, prev_hidden_data, hidden_out_data); + one_step.gates = xx_data; + one_step.ht_1 = prev_hidden_data; + one_step.ht = hidden_out_data; + ker->ComputeHtPart1(&one_step, &attr); // gemm rt * Ws blas.GEMM(CblasNoTrans, CblasNoTrans, 1, D, D, static_cast(1), hidden_out_data, D, wh_state_data, D, static_cast(1), xx_data + D2, D3); - ker->ComputeHtPart2(xx_data, prev_hidden_data, hidden_out_data); + ker->ComputeHtPart2(&one_step, &attr); // save prev prev_hidden_data = hidden_out_data; move_step(); @@ -314,7 +322,9 @@ class FusionGRUKernel : public framework::OpKernel { T* cur_out_data = batched_out_data; // W: {W_update, W_reset; W_state} for (int i = 0; i < max_bs; ++i) { - ker->ComputeH1(cur_in_data, cur_out_data); + one_step.gates = cur_in_data; + one_step.ht = cur_out_data; + ker->ComputeH1(&one_step, &attr); // add offset cur_in_data += D3; cur_out_data += D; @@ -339,8 +349,11 @@ class FusionGRUKernel : public framework::OpKernel { T* cur_out_data = batched_out_data; T* cur_prev_hidden_data = prev_hidden_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeHtPart1(cur_batched_data, cur_prev_hidden_data, - cur_out_data); + one_step.gates = cur_batched_data; + one_step.ht_1 = cur_prev_hidden_data; + one_step.ht = cur_out_data; + ker->ComputeHtPart1(&one_step, &attr); + cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; @@ -354,8 +367,10 @@ class FusionGRUKernel : public framework::OpKernel { cur_prev_hidden_data = prev_hidden_data; for (int i = 0; i < cur_bs; ++i) { - ker->ComputeHtPart2(cur_batched_data, cur_prev_hidden_data, - cur_out_data); + one_step.gates = cur_batched_data; + one_step.ht_1 = cur_prev_hidden_data; + one_step.ht = cur_out_data; + ker->ComputeHtPart2(&one_step, &attr); cur_batched_data += D3; cur_prev_hidden_data += D; cur_out_data += D; diff --git a/paddle/fluid/operators/math/jit_kernel.h b/paddle/fluid/operators/math/jit_kernel.h index bb5ba5813a7e6..b78b92b4f97b7 100644 --- a/paddle/fluid/operators/math/jit_kernel.h +++ b/paddle/fluid/operators/math/jit_kernel.h @@ -122,18 +122,18 @@ class VTanhKernel : public VActKernel {}; template class LSTMKernel : public Kernel { public: - void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *); // compute c1 and h1 without c0 or h0 void (*ComputeC1H1)(lstm_t *, const lstm_attr_t *); + void (*ComputeCtHt)(lstm_t *, const lstm_attr_t *); }; template class GRUKernel : public Kernel { public: // compute h1 without h0 - virtual void ComputeH1(T *gates, T *ht) const = 0; - virtual void ComputeHtPart1(T *gates, const T *ht_1, T *ht) const = 0; - virtual void ComputeHtPart2(T *gates, const T *ht_1, T *ht) const = 0; + void (*ComputeH1)(gru_t *, const gru_attr_t *); + void (*ComputeHtPart1)(gru_t *, const gru_attr_t *); + void (*ComputeHtPart2)(gru_t *, const gru_attr_t *); }; template diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 1fe7d66c752d3..686f3dd9836cb 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -25,10 +25,6 @@ limitations under the License. */ #include "paddle/fluid/platform/dynload/mklml.h" #endif -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { @@ -235,154 +231,6 @@ REGISTER_JITKERNEL(vexp, VExpKernel); REGISTER_JITKERNEL(vsigmoid, VSigmoidKernel); REGISTER_JITKERNEL(vtanh, VTanhKernel); -namespace detail { - -#ifdef __AVX__ - -#define ALIGN32 __attribute__((aligned(32))) - -#define _PS256_CONST(Name, Val) \ - static const float _ps256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ - Val, Val, Val, Val} - -#define _PI256_CONST(Name, Val) \ - static const int _pi256_##Name[8] ALIGN32 = {Val, Val, Val, Val, \ - Val, Val, Val, Val} - -_PI256_CONST(0x7f, 0x7f); -_PS256_CONST(one, 1.f); -_PS256_CONST(0p5, 0.5f); -_PS256_CONST(exp_hi, 88.3762626647949f); -_PS256_CONST(exp_lo, -88.3762626647949f); -_PS256_CONST(cephes_LOG2EF, 1.44269504088896341); -_PS256_CONST(cephes_exp_C1, 0.693359375); -_PS256_CONST(cephes_exp_C2, -2.12194440e-4); -_PS256_CONST(cephes_exp_p0, 1.9875691500E-4); -_PS256_CONST(cephes_exp_p1, 1.3981999507E-3); -_PS256_CONST(cephes_exp_p2, 8.3334519073E-3); -_PS256_CONST(cephes_exp_p3, 4.1665795894E-2); -_PS256_CONST(cephes_exp_p4, 1.6666665459E-1); -_PS256_CONST(cephes_exp_p5, 5.0000001201E-1); - -typedef union imm_xmm_union { - __m256i imm; - __m128i xmm[2]; -} imm_xmm_union; - -#define COPY_IMM_TO_XMM(imm_, xmm0_, xmm1_) \ - { \ - imm_xmm_union u ALIGN32; \ - u.imm = imm_; \ - xmm0_ = u.xmm[0]; \ - xmm1_ = u.xmm[1]; \ - } - -#define COPY_XMM_TO_IMM(xmm0_, xmm1_, imm_) \ - { \ - imm_xmm_union u ALIGN32; \ - u.xmm[0] = xmm0_; \ - u.xmm[1] = xmm1_; \ - imm_ = u.imm; \ - } - -#define AVX2_BITOP_USING_SSE2(fn) \ - static inline __m256i avx2_mm256_##fn(__m256i x, int y) { \ - /* use SSE2 to perform the bitop AVX2 */ \ - __m128i x1, x2; \ - __m256i ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - x1 = _mm_##fn(x1, y); \ - x2 = _mm_##fn(x2, y); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return ret; \ - } - -#define AVX2_INTOP_USING_SSE2(fn) \ - static inline __m256i avx2_mm256_add_epi32(__m256i x, __m256i y) { \ - /* use SSE2 to perform the AVX2 integer operation */ \ - __m128i x1, x2; \ - __m128i y1, y2; \ - __m256i ret; \ - COPY_IMM_TO_XMM(x, x1, x2); \ - COPY_IMM_TO_XMM(y, y1, y2); \ - x1 = _mm_##fn(x1, y1); \ - x2 = _mm_##fn(x2, y2); \ - COPY_XMM_TO_IMM(x1, x2, ret); \ - return ret; \ - } - -AVX2_BITOP_USING_SSE2(slli_epi32); -AVX2_INTOP_USING_SSE2(add_epi32); - -#define AVXEXP_BASE \ - __m256 tmp = _mm256_setzero_ps(), fx; \ - __m256 one = *reinterpret_cast(_ps256_one); \ - __m256i imm0; \ - x = _mm256_min_ps(x, *reinterpret_cast(_ps256_exp_hi)); \ - x = _mm256_max_ps(x, *reinterpret_cast(_ps256_exp_lo)); \ - /* express exp(x) as exp(g + n*log(2)) */ \ - fx = _mm256_mul_ps(x, \ - *reinterpret_cast(_ps256_cephes_LOG2EF)); \ - fx = _mm256_add_ps(fx, *reinterpret_cast(_ps256_0p5)); \ - tmp = _mm256_floor_ps(fx); \ - /* if greater, substract 1 */ \ - __m256 mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); \ - mask = _mm256_and_ps(mask, one); \ - fx = _mm256_sub_ps(tmp, mask); \ - tmp = _mm256_mul_ps(fx, \ - *reinterpret_cast(_ps256_cephes_exp_C1)); \ - __m256 z = _mm256_mul_ps( \ - fx, *reinterpret_cast(_ps256_cephes_exp_C2)); \ - x = _mm256_sub_ps(x, tmp); \ - x = _mm256_sub_ps(x, z); \ - z = _mm256_mul_ps(x, x); \ - __m256 y = *reinterpret_cast(_ps256_cephes_exp_p0); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p1)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p2)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p3)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p4)); \ - y = _mm256_mul_ps(y, x); \ - y = _mm256_add_ps(y, \ - *reinterpret_cast(_ps256_cephes_exp_p5)); \ - y = _mm256_mul_ps(y, z); \ - y = _mm256_add_ps(y, x); \ - y = _mm256_add_ps(y, one); \ - /* build 2^n */ \ - imm0 = _mm256_cvttps_epi32(fx) - -__m256 ExpAVX(__m256 x) { - AVXEXP_BASE; - // two AVX2 instructions using SSE2 - imm0 = avx2_mm256_add_epi32(imm0, - *reinterpret_cast(_pi256_0x7f)); - imm0 = avx2_mm256_slli_epi32(imm0, 23); - __m256 pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} -#endif - -#ifdef __AVX2__ -__m256 ExpAVX2(__m256 x) { - AVXEXP_BASE; - // two AVX2 instructions - imm0 = _mm256_add_epi32(imm0, *reinterpret_cast(_pi256_0x7f)); - imm0 = _mm256_slli_epi32(imm0, 23); - __m256 pow2n = _mm256_castsi256_ps(imm0); - y = _mm256_mul_ps(y, pow2n); - return y; -} -#endif - -} // namespace detail } // namespace jitkernel } // namespace math } // namespace operators diff --git a/paddle/fluid/operators/math/jit_kernel_impl.h b/paddle/fluid/operators/math/jit_kernel_impl.h index 2e734ca940895..ba5f20e53383d 100644 --- a/paddle/fluid/operators/math/jit_kernel_impl.h +++ b/paddle/fluid/operators/math/jit_kernel_impl.h @@ -38,20 +38,34 @@ typedef struct { void* checked{nullptr}; } lstm_t; -typedef struct lstm_attr_s { - bool use_peephole; +typedef struct { + void* gates; // gates: {W_update, W_reset; W_state} + const void* ht_1; + void* ht; +} gru_t; + +struct rnn_attr_s { int d; - std::string act_gate, act_cand, act_cell; + std::string act_gate, act_cand; + rnn_attr_s() = default; + rnn_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand) + : d(_d), act_gate(_act_gate), act_cand(_act_cand) {} +}; + +struct lstm_attr_s : public rnn_attr_s { + bool use_peephole; + std::string act_cell; lstm_attr_s() = default; lstm_attr_s(int _d, const std::string& _act_gate, const std::string& _act_cand, const std::string& _act_cell, bool _use_peephole = false) - : use_peephole(_use_peephole), - d(_d), - act_gate(_act_gate), - act_cand(_act_cand), + : rnn_attr_s(_d, _act_gate, _act_cand), + use_peephole(_use_peephole), act_cell(_act_cell) {} -} lstm_attr_t; +}; + +typedef struct rnn_attr_s gru_attr_t; +typedef struct lstm_attr_s lstm_attr_t; } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index 097bb8595612b..2e1a7f22db95a 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -185,6 +185,46 @@ void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr) { VMul(gates + d2, gates + d3, ht, d); } +// compute h1 without h0 +template +void GRUH1(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + auto act_gate = getActFunc(attr->act_gate); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + int d2 = d * 2; + act_gate(gates, gates, d); + act_cand(gates + d2, gates + d2, d); + VMul(gates, gates + d2, ht, d); +} + +template +void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { + // W: {W_update, W_reset; W_state} + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); + act_gate(gates, gates, attr->d * 2); + VMul(ht_1, gates + attr->d, ht, attr->d); +} + +template +void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { + T* gates = reinterpret_cast(step->gates); + T* ht = reinterpret_cast(step->ht); + const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_cand = getActFunc(attr->act_cand); + int d = attr->d; + T* y = gates + d * 2; + act_cand(y, y, d); + // out = zt*ht~ + (1-zt)*ht_1 + for (int i = 0; i < d; ++i) { + ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; + } +} + } // namespace refer } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index 6b7463aa52b93..dbfd212e6e7b2 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -23,140 +23,10 @@ limitations under the License. */ #include "paddle/fluid/operators/math/jit_code.h" #endif -#ifdef __AVX__ -#include -#endif - namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace detail { -#ifdef __AVX__ -__m256 ExpAVX(__m256 x); -#endif - -#ifdef __AVX2__ -__m256 ExpAVX2(__m256 x); -#endif - -} // namespace detail - -namespace jit = platform::jit; - -#ifdef __AVX__ -typedef enum { kSigmoid, kRelu, kTanh, kIdentity } act_type; - -class AVXAct { - public: - virtual ~AVXAct() = default; - virtual __m256 Compute(__m256 x) const = 0; -}; - -template -class AVXActImpl : public AVXAct { - public: - __m256 Compute(__m256 x) const override { PADDLE_THROW("Unkown type!"); } -}; - -#define AVX_SIGMOID(isa, expisa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - __m256 ones = _mm256_set1_ps(1.0f); \ - x = _mm256_max_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MIN)); \ - x = _mm256_min_ps(x, _mm256_set1_ps(SIGMOID_THRESHOLD_MAX)); \ - x = _mm256_sub_ps(_mm256_set1_ps(0.0f), x); \ - x = expisa(x); \ - x = _mm256_add_ps(ones, x); \ - return _mm256_div_ps(ones, x); \ - } - -#define AVX_TANH(isa, expisa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - __m256 ones = _mm256_set1_ps(1.0f); \ - x = _mm256_mul_ps(_mm256_set1_ps(-2.0f), x); \ - x = _mm256_min_ps(x, _mm256_set1_ps(EXP_MAX_INPUT)); \ - x = expisa(x); \ - x = _mm256_add_ps(ones, x); \ - x = _mm256_div_ps(_mm256_set1_ps(2.0f), x); \ - return _mm256_sub_ps(x, ones); \ - } - -#define AVX_RELU(isa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - return _mm256_max_ps(x, _mm256_setzero_ps()); \ - } - -#define AVX_IDENTITY(isa) \ - template <> \ - __m256 AVXActImpl::Compute(__m256 x) const { \ - return x; \ - } - -#define FOR_EACH_AVX_ISA(macro_) \ - macro_(jit::avx); \ - macro_(jit::avx2); \ - macro_(jit::avx512f) - -FOR_EACH_AVX_ISA(AVX_RELU); -FOR_EACH_AVX_ISA(AVX_IDENTITY); - -AVX_SIGMOID(jit::avx, detail::ExpAVX); -AVX_TANH(jit::avx, detail::ExpAVX); - -#ifdef __AVX2__ -AVX_SIGMOID(jit::avx2, detail::ExpAVX2); -AVX_SIGMOID(jit::avx512f, detail::ExpAVX2); -AVX_TANH(jit::avx2, detail::ExpAVX2); -AVX_TANH(jit::avx512f, detail::ExpAVX2); -#endif - -#undef FOR_EACH_AVX_ISA -#undef AVX_IDENTITY -#undef AVX_RELU -#undef AVX_TANH -#undef AVX_SIGMOID - -#endif - -template -static std::shared_ptr> GetActKernel( - const std::string& type, int n) { - if (type == "sigmoid") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "relu") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "tanh") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } else if (type == "identity" || type == "") { - return std::dynamic_pointer_cast>( - KernelPool::Instance().template Get>(n)); - } - PADDLE_THROW("Not support type: %s", type); - return nullptr; -} - -#ifdef __AVX__ -template -static std::unique_ptr GetAVXAct(const std::string& type) { - if (type == "sigmoid") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "relu") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "tanh") { - return std::unique_ptr(new AVXActImpl()); - } else if (type == "identity" || type == "") { - return std::unique_ptr(new AVXActImpl()); - } - PADDLE_THROW("Not support type: %s", type); - return nullptr; -} -#endif /* LSTM JitKernel */ template @@ -290,125 +160,73 @@ REGISTER_JITKERNEL_ARGS(lstm, LSTMKernel, JITKERNEL_DEFINE_NAME_LSTM, JITKERNEL_DECLARE_LSTM, JITKERNEL_FIND_KEY_LSTM, JITKERNEL_LSTM_IMPL); +#undef JITKERNEL_LSTM_IMPL +#undef JITKERNEL_FIND_KEY_LSTM +#undef JITKERNEL_DECLARE_LSTM +#undef JITKERNEL_DEFINE_NAME_LSTM + /* GRU JitKernel */ -template +template class GRUKernelImpl : public GRUKernel { public: - explicit GRUKernelImpl(const std::string& act_gate, - const std::string& act_state, int d) - : GRUKernel() { - d_ = d; - d2_ = d * 2; - act_gate_d2_ = GetActKernel(act_gate, d2_); - act_gate_d_ = GetActKernel(act_gate, d); - act_state_d_ = GetActKernel(act_state, d); - vmul_d_ = KernelPool::Instance().template Get>(d); - } - - void ComputeH1(T* gates, T* ht) const override { - act_gate_d_->Compute(gates, gates, d_); - act_state_d_->Compute(gates + d2_, gates + d2_, d_); - vmul_d_->Compute(gates, gates + d2_, ht, d_); - } - - void ComputeHtPart1(T* gates, const T* ht_1, T* ht) const override { - // W: {W_update, W_reset; W_state} - act_gate_d2_->Compute(gates, gates, d2_); - vmul_d_->Compute(ht_1, gates + d_, ht, d_); + static inline std::string name(const gru_attr_t& attr) { + PADDLE_THROW("DType should be either float or double"); } - - void ComputeHtPart2(T* gates, const T* ht_1, T* ht) const override { - T* y = gates + d2_; - act_state_d_->Compute(y, y, d_); - // out = zt*ht~ + (1-zt)*ht_1 - for (int i = 0; i < d_; ++i) { - ht[i] = gates[i] * y[i] + (static_cast(1) - gates[i]) * ht_1[i]; - } + static inline bool useJIT(int d) { return false; } + static inline bool useMKL(int d) { return false; } + explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel() { + this->ComputeH1 = refer::GRUH1; + this->ComputeHtPart1 = refer::GRUHtPart1; + this->ComputeHtPart2 = refer::GRUHtPart2; } - - private: - int d_, d2_; - std::shared_ptr> act_gate_d2_, act_gate_d_, act_state_d_; - std::shared_ptr> vmul_d_; -#ifdef __AVX__ - std::unique_ptr avx_act_gate_, avx_act_state_; -#endif }; -#define INTRI8_FLOAT(isa) \ - template <> \ - GRUKernelImpl::GRUKernelImpl( \ - const std::string& act_gate, const std::string& act_state, int d) \ - : GRUKernel() { \ - avx_act_gate_ = GetAVXAct(act_gate); \ - avx_act_state_ = GetAVXAct(act_state); \ - } \ - template <> \ - void GRUKernelImpl::ComputeH1(float* gates, float* ht) \ - const { \ - __m256 u, s; \ - /* W: {W_update, W_reset; W_state} */ \ - u = _mm256_loadu_ps(gates); \ - s = _mm256_loadu_ps(gates + 16); \ - s = _mm256_mul_ps(avx_act_gate_->Compute(u), avx_act_state_->Compute(s)); \ - _mm256_storeu_ps(ht, s); \ - } \ - template <> \ - void GRUKernelImpl::ComputeHtPart1( \ - float* gates, const float* ht_1, float* ht) const { \ - /* not exactly equal the any implementation */ \ - __m256 r, ht0; \ - r = _mm256_loadu_ps(gates + 8); \ - ht0 = _mm256_loadu_ps(ht_1); \ - r = _mm256_mul_ps(avx_act_gate_->Compute(r), ht0); \ - _mm256_storeu_ps(ht, r); \ - } \ - template <> \ - void GRUKernelImpl::ComputeHtPart2( \ - float* gates, const float* ht_1, float* ht) const { \ - /* not exactly equal the any implementation */ \ - __m256 u, s, ht0; \ - u = _mm256_loadu_ps(gates); \ - s = _mm256_loadu_ps(gates + 16); \ - ht0 = _mm256_loadu_ps(ht_1); \ - u = avx_act_gate_->Compute(u); \ - s = _mm256_mul_ps(u, avx_act_state_->Compute(s)); \ - u = _mm256_sub_ps(_mm256_set1_ps(1.f), u); \ - u = _mm256_mul_ps(u, ht0); \ - u = _mm256_add_ps(s, u); \ - _mm256_storeu_ps(ht, u); \ - } - -#ifdef __AVX__ -INTRI8_FLOAT(jit::avx); -#endif -#ifdef __AVX2__ -INTRI8_FLOAT(jit::avx2); -#endif -#ifdef __AVX512F__ -INTRI8_FLOAT(jit::avx512f); -#endif - -#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \ - template <> \ - std::shared_ptr> KernelPool::Get< \ - GRUKernel, const std::string&, const std::string&, int>( \ - const std::string& act_gate, const std::string& act_state, int d) - -#define JITKERNEL_KEY_GRU(ker_key, dtype_key) \ - #ker_key #dtype_key + std::to_string(d) + act_gate + act_state +#define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class) \ + template <> \ + std::string ker_class##Impl::name(const gru_attr_t& attr) { \ + std::string key(#ker_key "f"); \ + key += (attr.act_gate + attr.act_cand); \ + if (useJIT(attr.d)) { \ + /* only jit code need record d*/ \ + return key + "jit" + std::to_string(attr.d); \ + } else if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } \ + template <> \ + std::string ker_class##Impl::name(const gru_attr_t& attr) { \ + std::string key(#ker_key "d"); \ + /* jit code do not support double yet*/ \ + if (useMKL(attr.d)) { \ + return key + "mkl"; \ + } else { \ + return key + "any"; \ + } \ + } + +#define JITKERNEL_DECLARE_GRU(ker_class, ker_dtype) \ + template <> \ + std::shared_ptr> \ + KernelPool::Get, const gru_attr_t&>( \ + const gru_attr_t& attr) + +#define JITKERNEL_FIND_KEY_GRU(ker_class, ker_dtype) \ + std::string key = ker_class##Impl::name(attr) -#define JITKERNEL_NEW_GRU_IMPL(ker, dtype, isa, k) \ - p = std::dynamic_pointer_cast>( \ - std::make_shared>(act_gate, act_state, d)); +#define JITKERNEL_GRU_IMPL(ker, dtype) \ + p = std::dynamic_pointer_cast>( \ + std::make_shared>(attr)); -REGISTER_JITKERNEL_ARGS_DEPRECATED(gru, GRUKernel, JITKERNEL_DECLARE_GRU, - JITKERNEL_KEY_GRU, JITKERNEL_NEW_GRU_IMPL); +REGISTER_JITKERNEL_ARGS(gru, GRUKernel, JITKERNEL_DEFINE_NAME_GRU, + JITKERNEL_DECLARE_GRU, JITKERNEL_FIND_KEY_GRU, + JITKERNEL_GRU_IMPL); -#undef INTRI8_FLOAT -#undef JITKERNEL_NEW_GRU_IMPL -#undef JITKERNEL_KEY_GRU +#undef JITKERNEL_GRU_IMPL +#undef JITKERNEL_FIND_KEY_GRU #undef JITKERNEL_DECLARE_GRU +#undef JITKERNEL_DEFINE_NAME_GRU } // namespace jitkernel } // namespace math } // namespace operators From 6193dc76368f5f888d8270f938ec81b78e06ffdd Mon Sep 17 00:00:00 2001 From: peizhilin Date: Wed, 21 Nov 2018 23:17:26 +0800 Subject: [PATCH 087/684] test=develop --- CMakeLists.txt | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5325e3034c854..bc2ac2cd93969 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -139,6 +139,10 @@ if (WIN32) "Disable MKL when compiling for Windows" FORCE) set(WITH_DISTRIBUTE OFF CACHE STRING "Disable DISTRIBUTE when compiling for Windows" FORCE) + set(WITH_C_API OFF CACHE STRING + "Disable C_API when compiling for Windows" FORCE) + set(WITH_FLUID_ONLY ON CACHE STRING + "Enable FLUID_ONLY when compiling for Windows" FORCE) endif() set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING From 6eba5bd276a8d79d5611ec42db0c47273fb4950c Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Wed, 21 Nov 2018 15:32:25 +0000 Subject: [PATCH 088/684] Fix direct copy and refine split ut test=develop --- .../tensorrt/convert/test_split_op.cc | 55 ++++++++++++++----- .../tensorrt/plugin/split_op_plugin.cu | 7 ++- 2 files changed, 46 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index f81d011552c15..23909378dde06 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -20,30 +20,59 @@ namespace paddle { namespace inference { namespace tensorrt { -TEST(split_op, test) { +template +void TensorRTSplitTest(const std::vector &in_shape, + const std::vector §ions) { std::unordered_set parameters({""}); framework::Scope scope; - TRTConvertValidation validator(10, parameters, scope, 1000); - validator.DeclInputVar("split_input", nvinfer1::DimsCHW(3, 2, 2)); - validator.DeclOutputVar("split_out1", nvinfer1::DimsCHW(2, 2, 2)); - validator.DeclOutputVar("split_out2", nvinfer1::DimsCHW(1, 2, 2)); + TRTConvertValidation validator(BatchSize + 1, parameters, scope, 10000); + + auto make_dim = [](const std::vector &shape) { + nvinfer1::DimsCHW dim; + dim.c() = shape[0]; + dim.h() = shape[1]; + dim.w() = shape[2]; + return dim; + }; + validator.DeclInputVar("split_input", make_dim(in_shape)); + std::vector output_vars; + for (size_t i = 0; i < sections.size(); ++i) { + auto out_shape = in_shape; + out_shape[Axis - 1] = sections[i]; + std::string output_name = "split_out" + std::to_string(i); + validator.DeclOutputVar(output_name, make_dim(out_shape)); + output_vars.push_back(output_name); + } // Prepare Op description framework::OpDesc desc; desc.SetType("split"); desc.SetInput("X", {"split_input"}); - desc.SetOutput("Out", {"split_out1", "split_out2"}); + desc.SetOutput("Out", output_vars); - int num = 0; - int axis = 1; - std::vector output_lengths = {2, 1}; - desc.SetAttr("axis", axis); - desc.SetAttr("num", num); - desc.SetAttr("sections", output_lengths); + desc.SetAttr("axis", Axis); + desc.SetAttr("num", 0); + desc.SetAttr("sections", sections); validator.SetOp(*desc.Proto()); - validator.Execute(1); + validator.Execute(BatchSize); +} + +TEST(split_op, test_same_shape_batch1) { + TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2}); +} + +TEST(split_op, test_different_shape_batch1) { + TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1}); +} + +TEST(split_op, test_same_shape_batch10) { + TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2}); +} + +TEST(split_op, test_different_shape_batch10) { + TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1}); } } // namespace tensorrt diff --git a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu index 1ec0753e9fb01..de61ace59e299 100644 --- a/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/split_op_plugin.cu @@ -138,11 +138,12 @@ inline void Split(cudaStream_t stream, const bool same_shape, int SplitPlugin::enqueue(int batchSize, const void* const* inputs, void** outputs, void* workspace, cudaStream_t stream) { float const* input_ptr = reinterpret_cast(inputs[0]); - if (axis_ == -1 && this->getNbOutputs() < 10) { + if (((batchSize == 1 && axis_ == 0) || axis_ == -1) && + this->getNbOutputs() < 10) { float** output_ptrs = reinterpret_cast(outputs); int data_type_size = (this->getDataType() == nvinfer1::DataType::kFLOAT) - ? sizeof(__half) - : sizeof(float); + ? sizeof(float) + : sizeof(__half); for (int i = 0; i < this->getNbOutputs(); ++i) { PADDLE_ENFORCE( cudaMemcpyAsync( From 57a18e32a18232b65920a8ecb0ea014453bbdf7a Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 22 Nov 2018 04:26:13 +0000 Subject: [PATCH 089/684] test=develop --- paddle/fluid/operators/hierarchical_sigmoid_op.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 35a1de3e1917f..418fe86f69fff 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -43,9 +43,7 @@ std::vector cal_rows(const framework::LoDTensor* path) { } } } - for (std::set::iterator it = tmp.begin(); it != tmp.end(); ++it) { - rows.push_back(*it); - } + rows.assign(tmp.begin(), tmp.end()); return rows; } From 3912545ffec3ea5a850420f0a804afadc9f0352a Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Thu, 22 Nov 2018 04:30:19 +0000 Subject: [PATCH 090/684] add dlpack support test=develop --- CMakeLists.txt | 1 + cmake/external/dlpack.cmake | 31 +++++ paddle/fluid/framework/CMakeLists.txt | 3 + paddle/fluid/framework/dlpack_tensor.cc | 127 +++++++++++++++++++ paddle/fluid/framework/dlpack_tensor.h | 45 +++++++ paddle/fluid/framework/dlpack_tensor_test.cc | 113 +++++++++++++++++ 6 files changed, 320 insertions(+) create mode 100644 cmake/external/dlpack.cmake create mode 100644 paddle/fluid/framework/dlpack_tensor.cc create mode 100644 paddle/fluid/framework/dlpack_tensor.h create mode 100644 paddle/fluid/framework/dlpack_tensor_test.cc diff --git a/CMakeLists.txt b/CMakeLists.txt index c62cc9bfd70d7..b6ae241272bae 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -190,6 +190,7 @@ include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) include(external/xxhash) # download xxhash +include(external/dlpack) if (NOT WIN32) # there is no official support of snappystream, warpctc, nccl, cupti in windows diff --git a/cmake/external/dlpack.cmake b/cmake/external/dlpack.cmake new file mode 100644 index 0000000000000..94d8fcc668556 --- /dev/null +++ b/cmake/external/dlpack.cmake @@ -0,0 +1,31 @@ +include(ExternalProject) + +set(DLPACK_SOURCE_DIR ${THIRD_PARTY_PATH}/dlpack) +set(DLPACK_INCLUDE_DIR ${DLPACK_SOURCE_DIR}/src/extern_dlpack/include) + +include_directories(${DLPACK_INCLUDE_DIR}) + +ExternalProject_Add( + extern_dlpack + ${EXTERNAL_PROJECT_LOG_ARGS} + GIT_REPOSITORY "https://github.com/dmlc/dlpack.git" + GIT_TAG "v0.2" + PREFIX ${DLPACK_SOURCE_DIR} + UPDATE_COMMAND "" + CONFIGURE_COMMAND "" + BUILD_COMMAND "" + INSTALL_COMMAND "" + TEST_COMMAND "" +) + +if(${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/dlpack_dummy.c) + file(WRITE ${dummyfile} "const char *dummy = \"${dummyfile}\";") + add_library(dlpack STATIC ${dummyfile}) +else() + add_library(dlpack INTERFACE) +endif() + +add_dependencies(dlpack extern_dlpack) + +LIST(APPEND externl_project_dependencies dlpack) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index cb9057672cc2c..d7d7834b49ec5 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -205,3 +205,6 @@ cc_test(tuple_test SRCS tuple_test.cc ) if (NOT WIN32) cc_test(rw_lock_test SRCS rw_lock_test.cc) endif (NOT WIN32) + +cc_library(dlpack_tensor SRCS dlpack_tensor.cc DEPS tensor dlpack) +cc_test(dlpack_tensor_test SRCS dlpack_tensor_test.cc DEPS dlpack_tensor glog) diff --git a/paddle/fluid/framework/dlpack_tensor.cc b/paddle/fluid/framework/dlpack_tensor.cc new file mode 100644 index 0000000000000..04e3f78afe44b --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.cc @@ -0,0 +1,127 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" + +namespace paddle { +namespace framework { + +namespace internal { +template +static ::DLDataType GetDLDataTypeCode() { + ::DLDataType dtype; + if (std::is_same::value || + std::is_floating_point::value) { + dtype.code = kDLFloat; + } else if (std::is_unsigned::value) { + dtype.code = kDLUInt; + } else if (std::is_integral::value) { + dtype.code = kDLInt; + } else { + PADDLE_THROW("Unsupported data type %s", typeid(T).name()); + } + dtype.bits = 8 * sizeof(T); + dtype.lanes = 1; + return dtype; +} + +static DLDataType GetDLDataTypeFromTypeIndex(const std::type_index &type) { +#define REG_DL_DATA_TYPE(type) \ + { std::type_index(typeid(type)), GetDLDataTypeCode() } + static const std::unordered_map + type_to_dtype_map({ + REG_DL_DATA_TYPE(platform::float16), // NOLINT + REG_DL_DATA_TYPE(float), // NOLINT + REG_DL_DATA_TYPE(double), // NOLINT + REG_DL_DATA_TYPE(int), // NOLINT + REG_DL_DATA_TYPE(int64_t), // NOLINT + REG_DL_DATA_TYPE(bool), // NOLINT + REG_DL_DATA_TYPE(size_t), // NOLINT + REG_DL_DATA_TYPE(int16_t), // NOLINT + REG_DL_DATA_TYPE(uint8_t), // NOLINT + REG_DL_DATA_TYPE(int8_t) // NOLINT + }); + static auto type_to_dtype_map_end_it = type_to_dtype_map.end(); + auto it = type_to_dtype_map.find(type); + PADDLE_ENFORCE(it != type_to_dtype_map_end_it, "Unsupported data type %s", + type.name()); + return it->second; +#undef REG_DL_DATA_TYPE +} + +struct DLContextVisitor : public boost::static_visitor<::DLContext> { + inline ::DLContext operator()(const platform::CPUPlace &place) const { + DLContext ctx; + ctx.device_type = kDLCPU; + ctx.device_id = 0; + return ctx; + } + + inline ::DLContext operator()(const platform::CUDAPlace &place) const { +#ifdef PADDLE_WITH_CUDA + DLContext ctx; + ctx.device_type = kDLGPU; + ctx.device_id = place.device; + return ctx; +#else + PADDLE_THROW("platform::CUDAPlace is not supported in CPU only version"); +#endif + } + + inline ::DLContext operator()(const platform::CUDAPinnedPlace &place) const { +#ifdef PADDLE_WITH_CUDA + DLContext ctx; + ctx.device_type = kDLCPUPinned; + ctx.device_id = 0; + return ctx; +#else + PADDLE_THROW( + "platform::CUDAPinnedPlace is not supported in CPU only version"); +#endif + } +}; +} // namespace internal + +DLPackTensor::DLPackTensor(const Tensor &tensor, LaneType lanes) { + // init data, data buffer + t_.data = const_cast(tensor.data()); + + // init ctx, DLContext type with device_type and device_id + auto place = tensor.place(); + t_.ctx = boost::apply_visitor(internal::DLContextVisitor(), place); + + // init dtype + t_.dtype = internal::GetDLDataTypeFromTypeIndex(tensor.type()); + t_.dtype.lanes = lanes; + + // init ndim, tensor rank + auto &dims = tensor.dims(); + using DimType = decltype(t_.ndim); // int + t_.ndim = static_cast(dims.size()); + + // init shape, tensor dims + t_.shape = shape_; + for (DimType i = 0; i < t_.ndim; ++i) { + t_.shape[i] = dims[i]; + } + + // init strides, nullptr means the tensor is compact + t_.strides = nullptr; + + // init byte_offset + t_.byte_offset = 0; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor.h b/paddle/fluid/framework/dlpack_tensor.h new file mode 100644 index 0000000000000..0c52bce1ef6af --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor.h @@ -0,0 +1,45 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace framework { + +class DLPackTensor { + public: + using LaneType = decltype(::DLTensor::dtype.lanes); // uint16_t + using ShapeType = + std::remove_reference::type; // int64_t + + // lanes is only used in CPU to enable vectorization + explicit DLPackTensor(const Tensor& tensor, LaneType lanes = 1); + + inline operator const ::DLTensor&() const { return t_; } + + inline operator ::DLTensor&() { return t_; } + + private: + ::DLTensor t_; + + // The shape in DLTensor is defined as int64_t* + // Add this member to make TVMTensor init without heap allocation + ShapeType shape_[9]; +}; + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/dlpack_tensor_test.cc b/paddle/fluid/framework/dlpack_tensor_test.cc new file mode 100644 index 0000000000000..938b05635004f --- /dev/null +++ b/paddle/fluid/framework/dlpack_tensor_test.cc @@ -0,0 +1,113 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/dlpack_tensor.h" +#include +#include +#include + +namespace paddle { +namespace framework { + +namespace { // NOLINT +template +constexpr uint8_t GetDLDataTypeCode() { + return std::is_same::value || + std::is_floating_point::value + ? static_cast(kDLFloat) + : (std::is_unsigned::value + ? static_cast(kDLUInt) + : (std::is_integral::value ? static_cast(kDLInt) + : static_cast(-1))); +} +} // NOLINT + +template +void TestMain(const platform::Place &place, uint16_t lanes) { + DDim dims{4, 5, 6, 7}; + Tensor tensor; + tensor.Resize(dims); + void *p = tensor.mutable_data(place); + + DLPackTensor dlpack_tensor(tensor, lanes); + ::DLTensor &dl_tensor = dlpack_tensor; + + CHECK_EQ(p, dl_tensor.data); + if (platform::is_cpu_place(place)) { + CHECK_EQ(kDLCPU, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else if (platform::is_gpu_place(place)) { + CHECK_EQ(kDLGPU, dl_tensor.ctx.device_type); + CHECK_EQ(boost::get(place).device, + dl_tensor.ctx.device_id); + } else if (platform::is_cuda_pinned_place(place)) { + CHECK_EQ(kDLCPUPinned, dl_tensor.ctx.device_type); + CHECK_EQ(0, dl_tensor.ctx.device_id); + } else { + CHECK_EQ(false, true); + } + + CHECK_EQ(dims.size(), dl_tensor.ndim); + for (auto i = 0; i < dims.size(); ++i) { + CHECK_EQ(dims[i], dl_tensor.shape[i]); + } + + CHECK_EQ(dl_tensor.strides == nullptr, true); + CHECK_EQ(static_cast(0), dl_tensor.byte_offset); + + CHECK_EQ(lanes, dl_tensor.dtype.lanes); + CHECK_EQ(sizeof(T) * 8, dl_tensor.dtype.bits); + + CHECK_EQ(GetDLDataTypeCode(), dl_tensor.dtype.code); +} + +template +void TestMainLoop() { +#ifdef PADDLE_WITH_CUDA + std::vector places{platform::CPUPlace(), + platform::CUDAPlace(0), + platform::CUDAPinnedPlace()}; + if (platform::GetCUDADeviceCount() > 1) { + places.emplace_back(platform::CUDAPlace(1)); + } +#else + std::vector places{platform::CPUPlace()}; +#endif + std::vector lanes{1, 2}; + for (auto &p : places) { + for (auto &l : lanes) { + TestMain(p, l); + } + } +} + +#define PADDLE_DLPACK_TEST(type) \ + TEST(dlpack, test_##type) { TestMainLoop(); } + +using float16 = platform::float16; +PADDLE_DLPACK_TEST(float16); +PADDLE_DLPACK_TEST(float); +PADDLE_DLPACK_TEST(double); +PADDLE_DLPACK_TEST(int); +PADDLE_DLPACK_TEST(int64_t); +PADDLE_DLPACK_TEST(bool); +PADDLE_DLPACK_TEST(size_t); +PADDLE_DLPACK_TEST(int16_t); +PADDLE_DLPACK_TEST(uint8_t); +PADDLE_DLPACK_TEST(int8_t); + +#undef PADDLE_DLPACK_TEST + +} // namespace framework +} // namespace paddle From 982e48922020e8d5f3ddcfc682068fcbdc5b7fe2 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 22 Nov 2018 06:26:04 +0000 Subject: [PATCH 091/684] test=develop --- python/paddle/fluid/layers/nn.py | 5 +++-- python/paddle/fluid/tests/unittests/test_layers.py | 6 ++++++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 99acd7e30884b..32d411b8309d9 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2139,8 +2139,9 @@ def pool2d(input, input tensor is NCHW, where N is batch size, C is the number of channels, H is the height of the feature, and W is the width of the feature. - pool_size (int): The side length of pooling windows. All pooling - windows are squares with pool_size on a side. + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple, + it must contain two integers, (pool_size_Height, pool_size_Width). + Otherwise, the pool kernel size will be a square of an int. pool_type: ${pooling_type_comment} pool_stride (int): stride of the pooling layer. pool_padding (int): padding size. diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index a8fa5436c43d2..c4310fe006766 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -202,6 +202,12 @@ def test_sequence_unpad(self): self.assertIsNotNone(layers.sequence_unpad(x=x, length=length)) print(str(program)) + def test_pool2d(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') + self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3])) + def test_lstm_unit(self): program = Program() with program_guard(program): From 1adda8e06c075d55edcc6aa50804eab62b903f72 Mon Sep 17 00:00:00 2001 From: hjchen2 Date: Thu, 22 Nov 2018 06:53:16 +0000 Subject: [PATCH 092/684] Add more unit tests for split plugin test=develop --- .../inference/tensorrt/convert/split_op.cc | 13 ++--- .../tensorrt/convert/test_split_op.cc | 47 ++++++++++++++++--- 2 files changed, 43 insertions(+), 17 deletions(-) diff --git a/paddle/fluid/inference/tensorrt/convert/split_op.cc b/paddle/fluid/inference/tensorrt/convert/split_op.cc index 871354267e936..ae5b1b98060a4 100644 --- a/paddle/fluid/inference/tensorrt/convert/split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/split_op.cc @@ -19,9 +19,6 @@ namespace paddle { namespace inference { namespace tensorrt { -/* - * SplitOp. - */ class SplitOpConverter : public OpConverter { public: void operator()(const framework::proto::OpDesc& op, @@ -40,15 +37,11 @@ class SplitOpConverter : public OpConverter { int axis = boost::get(op_desc.GetAttr("axis")); std::vector output_lengths = boost::get>(op_desc.GetAttr("sections")); - // PADDLE_ENFORCE(axis != 0); - if (axis < 0) { - axis += input_dims.nbDims; - } else { - axis -= 1; - } + // split on batch is not supported in TensorRT + PADDLE_ENFORCE(axis != 0); + axis += (axis < 0) ? input_dims.nbDims : -1; PADDLE_ENFORCE(output_lengths.size() == output_num); - // plugin::SplitPlugin* plugin = new plugin::SplitPlugin(axis, output_lengths); nvinfer1::IPluginLayer* layer = engine_->AddPlugin(&input, input_num, plugin); diff --git a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc index 23909378dde06..5aacc5c600dd1 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_split_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_split_op.cc @@ -59,21 +59,54 @@ void TensorRTSplitTest(const std::vector &in_shape, validator.Execute(BatchSize); } -TEST(split_op, test_same_shape_batch1) { +// batch = 0, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch1) { TensorRTSplitTest<1, 1>({4, 2, 2}, {2, 2}); } - -TEST(split_op, test_different_shape_batch1) { +// batch = 0, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch1) { TensorRTSplitTest<1, 1>({3, 2, 2}, {2, 1}); } - -TEST(split_op, test_same_shape_batch10) { +// batch = 10, axis = 1, same shape +TEST(split_op, test_same_shape_axis1_batch10) { TensorRTSplitTest<10, 1>({4, 2, 2}, {2, 2}); } - -TEST(split_op, test_different_shape_batch10) { +// batch = 10, axis = 1, different shape +TEST(split_op, test_different_shape_axis1_batch10) { TensorRTSplitTest<10, 1>({3, 2, 2}, {2, 1}); } +// batch = 0, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 4, 2}, {2, 2}); +} +// batch = 0, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch1) { + TensorRTSplitTest<1, 2>({3, 3, 2}, {2, 1}); +} +// batch = 10, axis = 2, same shape +TEST(split_op, test_same_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 4, 2}, {2, 2}); +} +// batch = 10, axis = 2, different shape +TEST(split_op, test_different_shape_axis2_batch10) { + TensorRTSplitTest<10, 2>({3, 3, 2}, {2, 1}); +} +// batch = 0, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 4}, {2, 2}); +} +// batch = 0, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch1) { + TensorRTSplitTest<1, 3>({3, 2, 3}, {2, 1}); +} +// batch = 10, axis = 3, same shape +TEST(split_op, test_same_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 4}, {2, 2}); +} +// batch = 10, axis = 3, different shape +TEST(split_op, test_different_shape_axis3_batch10) { + TensorRTSplitTest<10, 3>({3, 2, 3}, {2, 1}); +} } // namespace tensorrt } // namespace inference From e3b61cf52b88b1350de8776afcfd8e5ae348e164 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 22 Nov 2018 08:24:01 +0000 Subject: [PATCH 093/684] init gru jitcode and fix lstm jitcode test=develop --- paddle/fluid/operators/math/jit_code.cc | 36 ++++- paddle/fluid/operators/math/jit_code.h | 140 ++++++++++++++---- paddle/fluid/operators/math/jit_kernel_rnn.cc | 36 ++++- 3 files changed, 170 insertions(+), 42 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index ccc9206f5cda8..03b67238fe7eb 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -214,6 +214,9 @@ void VActJitCode::generate() { bool LSTMJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } void LSTMJitCode::generate() { + if (use_peephole_) { + preCode(); + } reg64_t reg_ptr_gates = rax; reg64_t reg_ptr_ct_1 = r9; reg64_t reg_ptr_ct = r10; @@ -224,18 +227,19 @@ void LSTMJitCode::generate() { mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); int offset = 0; + int d = num_ * sizeof(float); for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { /* C_t = C_t-1 * fgated + cand_gated * igated*/ // c vmovups(ymm_src, ptr[reg_ptr_gates + offset]); act(ymm_c, ymm_src, act_cand_); // i - vmovups(ymm_src, ptr[reg_ptr_gates + offset + num_]); + vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]); act(ymm_i, ymm_src, act_gate_); vmulps(ymm_c, ymm_c, ymm_i); if (!compute_c1h1_) { // f - vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * num_]); + vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]); act(ymm_f, ymm_src, act_gate_); vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]); vmulps(ymm_f, ymm_f, ymm_i); @@ -245,20 +249,36 @@ void LSTMJitCode::generate() { ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c; ymm_t ymm_tmp = ymm_i; + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); // save ct act(ymm_tmp, ymm_ct, act_cell_); - vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * num_]); + vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]); act(ymm_o, ymm_src, act_gate_); vmulps(ymm_o, ymm_tmp, ymm_o); - // save ct and ht - vmovups(ptr[reg_ptr_ct + offset], ymm_ct); - vmovups(ptr[reg_ptr_ht + offset], ymm_o); - + vmovups(ptr[reg_ptr_ht + offset], ymm_o); // save ht offset += sizeof(float) * YMM_FLOAT_BLOCK; } - ret(); + if (use_peephole_) { + postCode(); + } else { + ret(); + } } +bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } + +void GRUJitCode::generate() { + reg64_t reg_ptr_gates = rax; + reg64_t reg_ptr_ct_1 = r9; + reg64_t reg_ptr_ct = r10; + reg64_t reg_ptr_ht = r11; + mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); + mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); + mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + + ret(); +} } // namespace gen } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index bf28d444b7712..403cea399102c 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -302,6 +302,34 @@ class VActJitCode : public JitCode { pop(reg_ptr_global); } + template + void act(JMM& dst, JMM& src, operand_type type) { // NOLINT + // use 15 + JMM zero = JMM(15); + if (type_ == operand_type::relu) { + vxorps(zero, zero, zero); + } + switch (type) { + case operand_type::relu: + relu_jmm(dst, src, zero); + break; + case operand_type::exp: + exp_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::sigmoid: + sigmoid_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::tanh: + tanh_jmm(dst, src, 2, 3, 4, 5); + break; + case operand_type::identity: + break; + default: + // throw error + break; + } + } + protected: int num_; operand_type type_; @@ -386,44 +414,94 @@ class LSTMJitCode : public VActJitCode { operand_type act_cand_; operand_type act_cell_; reg64_t param1{abi_param1}; - xmm_t xmm_src = xmm_t(0); xmm_t xmm_c = xmm_t(1); - xmm_t xmm_i = xmm_t(2); - xmm_t xmm_f = xmm_t(3); + xmm_t xmm_i = xmm_t(6); + xmm_t xmm_f = xmm_t(7); ymm_t ymm_src = ymm_t(0); - ymm_t ymm_c = ymm_t(1); - ymm_t ymm_i = ymm_t(2); - ymm_t ymm_f = ymm_t(3); + ymm_t ymm_c = ymm_t(1); // 2~5 for act + ymm_t ymm_i = ymm_t(6); + ymm_t ymm_f = ymm_t(7); +}; - template - void act(JMM& dst, JMM& src, operand_type type) { // NOLINT - // use 15 - JMM zero = JMM(15); - if (type_ == operand_type::relu) { - vxorps(zero, zero, zero); - } - switch (type) { - case operand_type::relu: - relu_jmm(dst, src, zero); - break; - case operand_type::exp: - exp_jmm(dst, src, 2, 3, 4, 5); - break; - case operand_type::sigmoid: - sigmoid_jmm(dst, src, 2, 3, 4, 5); - break; - case operand_type::tanh: - tanh_jmm(dst, src, 2, 3, 4, 5); - break; - case operand_type::identity: - break; - default: - // throw error - break; +class GRUJitCode : public VActJitCode { + public: + const char* name() const override { + std::string base = "GRUJitCode"; + if (id_ == 0) { + base += "_H1"; + } else if (id_ == 1) { + base += "_HtPart1"; + } else if (id_ == 2) { + base += "_HtPart2"; } + auto AddTypeStr = [&](operand_type type) { + switch (type) { + case operand_type::relu: + base += "_Relu"; + break; + case operand_type::exp: + base += "_Exp"; + break; + case operand_type::sigmoid: + base += "_Sigmoid"; + break; + case operand_type::tanh: + base += "_Tanh"; + break; + case operand_type::identity: + base += "_Identity"; + break; + default: + break; + } + }; + AddTypeStr(act_gate_); + AddTypeStr(act_cand_); + return base.c_str(); } + + explicit GRUJitCode(int id, const gru_attr_t& attr, + size_t code_size = 256 * 1024, void* code_ptr = nullptr) + : VActJitCode(attr.d, operand_type::sigmoid /* this is bugy*/, code_size, + code_ptr), + id_(id) { + auto typeExchange = [](const std::string& type) -> gen::operand_type { + if (type == "sigmoid") { + return operand_type::sigmoid; + } else if (type == "relu") { + return operand_type::relu; + } else if (type == "tanh") { + return operand_type::tanh; + } else if (type == "identity" || type == "") { + return operand_type::identity; + } // else throw error + return operand_type::identity; + }; + num_ = attr.d; + act_gate_ = typeExchange(attr.act_gate); + act_cand_ = typeExchange(attr.act_cand); + } + static bool init(int d); + void generate() override; + + protected: + int id_; + int num_; + operand_type act_gate_; + operand_type act_cand_; + reg64_t param1{abi_param1}; + + xmm_t xmm_src = xmm_t(0); + xmm_t xmm_c = xmm_t(1); + xmm_t xmm_i = xmm_t(6); + xmm_t xmm_f = xmm_t(7); + + ymm_t ymm_src = ymm_t(0); + ymm_t ymm_c = ymm_t(1); + ymm_t ymm_i = ymm_t(6); + ymm_t ymm_f = ymm_t(7); }; #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index dbfd212e6e7b2..e571d8adf4af6 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -40,7 +40,7 @@ class LSTMKernelImpl : public LSTMKernel { explicit LSTMKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(attr.d)) { - size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); this->ComputeCtHt = jitcode0_->getCode(); @@ -66,7 +66,7 @@ class LSTMKernelImpl : public LSTMKernel { #ifdef PADDLE_WITH_XBYAK template <> bool LSTMKernelImpl::useJIT(int d) { - return false; // not ready yet gen::LSTMJitCode::init(d); + return gen::LSTMJitCode::init(d); } #endif @@ -82,7 +82,7 @@ class PeepholeKernelImpl : public LSTMKernel { explicit PeepholeKernelImpl(const lstm_attr_t& attr) : LSTMKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(attr.d)) { - size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 4 * 8; jitcode0_.reset(new gen::LSTMJitCode(false, attr, sz > 4096 ? sz : 4096)); this->ComputeCtHt = jitcode0_->getCode(); @@ -175,12 +175,42 @@ class GRUKernelImpl : public GRUKernel { static inline bool useJIT(int d) { return false; } static inline bool useMKL(int d) { return false; } explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel() { +#ifdef PADDLE_WITH_XBYAK + if (useJIT(attr.d)) { + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096)); + this->ComputeH1 = + jitcode0_->getCode(); + + jitcode1_.reset(new gen::GRUJitCode(1, attr, sz > 4096 ? sz : 4096)); + this->ComputeHtPart1 = + jitcode1_->getCode(); + + jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096)); + this->ComputeHtPart2 = + jitcode1_->getCode(); + return; + } +#endif this->ComputeH1 = refer::GRUH1; this->ComputeHtPart1 = refer::GRUHtPart1; this->ComputeHtPart2 = refer::GRUHtPart2; } +#ifdef PADDLE_WITH_XBYAK + + private: + std::unique_ptr jitcode0_{nullptr}, jitcode1_{nullptr}, + jitcode2_{nullptr}; +#endif }; +#ifdef PADDLE_WITH_XBYAK +template <> +bool GRUKernelImpl::useJIT(int d) { + return false; // jitcode not ready yet +} +#endif + #define JITKERNEL_DEFINE_NAME_GRU(ker_key, ker_class) \ template <> \ std::string ker_class##Impl::name(const gru_attr_t& attr) { \ From e0b48f7e29fced72f439896fed46b76adc945035 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 22 Nov 2018 16:44:15 +0800 Subject: [PATCH 094/684] init lookup remote table --- .../distributed_ops/lookup_remote_table.h | 192 ++++++++++++++++++ 1 file changed, 192 insertions(+) create mode 100644 paddle/fluid/operators/distributed_ops/lookup_remote_table.h diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table.h new file mode 100644 index 0000000000000..5b066c8196108 --- /dev/null +++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table.h @@ -0,0 +1,192 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // NOLINT +#include +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/distributed_ops/send_recv_util.h" + +namespace paddle { +namespace operators { +namespace distributed { + +inline size_t GetSectionIndex(int64_t id, const std::vector& abs_sections) { + for (size_t i = 1; i < abs_sections.size(); ++i) { + if (row < abs_sections[i]) { + return i - 1; + } + } + return abs_sections.size() - 1; +} + +inline std::vector ToAbsoluteSection( + const std::vector& height_sections) { + std::vector abs_sections; + abs_sections.resize(height_sections.size()); + abs_sections[0] = 0; + for (size_t i = 1; i < height_sections.size(); ++i) { + abs_sections[i] = height_sections[i - 1] + abs_sections[i - 1]; + } + return abs_sections; +} + +inline std::vector> SplitIds( + const std::string& id_name, + const std::vector& height_section, + framework::Scope* scope) { + auto& id_tensor = scope->Var(id_name)->Get(); + auto* id_data = id_tensor.data(); + std::set all_ids; + for (size_t i = 0; i < id_tensor.numel(); ++i) { + all_ids.insert(id_data[i]); + } + auto abs_sections = ToAbsoluteSection(height_section); + std::vector> splited_ids; + splited_ids.resize(height_section.size() + 1); + for (auto& id : all_ids) { + auto section_index = GetSectionIndex(id); + splited_ids[section_index].push_back(id - abs_sections[section_index]); + } +} + +inline void SplitIdsIntoMultipleVarsBySection( + const std::string& id_name, + const std::vector& in_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + framework::Scope* scope) { + PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, ""); + + auto place = platform::CPUPlace(); + + for (size_t i = 0; i < in_var_names.size(); ++i) { + auto* id_tensor = scope->Var(in_var_names[i])->GetMutable(); + auto& ids = splited_ids[i]; + if (!ids.empty()) { + auto* id_tensor_data = id_tensor->mutable_data(framework::make_ddim({ids.size(), 1}), place); + memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); + } + } +} + +inline void MergeMultipleVarsIntoOnBySection( + const std::string& id_name, + const std::string& out_name, + const std::vector& out_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + framework::Scope* scope) { + PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, ""); + + auto cpu_place = platform::CPUPlace(); + + auto abs_sections = ToAbsoluteSection(height_section); + auto& id_tensor = scope->Var(id_name)->Get(); + auto* id_data = id_tensor.data(); + std::unordered_map> id_to_offset; + for (size_t i = 0; i < id_tensor.numel(); ++i) { + id_to_offset[id_data[i]].push_back(i); + } + + auto& out_tensor = scope->Var(out_name)->Get(); + auto* out_tensor_data = out_tensor.mutable_data(); + + for (size_t section_idx = 0; section_idx < out_var_names.size(); ++section_idx) { + auto& ids_in_this_section = splited_ids[section_idx]; + auto& prefetch_out_var = scope->Var(out_var_names[section_idx])->Get(); + const auto* out_var_data = prefetch_out_var.mutable_data(); + auto& dims = prefetch_out_var.dims(); + + PADDLE_ENFORCE_EQ(dims.size(), 2, ""); + PADDLE_ENFORCE_EQ(ids_in_this_section.size(), dims[0]); + + auto row_numel = dims[1]; + + for (size_t i = 0; i < dims[0]; ++i) { + auto id = ids_in_this_section[i]; + auto origin_id = id + abs_sections[section_idx]; + auto& offsets = id_to_offset[origin_id]; + for (auto& offset : offsets) { + // should support GPU tensor + memory::Copy(cpu_place, out_tensor_data + offset * row_numel, + cpu_place, out_var_data + i * grad_row_numel, + sizeof(T) * grad_row_numel); + } + } + } +} + +inline void prefetch( + const std::string& table_name, + const std::string& id_name, + const std::string& out_name, + const std::vector& epmap, + const std::vector& height_section, + const framework::Scope& scope, + const platform::Place& place) const { + + auto local_scope = scope.NewScope(); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + Attr("trainer_id")); + + std::vector in_var_names; + std::vector out_var_names; + for (size_t i = 0; i < epmap.size(); ++i) { + in_var_names.push_back(id_name + "@" + epmap[i]); + out_var_names.push_back(out_name + "@" + epmap[i]); + } + + auto splited_ids = SplitIds(id_name, height_section, local_scope); + SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, splited_ids, local_scope); + + // create output var in local scope + for (auto& name : out_var_names) { + local_scope.Var(name)->GetMutable(); + } + + std::vector rets; + for (size_t i = 0; i < ins.size(); i++) { + if (NeedSend(local_scope, ins[i])) { + VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get " + << outs[i] << " back"; + rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, local_scope, + in_var_names[i], out_var_names[i])); + } else { + VLOG(30) << "don't send no-initialied variable: " << out_var_names[i]; + } + } + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + + MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, height_section, plited_ids, scope) + + scope.DeleteScope(local_scope); +} + +} // namespace distributed +} // namespace operators +} // namespace paddle From dd6fd4c747df9ad5ffdf0f6eef8ef3683df871cb Mon Sep 17 00:00:00 2001 From: tangwei12 Date: Thu, 22 Nov 2018 16:49:45 +0800 Subject: [PATCH 095/684] Utils for download and upload files with HDFS (#14473) * add hdfs utils * add hdfs utils * test=develop * update hdfs utils and add demo * fix multi_download return local files * test=develop * add sync multi upload, test=develop --- python/paddle/fluid/contrib/utils/__init__.py | 20 + .../paddle/fluid/contrib/utils/hdfs_utils.py | 505 ++++++++++++++++++ 2 files changed, 525 insertions(+) create mode 100644 python/paddle/fluid/contrib/utils/__init__.py create mode 100644 python/paddle/fluid/contrib/utils/hdfs_utils.py diff --git a/python/paddle/fluid/contrib/utils/__init__.py b/python/paddle/fluid/contrib/utils/__init__.py new file mode 100644 index 0000000000000..df6d367782327 --- /dev/null +++ b/python/paddle/fluid/contrib/utils/__init__.py @@ -0,0 +1,20 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import hdfs_utils +from .hdfs_utils import * + +__all__ = hdfs_utils.__all__ diff --git a/python/paddle/fluid/contrib/utils/hdfs_utils.py b/python/paddle/fluid/contrib/utils/hdfs_utils.py new file mode 100644 index 0000000000000..251665d85e166 --- /dev/null +++ b/python/paddle/fluid/contrib/utils/hdfs_utils.py @@ -0,0 +1,505 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""HDFS Utils""" + +import os +import subprocess +import multiprocessing +from datetime import datetime + +import re +import copy +import errno + +import logging + +__all__ = ["HDFSClient", "multi_download"] + +logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s') +_logger = logging.getLogger("hdfs_utils") +_logger.setLevel(logging.INFO) + + +class HDFSClient(object): + def __init__(self, hadoop_home, configs): + self.pre_commands = [] + hadoop_bin = '%s/bin/hadoop' % hadoop_home + self.pre_commands.append(hadoop_bin) + dfs = 'fs' + self.pre_commands.append(dfs) + + for k, v in configs.iteritems(): + config_command = '-D%s=%s' % (k, v) + self.pre_commands.append(config_command) + + def __run_hdfs_cmd(self, commands, retry_times=5): + whole_commands = copy.deepcopy(self.pre_commands) + whole_commands.extend(commands) + + print('Running system command: {0}'.format(' '.join(whole_commands))) + + ret_code = 0 + ret_out = None + ret_err = None + for x in range(retry_times + 1): + proc = subprocess.Popen( + whole_commands, stdout=subprocess.PIPE, stderr=subprocess.PIPE) + (output, errors) = proc.communicate() + ret_code, ret_out, ret_err = proc.returncode, output, errors + if ret_code: + _logger.warn( + 'Times: %d, Error running command: %s. Return code: %d, Error: %s' + % (x, ' '.join(whole_commands), proc.returncode, errors)) + else: + break + return ret_code, ret_out, ret_err + + def upload(self, hdfs_path, local_path, overwrite=False, retry_times=5): + """ + upload the local file to hdfs + args: + local_file_path: the local file path + remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) + return: + True or False + """ + assert hdfs_path is not None + assert local_path is not None and os.path.exists(local_path) + + if os.path.isdir(local_path): + _logger.warn( + "The Local path: {} is dir and I will support it later, return". + format(local_path)) + return + + base = os.path.basename(local_path) + if not self.is_exist(hdfs_path): + self.makedirs(hdfs_path) + else: + if self.is_exist(os.path.join(hdfs_path, base)): + if overwrite: + _logger.error( + "The HDFS path: {} is exist and overwrite is True, delete it". + format(hdfs_path)) + self.delete(hdfs_path) + else: + _logger.error( + "The HDFS path: {} is exist and overwrite is False, return". + format(hdfs_path)) + return False + + put_commands = ["-put", local_path, hdfs_path] + returncode, output, errors = self.__run_hdfs_cmd(put_commands, + retry_times) + if returncode: + _logger.error("Put local path: {} to HDFS path: {} failed".format( + local_path, hdfs_path)) + return False + else: + _logger.info("Put local path: {} to HDFS path: {} successfully". + format(local_path, hdfs_path)) + return True + + def download(self, hdfs_path, local_path, overwrite=False, unzip=False): + """ + download from hdfs + args: + local_file_path: the local file path + remote_file_path: remote dir on hdfs + return: + True or False + """ + _logger.info('Downloading %r to %r.', hdfs_path, local_path) + _logger.info('Download of %s to %r complete.', hdfs_path, local_path) + + if not self.is_exist(hdfs_path): + print("HDFS path: {} do not exist".format(hdfs_path)) + return False + if self.is_dir(hdfs_path): + _logger.error( + "The HDFS path: {} is dir and I will support it later, return". + format(hdfs_path)) + + if os.path.exists(local_path): + base = os.path.basename(hdfs_path) + local_file = os.path.join(local_path, base) + if os.path.exists(local_file): + if overwrite: + os.remove(local_file) + else: + _logger.error( + "The Local path: {} is exist and overwrite is False, return". + format(local_file)) + return False + + self.make_local_dirs(local_path) + + download_commands = ["-get", hdfs_path, local_path] + returncode, output, errors = self.__run_hdfs_cmd(download_commands) + if returncode: + _logger.error("Get local path: {} from HDFS path: {} failed".format( + local_path, hdfs_path)) + return False + else: + _logger.info("Get local path: {} from HDFS path: {} successfully". + format(local_path, hdfs_path)) + return True + + def is_exist(self, hdfs_path=None): + """ + whether the remote hdfs path exists? + args: + remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) + fs_name: The default values are the same as in the job configuration + fs_ugi: The default values are the same as in the job configuration + return: + True or False + """ + exist_cmd = ['-test', '-e', hdfs_path] + returncode, output, errors = self.__run_hdfs_cmd( + exist_cmd, retry_times=1) + + if returncode: + _logger.error("HDFS is_exist HDFS path: {} failed".format( + hdfs_path)) + return False + else: + _logger.info("HDFS is_exist HDFS path: {} successfully".format( + hdfs_path)) + return True + + def is_dir(self, hdfs_path=None): + """ + whether the remote hdfs path exists? + args: + remote_file_path: default value(${OUTPUT_PATH}/${SYS_USER_ID}/${SYS_JOB_ID}/tmp) + fs_name: The default values are the same as in the job configuration + fs_ugi: The default values are the same as in the job configuration + return: + True or False + """ + + if not self.is_exist(hdfs_path): + return False + + dir_cmd = ['-test', '-d', hdfs_path] + returncode, output, errors = self.__run_hdfs_cmd(dir_cmd, retry_times=1) + + if returncode: + _logger.error("HDFS path: {} failed is not a directory".format( + hdfs_path)) + return False + else: + _logger.info("HDFS path: {} successfully is a directory".format( + hdfs_path)) + return True + + def delete(self, hdfs_path): + """Remove a file or directory from HDFS. + + :param hdfs_path: HDFS path. + :param recursive: Recursively delete files and directories. By default, + this method will raise an :class:`HdfsError` if trying to delete a + non-empty directory. + + This function returns `True` if the deletion was successful and `False` if + no file or directory previously existed at `hdfs_path`. + + """ + _logger.info('Deleting %r.', hdfs_path) + + if not self.is_exist(hdfs_path): + _logger.warn("HDFS path: {} do not exist".format(hdfs_path)) + return True + + if self.is_dir(hdfs_path): + del_cmd = ['-rmr', hdfs_path] + else: + del_cmd = ['-rm', hdfs_path] + + returncode, output, errors = self.__run_hdfs_cmd(del_cmd, retry_times=0) + + if returncode: + _logger.error("HDFS path: {} delete files failure".format( + hdfs_path)) + return False + else: + _logger.info("HDFS path: {} delete files successfully".format( + hdfs_path)) + return True + + def rename(self, hdfs_src_path, hdfs_dst_path, overwrite=False): + """Move a file or folder. + + :param hdfs_src_path: Source path. + :param hdfs_dst_path: Destination path. If the path already exists and is + a directory, the source will be moved into it. If the path exists and is + a file, or if a parent destination directory is missing, this method will + raise an :class:`HdfsError`. + + """ + assert hdfs_src_path is not None + assert hdfs_dst_path is not None + + if not self.is_exist(hdfs_src_path): + _logger.info("HDFS path do not exist: {}".format(hdfs_src_path)) + if self.is_exist(hdfs_dst_path) and not overwrite: + _logger.error("HDFS path is exist: {} and overwrite=False".format( + hdfs_dst_path)) + + rename_command = ['-mv', hdfs_src_path, hdfs_dst_path] + returncode, output, errors = self.__run_hdfs_cmd( + rename_command, retry_times=1) + + if returncode: + _logger.error("HDFS rename path: {} to {} failed".format( + hdfs_src_path, hdfs_dst_path)) + return False + else: + _logger.info("HDFS rename path: {} to {} successfully".format( + hdfs_src_path, hdfs_dst_path)) + return True + + @staticmethod + def make_local_dirs(local_path): + try: + os.makedirs(local_path) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + def makedirs(self, hdfs_path): + """Create a remote directory, recursively if necessary. + + :param hdfs_path: Remote path. Intermediate directories will be created + appropriately. + """ + _logger.info('Creating directories to %r.', hdfs_path) + assert hdfs_path is not None + + if self.is_exist(hdfs_path): + return + + mkdirs_commands = ['-mkdir', hdfs_path] + returncode, output, errors = self.__run_hdfs_cmd( + mkdirs_commands, retry_times=1) + + if returncode: + _logger.error("HDFS mkdir path: {} failed".format(hdfs_path)) + return False + else: + _logger.error("HDFS mkdir path: {} successfully".format(hdfs_path)) + return True + + def ls(self, hdfs_path): + assert hdfs_path is not None + + if not self.is_exist(hdfs_path): + return [] + + ls_commands = ['-ls', hdfs_path] + returncode, output, errors = self.__run_hdfs_cmd( + ls_commands, retry_times=1) + + if returncode: + _logger.error("HDFS list path: {} failed".format(hdfs_path)) + return [] + else: + _logger.info("HDFS list path: {} successfully".format(hdfs_path)) + + ret_lines = [] + regex = re.compile('\s+') + out_lines = output.strip().split("\n") + for line in out_lines: + re_line = regex.split(line) + if len(re_line) == 8: + ret_lines.append(re_line[7]) + return ret_lines + + def lsr(self, hdfs_path, only_file=True, sort=True): + def sort_by_time(v1, v2): + v1_time = datetime.strptime(v1[1], '%Y-%m-%d %H:%M') + v2_time = datetime.strptime(v2[1], '%Y-%m-%d %H:%M') + return v1_time > v2_time + + assert hdfs_path is not None + + if not self.is_exist(hdfs_path): + return [] + + ls_commands = ['-lsr', hdfs_path] + returncode, output, errors = self.__run_hdfs_cmd( + ls_commands, retry_times=1) + + if returncode: + _logger.error("HDFS list all files: {} failed".format(hdfs_path)) + return [] + else: + _logger.info("HDFS list all files: {} successfully".format( + hdfs_path)) + lines = [] + regex = re.compile('\s+') + out_lines = output.strip().split("\n") + for line in out_lines: + re_line = regex.split(line) + if len(re_line) == 8: + if only_file and re_line[0][0] == "d": + continue + else: + lines.append( + (re_line[7], re_line[5] + " " + re_line[6])) + if sort: + sorted(lines, cmp=sort_by_time) + ret_lines = [ret[0] for ret in lines] + return ret_lines + + +def multi_upload(client, + hdfs_path, + local_path, + multi_processes=5, + overwrite=False): + """ + :param overwrite: will overwrite hdfs file or not + :param multi_processes: the upload data process at the same time, default=5 + :param client: instance of HDFSClient + :param hdfs_path: path on hdfs + :param local_path: path on local + :return: + """ + + def __subprocess_upload(datas): + for data in datas: + re_path = os.path.relpath(os.path.dirname(data), local_path) + hdfs_re_path = os.path.join(hdfs_path, re_path) + client.upload(hdfs_re_path, data, overwrite, retry_times=5) + + def get_local_files(path): + rlist = [] + + if not os.path.isdir(path): + return rlist + + for dirname, folder, files in os.walk(path): + for i in files: + t = os.path.join(dirname, i) + rlist.append(t) + return rlist + + assert isinstance(client, HDFSClient) + + all_files = get_local_files(local_path) + if not all_files: + _logger.info("there are nothing need to upload, exit") + return + _logger.info("Start {} multi process to upload datas".format( + multi_processes)) + procs = [] + for i in range(multi_processes): + process_datas = all_files[i::multi_processes] + p = multiprocessing.Process( + target=__subprocess_upload, args=(process_datas, )) + procs.append(p) + p.start() + + # complete the processes + for proc in procs: + proc.join() + + _logger.info("Finish {} multi process to upload datas".format( + multi_processes)) + + +def multi_download(client, + hdfs_path, + local_path, + trainer_id, + trainers, + multi_processes=5): + """ + multi_download + :param client: instance of HDFSClient + :param hdfs_path: path on hdfs + :param local_path: path on local + :param trainer_id: current trainer id + :param trainers: all trainers number + :param multi_processes: the download data process at the same time, default=5 + :return: None + """ + + def __subprocess_download(datas): + for data in datas: + re_path = os.path.relpath(os.path.dirname(data), hdfs_path) + local_re_path = os.path.join(local_path, re_path) + client.download(data, local_re_path) + + assert isinstance(client, HDFSClient) + + client.make_local_dirs(local_path) + _logger.info("Make local dir {} successfully".format(local_path)) + + all_need_download = client.lsr(hdfs_path, sort=True) + need_download = all_need_download[trainer_id::trainers] + _logger.info("Get {} files From all {} files need to be download from {}". + format(len(need_download), len(all_need_download), hdfs_path)) + + _logger.info("Start {} multi process to download datas".format( + multi_processes)) + procs = [] + for i in range(multi_processes): + process_datas = need_download[i::multi_processes] + p = multiprocessing.Process( + target=__subprocess_download, args=(process_datas, )) + procs.append(p) + p.start() + + # complete the processes + for proc in procs: + proc.join() + + _logger.info("Finish {} multi process to download datas".format( + multi_processes)) + + local_downloads = [] + for data in need_download: + data_name = os.path.basename(data) + re_path = os.path.relpath(os.path.dirname(data), hdfs_path) + local_re_path = os.path.join(local_path, re_path, data_name) + local_downloads.append(local_re_path) + + return local_downloads + + +if __name__ == "__main__": + hadoop_home = "/home/client/hadoop-client/hadoop/" + + configs = { + "fs.default.name": "hdfs://xxx.hadoop.com:54310", + "hadoop.job.ugi": "hello,hello123" + } + + client = HDFSClient(hadoop_home, configs) + + client.ls("/user/com/train-25") + files = client.lsr("/user/com/train-25/models") + + downloads = multi_download( + client, + "/user/com/train-25/model", + "/home/xx/data1", + 1, + 5, + multi_processes=5) + + multi_upload(client, "/user/com/train-25/model", "/home/xx/data1") From 510601b2793047858763032b7816af07ab2b2bc7 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Thu, 22 Nov 2018 09:01:08 +0000 Subject: [PATCH 096/684] test=develop --- python/paddle/fluid/layers/nn.py | 10 +++++++--- python/paddle/fluid/tests/unittests/test_layers.py | 7 ++++++- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 32d411b8309d9..27f83a60bd55e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -2139,12 +2139,16 @@ def pool2d(input, input tensor is NCHW, where N is batch size, C is the number of channels, H is the height of the feature, and W is the width of the feature. - pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple, + pool_size (int|list|tuple): The pool kernel size. If pool kernel size is a tuple or list, it must contain two integers, (pool_size_Height, pool_size_Width). Otherwise, the pool kernel size will be a square of an int. pool_type: ${pooling_type_comment} - pool_stride (int): stride of the pooling layer. - pool_padding (int): padding size. + pool_stride (int|list|tuple): The pool stride size. If pool stride size is a tuple or list, + it must contain two integers, (pool_stride_Height, pool_stride_Width). + Otherwise, the pool stride size will be a square of an int. + pool_padding (int|list|tuple): The pool padding size. If pool padding size is a tuple, + it must contain two integers, (pool_padding_on_Height, pool_padding_on_Width). + Otherwise, the pool padding size will be a square of an int. global_pooling (bool): ${global_pooling_comment} use_cudnn (bool): ${use_cudnn_comment} ceil_mode (bool): ${ceil_mode_comment} diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index c4310fe006766..559c9cda4812e 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -206,7 +206,12 @@ def test_pool2d(self): program = Program() with program_guard(program): x = layers.data(name='x', shape=[3, 224, 224], dtype='float32') - self.assertIsNotNone(layers.pool2d(x, pool_size=[5, 3])) + self.assertIsNotNone( + layers.pool2d( + x, + pool_size=[5, 3], + pool_stride=[1, 2], + pool_padding=(2, 1))) def test_lstm_unit(self): program = Program() From 60a4f69b3c1af76e27c9c91e929eb6cac8c07730 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Thu, 22 Nov 2018 17:11:15 +0800 Subject: [PATCH 097/684] add lookup remote table op --- .../distributed_ops/lookup_remote_table_op.cc | 104 +++++++++++++ ...emote_table.h => lookup_remote_table_op.h} | 141 +++++++++++++----- 2 files changed, 204 insertions(+), 41 deletions(-) create mode 100644 paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc rename paddle/fluid/operators/distributed_ops/{lookup_remote_table.h => lookup_remote_table_op.h} (54%) diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc new file mode 100644 index 0000000000000..06e96a7f98303 --- /dev/null +++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc @@ -0,0 +1,104 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h" +#include "paddle/fluid/framework/var_type_inference.h" + +namespace paddle { +namespace operators { + +class LookupRemoteTableOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("W"), + "Input(W) of LookupRemoteTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Ids"), + "Input(Ids) of LookupRemoteTableOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of LookupRemoteTableOp should not be null."); + + auto table_dims = ctx->GetInputDim("W"); + auto ids_dims = ctx->GetInputDim("Ids"); + int ids_rank = ids_dims.size(); + + PADDLE_ENFORCE_EQ(table_dims.size(), 2); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + "The last dimension of the 'Ids' tensor must be 1."); + + auto output_dims = + framework::vectorize(framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + output_dims.push_back(table_dims[1]); + ctx->SetOutputDim("Out", framework::make_ddim(output_dims)); + + if (ctx->GetOutputsVarType("Out")[0] == + framework::proto::VarType::LOD_TENSOR) { + ctx->ShareLoD("Ids", /*->*/ "Out"); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + auto data_type = framework::GetDataTypeOfVar(ctx.InputVar("W")); + return framework::OpKernelType(data_type, ctx.device_context()); + } +}; + +class LookupRemoteTableOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("W", + "(Tensor) The input represents embedding tensors, " + "which is a learnable parameter."); + AddInput("Ids", + "An input with type int32 or int64 " + "contains the ids to be looked up in W. " + "The last dimension size must be 1."); + AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr("padding_idx", + "(int64, default -1) " + "If the value is -1, it makes no effect to lookup. " + "Otherwise the given value indicates padding the output " + "with zeros whenever lookup encounters it in Ids.") + .SetDefault(kNoPadding); + // NOTE(minqiyang): grad_inplace is an temporal attribute, + // please do NOT set this attribute in python layer. + AddAttr("grad_inplace", + "(boolean, default false) " + "If the grad op reuse the input's variable.") + .SetDefault(false); + AddComment(R"DOC( +Lookup Remote Table Operator. + +This operator is used to perform lookups on the parameter W, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(lookup_remote_table, ops::LookupRemoteTableOp, + ops::EmptyGradOpMaker, ops::LookupRemoteTableOpMaker); + +REGISTER_OP_CPU_KERNEL(lookup_remote_table, ops::LookupRemoteTableKernel, + ops::LookupRemoteTableKernel); diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h similarity index 54% rename from paddle/fluid/operators/distributed_ops/lookup_remote_table.h rename to paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h index 5b066c8196108..1a383f6d3e6f0 100644 --- a/paddle/fluid/operators/distributed_ops/lookup_remote_table.h +++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h @@ -14,21 +14,22 @@ limitations under the License. */ #include // NOLINT #include -#include #include #include +#include #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/lod_tensor.h" -#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/memory/memcpy.h" +#include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" namespace paddle { namespace operators { namespace distributed { -inline size_t GetSectionIndex(int64_t id, const std::vector& abs_sections) { +inline size_t GetSectionIndex(int64_t id, + const std::vector& abs_sections) { for (size_t i = 1; i < abs_sections.size(); ++i) { if (row < abs_sections[i]) { return i - 1; @@ -38,7 +39,7 @@ inline size_t GetSectionIndex(int64_t id, const std::vector& abs_sectio } inline std::vector ToAbsoluteSection( - const std::vector& height_sections) { + const std::vector& height_sections) { std::vector abs_sections; abs_sections.resize(height_sections.size()); abs_sections[0] = 0; @@ -49,9 +50,8 @@ inline std::vector ToAbsoluteSection( } inline std::vector> SplitIds( - const std::string& id_name, - const std::vector& height_section, - framework::Scope* scope) { + const std::string& id_name, const std::vector& height_section, + framework::Scope* scope) { auto& id_tensor = scope->Var(id_name)->Get(); auto* id_data = id_tensor.data(); std::set all_ids; @@ -68,32 +68,32 @@ inline std::vector> SplitIds( } inline void SplitIdsIntoMultipleVarsBySection( - const std::string& id_name, - const std::vector& in_var_names, - const std::vector& height_section, - const std::vector>& splited_ids, - framework::Scope* scope) { + const std::string& id_name, const std::vector& in_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + framework::Scope* scope) { PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, ""); auto place = platform::CPUPlace(); for (size_t i = 0; i < in_var_names.size(); ++i) { - auto* id_tensor = scope->Var(in_var_names[i])->GetMutable(); + auto* id_tensor = + scope->Var(in_var_names[i])->GetMutable(); auto& ids = splited_ids[i]; if (!ids.empty()) { - auto* id_tensor_data = id_tensor->mutable_data(framework::make_ddim({ids.size(), 1}), place); + auto* id_tensor_data = id_tensor->mutable_data( + framework::make_ddim({ids.size(), 1}), place); memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); } } } inline void MergeMultipleVarsIntoOnBySection( - const std::string& id_name, - const std::string& out_name, - const std::vector& out_var_names, - const std::vector& height_section, - const std::vector>& splited_ids, - framework::Scope* scope) { + const std::string& id_name, const std::string& out_name, + const std::vector& out_var_names, + const std::vector& height_section, + const std::vector>& splited_ids, + framework::Scope* scope) { PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, ""); auto cpu_place = platform::CPUPlace(); @@ -109,9 +109,11 @@ inline void MergeMultipleVarsIntoOnBySection( auto& out_tensor = scope->Var(out_name)->Get(); auto* out_tensor_data = out_tensor.mutable_data(); - for (size_t section_idx = 0; section_idx < out_var_names.size(); ++section_idx) { + for (size_t section_idx = 0; section_idx < out_var_names.size(); + ++section_idx) { auto& ids_in_this_section = splited_ids[section_idx]; - auto& prefetch_out_var = scope->Var(out_var_names[section_idx])->Get(); + auto& prefetch_out_var = + scope->Var(out_var_names[section_idx])->Get(); const auto* out_var_data = prefetch_out_var.mutable_data(); auto& dims = prefetch_out_var.dims(); @@ -126,31 +128,27 @@ inline void MergeMultipleVarsIntoOnBySection( auto& offsets = id_to_offset[origin_id]; for (auto& offset : offsets) { // should support GPU tensor - memory::Copy(cpu_place, out_tensor_data + offset * row_numel, - cpu_place, out_var_data + i * grad_row_numel, + memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place, + out_var_data + i * grad_row_numel, sizeof(T) * grad_row_numel); } } } } -inline void prefetch( - const std::string& table_name, - const std::string& id_name, - const std::string& out_name, - const std::vector& epmap, - const std::vector& height_section, - const framework::Scope& scope, - const platform::Place& place) const { - +inline void prefetch(const std::string& table_name, const std::string& id_name, + const std::string& out_name, + const std::vector& epmap, + const std::vector& height_section, + const framework::Scope& scope, + const platform::Place& place) const { auto local_scope = scope.NewScope(); platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto& ctx = *pool.Get(place); distributed::RPCClient* rpc_client = - distributed::RPCClient::GetInstance( - Attr("trainer_id")); + distributed::RPCClient::GetInstance(Attr("trainer_id")); std::vector in_var_names; std::vector out_var_names; @@ -160,7 +158,8 @@ inline void prefetch( } auto splited_ids = SplitIds(id_name, height_section, local_scope); - SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, splited_ids, local_scope); + SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, + splited_ids, local_scope); // create output var in local scope for (auto& name : out_var_names) { @@ -171,9 +170,9 @@ inline void prefetch( for (size_t i = 0; i < ins.size(); i++) { if (NeedSend(local_scope, ins[i])) { VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get " - << outs[i] << " back"; - rets.push_back(rpc_client->AsyncPrefetchVar(epmap[i], ctx, local_scope, - in_var_names[i], out_var_names[i])); + << outs[i] << " back"; + rets.push_back(rpc_client->AsyncPrefetchVar( + epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i])); } else { VLOG(30) << "don't send no-initialied variable: " << out_var_names[i]; } @@ -182,11 +181,71 @@ inline void prefetch( PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); } - MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, height_section, plited_ids, scope) + MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, + height_section, plited_ids, scope) - scope.DeleteScope(local_scope); + scope.DeleteScope(local_scope); } +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +constexpr int64_t kNoPadding = -1; + +template +class LookupRemoteTableKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* ids_t = context.Input("Ids"); // int tensor + auto* output_t = context.Output("Out"); // float tensor + auto* table_var = context.InputVar("W"); + + int64_t padding_idx = context.Attr("padding_idx"); + int64_t* ids = const_cast(ids_t->data()); + int64_t ids_numel = ids_t->numel(); + + if (table_var->IsType()) { + auto* table_t = context.Input("W"); + int64_t row_number = table_t->dims()[0]; + int64_t row_width = table_t->dims()[1]; + + auto* table = table_t->data(); + auto* output = output_t->mutable_data(context.GetPlace()); + + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_LT(ids[i], row_number); + PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); + memcpy(output + i * row_width, table + ids[i] * row_width, + row_width * sizeof(T)); + } + } + } else if (table_var->IsType()) { + const auto& table_t = table_var->Get(); + int64_t row_width = table_t.value().dims()[1]; + const auto* table = table_t.value().data(); + auto* output = output_t->mutable_data(context.GetPlace()); + + auto blas = math::GetBlas(context); + for (int64_t i = 0; i < ids_numel; ++i) { + if (padding_idx != kNoPadding && ids[i] == padding_idx) { + memset(output + i * row_width, 0, row_width * sizeof(T)); + } else { + PADDLE_ENFORCE_GE(ids[i], 0); + auto id_index = table_t.Index(ids[i]); + PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); + blas.VCOPY(row_width, table + id_index * row_width, + output + i * row_width); + } + } + } + } +}; + } // namespace distributed } // namespace operators } // namespace paddle From 0c5ed5f6fc2f7d7a8936c70d2005cf3e85c23df6 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Thu, 22 Nov 2018 10:04:10 +0000 Subject: [PATCH 098/684] enable peephole jitcode test=develop --- paddle/fluid/operators/math/jit_code.cc | 28 +++++++++++++++++-- paddle/fluid/operators/math/jit_kernel_rnn.cc | 2 +- 2 files changed, 27 insertions(+), 3 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 03b67238fe7eb..95247ce3099f3 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -221,10 +221,14 @@ void LSTMJitCode::generate() { reg64_t reg_ptr_ct_1 = r9; reg64_t reg_ptr_ct = r10; reg64_t reg_ptr_ht = r11; + reg64_t reg_ptr_wp = r12; mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + if (use_peephole_) { + mov(reg_ptr_wp, ptr[param1 + offsetof(lstm_t, wp)]); + } int offset = 0; int d = num_ * sizeof(float); @@ -235,13 +239,27 @@ void LSTMJitCode::generate() { act(ymm_c, ymm_src, act_cand_); // i vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]); + if (!compute_c1h1_ && use_peephole_) { + ymm_t ymm_wp = ymm_t(2); + ymm_t ymm_ct_1 = ymm_t(3); + vmovups(ymm_wp, ptr[reg_ptr_wp + offset]); + vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]); + vmulps(ymm_wp, ymm_ct_1, ymm_wp); + vaddps(ymm_src, ymm_src, ymm_wp); + } act(ymm_i, ymm_src, act_gate_); vmulps(ymm_c, ymm_c, ymm_i); if (!compute_c1h1_) { // f vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]); - act(ymm_f, ymm_src, act_gate_); vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]); + if (use_peephole_) { + ymm_t ymm_wp = ymm_t(3); + vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d]); + vmulps(ymm_wp, ymm_i, ymm_wp); + vaddps(ymm_src, ymm_src, ymm_wp); + } + act(ymm_f, ymm_src, act_gate_); vmulps(ymm_f, ymm_f, ymm_i); vaddps(ymm_f, ymm_f, ymm_c); } @@ -250,8 +268,14 @@ void LSTMJitCode::generate() { ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c; ymm_t ymm_tmp = ymm_i; vmovups(ptr[reg_ptr_ct + offset], ymm_ct); // save ct - act(ymm_tmp, ymm_ct, act_cell_); vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]); + if (use_peephole_) { + ymm_t ymm_wp = ymm_t(2); + vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d * 2]); + vmulps(ymm_wp, ymm_ct, ymm_wp); + vaddps(ymm_src, ymm_src, ymm_wp); + } + act(ymm_tmp, ymm_ct, act_cell_); act(ymm_o, ymm_src, act_gate_); vmulps(ymm_o, ymm_tmp, ymm_o); vmovups(ptr[reg_ptr_ht + offset], ymm_o); // save ht diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index e571d8adf4af6..85ea95cfcc103 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -108,7 +108,7 @@ class PeepholeKernelImpl : public LSTMKernel { #ifdef PADDLE_WITH_XBYAK template <> bool PeepholeKernelImpl::useJIT(int d) { - return false; // peephole jitcode not ready yet + return gen::LSTMJitCode::init(d); } #endif From 83370576cd8f35e4155d94a789c886c8c264056d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Thu, 22 Nov 2018 18:52:54 +0800 Subject: [PATCH 099/684] Add sqlite3 support in Python3.6 test=develop --- tools/manylinux1/build_scripts/build_utils.sh | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/tools/manylinux1/build_scripts/build_utils.sh b/tools/manylinux1/build_scripts/build_utils.sh index d97745ad2dd80..48cce15a14513 100755 --- a/tools/manylinux1/build_scripts/build_utils.sh +++ b/tools/manylinux1/build_scripts/build_utils.sh @@ -50,6 +50,15 @@ function do_cpython_build { mkdir -p ${prefix}/lib # -Wformat added for https://bugs.python.org/issue17547 on Python 2.6 + if [ $(lex_pyver $py_ver) -eq $(lex_pyver 3.6) ]; then + wget https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz + tar -zxf sqlite-autoconf-3250300.tar.gz + cd sqlite-autoconf-3250300 + ./configure --prefix=/usr/local + make -j8 && make install + cd ../ && rm sqlite-autoconf-3250300.tar.gz + fi + # NOTE --enable-shared for generating libpython shared library needed for # linking of some of the nupic.core test executables. if [ $(lex_pyver $py_ver) -ge $(lex_pyver 3.7) ]; then @@ -59,9 +68,9 @@ function do_cpython_build { make -j8 > /dev/null make altinstall > /dev/null else - CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null - make -j8 > /dev/null - make install > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} CFLAGS="-Wformat" ./configure --prefix=${prefix} --enable-shared $unicode_flags > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make -j8 > /dev/null + LD_LIBRARY_PATH=/usr/local/lib:${LD_LIBRARY_PATH} make install > /dev/null fi popd echo "ZZZ looking for libpython" From 7c8c9dc9bf441ee3360ec416fd71dbf5921ba391 Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 22 Nov 2018 19:15:47 +0800 Subject: [PATCH 100/684] fix unit test cases --- cmake/generic.cmake | 11 ++++++-- .../framework/details/all_reduce_op_handle.cc | 4 +-- .../framework/details/all_reduce_op_handle.h | 6 ++--- .../framework/details/broadcast_op_handle.cc | 2 +- .../framework/details/broadcast_op_handle.h | 6 ++--- .../details/broadcast_op_handle_test.h | 12 ++++----- .../fluid/framework/details/build_strategy.cc | 4 +-- .../fluid/framework/details/build_strategy.h | 4 +-- .../details/data_balance_op_handle.cc | 2 +- .../details/data_balance_op_handle.h | 4 +-- .../details/fused_broadcast_op_handle.h | 4 +-- .../details/fused_broadcast_op_handle_test.cc | 4 +-- .../details/multi_devices_graph_pass.cc | 16 +++++------ .../details/multi_devices_graph_pass.h | 2 +- .../framework/details/reduce_op_handle.cc | 2 +- .../framework/details/reduce_op_handle.h | 4 +-- .../details/reduce_op_handle_test.cc | 12 ++++----- .../fluid/framework/ir/is_test_pass_tester.cc | 5 +++- .../inference/analysis/analyzer_tester.cc | 3 ++- paddle/fluid/inference/api/helper.h | 5 +--- .../inference/tests/api/anakin_rnn1_tester.cc | 1 - .../tests/book/test_inference_nlp.cc | 1 - paddle/fluid/inference/tests/test_helper.h | 1 + paddle/fluid/operators/beam_search_op_test.cc | 16 +++++------ .../operators/distributed/grpc_client.cc | 2 +- .../fluid/operators/distributed/grpc_serde.cc | 2 +- .../fluid/operators/distributed/grpc_serde.h | 3 ++- .../operators/distributed/sendrecvop_utils.cc | 2 +- .../operators/distributed/sendrecvop_utils.h | 2 +- paddle/fluid/operators/math/cpu_vec_test.cc | 2 +- paddle/fluid/operators/math/im2col_test.cc | 2 +- .../fluid/operators/math/jit_kernel_test.cc | 2 +- paddle/fluid/platform/cudnn_helper.h | 2 +- paddle/fluid/platform/dynload/cudnn.h | 14 +++++----- paddle/fluid/platform/gpu_info.cc | 5 +++- .../fluid/platform/stream_callback_manager.h | 2 +- paddle/legacy/cuda/include/hl_warpctc_wrap.h | 3 ++- paddle/legacy/cuda/src/hl_cuda_device.cc | 4 +++ paddle/legacy/utils/ThreadLocal.h | 4 ++- paddle/legacy/utils/Util.h | 27 +++++++++++++++++++ paddle/testing/CMakeLists.txt | 6 +++-- python/paddle/fluid/metrics.py | 4 +-- .../fluid/tests/unittests/CMakeLists.txt | 8 +++--- 43 files changed, 138 insertions(+), 89 deletions(-) diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 111627a932afe..cabef3f713621 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -349,10 +349,17 @@ function(cc_test TARGET_NAME) set(oneValueArgs "") set(multiValueArgs SRCS DEPS ARGS) cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) + if(WIN32) + list(APPEND win32_deps shlwapi) + if("${cc_test_DEPS};" MATCHES "python;") + list(REMOVE_ITEM cc_test_DEPS python) + list(APPEND win32_deps ${PYTHON_LIBRARIES}) + endif() + endif(WIN32) add_executable(${TARGET_NAME} ${cc_test_SRCS}) target_link_libraries(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) if(WIN32) - target_link_libraries(${TARGET_NAME} shlwapi) + target_link_libraries(${TARGET_NAME} ${win32_deps}) endif(WIN32) add_dependencies(${TARGET_NAME} ${cc_test_DEPS} paddle_gtest_main lod_tensor memory gtest gflags glog) add_test(NAME ${TARGET_NAME} @@ -679,7 +686,7 @@ function(py_test TARGET_NAME) set(multiValueArgs SRCS DEPS ARGS ENVS) cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) add_test(NAME ${TARGET_NAME} - COMMAND env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true + COMMAND ${CMAKE_COMMAND} -E env FLAGS_init_allocated_mem=true FLAGS_cudnn_deterministic=true FLAGS_cpu_deterministic=true PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_ENVS} ${PYTHON_EXECUTABLE} -u ${py_test_SRCS} ${py_test_ARGS} diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index b8690156763e4..a003995ae3f8e 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -23,7 +23,7 @@ namespace paddle { namespace framework { namespace details { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, @@ -74,7 +74,7 @@ void AllReduceOpHandle::RunImpl() { } if (platform::is_gpu_place(lod_tensors[0]->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) PADDLE_ENFORCE(nccl_ctxs_, "nccl_ctxs should not be nullptr."); int dtype = -1; size_t numel = 0; diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.h b/paddle/fluid/framework/details/all_reduce_op_handle.h index f6ef3a1367b91..b449796fcaee7 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.h +++ b/paddle/fluid/framework/details/all_reduce_op_handle.h @@ -20,7 +20,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -29,7 +29,7 @@ namespace framework { namespace details { struct AllReduceOpHandle : public OpHandleBase { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) AllReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs); @@ -49,7 +49,7 @@ struct AllReduceOpHandle : public OpHandleBase { private: std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; #endif }; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.cc b/paddle/fluid/framework/details/broadcast_op_handle.cc index 8e5e542765938..d98df3bbadd39 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.cc +++ b/paddle/fluid/framework/details/broadcast_op_handle.cc @@ -82,7 +82,7 @@ void BroadcastOpHandle::BroadcastOneVar( }); } } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) VarHandle *out_handle = nullptr; int root_id = boost::get(in_tensor.place()).device; std::vector> broadcast_calls; diff --git a/paddle/fluid/framework/details/broadcast_op_handle.h b/paddle/fluid/framework/details/broadcast_op_handle.h index 72180fac86425..0c75e05f86163 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle.h +++ b/paddle/fluid/framework/details/broadcast_op_handle.h @@ -24,7 +24,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -34,7 +34,7 @@ namespace details { struct BroadcastOpHandle : public OpHandleBase { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) BroadcastOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *nccl_ctxs) @@ -68,7 +68,7 @@ struct BroadcastOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/broadcast_op_handle_test.h b/paddle/fluid/framework/details/broadcast_op_handle_test.h index 4305eb65733a7..df3b3cc9ca012 100644 --- a/paddle/fluid/framework/details/broadcast_op_handle_test.h +++ b/paddle/fluid/framework/details/broadcast_op_handle_test.h @@ -42,7 +42,7 @@ struct TestBroadcastOpHandle { std::vector> nodes_; std::vector place_list_; bool use_gpu_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; #endif @@ -50,7 +50,7 @@ struct TestBroadcastOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -60,7 +60,7 @@ struct TestBroadcastOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -84,7 +84,7 @@ struct TestBroadcastOpHandle { place_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_.reset(nullptr); #endif } @@ -106,14 +106,14 @@ struct TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("node0", ir::Node::Type::kOperation)); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new BroadcastOpHandle(nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 37202f869508c..70baced0ada33 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -96,7 +96,7 @@ std::unique_ptr BuildStrategy::Apply( const std::string &loss_var_name, const std::unordered_set ¶m_names, const std::vector &local_scopes, -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const { #else const bool use_cuda) const { @@ -118,7 +118,7 @@ std::unique_ptr BuildStrategy::Apply( pass->Erase("local_scopes"); pass->SetNotOwned>("local_scopes", &local_scopes); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) platform::NCCLContextMap *nctx = use_cuda ? nccl_ctxs : nullptr; pass->Erase("nccl_ctxs"); pass->SetNotOwned("nccl_ctxs", nctx); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index fc2641dbd4827..3236c35efdbf1 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -23,7 +23,7 @@ #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/enforce.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -98,7 +98,7 @@ struct BuildStrategy { const std::string &loss_var_name, const std::unordered_set ¶m_names, const std::vector &local_scopes, -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const bool use_cuda, platform::NCCLContextMap *nccl_ctxs) const; #else const bool use_cuda) const; diff --git a/paddle/fluid/framework/details/data_balance_op_handle.cc b/paddle/fluid/framework/details/data_balance_op_handle.cc index 0b772f9b63e2c..cc562c7b102ce 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.cc +++ b/paddle/fluid/framework/details/data_balance_op_handle.cc @@ -20,7 +20,7 @@ namespace paddle { namespace framework { namespace details { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) DataBalanceOpHandle::DataBalanceOpHandle( ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/data_balance_op_handle.h b/paddle/fluid/framework/details/data_balance_op_handle.h index 0462fb6ec713e..2db18a1a7203f 100644 --- a/paddle/fluid/framework/details/data_balance_op_handle.h +++ b/paddle/fluid/framework/details/data_balance_op_handle.h @@ -19,7 +19,7 @@ #include "paddle/fluid/framework/details/op_handle_base.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -29,7 +29,7 @@ namespace details { struct DataBalanceOpHandle : public OpHandleBase { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) DataBalanceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, const platform::NCCLContextMap *ctxs); diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle.h b/paddle/fluid/framework/details/fused_broadcast_op_handle.h index e37259526a5f6..e43d545c9c0d0 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle.h +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle.h @@ -25,7 +25,7 @@ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -35,7 +35,7 @@ namespace details { struct FusedBroadcastOpHandle : public BroadcastOpHandle { public: -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) FusedBroadcastOpHandle(ir::Node *node, const std::vector local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc index 541993c74332c..be0d941c4f9c2 100644 --- a/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc +++ b/paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc @@ -44,14 +44,14 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle { nodes_.emplace_back( ir::CreateNodeForTest("fused_broadcast", ir::Node::Type::kOperation)); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else PADDLE_THROW("CUDA is not supported."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_ = new FusedBroadcastOpHandle( nodes_.back().get(), local_scopes_, place_list_, nccl_ctxs_.get()); #else diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index 8c98b781301e8..26666212ae8c4 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -142,7 +142,7 @@ void MultiDevSSAGraphBuilder::Init() const { places_ = Get>(kPlaces); local_scopes_ = Get>(kLocalScopes); strategy_ = Get(kStrategy); -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_ = &Get("nccl_ctxs"); #endif @@ -431,7 +431,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( } } bool use_gpu = false; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) use_gpu = nccl_ctxs_ != nullptr; #endif @@ -478,7 +478,7 @@ bool MultiDevSSAGraphBuilder::IsSparseGradient(const std::string &og) const { void MultiDevSSAGraphBuilder::SetCommunicationContext( OpHandleBase *op_handle, const platform::Place &p) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_ == nullptr) { op_handle->SetDeviceContext(p, platform::DeviceContextPool::Instance().Get(p)); @@ -492,7 +492,7 @@ void MultiDevSSAGraphBuilder::SetCommunicationContext( void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, const std::string &p_name, size_t src_dev_id) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new BroadcastOpHandle( result->CreateEmptyNode("broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); @@ -522,7 +522,7 @@ void MultiDevSSAGraphBuilder::CreateBroadcastOp(ir::Graph *result, void MultiDevSSAGraphBuilder::CreateFusedBroadcastOp( ir::Graph *result, const std::vector> &bcast_varnames) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto *op_handle = new FusedBroadcastOpHandle( result->CreateEmptyNode("fused_broadcast", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_); @@ -568,7 +568,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, const std::string &og) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new AllReduceOpHandle( result->CreateEmptyNode("allreduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); @@ -597,7 +597,7 @@ void MultiDevSSAGraphBuilder::InsertAllReduceOp(ir::Graph *result, void MultiDevSSAGraphBuilder::InsertDataBalanceOp( ir::Graph *result, const std::vector &datas) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new DataBalanceOpHandle( result->CreateEmptyNode("data_balance", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); @@ -694,7 +694,7 @@ void MultiDevSSAGraphBuilder::CreateComputationalOps(ir::Graph *result, VarHandle *MultiDevSSAGraphBuilder::CreateReduceOp(ir::Graph *result, const std::string &og, int dst_dev_id) const { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) result->Get(kGraphOps).emplace_back(new ReduceOpHandle( result->CreateEmptyNode("reduce", ir::Node::Type::kOperation), local_scopes_, places_, nccl_ctxs_)); diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.h b/paddle/fluid/framework/details/multi_devices_graph_pass.h index f3ec2d2941524..8e462aec7dc7c 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.h +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.h @@ -40,7 +40,7 @@ class MultiDevSSAGraphBuilder : public ir::Pass { size_t device_id) const; void Init() const; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) mutable platform::NCCLContextMap *nccl_ctxs_; #endif diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index 4503123eac810..c9f1107aeab5a 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -125,7 +125,7 @@ void ReduceOpHandle::RunImpl() { } }); } else if (paddle::platform::is_gpu_place(lod_tensors[0]->place())) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) auto pre_in = pre_in_var->Get(); VariableVisitor::ShareDimsAndLoD(*pre_in_var, out_var); VariableVisitor::GetMutableTensor(out_var).mutable_data( diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 999828ae457ba..846839029ca65 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -23,7 +23,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/platform/device_context.h" -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) #include "paddle/fluid/platform/nccl_helper.h" #endif @@ -35,7 +35,7 @@ struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; std::vector places_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) const platform::NCCLContextMap *nccl_ctxs_; ReduceOpHandle(ir::Node *node, const std::vector &local_scopes, const std::vector &places, diff --git a/paddle/fluid/framework/details/reduce_op_handle_test.cc b/paddle/fluid/framework/details/reduce_op_handle_test.cc index 72299c0bfa916..6cee4770e6435 100644 --- a/paddle/fluid/framework/details/reduce_op_handle_test.cc +++ b/paddle/fluid/framework/details/reduce_op_handle_test.cc @@ -35,7 +35,7 @@ struct TestReduceOpHandle { std::vector gpu_list_; std::vector> ctxs_; -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) std::unique_ptr nccl_ctxs_; #endif @@ -43,7 +43,7 @@ struct TestReduceOpHandle { for (size_t j = 0; j < ctxs_.size(); ++j) { ctxs_[j]->Wait(); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) if (nccl_ctxs_) { nccl_ctxs_->WaitAll(); } @@ -53,7 +53,7 @@ struct TestReduceOpHandle { void InitCtxOnGpu(bool use_gpu) { use_gpu_ = use_gpu; if (use_gpu) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) int count = p::GetCUDADeviceCount(); if (count <= 1) { LOG(WARNING) << "Cannot test multi-gpu Broadcast, because the CUDA " @@ -77,7 +77,7 @@ struct TestReduceOpHandle { gpu_list_.push_back(p); ctxs_.emplace_back(new p::CPUDeviceContext(p)); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) nccl_ctxs_.reset(nullptr); #endif } @@ -99,14 +99,14 @@ struct TestReduceOpHandle { nodes.emplace_back(new ir::Node("node")); if (use_gpu_) { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else PADDLE_THROW("CUDA is not support."); #endif } else { -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) op_handle_.reset(new ReduceOpHandle(nodes.back().get(), local_scopes_, gpu_list_, nccl_ctxs_.get())); #else diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index cd2cb0c9f8a8e..9696441a21661 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -15,7 +15,10 @@ #include "paddle/fluid/framework/ir/is_test_pass.h" #include - +#ifdef _WIN32 +#undef FALSE +#undef TRUE +#endif namespace paddle { namespace framework { namespace ir { diff --git a/paddle/fluid/inference/analysis/analyzer_tester.cc b/paddle/fluid/inference/analysis/analyzer_tester.cc index 84a0c3374c66f..7710ed7b613a1 100644 --- a/paddle/fluid/inference/analysis/analyzer_tester.cc +++ b/paddle/fluid/inference/analysis/analyzer_tester.cc @@ -19,6 +19,7 @@ #include "paddle/fluid/inference/analysis/ut_helper.h" #include "paddle/fluid/inference/api/paddle_inference_api.h" #include "paddle/fluid/inference/api/paddle_inference_pass.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace inference { @@ -75,7 +76,7 @@ void TestWord2vecPrediction(const std::string& model_path) { 0.000932706}; const size_t num_elements = outputs.front().data.length() / sizeof(float); // The outputs' buffers are in CPU memory. - for (size_t i = 0; i < std::min(5UL, num_elements); i++) { + for (size_t i = 0; i < std::min((size_t)5UL, num_elements); i++) { LOG(INFO) << "data: " << static_cast(outputs.front().data.data())[i]; PADDLE_ENFORCE(static_cast(outputs.front().data.data())[i], diff --git a/paddle/fluid/inference/api/helper.h b/paddle/fluid/inference/api/helper.h index 6f9d663121004..9a393a61c4b45 100644 --- a/paddle/fluid/inference/api/helper.h +++ b/paddle/fluid/inference/api/helper.h @@ -15,10 +15,6 @@ #pragma once #include -#if !defined(_WIN32) -#include -#else -#endif #include #include // NOLINT @@ -28,6 +24,7 @@ #include #include #include "paddle/fluid/inference/api/paddle_inference_api.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/string/printf.h" namespace paddle { diff --git a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc index c4022225fd452..da42688f29f04 100644 --- a/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc +++ b/paddle/fluid/inference/tests/api/anakin_rnn1_tester.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include -#include #include #include #include diff --git a/paddle/fluid/inference/tests/book/test_inference_nlp.cc b/paddle/fluid/inference/tests/book/test_inference_nlp.cc index cbcfc964c91c3..5c1204b9e6b78 100644 --- a/paddle/fluid/inference/tests/book/test_inference_nlp.cc +++ b/paddle/fluid/inference/tests/book/test_inference_nlp.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include // NOLINT diff --git a/paddle/fluid/inference/tests/test_helper.h b/paddle/fluid/inference/tests/test_helper.h index 2118fcfd4bb15..75fa611c0d701 100644 --- a/paddle/fluid/inference/tests/test_helper.h +++ b/paddle/fluid/inference/tests/test_helper.h @@ -20,6 +20,7 @@ limitations under the License. */ #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/inference/io.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" DECLARE_bool(use_mkldnn); diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index 501807e7f3e04..80fdd22fbbc06 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -30,23 +30,23 @@ using std::endl; void CreateInput(LoDTensor* ids, LoDTensor* scores) { LoD lod; - vector level0({0, 2, 4}); - vector level1({0, 1, 2, 3, 4}); + vector level0{0, 2, 4}; + vector level1{0, 1, 2, 3, 4}; lod.push_back(level0); lod.push_back(level1); ids->set_lod(lod); scores->set_lod(lod); - auto dims = framework::make_ddim(vector({4, 3})); + auto dims = framework::make_ddim(vector{4, 3}); ids->Resize(dims); scores->Resize(dims); CPUPlace place; auto* ids_data = ids->mutable_data(place); auto* scores_data = scores->mutable_data(place); - vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); - vector _scores( - {0.5, 0.3, 0.2, 0.6, 0.3, 0.1, 0.9, 0.5, 0.1, 0.7, 0.5, 0.1}); + vector _ids{4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}; + vector _scores{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, + 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}; for (int i = 0; i < 12; i++) { ids_data[i] = _ids[i]; @@ -79,8 +79,8 @@ TEST(DISABLED_beam_search_op, run) { ASSERT_EQ(sids.lod(), sscores.lod()); - vector tids({4, 2, 3, 8}); - vector tscores({0.5, 0.6, 0.9, 0.7}); + vector tids{4, 2, 3, 8}; + vector tscores{0.5f, 0.6f, 0.9f, 0.7f}; for (int i = 0; i < 4; i++) { ASSERT_EQ(tids[i], sids.data()[i]); diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index c28f86146d304..3548d5d9fb774 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include "glog/logging.h" // For VLOG @@ -20,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/grpc_client.h" #include "paddle/fluid/operators/distributed/grpc_serde.h" #include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/grpc_serde.cc b/paddle/fluid/operators/distributed/grpc_serde.cc index f27b70a5a3dd2..e6856676d49e8 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.cc +++ b/paddle/fluid/operators/distributed/grpc_serde.cc @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif -#include #include // NOLINT #include "google/protobuf/io/coded_stream.h" @@ -26,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/grpc_variable_response.h" #include "paddle/fluid/operators/distributed/proto_encoder_helper.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/platform/profiler.h" namespace paddle { diff --git a/paddle/fluid/operators/distributed/grpc_serde.h b/paddle/fluid/operators/distributed/grpc_serde.h index 7ec489e961630..17290d3fb4478 100644 --- a/paddle/fluid/operators/distributed/grpc_serde.h +++ b/paddle/fluid/operators/distributed/grpc_serde.h @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include + #include #include #include @@ -25,6 +25,7 @@ limitations under the License. */ #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/operators/distributed/send_recv.grpc.pb.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.cc b/paddle/fluid/operators/distributed/sendrecvop_utils.cc index 374fa680e3681..0abebb92401a7 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.cc +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.cc @@ -15,12 +15,12 @@ limitations under the License. */ #ifdef PADDLE_WITH_CUDA #include #endif -#include #include // NOLINT #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/operators/distributed/sendrecvop_utils.h" #include "paddle/fluid/operators/distributed/variable_response.h" +#include "paddle/fluid/platform/port.h" namespace paddle { namespace operators { diff --git a/paddle/fluid/operators/distributed/sendrecvop_utils.h b/paddle/fluid/operators/distributed/sendrecvop_utils.h index 480fc59c4281e..523e56fe3e4ea 100644 --- a/paddle/fluid/operators/distributed/sendrecvop_utils.h +++ b/paddle/fluid/operators/distributed/sendrecvop_utils.h @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once -#include #include #include #include @@ -24,6 +23,7 @@ limitations under the License. */ #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/var_type.h" +#include "paddle/fluid/platform/port.h" #include "paddle/fluid/operators/distributed/send_recv.pb.h" diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index 18a586f8dd9f0..ad734bae425f3 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include #include #include #include @@ -22,6 +21,7 @@ limitations under the License. */ #include "gtest/gtest.h" #include "paddle/fluid/operators/math/cpu_vec.h" +#include "paddle/fluid/platform/port.h" inline double GetCurrentUS() { struct timeval time; diff --git a/paddle/fluid/operators/math/im2col_test.cc b/paddle/fluid/operators/math/im2col_test.cc index ae2c90b33a429..521cd7801abd6 100644 --- a/paddle/fluid/operators/math/im2col_test.cc +++ b/paddle/fluid/operators/math/im2col_test.cc @@ -14,9 +14,9 @@ limitations under the License. */ #include "paddle/fluid/operators/math/im2col.h" #include -#include #include #include "paddle/fluid/operators/math/im2col_cfo_cpu.h" +#include "paddle/fluid/platform/port.h" template void testIm2col() { diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index b6c62a26348cd..8662e1c50dea4 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -13,7 +13,6 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/operators/math/jit_kernel.h" -#include #include // for exp #include // for memcpy #include @@ -22,6 +21,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "glog/logging.h" #include "gtest/gtest.h" +#include "paddle/fluid/platform/port.h" #ifdef PADDLE_WITH_MKLML #include "paddle/fluid/platform/dynload/mklml.h" diff --git a/paddle/fluid/platform/cudnn_helper.h b/paddle/fluid/platform/cudnn_helper.h index 682b0c0ff39b7..61a25064d1799 100644 --- a/paddle/fluid/platform/cudnn_helper.h +++ b/paddle/fluid/platform/cudnn_helper.h @@ -62,7 +62,7 @@ inline const char* cudnnGetErrorString(cudnnStatus_t status) { #define CUDNN_ENFORCE(condition) \ do { \ - cudnnStatus_t status = condition; \ + auto status = condition; \ if (UNLIKELY(status != CUDNN_STATUS_SUCCESS)) { \ PADDLE_THROW(::paddle::platform::cudnnGetErrorString(status)); \ } \ diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 1a83ac7780a01..db62377898339 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -48,13 +48,13 @@ extern void EnforceCUDNNLoaded(const char* fn_name); #else -#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ - struct DynLoad__##__name { \ - template \ - inline cudnnStatus_t operator()(Args... args) { \ - return ::__name(args...); \ - } \ - }; \ +#define DECLARE_DYNAMIC_LOAD_CUDNN_WRAP(__name) \ + struct DynLoad__##__name { \ + template \ + inline auto operator()(Args... args) { \ + return ::__name(args...); \ + } \ + }; \ extern DynLoad__##__name __name #endif diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c78f159ad25a1..e0d0051ad0d8f 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -19,7 +19,10 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" -DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, +// fraction_of_gpu_memory_to_use cannot be too high on windows, +// since the win32 graphic sub-system can occupy some GPU memory +// which may lead to insufficient memory left for paddle +DEFINE_double(fraction_of_gpu_memory_to_use, 0.5, "Allocate a trunk of gpu memory that is this fraction of the " "total gpu memory size. Future memory usage will be allocated " "from the trunk. If the trunk doesn't have enough gpu memory, " diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 11c68f3449ee2..8dcfc4e748f27 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -18,7 +18,7 @@ #include #include #include -#include "ThreadPool.h" +#include #include "paddle/fluid/platform/enforce.h" namespace paddle { diff --git a/paddle/legacy/cuda/include/hl_warpctc_wrap.h b/paddle/legacy/cuda/include/hl_warpctc_wrap.h index 0857bd1aa1b3c..09cbd6d450f77 100644 --- a/paddle/legacy/cuda/include/hl_warpctc_wrap.h +++ b/paddle/legacy/cuda/include/hl_warpctc_wrap.h @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#ifndef _WIN32 #ifndef HL_WARPCTC_WRAP_H_ #define HL_WARPCTC_WRAP_H_ - #include "ctc.h" #include "hl_base.h" @@ -91,3 +91,4 @@ extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths, size_t* bytes); #endif // HL_WARPCTC_WRAP_H_ +#endif diff --git a/paddle/legacy/cuda/src/hl_cuda_device.cc b/paddle/legacy/cuda/src/hl_cuda_device.cc index 501e3b0f3be02..a6e27a37ffef9 100644 --- a/paddle/legacy/cuda/src/hl_cuda_device.cc +++ b/paddle/legacy/cuda/src/hl_cuda_device.cc @@ -132,11 +132,15 @@ inline pid_t gettid() { uint64_t tid; pthread_threadid_np(NULL, &tid); #else +#ifndef _WIN32 #ifndef __NR_gettid #define __NR_gettid 224 #endif pid_t tid = syscall(__NR_gettid); #endif +#else // _WIN32 + pid_t tid = _getpid(); +#endif // _WIN32 CHECK_NE((int)tid, -1); return tid; } diff --git a/paddle/legacy/utils/ThreadLocal.h b/paddle/legacy/utils/ThreadLocal.h index c5b07506d3687..6268b73a85540 100644 --- a/paddle/legacy/utils/ThreadLocal.h +++ b/paddle/legacy/utils/ThreadLocal.h @@ -14,10 +14,12 @@ limitations under the License. */ #pragma once +#ifndef _WIN32 #include #include -#include #include +#endif +#include #include #include #include diff --git a/paddle/legacy/utils/Util.h b/paddle/legacy/utils/Util.h index e6f05e30d308b..3a878b2b30127 100644 --- a/paddle/legacy/utils/Util.h +++ b/paddle/legacy/utils/Util.h @@ -14,7 +14,9 @@ limitations under the License. */ #pragma once +#ifndef _WIN32 #include // for syscall() +#endif #include #include #include @@ -40,6 +42,31 @@ inline int rand_r(unsigned int* seedp) { } #endif +#ifdef _WIN32 +#define NOMINMAX // msvc max/min macro conflict with std::min/max +#include + +template +inline int __builtin_clz(const T& value) { + DWORD leadning_zero = 0; + if (_BitScanReverse(&leadning_zero, value)) { + return static_cast(sizeof(T) * 8 - leadning_zero); + } else { + return static_cast(0); + } +} + +inline int __builtin_clzl(const unsigned long& value) { + return __builtin_clz(value); +} + +inline int __builtin_clzll(const unsigned long long& value) { + return __builtin_clz(value); +} + +#define pid_t int +#endif + /** * Loop over the elements in a container * TODO(yuyang18): It's this foreach useful? Why not use C++ 11 foreach, diff --git a/paddle/testing/CMakeLists.txt b/paddle/testing/CMakeLists.txt index 2264481899413..614596958e3c5 100644 --- a/paddle/testing/CMakeLists.txt +++ b/paddle/testing/CMakeLists.txt @@ -3,8 +3,10 @@ if(WITH_TESTING) add_library(paddle_test_main STATIC TestMain.cpp) add_dependencies(paddle_test_main paddle_proto ${external_project_dependencies}) - add_library(paddle_test_util STATIC TestUtil.cpp) - add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) + if(NOT WIN32) + add_library(paddle_test_util STATIC TestUtil.cpp) + add_dependencies(paddle_test_util paddle_proto ${external_project_dependencies}) + endif(NOT WIN32) if(NOT MOBILE_INFERENCE) cc_library(paddle_gtest_main SRCS paddle_gtest_main.cc DEPS device_context memory gtest gflags) endif() diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index f65b37903a35f..829154f1b23d6 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -46,8 +46,8 @@ def _is_numpy_(var): def _is_number_(var): - return isinstance(var, int) or isinstance(var, float) or (isinstance( - var, np.ndarray) and var.shape == (1, )) + return isinstance(var, int) or isinstance(var, np.int64) or isinstance( + var, float) or (isinstance(var, np.ndarray) and var.shape == (1, )) def _is_number_or_matrix_(var): diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 510d0304f04ff..3fc12d584d2fc 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -23,9 +23,11 @@ if(NOT WITH_DISTRIBUTE) LIST(REMOVE_ITEM TEST_OPS test_dist_text_classification) endif(NOT WITH_DISTRIBUTE) -if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) - LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) -endif() +if(WITH_GPU) + if (${CUDNN_MAJOR_VERSION} VERSION_LESS 7) + LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) + endif() +endif(WITH_GPU) list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 From e280c7a4db7f5765e7b3b5b2146204705b348e5b Mon Sep 17 00:00:00 2001 From: peizhilin Date: Thu, 22 Nov 2018 19:52:10 +0800 Subject: [PATCH 101/684] code style fix test=develop --- paddle/fluid/platform/stream_callback_manager.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 8dcfc4e748f27..ed8734c98cb99 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -14,11 +14,11 @@ #pragma once +#include #include #include #include #include -#include #include "paddle/fluid/platform/enforce.h" namespace paddle { From 00b9e9a1357bb3fa6e6adceb4e650d9f6424aa2a Mon Sep 17 00:00:00 2001 From: chengduo Date: Thu, 22 Nov 2018 20:40:56 +0800 Subject: [PATCH 102/684] Refine cublas to support CUBLAS_TENSOR_OP_MATH (#13929) * refine cublase test=develop * code refine * refine cublas * add GEMME_EX * add enable_cublas_tensor_op_math doc and add cublasCall test=develop * fix CublasCall for cuda version test=develop * fix error test=develop * fix GEMM_EX to be compatible with gcc 4.8 test=develop * add GEMM_EX test=develop * to compatiable with gcc4.8 test=develop --- paddle/fluid/operators/math/blas_impl.cu.h | 206 +++++++++++++++++---- paddle/fluid/platform/device_context.h | 47 +++++ paddle/fluid/platform/dynload/cublas.h | 26 ++- paddle/fluid/platform/gpu_info.cc | 20 ++ paddle/fluid/platform/gpu_info.h | 3 + python/paddle/fluid/__init__.py | 3 +- 6 files changed, 256 insertions(+), 49 deletions(-) diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h index d84c88cb3bc1a..d35073029a344 100644 --- a/paddle/fluid/operators/math/blas_impl.cu.h +++ b/paddle/fluid/operators/math/blas_impl.cu.h @@ -16,6 +16,9 @@ #include "paddle/fluid/operators/math/math_function.h" #include "paddle/fluid/platform/dynload/cublas.h" +#include "paddle/fluid/platform/gpu_info.h" + +DECLARE_bool(enable_cublas_tensor_op_math); namespace paddle { namespace operators { @@ -42,11 +45,44 @@ struct CUBlas { } template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasSgemmStridedBatched(args...)); #else PADDLE_THROW("SgemmStridedBatched is not supported on cuda <= 7.5"); +#endif + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const float *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const float *beta, void *C, + cudaDataType_t Ctype, int ldc) { + // Because the gcc 4.8 doesn't expand template parameter pack that + // appears in a lambda-expression, I can not use template parameter pack + // here. + auto cublas_call = [&]() { +#if CUDA_VERSION >= 8000 + VLOG(5) << "use_tensor_op_math: " + << (platform::TensorCoreAvailable() ? "True" : "False"); + PADDLE_ENFORCE(platform::dynload::cublasSgemmEx( + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc)); +#else + PADDLE_THROW("cublasSgemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -69,13 +105,18 @@ struct CUBlas { } template - static void GEMM_BATCH(ARGS... args) { + static void GEMM_STRIDED_BATCH(ARGS... args) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasDgemmStridedBatched(args...)); #else PADDLE_THROW("DgemmStridedBatched is not supported on cuda <= 7.5"); #endif } + + template + static void GEMM_EX(ARGS... args) { + PADDLE_THROW("Currently there are not cublasDgemmEx."); + } }; template <> @@ -96,14 +137,16 @@ struct CUBlas { reinterpret_cast<__half *>(C), ldc)); } - static void GEMM_BATCH(cublasHandle_t handle, cublasOperation_t transa, - cublasOperation_t transb, int m, int n, int k, - const float16 *alpha, const float16 *A, int lda, - long long int strideA, const float16 *B, // NOLINT - int ldb, long long int strideB, // NOLINT - const float16 *beta, float16 *C, int ldc, - long long int strideC, // NOLINT - int batchCount) { + static void GEMM_STRIDED_BATCH(cublasHandle_t handle, + cublasOperation_t transa, + cublasOperation_t transb, int m, int n, int k, + const float16 *alpha, const float16 *A, + int lda, long long int strideA, // NOLINT + const float16 *B, // NOLINT + int ldb, long long int strideB, // NOLINT + const float16 *beta, float16 *C, int ldc, + long long int strideC, // NOLINT + int batchCount) { #if CUDA_VERSION >= 8000 PADDLE_ENFORCE(platform::dynload::cublasHgemmStridedBatched( handle, transa, transb, m, n, k, @@ -114,6 +157,45 @@ struct CUBlas { ldc, strideC, batchCount)); #else PADDLE_THROW("HgemmStridedBatched is not supported on cuda <= 7.5"); +#endif + } + + // NOTES: GEMM_EX can use Tensor Core to accelerate matrix multiply. + // https://docs.nvidia.com/cuda/cublas/index.html#cublassetmathmode + template + static void GEMM_EX(platform::CUDADeviceContext *dev_ctx, + cublasOperation_t transa, cublasOperation_t transb, int m, + int n, int k, const void *alpha, const void *A, + cudaDataType_t Atype, int lda, const void *B, + cudaDataType_t Btype, int ldb, const void *beta, void *C, + cudaDataType_t Ctype, int ldc, + cudaDataType_t computeType) { + auto cublas_call = [&]() { +#if CUDA_VERSION >= 8000 + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; +#if CUDA_VERSION >= 9000 + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); +#endif // CUDA_VERSION >= 9000 + + PADDLE_ENFORCE(platform::dynload::cublasGemmEx( + dev_ctx->cublas_handle(), transa, transb, m, n, k, alpha, A, Atype, + lda, B, Btype, ldb, beta, C, Ctype, ldc, computeType, algo)); +#else + PADDLE_THROW("cublasGemmEx is supported on cuda >= 8.0"); +#endif + }; + +#if CUDA_VERSION >= 9000 + // NOTES: To use Tensor Core, we should change the cublas config, + // but the cublas may be hold by multi-thread. + dev_ctx->CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); +#else + cublas_call(); #endif } }; @@ -133,8 +215,21 @@ void Blas::GEMM(CBLAS_TRANSPOSE transA, cublasOperation_t cuTransB = (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, N); +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, N); + } else { +#endif // CUDA_VERSION >= 8000 + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, N); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 } template <> @@ -157,30 +252,18 @@ inline void Blas::GEMM( PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53, "cublas fp16 gemm requires GPU compute capability >= 53"); -#if CUDA_VERSION >= 8000 float h_alpha = static_cast(alpha); float h_beta = static_cast(beta); - cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; -#if CUDA_VERSION >= 9000 - if (context_.GetComputeCapability() >= 70) { - PADDLE_ENFORCE(platform::dynload::cublasSetMathMode( - context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH)); - algo = CUBLAS_GEMM_DFALT_TENSOR_OP; - } else { - PADDLE_ENFORCE(platform::dynload::cublasSetMathMode( - context_.cublas_handle(), CUBLAS_DEFAULT_MATH)); - } -#endif // CUDA_VERSION >= 9000 - +#if CUDA_VERSION >= 8000 // cublasHgemm does true FP16 computation which is slow for non-Volta // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation: // input/output in fp16, computation in fp32, which can also be accelerated // using tensor cores in volta GPUs. - PADDLE_ENFORCE(platform::dynload::cublasGemmEx( - context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B, - CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, - CUDA_R_32F, algo)); + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX( + &cuda_ctx, cuTransB, cuTransA, N, M, K, &h_alpha, B, CUDA_R_16F, ldb, A, + CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N, CUDA_R_32F); #else // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, @@ -199,8 +282,38 @@ void Blas::GEMM(bool transA, bool transB, int M, // the cblas convention. cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; - CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, - B, ldb, A, lda, &beta, C, ldc); + +#if CUDA_VERSION >= 8000 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto &cuda_ctx = const_cast(context_); + CUBlas::GEMM_EX(&cuda_ctx, cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, A, CUDA_R_32F, lda, &beta, C, + CUDA_R_32F, ldc); + } else { +#endif // CUDA_VERSION >= 8000 + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, + &alpha, B, ldb, A, lda, &beta, C, ldc); + +#if CUDA_VERSION >= 8000 + } +#endif // CUDA_VERSION >= 8000 +} + +template <> +template <> +inline void Blas::GEMM( + bool transA, bool transB, int M, int N, int K, platform::float16 alpha, + const platform::float16 *A, int lda, const platform::float16 *B, int ldb, + platform::float16 beta, platform::float16 *C, int ldc) const { + // Note that cublas follows fortran order, so the order is different from + // the cblas convention. + cublasOperation_t cuTransA = transA ? CUBLAS_OP_T : CUBLAS_OP_N; + cublasOperation_t cuTransB = transB ? CUBLAS_OP_T : CUBLAS_OP_N; + + CUBlas::GEMM(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, A, lda, &beta, C, + ldc); } template <> @@ -238,9 +351,34 @@ void Blas::BatchedGEMM( (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T; const int64_t strideC = M * N; - CUBlas::GEMM_BATCH(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, - &alpha, B, ldb, strideB, A, lda, strideA, &beta, C, ldc, - strideC, batchCount); +#if CUDA_VERSION >= 9010 + if (FLAGS_enable_cublas_tensor_op_math && std::is_same::value) { + auto cublas_call = [&]() { + cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT; + bool use_tensor_op_math = platform::TensorCoreAvailable(); + if (use_tensor_op_math) { + algo = CUBLAS_GEMM_DFALT_TENSOR_OP; + } + VLOG(5) << "use_tensor_op_math: " + << (use_tensor_op_math ? "True" : "False"); + + PADDLE_ENFORCE(platform::dynload::cublasGemmStridedBatchedEx( + context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, + CUDA_R_32F, ldb, strideB, A, CUDA_R_32F, lda, strideA, &beta, C, + CUDA_R_32F, ldc, strideC, batchCount, CUDA_R_32F, algo)); + }; + auto &dev_ctx = const_cast(context_); + dev_ctx.CublasCall(cublas_call, CUBLAS_TENSOR_OP_MATH); + } else { +#endif // CUDA_VERSION >= 9010 + + CUBlas::GEMM_STRIDED_BATCH(context_.cublas_handle(), cuTransB, cuTransA, + N, M, K, &alpha, B, ldb, strideB, A, lda, + strideA, &beta, C, ldc, strideC, batchCount); + +#if CUDA_VERSION >= 9010 + } +#endif // CUDA_VERSION >= 9010 } } // namespace math diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h index 9a9018cdea6a9..3edd727978010 100644 --- a/paddle/fluid/platform/device_context.h +++ b/paddle/fluid/platform/device_context.h @@ -143,6 +143,39 @@ class CudnnWorkspaceHandle { std::unique_ptr> guard_; }; +#if CUDA_VERSION >= 9000 +class ScopedCublasMathMode { + public: + ScopedCublasMathMode(cublasHandle_t handle, cublasMath_t new_math_mode) + : handle_(handle) { + need_reset = false; + PADDLE_ENFORCE( + platform::dynload::cublasGetMathMode(handle_, &old_math_mode_), + "Failed to get old cublas math mode"); + if (old_math_mode_ != new_math_mode) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, new_math_mode), + "Failed to set old cublas math mode"); + need_reset = true; + } + } + + ~ScopedCublasMathMode() { + if (need_reset) { + PADDLE_ENFORCE( + platform::dynload::cublasSetMathMode(handle_, old_math_mode_), + "Failed to set old cublas math mode"); + } + } + + private: + cublasHandle_t handle_; + cublasMath_t old_math_mode_; + bool need_reset; +}; + +#endif + class CUDADeviceContext : public DeviceContext { public: explicit CUDADeviceContext(CUDAPlace place); @@ -199,6 +232,18 @@ class CUDADeviceContext : public DeviceContext { callback_manager_->Wait(); } +#if CUDA_VERSION >= 9000 + /*! \brief CublasCall may need to change cublas's config, + * but the cublas may be hold by multi-thread, so we should + * add lock here. */ + template + void CublasCall(Callback callback, cublasMath_t new_math) { + std::lock_guard guard(cublas_mtx_); + ScopedCublasMathMode scoped_cublas_math(cublas_handle_, new_math); + callback(); + } +#endif + private: CUDAPlace place_; @@ -220,6 +265,8 @@ class CUDADeviceContext : public DeviceContext { // If we use mtx_ for StreamCallbackManager, deadlock may occur sometimes mutable std::mutex callback_mtx_; std::unique_ptr callback_manager_; + + mutable std::mutex cublas_mtx_; }; template <> diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index 4ea0cd7283b55..ff80bd525c167 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -61,9 +61,6 @@ extern void *cublas_dso_handle; extern DynLoad__##__name __name #endif -#define DECLARE_DYNAMIC_LOAD_CUBLAS_V2_WRAP(__name) \ - DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) - #define CUBLAS_BLAS_ROUTINE_EACH(__macro) \ __macro(cublasSaxpy_v2); \ __macro(cublasDaxpy_v2); \ @@ -93,22 +90,23 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // APIs available after CUDA 8.0 #if CUDA_VERSION >= 8000 -#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ - __macro(cublasGemmEx); \ - __macro(cublasSgemmStridedBatched); \ - __macro(cublasDgemmStridedBatched); \ - __macro(cublasCgemmStridedBatched); \ - __macro(cublasZgemmStridedBatched); \ - __macro(cublasHgemmStridedBatched); - -CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched); #endif // APIs available after CUDA 9.0 #if CUDA_VERSION >= 9000 -#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) __macro(cublasSetMathMode); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode); +#endif -CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) +#if CUDA_VERSION >= 9010 +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx); +DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx); #endif #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index c78f159ad25a1..833d48347f490 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -26,6 +26,16 @@ DEFINE_double(fraction_of_gpu_memory_to_use, 0.92, "additional trunks of the same size will be requested from gpu " "until the gpu has no memory left for another trunk."); +DEFINE_bool( + enable_cublas_tensor_op_math, false, + "The enable_cublas_tensor_op_math indicate whether to use Tensor Core, " + "but it may loss precision. Currently, There are two CUDA libraries that" + " use Tensor Cores, cuBLAS and cuDNN. cuBLAS uses Tensor Cores to speed up" + " GEMM computations(the matrices must be either half precision or single " + "precision); cuDNN uses Tensor Cores to speed up both convolutions(the " + "input and output must be half precision) and recurrent neural networks " + "(RNNs)."); + namespace paddle { namespace platform { @@ -64,6 +74,16 @@ int GetCUDADriverVersion(int id) { return driver_version; } +bool TensorCoreAvailable() { +#if CUDA_VERSION >= 9000 + int device = GetCurrentDeviceId(); + int driver_version = GetCUDAComputeCapability(device); + return driver_version >= 70; +#else + return false; +#endif +} + int GetCUDAMultiProcessors(int id) { PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); int count; diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index be44158431ff8..6a0b3c8e02d49 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -35,6 +35,9 @@ int GetCUDARuntimeVersion(int id); //! Get the driver version of the ith GPU int GetCUDADriverVersion(int id); +//! Wheter the current device support TensorCore +bool TensorCoreAvailable(); + //! Get the MultiProcessors of the ith GPU. int GetCUDAMultiProcessors(int i); diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 543acf2d349c7..3c092dee3438c 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -133,7 +133,8 @@ def __bootstrap__(): if core.is_compiled_with_cuda(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', - 'conv_workspace_size_limit', 'cudnn_exhaustive_search' + 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', + 'cudnn_exhaustive_search' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) From 1f0fa675718fd5aa58ca194a0aafa89829da877d Mon Sep 17 00:00:00 2001 From: ZhenWang Date: Thu, 22 Nov 2018 21:05:35 +0800 Subject: [PATCH 103/684] add some activation api examples. --- python/paddle/fluid/layers/nn.py | 46 ++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ccd9175b64d46..2891893fde3b4 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6833,6 +6833,13 @@ def elu(x, alpha=1.0, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.elu(x, alpha=0.2) """ helper = LayerHelper('elu', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6856,6 +6863,13 @@ def relu6(x, threshold=6.0, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.relu6(x, threshold=6.0) """ helper = LayerHelper('relu6', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6879,6 +6893,13 @@ def pow(x, factor=1.0, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.pow(x, factor=2.0) """ helper = LayerHelper('pow', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6903,6 +6924,13 @@ def stanh(x, scale_a=2.0 / 3.0, scale_b=1.7159, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.stanh(x, scale_a=0.6667, scale_b=1.7159) """ helper = LayerHelper('stanh', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6928,6 +6956,13 @@ def hard_sigmoid(x, slope=0.2, offset=0.5, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.hard_sigmoid(x, slope=0.3, offset=0.8) """ helper = LayerHelper('hard_sigmoid', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6952,6 +6987,13 @@ def swish(x, beta=1.0, name=None): Returns: output(${out_type}): ${out_comment} + + Examples: + + .. code-block:: python + + x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") + y = fluid.layers.swish(x, beta=1.2) """ helper = LayerHelper('swish', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) @@ -6988,8 +7030,8 @@ def prelu(x, mode, param_attr=None, name=None): .. code-block:: python x = fluid.layers.data(name="x", shape=[10,10], dtype="float32") - mode = 'channel' - output = fluid.layers.prelu(x,mode) + mode = 'channel' + output = fluid.layers.prelu(x,mode) """ helper = LayerHelper('prelu', **locals()) if mode not in ['all', 'channel', 'element']: From 43b9202d9bd57cf11403d1fd8d0189ce8f68e3b3 Mon Sep 17 00:00:00 2001 From: ZhenWang Date: Thu, 22 Nov 2018 21:49:49 +0800 Subject: [PATCH 104/684] test=develop --- python/paddle/fluid/layers/nn.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index 2891893fde3b4..8db6d80aa547d 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -6993,7 +6993,7 @@ def swish(x, beta=1.0, name=None): .. code-block:: python x = fluid.layers.data(name="x", shape=[3,10,32,32], dtype="float32") - y = fluid.layers.swish(x, beta=1.2) + y = fluid.layers.swish(x, beta=2.0) """ helper = LayerHelper('swish', **locals()) out = helper.create_variable_for_type_inference(dtype=x.dtype) From 6cc6bf4074d69c5c0b02af612b94e438d596803a Mon Sep 17 00:00:00 2001 From: Krzysztof Binias Date: Thu, 22 Nov 2018 15:30:43 +0100 Subject: [PATCH 105/684] Bumped MKL-DNN version to 0.17 test=develop --- cmake/external/mkldnn.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake index 785148d4f9f44..b280db23b9b27 100644 --- a/cmake/external/mkldnn.cmake +++ b/cmake/external/mkldnn.cmake @@ -53,7 +53,7 @@ ExternalProject_Add( ${EXTERNAL_PROJECT_LOG_ARGS} DEPENDS ${MKLDNN_DEPENDS} GIT_REPOSITORY "https://github.com/01org/mkl-dnn.git" - GIT_TAG "21fb5f2af1dd14e132af4f1b79160977ee487818" + GIT_TAG "830a10059a018cd2634d94195140cf2d8790a75a" PREFIX ${MKLDNN_SOURCES_DIR} UPDATE_COMMAND "" CMAKE_ARGS -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER} From a902b8b0f811f6837330385b95fa2f552393197c Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 01:07:58 +0800 Subject: [PATCH 106/684] Add sqlite3 support test=develop --- tools/manylinux1/Dockerfile.x64 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/manylinux1/Dockerfile.x64 b/tools/manylinux1/Dockerfile.x64 index e91216a5b89c5..48fd145e5fe67 100644 --- a/tools/manylinux1/Dockerfile.x64 +++ b/tools/manylinux1/Dockerfile.x64 @@ -16,7 +16,7 @@ ENV PKG_CONFIG_PATH=/usr/local/lib/pkgconfig RUN yum install -y sqlite-devel zlib-devel openssl-devel pcre-devel vim tk-devel tkinter libtool xz graphviz COPY build_scripts /build_scripts RUN bash build_scripts/build.sh && \ - bash build_scripts/install_nccl2.sh && rm -r build_scripts + bash build_scripts/install_nccl2.sh && rm -rf build_scripts ENV SSL_CERT_FILE=/opt/_internal/certs.pem From e9be3366a9cde661293e92306b036aea0ee772c1 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 02:49:06 +0000 Subject: [PATCH 107/684] test=develop --- paddle/fluid/operators/hierarchical_sigmoid_op.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 418fe86f69fff..b4a5fe8309190 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -164,7 +164,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { auto pre_out_grad_mat = EigenMatrix::From(pre_out_grad); auto out_grad_mat = EigenMatrix::From(*out_grad); - Eigen::array bcast({{1, static_cast(pre_out_grad.dims()[1])}}); + Eigen::array bcast{1, static_cast(pre_out_grad.dims()[1])}; // softrelu derivative pre_out_grad_mat.device(place) = From 0fca16847c89d1018c32da0e7bbc0b6396d5e104 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 02:52:35 +0000 Subject: [PATCH 108/684] temp --- paddle/fluid/operators/math/matrix_bit_code.cc | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 8baffe1ba1e5f..2967586949846 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -102,6 +102,8 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::LoDTensor& tmat, size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; size_t weight_width = weight->dims()[1]; + VLOG(30) << "sparse w_grad dims is [" << weight->dims()[0] << " ," + << weight->dims()[1] << " ]"; auto tmat_value = tmat.data(); auto weight_value = weight->data(); auto input_value = input.data(); @@ -127,6 +129,8 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::LoDTensor& tmat, size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; size_t weight_width = weight->value().dims()[1]; + VLOG(30) << "sparse w_grad dims is: [" << weight->value().dims()[0] << " ," + << weight->value().dims()[1] << " ]"; auto tmat_value = tmat.data(); auto weight_value = weight->mutable_value()->data(); auto input_value = input.data(); From 361cb0e078d1942e06ffcb3586e68be11c465d29 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 23 Nov 2018 10:53:35 +0800 Subject: [PATCH 109/684] lookup remote table can compile --- .../distributed_ops/lookup_remote_table_op.cc | 12 +- .../distributed_ops/lookup_remote_table_op.h | 220 ++++++++++-------- 2 files changed, 133 insertions(+), 99 deletions(-) diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc index 06e96a7f98303..5d3a50a44cf18 100644 --- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc +++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.cc @@ -68,6 +68,15 @@ class LookupRemoteTableOpMaker : public framework::OpProtoAndCheckerMaker { "contains the ids to be looked up in W. " "The last dimension size must be 1."); AddOutput("Out", "The lookup results, which have the same type as W."); + AddAttr>("height_sections", + "Height for each output SelectedRows.") + .SetDefault(std::vector({})); + AddAttr("trainer_id", "trainer id from 0 ~ worker_num.").SetDefault(0); + AddAttr>( + "epmap", + "(string vector, default 127.0.0.1:6164)" + "Server endpoints in the order of input variables for mapping") + .SetDefault({"127.0.0.1:6164"}); AddAttr("padding_idx", "(int64, default -1) " "If the value is -1, it makes no effect to lookup. " @@ -98,7 +107,8 @@ or not. And the output only shares the LoD information with input Ids. namespace ops = paddle::operators; REGISTER_OPERATOR(lookup_remote_table, ops::LookupRemoteTableOp, - ops::EmptyGradOpMaker, ops::LookupRemoteTableOpMaker); + paddle::framework::EmptyGradOpMaker, + ops::LookupRemoteTableOpMaker); REGISTER_OP_CPU_KERNEL(lookup_remote_table, ops::LookupRemoteTableKernel, ops::LookupRemoteTableKernel); diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h index 1a383f6d3e6f0..ddf57016dbca5 100644 --- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h +++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h @@ -12,26 +12,32 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#pragma once + #include // NOLINT #include #include +#include #include #include #include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/detail/macros.h" #include "paddle/fluid/operators/distributed_ops/send_recv_util.h" +#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { -namespace distributed { inline size_t GetSectionIndex(int64_t id, const std::vector& abs_sections) { for (size_t i = 1; i < abs_sections.size(); ++i) { - if (row < abs_sections[i]) { + if (id < abs_sections[i]) { return i - 1; } } @@ -62,9 +68,10 @@ inline std::vector> SplitIds( std::vector> splited_ids; splited_ids.resize(height_section.size() + 1); for (auto& id : all_ids) { - auto section_index = GetSectionIndex(id); + auto section_index = GetSectionIndex(id, abs_sections); splited_ids[section_index].push_back(id - abs_sections[section_index]); } + return splited_ids; } inline void SplitIdsIntoMultipleVarsBySection( @@ -82,7 +89,7 @@ inline void SplitIdsIntoMultipleVarsBySection( auto& ids = splited_ids[i]; if (!ids.empty()) { auto* id_tensor_data = id_tensor->mutable_data( - framework::make_ddim({ids.size(), 1}), place); + framework::make_ddim({static_cast(ids.size()), 1}), place); memcpy(id_tensor_data, ids.data(), sizeof(int64_t) * ids.size()); } } @@ -93,8 +100,8 @@ inline void MergeMultipleVarsIntoOnBySection( const std::vector& out_var_names, const std::vector& height_section, const std::vector>& splited_ids, - framework::Scope* scope) { - PADDLE_ENFORCE_EQ(in_var_names.size(), height_section.size() + 1, ""); + const framework::ExecutionContext& context, framework::Scope* scope) { + PADDLE_ENFORCE_EQ(out_var_names.size(), height_section.size() + 1, ""); auto cpu_place = platform::CPUPlace(); @@ -106,15 +113,15 @@ inline void MergeMultipleVarsIntoOnBySection( id_to_offset[id_data[i]].push_back(i); } - auto& out_tensor = scope->Var(out_name)->Get(); - auto* out_tensor_data = out_tensor.mutable_data(); + auto* out_tensor = scope->Var(out_name)->GetMutable(); + auto* out_tensor_data = out_tensor->mutable_data(context.GetPlace()); for (size_t section_idx = 0; section_idx < out_var_names.size(); ++section_idx) { auto& ids_in_this_section = splited_ids[section_idx]; auto& prefetch_out_var = scope->Var(out_var_names[section_idx])->Get(); - const auto* out_var_data = prefetch_out_var.mutable_data(); + const auto* out_var_data = prefetch_out_var.data(); auto& dims = prefetch_out_var.dims(); PADDLE_ENFORCE_EQ(dims.size(), 2, ""); @@ -129,63 +136,64 @@ inline void MergeMultipleVarsIntoOnBySection( for (auto& offset : offsets) { // should support GPU tensor memory::Copy(cpu_place, out_tensor_data + offset * row_numel, cpu_place, - out_var_data + i * grad_row_numel, - sizeof(T) * grad_row_numel); + out_var_data + i * row_numel, sizeof(float) * row_numel); } } } } -inline void prefetch(const std::string& table_name, const std::string& id_name, - const std::string& out_name, - const std::vector& epmap, - const std::vector& height_section, - const framework::Scope& scope, - const platform::Place& place) const { - auto local_scope = scope.NewScope(); - - platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); - auto& ctx = *pool.Get(place); - - distributed::RPCClient* rpc_client = - distributed::RPCClient::GetInstance(Attr("trainer_id")); - - std::vector in_var_names; - std::vector out_var_names; - for (size_t i = 0; i < epmap.size(); ++i) { - in_var_names.push_back(id_name + "@" + epmap[i]); - out_var_names.push_back(out_name + "@" + epmap[i]); - } - - auto splited_ids = SplitIds(id_name, height_section, local_scope); - SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, - splited_ids, local_scope); - - // create output var in local scope - for (auto& name : out_var_names) { - local_scope.Var(name)->GetMutable(); - } - - std::vector rets; - for (size_t i = 0; i < ins.size(); i++) { - if (NeedSend(local_scope, ins[i])) { - VLOG(30) << "sending " << ins[i] << " to " << epmap[i] << " to get " - << outs[i] << " back"; - rets.push_back(rpc_client->AsyncPrefetchVar( - epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i])); - } else { - VLOG(30) << "don't send no-initialied variable: " << out_var_names[i]; - } - } - for (size_t i = 0; i < rets.size(); i++) { - PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); - } - - MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, - height_section, plited_ids, scope) - - scope.DeleteScope(local_scope); -} +// inline void prefetch(const std::string& table_name, const std::string& +// id_name, +// const std::string& out_name, +// const std::vector& epmap, +// const std::vector& height_section, +// const framework::Scope& scope, +// const platform::Place& place) { +// auto& local_scope = scope.NewScope(); +// +// platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); +// auto& ctx = *pool.Get(place); +// +// distributed::RPCClient* rpc_client = +// distributed::RPCClient::GetInstance(Attr("trainer_id")); +// +// std::vector in_var_names; +// std::vector out_var_names; +// for (size_t i = 0; i < epmap.size(); ++i) { +// in_var_names.push_back(id_name + "@" + epmap[i]); +// out_var_names.push_back(out_name + "@" + epmap[i]); +// } +// +// auto splited_ids = SplitIds(id_name, height_section, &local_scope); +// SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, +// splited_ids, &local_scope); +// +// // create output var in local scope +// for (auto& name : out_var_names) { +// local_scope.Var(name)->GetMutable(); +// } +// +// std::vector rets; +// for (size_t i = 0; i < in_var_names.size(); i++) { +// if (NeedSend(local_scope, in_var_names[i])) { +// VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i] << " to +// get " +// << out_var_names[i] << " back"; +// rets.push_back(rpc_client->AsyncPrefetchVar( +// epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i])); +// } else { +// VLOG(30) << "don't send no-initialied variable: " << out_var_names[i]; +// } +// } +// for (size_t i = 0; i < rets.size(); i++) { +// PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); +// } +// +// MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, +// height_section, splited_ids, &local_scope); +// +// scope.DeleteScope(&local_scope); +//} using Tensor = framework::Tensor; using LoDTensor = framework::LoDTensor; @@ -198,54 +206,70 @@ template class LookupRemoteTableKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - auto* ids_t = context.Input("Ids"); // int tensor + std::string id_name = context.Inputs("Ids").front(); + auto* ids_t = context.Input("Ids"); // int tensor + + std::string out_name = context.Outputs("Out").front(); auto* output_t = context.Output("Out"); // float tensor + + std::string table_name = context.Inputs("W").front(); auto* table_var = context.InputVar("W"); int64_t padding_idx = context.Attr("padding_idx"); int64_t* ids = const_cast(ids_t->data()); int64_t ids_numel = ids_t->numel(); - if (table_var->IsType()) { - auto* table_t = context.Input("W"); - int64_t row_number = table_t->dims()[0]; - int64_t row_width = table_t->dims()[1]; - - auto* table = table_t->data(); - auto* output = output_t->mutable_data(context.GetPlace()); - - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_LT(ids[i], row_number); - PADDLE_ENFORCE_GE(ids[i], 0, "ids %d", i); - memcpy(output + i * row_width, table + ids[i] * row_width, - row_width * sizeof(T)); - } - } - } else if (table_var->IsType()) { - const auto& table_t = table_var->Get(); - int64_t row_width = table_t.value().dims()[1]; - const auto* table = table_t.value().data(); - auto* output = output_t->mutable_data(context.GetPlace()); - - auto blas = math::GetBlas(context); - for (int64_t i = 0; i < ids_numel; ++i) { - if (padding_idx != kNoPadding && ids[i] == padding_idx) { - memset(output + i * row_width, 0, row_width * sizeof(T)); - } else { - PADDLE_ENFORCE_GE(ids[i], 0); - auto id_index = table_t.Index(ids[i]); - PADDLE_ENFORCE_GE(id_index, 0, "the input key should be exists."); - blas.VCOPY(row_width, table + id_index * row_width, - output + i * row_width); - } + auto epmap = context.Attr>("epmap"); + auto height_sections = + context.Attr>("height_sections"); + + auto& local_scope = context.scope().NewScope(); + + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(context.GetPlace()); + + distributed::RPCClient* rpc_client = + distributed::RPCClient::GetInstance( + context.Attr("trainer_id")); + + std::vector in_var_names; + std::vector out_var_names; + for (size_t i = 0; i < epmap.size(); ++i) { + in_var_names.push_back(id_name + "@" + epmap[i]); + out_var_names.push_back(out_name + "@" + epmap[i]); + } + + auto splited_ids = SplitIds(id_name, height_sections, &local_scope); + SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_sections, + splited_ids, &local_scope); + + // create output var in local scope + for (auto& name : out_var_names) { + local_scope.Var(name)->GetMutable(); + } + + std::vector rets; + for (size_t i = 0; i < in_var_names.size(); i++) { + if (NeedSend(local_scope, in_var_names[i])) { + VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i] + << " to get " << out_var_names[i] << " back"; + rets.push_back(rpc_client->AsyncPrefetchVar( + epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i])); + } else { + VLOG(30) << "don't send no-initialied variable: " << out_var_names[i]; } } + for (size_t i = 0; i < rets.size(); i++) { + PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); + } + + MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, + height_sections, splited_ids, context, + &local_scope); + + context.scope().DeleteScope(&local_scope); } }; -} // namespace distributed } // namespace operators } // namespace paddle From 1f87f263a2906cb1130fdb3cf3c415197cf0d549 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 23 Nov 2018 10:56:45 +0800 Subject: [PATCH 110/684] clean code --- .../distributed_ops/lookup_remote_table_op.h | 67 ++----------------- 1 file changed, 7 insertions(+), 60 deletions(-) diff --git a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h index ddf57016dbca5..5c53ca69517d3 100644 --- a/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h +++ b/paddle/fluid/operators/distributed_ops/lookup_remote_table_op.h @@ -34,6 +34,13 @@ limitations under the License. */ namespace paddle { namespace operators { +using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; +using DDim = framework::DDim; + +constexpr int64_t kNoPadding = -1; + inline size_t GetSectionIndex(int64_t id, const std::vector& abs_sections) { for (size_t i = 1; i < abs_sections.size(); ++i) { @@ -142,66 +149,6 @@ inline void MergeMultipleVarsIntoOnBySection( } } -// inline void prefetch(const std::string& table_name, const std::string& -// id_name, -// const std::string& out_name, -// const std::vector& epmap, -// const std::vector& height_section, -// const framework::Scope& scope, -// const platform::Place& place) { -// auto& local_scope = scope.NewScope(); -// -// platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); -// auto& ctx = *pool.Get(place); -// -// distributed::RPCClient* rpc_client = -// distributed::RPCClient::GetInstance(Attr("trainer_id")); -// -// std::vector in_var_names; -// std::vector out_var_names; -// for (size_t i = 0; i < epmap.size(); ++i) { -// in_var_names.push_back(id_name + "@" + epmap[i]); -// out_var_names.push_back(out_name + "@" + epmap[i]); -// } -// -// auto splited_ids = SplitIds(id_name, height_section, &local_scope); -// SplitIdsIntoMultipleVarsBySection(id_name, in_var_names, height_section, -// splited_ids, &local_scope); -// -// // create output var in local scope -// for (auto& name : out_var_names) { -// local_scope.Var(name)->GetMutable(); -// } -// -// std::vector rets; -// for (size_t i = 0; i < in_var_names.size(); i++) { -// if (NeedSend(local_scope, in_var_names[i])) { -// VLOG(30) << "sending " << in_var_names[i] << " to " << epmap[i] << " to -// get " -// << out_var_names[i] << " back"; -// rets.push_back(rpc_client->AsyncPrefetchVar( -// epmap[i], ctx, local_scope, in_var_names[i], out_var_names[i])); -// } else { -// VLOG(30) << "don't send no-initialied variable: " << out_var_names[i]; -// } -// } -// for (size_t i = 0; i < rets.size(); i++) { -// PADDLE_ENFORCE(rets[i]->Wait(), "internal error in RPCClient"); -// } -// -// MergeMultipleVarsIntoOnBySection(id_name, out_name, out_var_names, -// height_section, splited_ids, &local_scope); -// -// scope.DeleteScope(&local_scope); -//} - -using Tensor = framework::Tensor; -using LoDTensor = framework::LoDTensor; -using SelectedRows = framework::SelectedRows; -using DDim = framework::DDim; - -constexpr int64_t kNoPadding = -1; - template class LookupRemoteTableKernel : public framework::OpKernel { public: From 81bd7eeff4f3581c67cb294f94a14c3b1e97e40d Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 23 Nov 2018 11:04:11 +0800 Subject: [PATCH 111/684] rollback the format --- paddle/fluid/operators/beam_search_op_test.cc | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index 80fdd22fbbc06..6e283866ff59a 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -30,23 +30,23 @@ using std::endl; void CreateInput(LoDTensor* ids, LoDTensor* scores) { LoD lod; - vector level0{0, 2, 4}; - vector level1{0, 1, 2, 3, 4}; + vector level0({0, 2, 4}); + vector level1({0, 1, 2, 3, 4}); lod.push_back(level0); lod.push_back(level1); ids->set_lod(lod); scores->set_lod(lod); - auto dims = framework::make_ddim(vector{4, 3}); + auto dims = framework::make_ddim(vector({4, 3})); ids->Resize(dims); scores->Resize(dims); CPUPlace place; auto* ids_data = ids->mutable_data(place); auto* scores_data = scores->mutable_data(place); - vector _ids{4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}; - vector _scores{0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, - 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}; + vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); + vector _scores({0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, + 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); for (int i = 0; i < 12; i++) { ids_data[i] = _ids[i]; @@ -79,8 +79,8 @@ TEST(DISABLED_beam_search_op, run) { ASSERT_EQ(sids.lod(), sscores.lod()); - vector tids{4, 2, 3, 8}; - vector tscores{0.5f, 0.6f, 0.9f, 0.7f}; + vector tids({4, 2, 3, 8}); + vector tscores({0.5f, 0.6f, 0.9f, 0.7f}); for (int i = 0; i < 4; i++) { ASSERT_EQ(tids[i], sids.data()[i]); From ff2a9786f3f8ba49daad21bd3d4550b394d46ca4 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 03:08:33 +0000 Subject: [PATCH 112/684] test=develop --- python/paddle/fluid/__init__.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 543acf2d349c7..a6416326ed80b 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -91,6 +91,7 @@ def __bootstrap__(): """ import sys import os + import platform from . import core in_test = 'unittest' in sys.modules @@ -110,14 +111,17 @@ def __bootstrap__(): print('PLEASE USE OMP_NUM_THREADS WISELY.', file=sys.stderr) os.environ['OMP_NUM_THREADS'] = str(num_threads) - + sysstr = platform.system() read_env_flags = [ - 'use_pinned_memory', 'check_nan_inf', 'benchmark', 'eager_delete_scope', - 'use_mkldnn', 'use_ngraph', 'initial_cpu_memory_in_mb', - 'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads', - "dist_threadpool_size", 'eager_delete_tensor_gb', 'allocator_strategy', + 'check_nan_inf', 'benchmark', 'eager_delete_scope', 'use_mkldnn', + 'use_ngraph', 'initial_cpu_memory_in_mb', 'init_allocated_mem', + 'free_idle_memory', 'paddle_num_threads', "dist_threadpool_size", + 'eager_delete_tensor_gb', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir' ] + if sysstr != 'Darwin': + read_env_flags.append('use_pinned_memory') + if os.name != 'nt': read_env_flags.append('warpctc_dir') read_env_flags.append('cpu_deterministic') From 9ea1ce63192fee1a211aa5dcc6fecf4758434451 Mon Sep 17 00:00:00 2001 From: Xin Pan Date: Fri, 23 Nov 2018 11:51:07 +0800 Subject: [PATCH 113/684] Update issue templates --- .github/ISSUE_TEMPLATE/---feature-request-.md | 27 +++++++++++++ .github/ISSUE_TEMPLATE/---inference-issue-.md | 40 +++++++++++++++++++ .../ISSUE_TEMPLATE/---installation-issue-.md | 40 +++++++++++++++++++ .github/ISSUE_TEMPLATE/---model-issue-.md | 36 +++++++++++++++++ .github/ISSUE_TEMPLATE/---others-.md | 33 +++++++++++++++ .github/ISSUE_TEMPLATE/---training-issue-.md | 38 ++++++++++++++++++ 6 files changed, 214 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/---feature-request-.md create mode 100644 .github/ISSUE_TEMPLATE/---inference-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---installation-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---model-issue-.md create mode 100644 .github/ISSUE_TEMPLATE/---others-.md create mode 100644 .github/ISSUE_TEMPLATE/---training-issue-.md diff --git a/.github/ISSUE_TEMPLATE/---feature-request-.md b/.github/ISSUE_TEMPLATE/---feature-request-.md new file mode 100644 index 0000000000000..57708855dce4f --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---feature-request-.md @@ -0,0 +1,27 @@ +--- +name: 建议(Feature request) +about: 您可以提出您的建议。 You could use this template for reporting a suggestion  issue. + +--- + +欢迎您对PaddlePaddle提出建议,非常感谢您对PaddlePaddle的贡献! +在留下您的建议时,辛苦您同步提供如下信息: +- 版本、环境信息 +1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1 +2)CPU/GPU:您是否使用GPU进行训练,如是,请提供您的CUDA和cuDNN版本号 +3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 建议描述:请您详细描述,您认为需优化的功能 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +Please make sure that this is a feature request. +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +**To Reproduce** +Steps to reproduce the behavior +**Describe the feature and the current behavior/state.** +**Any Other info.** diff --git a/.github/ISSUE_TEMPLATE/---inference-issue-.md b/.github/ISSUE_TEMPLATE/---inference-issue-.md new file mode 100644 index 0000000000000..37bdc8889e272 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---inference-issue-.md @@ -0,0 +1,40 @@ +--- +name: 预测(Inference Issue) +about: 您可以提问预测中报错、应用等问题。 You could use this template for reporting an inference issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准描述您的问题,例如“最新预测库的API文档在哪儿 ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID +    2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 +    3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 +    4)系统环境:请您描述系统类型、版本(如Mac OS 10.14),Python版本 +-预测信息 +    1)C++预测:请您提供预测库安装包的版本信息,及其中的version.txt文件 +    2)CMake包含路径的完整命令 +    3)API信息(如调用请提供) +    4)预测库来源:官网下载/特殊环境(如BCLOUD编译) +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that th +If there is no solution,please make sure that this is an inference issue including the following details : +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Python version +-Cmake orders +-C++version.txt +-API information +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---installation-issue-.md b/.github/ISSUE_TEMPLATE/---installation-issue-.md new file mode 100644 index 0000000000000..ce4ba58932467 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---installation-issue-.md @@ -0,0 +1,40 @@ +--- +name: 安装(Installation Issue) +about: 您可以提问安装、编译出现报错等问题。 You could use this template for reporting an installation +  issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +建立issue时,为快速解决问题,请您根据使用情况给出如下信息: +- 标题:请包含关键词“安装错误”/“编译错误”,例如“Mac编译错误” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号(如1.1)或CommitID +    2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况 +    3)GPU:请提供GPU型号,CUDA和CUDNN版本号 +    4)系统环境:请说明系统类型、版本(如Mac OS 10.14)、Python版本 +- 安装方式信息: +1)pip安装/docker安装 +2)本地编译:请提供cmake命令,编译命令 +3)docker编译:请提供docker镜像,编译命令            +  特殊环境请注明:如离线安装等 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in Github in case that there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is an installation issue including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg. Mac OS 10.14) +-Python version +- Install method: pip install/install with docker/build from source(without docker)/build within docker +- Other special cases that you think may be related to this problem, eg. offline install, special internet condition   +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---model-issue-.md b/.github/ISSUE_TEMPLATE/---model-issue-.md new file mode 100644 index 0000000000000..7cb52f37b9026 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---model-issue-.md @@ -0,0 +1,36 @@ +--- +name: 模型(Model Issue) +about: 您可以提问模型、算法、数据集方向的使用报错等问题。You could use this template for reporting a model/ + algorithm/dataset  issue. + +--- + +为使您的问题得到快速解决,在建立Issue前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +建立issue时,为快速解决问题,请您根据使用情况给出如下信息: +- 标题:简洁、精准描述您的问题,例如“ssd 模型前置lstm报错  ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供PaddlePaddle版本号,例如1.1或CommitID +    2)CPU:请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库的使用情况 +    3)GPU:请提供GPU型号,CUDA和CUDNN版本号 +    4)系统环境:请说明系统类型、版本(例如Mac OS 10.14),Python版本 +- 模型信息 +    1)模型名称 2)使用数据集名称 3)使用算法名称 4)模型链接 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github.Probably there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is a issue of models including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Python version +-Name of Models&Dataset/details of operator +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---others-.md b/.github/ISSUE_TEMPLATE/---others-.md new file mode 100644 index 0000000000000..6a291153e43f5 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---others-.md @@ -0,0 +1,33 @@ +--- +name: 其他(Others) +about: 如上述分类未包含您的问题,可在此提出。 You could use this template for reporting other issues + +--- + +为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准概括您的问题 +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID +    2)CPU/GPU:如果您使用GPU训练,请提供GPU驱动版本、CUDA和cuDNN版本号 +    3)系统环境:请您描述系统类型、版本,例如Mac OS 10.14 +    4)Python版本号 +    5)显存信息 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志/代码关键片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +If there is no solution,please provide us with the following details : +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/cuDNN version +-OS Platform and Distribution(eg.Mac OS 10.14) +-Python version +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** diff --git a/.github/ISSUE_TEMPLATE/---training-issue-.md b/.github/ISSUE_TEMPLATE/---training-issue-.md new file mode 100644 index 0000000000000..29e8383d97792 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/---training-issue-.md @@ -0,0 +1,38 @@ +--- +name: 训练(Training issue) +about: 您可以提问训练中报错、应用、出core等问题。 You could use this template for reporting an training +  issue. + +--- + +为使您的问题得到快速解决,在建立Issues前,请您先通过如下方式搜索是否有相似问题:【搜索issue关键字】【使用labels筛选】【官方文档】 + +如果您没有查询到相似问题,为快速解决您的提问,建立issue时请提供如下细节信息: +- 标题:简洁、精准概括您的问题,例如“Insufficient Memory xxx" ” +- 版本、环境信息: +    1)PaddlePaddle版本:请提供您的PaddlePaddle版本号,例如1.1或CommitID +    2)CPU:预测若用CPU,请提供CPU型号,MKL/OpenBlas/MKLDNN/等数学库使用情况 +    3)GPU:预测若用GPU,请提供GPU型号、CUDA和CUDNN版本号 +    4)系统环境:请您描述系统类型、版本,例如Mac OS 10.14,Python版本 +- 训练信息 +    1)单机/多机,单卡/多卡 +    2)显存信息 +    3)Operator信息 +- 复现信息:如为报错,请给出复现环境、复现步骤 +- 问题描述:请详细描述您的问题,同步贴出报错信息、日志、可复现的代码片段 + +Thank you for contributing to PaddlePaddle. +Before submitting the issue, you could search issue in the github in case that there was a similar issue submitted or resolved before. +If there is no solution,please make sure that this is a training issue including the following details: +**System information** +-PaddlePaddle version (eg.1.1)or CommitID +-CPU: including CPUMKL/OpenBlas/MKLDNN version +-GPU: including CUDA/CUDNN version +-OS Platform (eg.Mac OS 10.14) +-Other imformation: Distriuted training/informantion of operator/ +Graphics card storage +**To Reproduce** +Steps to reproduce the behavior +**Describe your current behavior** +**Code to reproduce the issue** +**Other info / logs** From 6a7f83d45df2ff22c49867837c97f0773421ee0c Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 23 Nov 2018 04:11:28 +0000 Subject: [PATCH 114/684] enable gru jitcode and refine act and lstm jitcode test=develop --- paddle/fluid/operators/math/jit_code.cc | 183 ++++++++++-------- paddle/fluid/operators/math/jit_code.h | 90 ++++----- .../fluid/operators/math/jit_kernel_refer.h | 4 +- paddle/fluid/operators/math/jit_kernel_rnn.cc | 6 +- .../fluid/operators/math/jit_kernel_test.cc | 2 + 5 files changed, 149 insertions(+), 136 deletions(-) diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 95247ce3099f3..52cbdf685dee6 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -140,32 +140,10 @@ bool VActJitCode::init(int d, operand_type type) { } void VActJitCode::generate() { - xmm_t xmm_zero = xmm_t(2); - ymm_t ymm_zero = ymm_t(2); - if (type_ == operand_type::relu) { - vxorps(ymm_zero, ymm_zero, ymm_zero); - } int offset = 0; for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { vmovups(ymm_src, ptr[param1 + offset]); - switch (type_) { - case operand_type::relu: - relu_jmm(ymm_dst, ymm_src, ymm_zero); - break; - case operand_type::exp: - exp_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::sigmoid: - sigmoid_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::tanh: - tanh_jmm(ymm_dst, ymm_src, 2, 3, 4, 5); - break; - case operand_type::identity: - break; - default: - break; - } + act(ymm_dst, ymm_src, type_); vmovups(ptr[param2 + offset], ymm_dst); offset += sizeof(float) * YMM_FLOAT_BLOCK; } @@ -182,22 +160,7 @@ void VActJitCode::generate() { block = 1; vmovss(xmm_src, ptr[param1 + offset]); } - switch (type_) { - case operand_type::relu: - relu_jmm(xmm_dst, xmm_src, xmm_zero); - break; - case operand_type::exp: - exp_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - case operand_type::sigmoid: - sigmoid_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - case operand_type::tanh: - tanh_jmm(xmm_dst, xmm_src, 2, 3, 4, 5); - break; - default: - break; - } + act(xmm_dst, xmm_src, type_); if (rest >= 4) { vmovups(ptr[param2 + offset], xmm_dst); } else if (rest >= 2) { @@ -233,52 +196,64 @@ void LSTMJitCode::generate() { int offset = 0; int d = num_ * sizeof(float); for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { - /* C_t = C_t-1 * fgated + cand_gated * igated*/ - // c - vmovups(ymm_src, ptr[reg_ptr_gates + offset]); - act(ymm_c, ymm_src, act_cand_); - // i - vmovups(ymm_src, ptr[reg_ptr_gates + offset + d]); - if (!compute_c1h1_ && use_peephole_) { - ymm_t ymm_wp = ymm_t(2); - ymm_t ymm_ct_1 = ymm_t(3); - vmovups(ymm_wp, ptr[reg_ptr_wp + offset]); + /* gates: W_ch, W_ih, W_fh, W_oh */ + ymm_t ymm_c = ymm_t(0); + ymm_t ymm_i = ymm_t(1); + ymm_t ymm_f = ymm_t(2); + ymm_t ymm_o = ymm_t(3); + ymm_t ymm_ct_1 = ymm_t(4); + ymm_t ymm_wp0 = ymm_t(5); + ymm_t ymm_wp1 = ymm_t(6); + ymm_t ymm_wp2 = ymm_t(7); + vmovups(ymm_c, ptr[reg_ptr_gates + offset]); + vmovups(ymm_i, ptr[reg_ptr_gates + offset + d]); + vmovups(ymm_f, ptr[reg_ptr_gates + offset + 2 * d]); + vmovups(ymm_o, ptr[reg_ptr_gates + offset + 3 * d]); + if (!compute_c1h1_) { vmovups(ymm_ct_1, ptr[reg_ptr_ct_1 + offset]); - vmulps(ymm_wp, ymm_ct_1, ymm_wp); - vaddps(ymm_src, ymm_src, ymm_wp); } - act(ymm_i, ymm_src, act_gate_); + if (use_peephole_) { + vmovups(ymm_wp0, ptr[reg_ptr_wp + offset]); + vmovups(ymm_wp1, ptr[reg_ptr_wp + offset + d]); + vmovups(ymm_wp2, ptr[reg_ptr_wp + offset + 2 * d]); + } + /* C_t = act_cand(c) * act_gate(i) + C_t-1 * act_gate(f) */ + // act_cand(c) + act(ymm_c, ymm_c, act_cand_); + // act_gate(i) or act_gate(ct_1 * wp0 + i) + if (!compute_c1h1_ && use_peephole_) { + vmulps(ymm_wp0, ymm_ct_1, ymm_wp0); + vaddps(ymm_i, ymm_i, ymm_wp0); + } + act(ymm_i, ymm_i, act_gate_); vmulps(ymm_c, ymm_c, ymm_i); if (!compute_c1h1_) { - // f - vmovups(ymm_src, ptr[reg_ptr_gates + offset + 2 * d]); - vmovups(ymm_i, ptr[reg_ptr_ct_1 + offset]); + // act_gate(f) or act_gate(ct_1 * wp1 + f) if (use_peephole_) { - ymm_t ymm_wp = ymm_t(3); - vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d]); - vmulps(ymm_wp, ymm_i, ymm_wp); - vaddps(ymm_src, ymm_src, ymm_wp); + vmulps(ymm_wp1, ymm_ct_1, ymm_wp1); + vaddps(ymm_f, ymm_f, ymm_wp1); } - act(ymm_f, ymm_src, act_gate_); - vmulps(ymm_f, ymm_f, ymm_i); + act(ymm_f, ymm_f, act_gate_); + // ct + vmulps(ymm_f, ymm_f, ymm_ct_1); vaddps(ymm_f, ymm_f, ymm_c); } - /* H_t = act_cell(C_t) * ogated */ + /* H_t = act_cell(C_t) * act_gate(o) */ + // act_cell(C_t) ymm_t ymm_ct = compute_c1h1_ ? ymm_c : ymm_f; - ymm_t ymm_o = compute_c1h1_ ? ymm_f : ymm_c; ymm_t ymm_tmp = ymm_i; - vmovups(ptr[reg_ptr_ct + offset], ymm_ct); // save ct - vmovups(ymm_src, ptr[reg_ptr_gates + offset + 3 * d]); + act(ymm_tmp, ymm_ct, act_cell_); + // act_gate(o) or act_gate(ct * wp2 + o) if (use_peephole_) { - ymm_t ymm_wp = ymm_t(2); - vmovups(ymm_wp, ptr[reg_ptr_wp + offset + d * 2]); - vmulps(ymm_wp, ymm_ct, ymm_wp); - vaddps(ymm_src, ymm_src, ymm_wp); + vmulps(ymm_wp2, ymm_ct, ymm_wp2); + vaddps(ymm_o, ymm_o, ymm_wp2); } - act(ymm_tmp, ymm_ct, act_cell_); - act(ymm_o, ymm_src, act_gate_); - vmulps(ymm_o, ymm_tmp, ymm_o); - vmovups(ptr[reg_ptr_ht + offset], ymm_o); // save ht + act(ymm_o, ymm_o, act_gate_); + // ht + vmulps(ymm_o, ymm_o, ymm_tmp); + // save ct and ht + vmovups(ptr[reg_ptr_ct + offset], ymm_ct); + vmovups(ptr[reg_ptr_ht + offset], ymm_o); offset += sizeof(float) * YMM_FLOAT_BLOCK; } @@ -293,13 +268,61 @@ bool GRUJitCode::init(int d) { return MayIUse(avx) && d % 8 == 0; } void GRUJitCode::generate() { reg64_t reg_ptr_gates = rax; - reg64_t reg_ptr_ct_1 = r9; - reg64_t reg_ptr_ct = r10; - reg64_t reg_ptr_ht = r11; - mov(reg_ptr_gates, ptr[param1 + offsetof(lstm_t, gates)]); - mov(reg_ptr_ct_1, ptr[param1 + offsetof(lstm_t, ct_1)]); - mov(reg_ptr_ct, ptr[param1 + offsetof(lstm_t, ct)]); - mov(reg_ptr_ht, ptr[param1 + offsetof(lstm_t, ht)]); + reg64_t reg_ptr_ht_1 = r9; + reg64_t reg_ptr_ht = r10; + mov(reg_ptr_gates, ptr[param1 + offsetof(gru_t, gates)]); + mov(reg_ptr_ht_1, ptr[param1 + offsetof(gru_t, ht_1)]); + mov(reg_ptr_ht, ptr[param1 + offsetof(gru_t, ht)]); + ymm_t ymm_one = ymm_t(0); + + if (id_ == 2) { + reg64_t reg_ptr_tmp = r11; + mov(reg_ptr_tmp, reinterpret_cast(exp_float_consts)); + vmovaps(ymm_one, ptr[reg_ptr_tmp + OFFSET_EXP_ONE]); + } + int offset = 0; + int d = num_ * sizeof(float); + for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) { + ymm_t ymm_u = ymm_t(1); + ymm_t ymm_r = ymm_t(2); + ymm_t ymm_s = ymm_t(3); + ymm_t ymm_ht_1 = ymm_t(4); + // W: {W_update, W_reset; W_state} + if (id_ == 0 || id_ == 2) { + vmovups(ymm_u, ptr[reg_ptr_gates + offset]); + vmovups(ymm_s, ptr[reg_ptr_gates + offset + 2 * d]); + } + if (id_ == 1) { + vmovups(ymm_r, ptr[reg_ptr_gates + offset + d]); + } + if (id_ == 1 || id_ == 2) { + vmovups(ymm_ht_1, ptr[reg_ptr_ht_1 + offset]); + } + + if (id_ == 0) { + // ht = act_gate(u) * act_cand(s) + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_s); + } else if (id_ == 1) { + // ht = act_gate(r) * ht_1 + act(ymm_r, ymm_r, act_gate_); + vmulps(ymm_r, ymm_r, ymm_ht_1); + vmovups(ptr[reg_ptr_ht + offset], ymm_r); + } else if (id_ == 2) { + // ht = act_gate(u) * act_cand(s) + (1-act_gate(u)) * ht_1 + ymm_t ymm_one_inner = ymm_t(ymm_one.getIdx()); + act(ymm_u, ymm_u, act_gate_); + act(ymm_s, ymm_s, act_cand_); + vmulps(ymm_s, ymm_s, ymm_u); + vsubps(ymm_u, ymm_one_inner, ymm_u); + vmulps(ymm_u, ymm_ht_1, ymm_u); + vaddps(ymm_u, ymm_s, ymm_u); + vmovups(ptr[reg_ptr_ht + offset], ymm_u); + } + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } ret(); } diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index 403cea399102c..a9214621295a7 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -169,31 +169,34 @@ class VActJitCode : public JitCode { protected: // compute relu with ymm, xmm template - void relu_jmm(JMM& dst, JMM& src, JMM& zero) { // NOLINT + void relu_jmm(JMM& dst, JMM& src, int zero_idx = 15) { // NOLINT + JMM zero = JMM(zero_idx); + vxorps(zero, zero, zero); vmaxps(dst, src, zero); } // compute exp with ymm, xmm template - void exp_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT - int mask_idx = 4, int tmp_idx = 5) { - using namespace platform::jit; // NOLINT - assert(src.getIdx() != dst.getIdx()); // TODO(TJ): use enfore + void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT + int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { + using namespace platform::jit; // NOLINT // check all idx can not equal + JMM jmm_src = JMM(src_idx); JMM jmm_fx = JMM(fx_idx); JMM jmm_fy = JMM(fy_idx); JMM jmm_mask = JMM(mask_idx); JMM jmm_tmp = JMM(tmp_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_HIG]); - vminps(src, src, jmm_tmp); + vminps(jmm_src, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOW]); - vmaxps(src, src, jmm_tmp); + vmaxps(jmm_src, jmm_src, jmm_tmp); // express exp(x) as exp(g + n*log(2)) vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_LOG2EF]); - vmulps(jmm_fx, src, jmm_tmp); + vmulps(jmm_fx, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_0P5]); vaddps(jmm_fx, jmm_fx, jmm_tmp); vroundps(jmm_fy, jmm_fx, 0x01); @@ -207,21 +210,21 @@ class VActJitCode : public JitCode { vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_C2]); JMM ymm_z = JMM(jmm_mask.getIdx()); vmulps(ymm_z, jmm_fx, jmm_tmp); - vsubps(src, src, jmm_fy); - vsubps(src, src, ymm_z); - vmulps(ymm_z, src, src); + vsubps(jmm_src, jmm_src, jmm_fy); + vsubps(jmm_src, jmm_src, ymm_z); + vmulps(ymm_z, jmm_src, jmm_src); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P0]); - vmulps(dst, src, jmm_tmp); + vmulps(dst, jmm_src, jmm_tmp); for (size_t i = OFFSET_EXP_P1; i < OFFSET_EXP_P5; i += (YMM_FLOAT_BLOCK * sizeof(float))) { vmovaps(jmm_tmp, ptr[reg_ptr_global + i]); // P1~P4 vaddps(dst, dst, jmm_tmp); - vmulps(dst, dst, src); + vmulps(dst, dst, jmm_src); } vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_P5]); vaddps(dst, dst, jmm_tmp); vmulps(dst, dst, ymm_z); - vaddps(dst, dst, src); + vaddps(dst, dst, jmm_src); vmovaps(jmm_tmp, ptr[reg_ptr_global]); vaddps(dst, dst, jmm_tmp); // build 2^n @@ -258,20 +261,23 @@ class VActJitCode : public JitCode { // compute sigmoid with ymm, xmm template - void sigmoid_jmm(JMM& dst, JMM& src, int fx_idx = 2, // NOLINT - int fy_idx = 3, int mask_idx = 4, int tmp_idx = 5) { + void sigmoid_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { // y = 1 / (1 + e^-x) JMM jmm_tmp = JMM(tmp_idx); + JMM jmm_src = JMM(src_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MAX]); - vminps(src, src, jmm_tmp); + vminps(jmm_src, jmm_src, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_SIGMOID_MIN]); - vmaxps(src, src, jmm_tmp); + vmaxps(jmm_src, jmm_src, jmm_tmp); vxorps(jmm_tmp, jmm_tmp, jmm_tmp); - vsubps(src, jmm_tmp, src); - exp_jmm(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx); + vsubps(jmm_src, jmm_tmp, jmm_src); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); vaddps(dst, dst, jmm_tmp); vdivps(dst, jmm_tmp, dst); @@ -280,19 +286,22 @@ class VActJitCode : public JitCode { // compute tanh with ymm, xmm template - void tanh_jmm(JMM& dst, JMM& src, int fx_idx = 2, int fy_idx = 3, // NOLINT - int mask_idx = 4, int tmp_idx = 5) { + void tanh_jmm(JMM& dst, JMM& src, int src_idx = 11, // NOLINT + int fx_idx = 12, int fy_idx = 13, int mask_idx = 14, + int tmp_idx = 15) { // y = 2 / (1 + e^(-2x)) - 1 + JMM jmm_src = JMM(src_idx); JMM jmm_tmp = JMM(tmp_idx); JMM jmm_zero = JMM(mask_idx); reg64_t reg_ptr_global = rax; push(reg_ptr_global); + vmovaps(jmm_src, src); mov(reg_ptr_global, reinterpret_cast(exp_float_consts)); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); vxorps(jmm_zero, jmm_zero, jmm_zero); vsubps(jmm_tmp, jmm_zero, jmm_tmp); - vmulps(src, src, jmm_tmp); - exp_jmm(dst, src, fx_idx, fy_idx, mask_idx, tmp_idx); + vmulps(jmm_src, jmm_src, jmm_tmp); + exp_jmm(dst, jmm_src, src_idx, fx_idx, fy_idx, mask_idx, tmp_idx); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_ONE]); vaddps(dst, dst, jmm_tmp); vmovaps(jmm_tmp, ptr[reg_ptr_global + OFFSET_EXP_TWO]); @@ -304,23 +313,19 @@ class VActJitCode : public JitCode { template void act(JMM& dst, JMM& src, operand_type type) { // NOLINT - // use 15 - JMM zero = JMM(15); - if (type_ == operand_type::relu) { - vxorps(zero, zero, zero); - } + // use 11~15 switch (type) { case operand_type::relu: - relu_jmm(dst, src, zero); + relu_jmm(dst, src, 15); break; case operand_type::exp: - exp_jmm(dst, src, 2, 3, 4, 5); + exp_jmm(dst, src, 11, 12, 13, 14, 15); break; case operand_type::sigmoid: - sigmoid_jmm(dst, src, 2, 3, 4, 5); + sigmoid_jmm(dst, src, 11, 12, 13, 14, 15); break; case operand_type::tanh: - tanh_jmm(dst, src, 2, 3, 4, 5); + tanh_jmm(dst, src, 11, 12, 13, 14, 15); break; case operand_type::identity: break; @@ -414,15 +419,6 @@ class LSTMJitCode : public VActJitCode { operand_type act_cand_; operand_type act_cell_; reg64_t param1{abi_param1}; - xmm_t xmm_src = xmm_t(0); - xmm_t xmm_c = xmm_t(1); - xmm_t xmm_i = xmm_t(6); - xmm_t xmm_f = xmm_t(7); - - ymm_t ymm_src = ymm_t(0); - ymm_t ymm_c = ymm_t(1); // 2~5 for act - ymm_t ymm_i = ymm_t(6); - ymm_t ymm_f = ymm_t(7); }; class GRUJitCode : public VActJitCode { @@ -492,16 +488,6 @@ class GRUJitCode : public VActJitCode { operand_type act_gate_; operand_type act_cand_; reg64_t param1{abi_param1}; - - xmm_t xmm_src = xmm_t(0); - xmm_t xmm_c = xmm_t(1); - xmm_t xmm_i = xmm_t(6); - xmm_t xmm_f = xmm_t(7); - - ymm_t ymm_src = ymm_t(0); - ymm_t ymm_c = ymm_t(1); - ymm_t ymm_i = ymm_t(6); - ymm_t ymm_f = ymm_t(7); }; #ifdef PADDLE_WITH_MKLDNN diff --git a/paddle/fluid/operators/math/jit_kernel_refer.h b/paddle/fluid/operators/math/jit_kernel_refer.h index 2e1a7f22db95a..bcb6615df8409 100644 --- a/paddle/fluid/operators/math/jit_kernel_refer.h +++ b/paddle/fluid/operators/math/jit_kernel_refer.h @@ -206,7 +206,7 @@ void GRUHtPart1(gru_t* step, const gru_attr_t* attr) { T* ht = reinterpret_cast(step->ht); const T* ht_1 = reinterpret_cast(step->ht_1); auto act_gate = getActFunc(attr->act_gate); - act_gate(gates, gates, attr->d * 2); + act_gate(gates + attr->d, gates + attr->d, attr->d); VMul(ht_1, gates + attr->d, ht, attr->d); } @@ -215,9 +215,11 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr) { T* gates = reinterpret_cast(step->gates); T* ht = reinterpret_cast(step->ht); const T* ht_1 = reinterpret_cast(step->ht_1); + auto act_gate = getActFunc(attr->act_gate); auto act_cand = getActFunc(attr->act_cand); int d = attr->d; T* y = gates + d * 2; + act_gate(gates, gates, d); act_cand(y, y, d); // out = zt*ht~ + (1-zt)*ht_1 for (int i = 0; i < d; ++i) { diff --git a/paddle/fluid/operators/math/jit_kernel_rnn.cc b/paddle/fluid/operators/math/jit_kernel_rnn.cc index 85ea95cfcc103..2db3274a45610 100644 --- a/paddle/fluid/operators/math/jit_kernel_rnn.cc +++ b/paddle/fluid/operators/math/jit_kernel_rnn.cc @@ -177,7 +177,7 @@ class GRUKernelImpl : public GRUKernel { explicit GRUKernelImpl(const gru_attr_t& attr) : GRUKernel() { #ifdef PADDLE_WITH_XBYAK if (useJIT(attr.d)) { - size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 84 * 8; // should change + size_t sz = 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; jitcode0_.reset(new gen::GRUJitCode(0, attr, sz > 4096 ? sz : 4096)); this->ComputeH1 = jitcode0_->getCode(); @@ -188,7 +188,7 @@ class GRUKernelImpl : public GRUKernel { jitcode2_.reset(new gen::GRUJitCode(2, attr, sz > 4096 ? sz : 4096)); this->ComputeHtPart2 = - jitcode1_->getCode(); + jitcode2_->getCode(); return; } #endif @@ -207,7 +207,7 @@ class GRUKernelImpl : public GRUKernel { #ifdef PADDLE_WITH_XBYAK template <> bool GRUKernelImpl::useJIT(int d) { - return false; // jitcode not ready yet + return gen::GRUJitCode::init(d); } #endif diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index 1cbe1b5d952f0..cc8a5d4d862cc 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -714,6 +714,8 @@ TEST(JitKernel, pool) { std::string act_gate = "sigmoid", act_cand = "tanh", act_cell = "tanh"; jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); + // empty call it to avoid unknown flag 'use_pinned_memory' on Mac + paddle::platform::jit::MayIUse(paddle::platform::jit::avx); const auto& plstm1 = jit::KernelPool::Instance() .template Get, const jit::lstm_attr_t&>(attr); From 36f08eef3b466001f339e2c33f47dac60bbc6821 Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 23 Nov 2018 13:04:41 +0800 Subject: [PATCH 115/684] CUDA kernel for density_prior_box_op. (#14513) * CUDA kernel for density_prior_box_op. * Support flatten to 2D. --- paddle/fluid/API.spec | 2 +- paddle/fluid/framework/op_desc.cc | 6 + .../fluid/operators/detection/CMakeLists.txt | 2 +- .../detection/density_prior_box_op.cc | 36 ++-- .../detection/density_prior_box_op.cu | 170 ++++++++++++++++++ .../detection/density_prior_box_op.h | 73 ++++---- python/paddle/fluid/layers/detection.py | 43 +++-- python/paddle/fluid/tests/test_detection.py | 60 ++++--- .../unittests/test_density_prior_box_op.py | 30 ++-- 9 files changed, 305 insertions(+), 117 deletions(-) create mode 100644 paddle/fluid/operators/detection/density_prior_box_op.cu diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 541c4db1fa091..50114bf3df0ac 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -276,7 +276,7 @@ paddle.fluid.layers.hard_shrink ArgSpec(args=['x', 'threshold'], varargs=None, k paddle.fluid.layers.cumsum ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.thresholded_relu ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.prior_box ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)) -paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, None)) +paddle.fluid.layers.density_prior_box ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)) paddle.fluid.layers.multi_box_head ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)) paddle.fluid.layers.bipartite_match ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) paddle.fluid.layers.target_assign ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)) diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index fbaa169df6324..362cda3f2329b 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -252,6 +252,12 @@ void OpDesc::SetAttr(const std::string &name, const Attribute &v) { this->attrs_[name] = std::vector(); break; } + case proto::AttrType::LONGS: { + VLOG(110) << "SetAttr: " << Type() << ", " << name + << " from LONGS to LONGS"; + this->attrs_[name] = std::vector(); + break; + } case proto::AttrType::FLOATS: { VLOG(110) << "SetAttr: " << Type() << ", " << name << " from INTS to FLOATS"; diff --git a/paddle/fluid/operators/detection/CMakeLists.txt b/paddle/fluid/operators/detection/CMakeLists.txt index 58f6f48467310..6c85f1577e0c4 100644 --- a/paddle/fluid/operators/detection/CMakeLists.txt +++ b/paddle/fluid/operators/detection/CMakeLists.txt @@ -22,7 +22,7 @@ iou_similarity_op.cu) detection_library(mine_hard_examples_op SRCS mine_hard_examples_op.cc) detection_library(multiclass_nms_op SRCS multiclass_nms_op.cc poly_util.cc gpc.cc) detection_library(prior_box_op SRCS prior_box_op.cc prior_box_op.cu) -detection_library(density_prior_box_op SRCS density_prior_box_op.cc) +detection_library(density_prior_box_op SRCS density_prior_box_op.cc density_prior_box_op.cu) detection_library(anchor_generator_op SRCS anchor_generator_op.cc anchor_generator_op.cu) detection_library(target_assign_op SRCS target_assign_op.cc diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cc b/paddle/fluid/operators/detection/density_prior_box_op.cc index 99df15c3226b4..1012ba3652ddf 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.cc +++ b/paddle/fluid/operators/detection/density_prior_box_op.cc @@ -39,24 +39,27 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { auto fixed_sizes = ctx->Attrs().Get>("fixed_sizes"); auto fixed_ratios = ctx->Attrs().Get>("fixed_ratios"); auto densities = ctx->Attrs().Get>("densities"); + bool flatten = ctx->Attrs().Get("flatten_to_2d"); PADDLE_ENFORCE_EQ(fixed_sizes.size(), densities.size(), "The number of fixed_sizes and densities must be equal."); size_t num_priors = 0; - if ((fixed_sizes.size() > 0) && (densities.size() > 0)) { - for (size_t i = 0; i < densities.size(); ++i) { - if (fixed_ratios.size() > 0) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - } + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + if (!flatten) { + std::vector dim_vec(4); + dim_vec[0] = input_dims[2]; + dim_vec[1] = input_dims[3]; + dim_vec[2] = num_priors; + dim_vec[3] = 4; + ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); + ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); + } else { + int64_t dim0 = input_dims[2] * input_dims[3] * num_priors; + ctx->SetOutputDim("Boxes", {dim0, 4}); + ctx->SetOutputDim("Variances", {dim0, 4}); } - std::vector dim_vec(4); - dim_vec[0] = input_dims[2]; - dim_vec[1] = input_dims[3]; - dim_vec[2] = num_priors; - dim_vec[3] = 4; - ctx->SetOutputDim("Boxes", framework::make_ddim(dim_vec)); - ctx->SetOutputDim("Variances", framework::make_ddim(dim_vec)); } protected: @@ -64,7 +67,7 @@ class DensityPriorBoxOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), - platform::CPUPlace()); + ctx.GetPlace()); } }; @@ -101,7 +104,10 @@ class DensityPriorBoxOpMaker : public framework::OpProtoAndCheckerMaker { }); AddAttr("clip", "(bool) Whether to clip out-of-boundary boxes.") .SetDefault(true); - + AddAttr("flatten_to_2d", + "(bool) Whether to flatten to 2D and " + "the second dim is 4.") + .SetDefault(false); AddAttr( "step_w", "Density prior boxes step across width, 0.0 for auto calculation.") diff --git a/paddle/fluid/operators/detection/density_prior_box_op.cu b/paddle/fluid/operators/detection/density_prior_box_op.cu new file mode 100644 index 0000000000000..3b7c781795f02 --- /dev/null +++ b/paddle/fluid/operators/detection/density_prior_box_op.cu @@ -0,0 +1,170 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/detection/density_prior_box_op.h" + +namespace paddle { +namespace operators { + +template +static __device__ inline T Clip(T in) { + return min(max(in, 0.), 1.); +} + +template +static __global__ void GenDensityPriorBox( + const int height, const int width, const int im_height, const int im_width, + const T offset, const T step_width, const T step_height, + const int num_priors, const T* ratios_shift, bool is_clip, const T var_xmin, + const T var_ymin, const T var_xmax, const T var_ymax, T* out, T* var) { + int gidx = blockIdx.x * blockDim.x + threadIdx.x; + int gidy = blockIdx.y * blockDim.y + threadIdx.y; + int step_x = blockDim.x * gridDim.x; + int step_y = blockDim.y * gridDim.y; + + const T* width_ratio = ratios_shift; + const T* height_ratio = ratios_shift + num_priors; + const T* width_shift = ratios_shift + 2 * num_priors; + const T* height_shift = ratios_shift + 3 * num_priors; + + for (int j = gidy; j < height; j += step_y) { + for (int i = gidx; i < width * num_priors; i += step_x) { + int h = j; + int w = i / num_priors; + int k = i % num_priors; + + T center_x = (w + offset) * step_width; + T center_y = (h + offset) * step_height; + + T center_x_temp = center_x + width_shift[k]; + T center_y_temp = center_y + height_shift[k]; + + T box_width_ratio = width_ratio[k] / 2.; + T box_height_ratio = height_ratio[k] / 2.; + + T xmin = max((center_x_temp - box_width_ratio) / im_width, 0.); + T ymin = max((center_y_temp - box_height_ratio) / im_height, 0.); + T xmax = min((center_x_temp + box_width_ratio) / im_width, 1.); + T ymax = min((center_y_temp + box_height_ratio) / im_height, 1.); + + int out_offset = (j * width * num_priors + i) * 4; + out[out_offset] = is_clip ? Clip(xmin) : xmin; + out[out_offset + 1] = is_clip ? Clip(ymin) : ymin; + out[out_offset + 2] = is_clip ? Clip(xmax) : xmax; + out[out_offset + 3] = is_clip ? Clip(ymax) : ymax; + + var[out_offset] = var_xmin; + var[out_offset + 1] = var_ymin; + var[out_offset + 2] = var_xmax; + var[out_offset + 3] = var_ymax; + } + } +} + +template +class DensityPriorBoxOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("Input"); + auto* image = ctx.Input("Image"); + auto* boxes = ctx.Output("Boxes"); + auto* vars = ctx.Output("Variances"); + + auto variances = ctx.Attr>("variances"); + auto is_clip = ctx.Attr("clip"); + + auto fixed_sizes = ctx.Attr>("fixed_sizes"); + auto fixed_ratios = ctx.Attr>("fixed_ratios"); + auto densities = ctx.Attr>("densities"); + + T step_w = static_cast(ctx.Attr("step_w")); + T step_h = static_cast(ctx.Attr("step_h")); + T offset = static_cast(ctx.Attr("offset")); + + auto img_width = image->dims()[3]; + auto img_height = image->dims()[2]; + + auto feature_width = input->dims()[3]; + auto feature_height = input->dims()[2]; + + T step_width, step_height; + if (step_w == 0 || step_h == 0) { + step_width = static_cast(img_width) / feature_width; + step_height = static_cast(img_height) / feature_height; + } else { + step_width = step_w; + step_height = step_h; + } + + int num_priors = 0; + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); + } + int step_average = static_cast((step_width + step_height) * 0.5); + + framework::Tensor h_temp; + T* tdata = h_temp.mutable_data({num_priors * 4}, platform::CPUPlace()); + int idx = 0; + for (size_t s = 0; s < fixed_sizes.size(); ++s) { + auto fixed_size = fixed_sizes[s]; + int density = densities[s]; + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = shift / 2. + dj * shift - step_average / 2.; + float center_y_temp = shift / 2. + di * shift - step_average / 2.; + tdata[idx] = box_width_ratio; + tdata[num_priors + idx] = box_height_ratio; + tdata[2 * num_priors + idx] = center_x_temp; + tdata[3 * num_priors + idx] = center_y_temp; + idx++; + } + } + } + } + + boxes->mutable_data(ctx.GetPlace()); + vars->mutable_data(ctx.GetPlace()); + + framework::Tensor d_temp; + framework::TensorCopySync(h_temp, ctx.GetPlace(), &d_temp); + + // At least use 32 threads, at most 512 threads. + // blockx is multiple of 32. + int blockx = std::min(((feature_width * num_priors + 31) >> 5) << 5, 512L); + int gridx = (feature_width * num_priors + blockx - 1) / blockx; + dim3 threads(blockx, 1); + dim3 grids(gridx, feature_height); + + auto stream = + ctx.template device_context().stream(); + GenDensityPriorBox<<>>( + feature_height, feature_width, img_height, img_width, offset, + step_width, step_height, num_priors, d_temp.data(), is_clip, + variances[0], variances[1], variances[2], variances[3], + boxes->data(), vars->data()); + } +}; // namespace operators + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(density_prior_box, + ops::DensityPriorBoxOpCUDAKernel, + ops::DensityPriorBoxOpCUDAKernel); diff --git a/paddle/fluid/operators/detection/density_prior_box_op.h b/paddle/fluid/operators/detection/density_prior_box_op.h index 9a52077e9cf90..ed2f5df80cf4d 100644 --- a/paddle/fluid/operators/detection/density_prior_box_op.h +++ b/paddle/fluid/operators/detection/density_prior_box_op.h @@ -1,4 +1,4 @@ -/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at @@ -52,18 +52,16 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { step_height = step_h; } int num_priors = 0; - if (fixed_sizes.size() > 0 && densities.size() > 0) { - for (size_t i = 0; i < densities.size(); ++i) { - if (fixed_ratios.size() > 0) { - num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); - } - } + for (size_t i = 0; i < densities.size(); ++i) { + num_priors += (fixed_ratios.size()) * (pow(densities[i], 2)); } boxes->mutable_data(ctx.GetPlace()); vars->mutable_data(ctx.GetPlace()); - auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); + auto box_dim = vars->dims(); + boxes->Resize({feature_height, feature_width, num_priors, 4}); + auto e_boxes = framework::EigenTensor::From(*boxes).setConstant(0.0); int step_average = static_cast((step_width + step_height) * 0.5); for (int h = 0; h < feature_height; ++h) { @@ -76,36 +74,34 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { auto fixed_size = fixed_sizes[s]; int density = densities[s]; // Generate density prior boxes with fixed ratios. - if (fixed_ratios.size() > 0) { - for (size_t r = 0; r < fixed_ratios.size(); ++r) { - float ar = fixed_ratios[r]; - int shift = step_average / density; - float box_width_ratio = fixed_size * sqrt(ar); - float box_height_ratio = fixed_size / sqrt(ar); - for (int di = 0; di < density; ++di) { - for (int dj = 0; dj < density; ++dj) { - float center_x_temp = - center_x - step_average / 2. + shift / 2. + dj * shift; - float center_y_temp = - center_y - step_average / 2. + shift / 2. + di * shift; - e_boxes(h, w, idx, 0) = - (center_x_temp - box_width_ratio / 2.) / img_width >= 0 - ? (center_x_temp - box_width_ratio / 2.) / img_width - : 0; - e_boxes(h, w, idx, 1) = - (center_y_temp - box_height_ratio / 2.) / img_height >= 0 - ? (center_y_temp - box_height_ratio / 2.) / img_height - : 0; - e_boxes(h, w, idx, 2) = - (center_x_temp + box_width_ratio / 2.) / img_width <= 1 - ? (center_x_temp + box_width_ratio / 2.) / img_width - : 1; - e_boxes(h, w, idx, 3) = - (center_y_temp + box_height_ratio / 2.) / img_height <= 1 - ? (center_y_temp + box_height_ratio / 2.) / img_height - : 1; - idx++; - } + for (size_t r = 0; r < fixed_ratios.size(); ++r) { + float ar = fixed_ratios[r]; + int shift = step_average / density; + float box_width_ratio = fixed_size * sqrt(ar); + float box_height_ratio = fixed_size / sqrt(ar); + for (int di = 0; di < density; ++di) { + for (int dj = 0; dj < density; ++dj) { + float center_x_temp = + center_x - step_average / 2. + shift / 2. + dj * shift; + float center_y_temp = + center_y - step_average / 2. + shift / 2. + di * shift; + e_boxes(h, w, idx, 0) = + (center_x_temp - box_width_ratio / 2.) / img_width >= 0 + ? (center_x_temp - box_width_ratio / 2.) / img_width + : 0; + e_boxes(h, w, idx, 1) = + (center_y_temp - box_height_ratio / 2.) / img_height >= 0 + ? (center_y_temp - box_height_ratio / 2.) / img_height + : 0; + e_boxes(h, w, idx, 2) = + (center_x_temp + box_width_ratio / 2.) / img_width <= 1 + ? (center_x_temp + box_width_ratio / 2.) / img_width + : 1; + e_boxes(h, w, idx, 3) = + (center_y_temp + box_height_ratio / 2.) / img_height <= 1 + ? (center_y_temp + box_height_ratio / 2.) / img_height + : 1; + idx++; } } } @@ -139,6 +135,7 @@ class DensityPriorBoxOpKernel : public framework::OpKernel { e_vars = var_et.broadcast(Eigen::DSizes(box_num, 1)); vars->Resize(var_dim); + boxes->Resize(box_dim); } }; // namespace operators diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 3f17400a1432b..4843af8340310 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -1029,6 +1029,7 @@ def density_prior_box(input, clip=False, steps=[0.0, 0.0], offset=0.5, + flatten_to_2d=False, name=None): """ **Density Prior Box Operator** @@ -1065,22 +1066,24 @@ def density_prior_box(input, height/weight of the input will be automatically calculated. Default: [0., 0.] offset(float): Prior boxes center offset. Default: 0.5 + flatten_to_2d(bool): Whether to flatten output prior boxes and variance + to 2D shape, the second dim is 4. Default: False. name(str): Name of the density prior box op. Default: None. Returns: tuple: A tuple with two Variable (boxes, variances) boxes: the output density prior boxes of PriorBox. - The layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input, - num_priors is the total - box count of each position of input. + The layout is [H, W, num_priors, 4] when flatten_to_2d is False. + The layout is [H * W * num_priors, 4] when flatten_to_2d is True. + H is the height of input, W is the width of input, + num_priors is the total box count of each position of input. variances: the expanded variances of PriorBox. - The layout is [H, W, num_priors, 4]. - H is the height of input, W is the width of input - num_priors is the total - box count of each position of input + The layout is [H, W, num_priors, 4] when flatten_to_2d is False. + The layout is [H * W * num_priors, 4] when flatten_to_2d is True. + H is the height of input, W is the width of input + num_priors is the total box count of each position of input. Examples: @@ -1089,14 +1092,11 @@ def density_prior_box(input, box, var = fluid.layers.density_prior_box( input=conv1, image=images, - min_sizes=[100.], - max_sizes=[200.], - aspect_ratios=[1.0, 1.0 / 2.0, 2.0], - densities=[3, 4], - fixed_sizes=[50., 60.], - fixed_ratios=[1.0, 3.0, 1.0 / 3.0], - flip=True, - clip=True) + densities=[4, 2, 1], + fixed_sizes=[32.0, 64.0, 128.0], + fixed_ratios=[1.], + clip=True, + flatten_to_2d=True) """ helper = LayerHelper("density_prior_box", **locals()) dtype = helper.input_dtype() @@ -1127,14 +1127,11 @@ def _is_list_or_tuple_(data): 'step_w': steps[0], 'step_h': steps[1], 'offset': offset, + 'densities': densities, + 'fixed_sizes': fixed_sizes, + 'fixed_ratios': fixed_ratios, + 'flatten_to_2d': flatten_to_2d, } - if densities is not None and len(densities) > 0: - attrs['densities'] = densities - if fixed_sizes is not None and len(fixed_sizes) > 0: - attrs['fixed_sizes'] = fixed_sizes - if fixed_ratios is not None and len(fixed_ratios) > 0: - attrs['fixed_ratios'] = fixed_ratios - box = helper.create_variable_for_type_inference(dtype) var = helper.create_variable_for_type_inference(dtype) helper.append_op( diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index 982d29180141d..a2eca5541a152 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -112,38 +112,42 @@ def test_ssd_loss(self): class TestPriorBox(unittest.TestCase): def test_prior_box(self): - data_shape = [3, 224, 224] - images = fluid.layers.data( - name='pixel', shape=data_shape, dtype='float32') - conv1 = fluid.layers.conv2d(images, 3, 3, 2) - box, var = layers.prior_box( - input=conv1, - image=images, - min_sizes=[100.0], - aspect_ratios=[1.], - flip=True, - clip=True) - assert len(box.shape) == 4 - assert box.shape == var.shape - assert box.shape[3] == 4 + program = Program() + with program_guard(program): + data_shape = [3, 224, 224] + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + conv1 = fluid.layers.conv2d(images, 3, 3, 2) + box, var = layers.prior_box( + input=conv1, + image=images, + min_sizes=[100.0], + aspect_ratios=[1.], + flip=True, + clip=True) + assert len(box.shape) == 4 + assert box.shape == var.shape + assert box.shape[3] == 4 class TestDensityPriorBox(unittest.TestCase): def test_density_prior_box(self): - data_shape = [3, 224, 224] - images = fluid.layers.data( - name='pixel', shape=data_shape, dtype='float32') - conv1 = fluid.layers.conv2d(images, 3, 3, 2) - box, var = layers.density_prior_box( - input=conv1, - image=images, - densities=[3, 4], - fixed_sizes=[50., 60.], - fixed_ratios=[1.0], - clip=True) - assert len(box.shape) == 4 - assert box.shape == var.shape - assert box.shape[3] == 4 + program = Program() + with program_guard(program): + data_shape = [3, 224, 224] + images = fluid.layers.data( + name='pixel', shape=data_shape, dtype='float32') + conv1 = fluid.layers.conv2d(images, 3, 3, 2) + box, var = layers.density_prior_box( + input=conv1, + image=images, + densities=[3, 4], + fixed_sizes=[50., 60.], + fixed_ratios=[1.0], + clip=True) + assert len(box.shape) == 4 + assert box.shape == var.shape + assert box.shape[-1] == 4 class TestAnchorGenerator(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py index 79d1fd3d7171e..4b0bc1dcf85fb 100644 --- a/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py +++ b/python/paddle/fluid/tests/unittests/test_density_prior_box_op.py @@ -36,7 +36,8 @@ def set_data(self): 'offset': self.offset, 'densities': self.densities, 'fixed_sizes': self.fixed_sizes, - 'fixed_ratios': self.fixed_ratios + 'fixed_ratios': self.fixed_ratios, + 'flatten_to_2d': self.flatten_to_2d } self.outputs = {'Boxes': self.out_boxes, 'Variances': self.out_var} @@ -48,16 +49,17 @@ def setUp(self): self.set_data() def set_density(self): - self.densities = [] - self.fixed_sizes = [] - self.fixed_ratios = [] + self.densities = [4, 2, 1] + self.fixed_sizes = [32.0, 64.0, 128.0] + self.fixed_ratios = [1.0] + self.layer_w = 17 + self.layer_h = 17 + self.image_w = 533 + self.image_h = 533 + self.flatten_to_2d = False def init_test_params(self): - self.layer_w = 32 - self.layer_h = 32 - - self.image_w = 40 - self.image_h = 40 + self.set_density() self.step_w = float(self.image_w) / float(self.layer_w) self.step_h = float(self.image_h) / float(self.layer_h) @@ -69,8 +71,6 @@ def init_test_params(self): self.variances = [0.1, 0.1, 0.2, 0.2] self.variances = np.array(self.variances, dtype=np.float).flatten() - self.set_density() - self.clip = True self.num_priors = 0 if len(self.fixed_sizes) > 0 and len(self.densities) > 0: @@ -129,6 +129,9 @@ def init_test_output(self): (self.layer_h, self.layer_w, self.num_priors, 1)) self.out_boxes = out_boxes.astype('float32') self.out_var = out_var.astype('float32') + if self.flatten_to_2d: + self.out_boxes = self.out_boxes.reshape((-1, 4)) + self.out_var = self.out_var.reshape((-1, 4)) class TestDensityPriorBox(TestDensityPriorBoxOp): @@ -136,6 +139,11 @@ def set_density(self): self.densities = [3, 4] self.fixed_sizes = [1.0, 2.0] self.fixed_ratios = [1.0] + self.layer_w = 32 + self.layer_h = 32 + self.image_w = 40 + self.image_h = 40 + self.flatten_to_2d = True if __name__ == '__main__': From e950ce7148759472d67856d7c1ba7ec35fe364cd Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 05:54:20 +0000 Subject: [PATCH 116/684] test for mac --- python/paddle/fluid/__init__.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index a6416326ed80b..ac2a7ea47e3dd 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -119,7 +119,8 @@ def __bootstrap__(): 'eager_delete_tensor_gb', 'allocator_strategy', 'reader_queue_speed_test_mode', 'print_sub_graph_dir' ] - if sysstr != 'Darwin': + if 'Darwin' not in sysstr: + print("aaaaa") read_env_flags.append('use_pinned_memory') if os.name != 'nt': From 61c5f13fcf92c18f30c05a90e3d3badd884f9340 Mon Sep 17 00:00:00 2001 From: sabreshao Date: Fri, 23 Nov 2018 14:27:39 +0800 Subject: [PATCH 117/684] Fix cmake for AMDGPU platform (#13801) * HIP cmake. Enable whole archieve build for pybind library. Disable two warning. Rollback to C++11. Link RCCL to WA gpu kernel loading issue. Update eigen to fix build failure. Add more include directories. Fix O3 build failure. Update eigen. fix tensor_util_test segment fault issue add more macro check in hip.cmake. we may consider refine hip.cmake to inherit all add_definitions() in parrent scope, in the future. Fix rocRAND load. Update eigen to fix gru_unit_op and reduce_op. Add HIP support to testing. Update eigen to support int16 and int8 in arg min and arg max. * add rocprim as cub library used by nv implementation * Reduce build time in rocprim. * Add rocprim introduction, remove useless cmake code. * Remove useless flags and format cmake file. --- CMakeLists.txt | 1 + cmake/external/eigen.cmake | 2 +- cmake/external/rocprim.cmake | 44 +++++++++++++++++++++++++++++ cmake/flags.cmake | 3 ++ cmake/generic.cmake | 26 +++++++++-------- cmake/hip.cmake | 32 +++++++++++++++++---- paddle/fluid/pybind/CMakeLists.txt | 4 +-- paddle/testing/paddle_gtest_main.cc | 2 +- 8 files changed, 94 insertions(+), 20 deletions(-) create mode 100644 cmake/external/rocprim.cmake diff --git a/CMakeLists.txt b/CMakeLists.txt index 3059ab7e0e412..8dcf9786e36fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -204,6 +204,7 @@ include(external/eigen) # download eigen3 include(external/pybind11) # download pybind11 include(external/cares) include(external/cub) +include(external/rocprim) include(external/xxhash) # download xxhash include(external/dlpack) include(external/snappy) # download snappy diff --git a/cmake/external/eigen.cmake b/cmake/external/eigen.cmake index 573ad5e5f06a9..6aef97f21244e 100644 --- a/cmake/external/eigen.cmake +++ b/cmake/external/eigen.cmake @@ -17,7 +17,7 @@ if(WITH_AMD_GPU) extern_eigen3 ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/sabreshao/hipeigen.git" - GIT_TAG 0cba03ff9f8f9f70bbd92ac5857b031aa8fed6f9 + GIT_TAG 7cb2b6e5a4b4a1efe658abb215cd866c6fb2275e PREFIX ${EIGEN_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" diff --git a/cmake/external/rocprim.cmake b/cmake/external/rocprim.cmake new file mode 100644 index 0000000000000..914c064918905 --- /dev/null +++ b/cmake/external/rocprim.cmake @@ -0,0 +1,44 @@ +if (NOT WITH_AMD_GPU) + return() +endif() + +# rocprim is "ROCm Parallel Primitives" for short. +# It is a header-only library providing HIP and HC parallel primitives +# for developing performant GPU-accelerated code on AMD ROCm platform. + +if("x${HCC_HOME}" STREQUAL "x") + set(HCC_HOME "/opt/rocm/hcc") +endif() + +INCLUDE(ExternalProject) + +SET(ROCPRIM_SOURCE_DIR ${THIRD_PARTY_PATH}/rocprim) +SET(ROCPRIM_INSTALL_DIR ${THIRD_PARTY_PATH}/install/rocprim) +SET(ROCPRIM_INCLUDE_DIR ${ROCPRIM_INSTALL_DIR}/include) + +ExternalProject_Add( + extern_rocprim + GIT_REPOSITORY "https://github.com/ROCmSoftwarePlatform/rocPRIM.git" + GIT_TAG 5bd41b96ab8d8343330fb2c3e1b96775bde3b3fc + PREFIX ${ROCPRIM_SOURCE_DIR} + UPDATE_COMMAND "" + CMAKE_ARGS -DCMAKE_CXX_COMPILER=${HCC_HOME}/bin/hcc + CMAKE_ARGS -DONLY_INSTALL=ON + CMAKE_ARGS -DBUILD_TEST=OFF + CMAKE_ARGS -DCMAKE_INSTALL_PREFIX=${ROCPRIM_INSTALL_DIR} + + INSTALL_DIR ${ROCPRIM_INSTALL_DIR} + ${EXTERNAL_PROJECT_LOG_ARGS} +) + +INCLUDE_DIRECTORIES(${ROCPRIM_INCLUDE_DIR}) + +if (${CMAKE_VERSION} VERSION_LESS "3.3.0") + set(dummyfile ${CMAKE_CURRENT_BINARY_DIR}/rocprim_dummy.c) + file(WRITE ${dummyfile} "const char *dummy_rocprim = \"${dummyfile}\";") + add_library(rocprim STATIC ${dummyfile}) +else() + add_library(rocprim INTERFACE) +endif() + +add_dependencies(rocprim extern_rocprim) diff --git a/cmake/flags.cmake b/cmake/flags.cmake index 343e44ab4bc21..c4472040cef87 100644 --- a/cmake/flags.cmake +++ b/cmake/flags.cmake @@ -129,6 +129,9 @@ set(COMMON_FLAGS -Wno-error=parentheses-equality # Warnings in pybind11 -Wno-error=ignored-attributes # Warnings in Eigen, gcc 6.3 -Wno-error=terminate # Warning in PADDLE_ENFORCE + -Wno-error=int-in-bool-context # Warning in Eigen gcc 7.2 + -Wimplicit-fallthrough=0 # Warning in tinyformat.h + -Wno-error=maybe-uninitialized # Warning in boost gcc 7.2 ) set(GPU_COMMON_FLAGS diff --git a/cmake/generic.cmake b/cmake/generic.cmake index 111627a932afe..7d803d00ef45c 100644 --- a/cmake/generic.cmake +++ b/cmake/generic.cmake @@ -454,25 +454,29 @@ function(hip_library TARGET_NAME) else() add_library(${TARGET_NAME} STATIC ${_cmake_options} ${_generated_files} ${_sources}) set_target_properties(${TARGET_NAME} PROPERTIES LINKER_LANGUAGE CXX) - target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a) - find_fluid_modules(${TARGET_NAME}) + target_link_libraries(${TARGET_NAME} /opt/rocm/hip/lib/libhip_hcc.so /opt/rocm/hip/lib/libhip_device.a /opt/rocm/rccl/lib/librccl.so /opt/rocm/hiprand/lib/libhiprand.so) + find_fluid_modules(${TARGET_NAME}) endif() - if (hip_library_DEPS) - add_dependencies(${TARGET_NAME} ${hip_library_DEPS}) - target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + if("${hip_library_DEPS}" MATCHES "ARCHIVE_START") + # Support linking flags: --whole-archive (Linux) / -force_load (MacOS). + # WARNING: Please don't use ARCHIVE_START&ARCHIVE_END if TARGET_NAME will be linked by other libraries. + target_circle_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) + list(REMOVE_ITEM hip_library_DEPS ARCHIVE_START ARCHIVE_END) + else() + target_link_libraries(${TARGET_NAME} ${hip_library_DEPS}) endif() # cpplint code style foreach(source_file ${hip_library_SRCS}) - string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) - if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) - endif() + string(REGEX REPLACE "\\.[^.]*$" "" source ${source_file}) + if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + list(APPEND hip_library_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}/${source}.h) + endif() endforeach() else(hip_library_SRCS) if (hip_library_DEPS) - merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) + merge_static_libs(${TARGET_NAME} ${hip_library_DEPS}) else() - message(FATAL "Please specify source file or library in nv_library.") + message(FATAL "Please specify source file or library in nv_library.") endif() endif(hip_library_SRCS) endif() diff --git a/cmake/hip.cmake b/cmake/hip.cmake index bfe491bd6b760..4276bc5b08cd8 100644 --- a/cmake/hip.cmake +++ b/cmake/hip.cmake @@ -3,6 +3,8 @@ if(NOT WITH_AMD_GPU) endif() include_directories("/opt/rocm/include") +include_directories("/opt/rocm/hip/include") +include_directories("/opt/rocm/miopen/include") include_directories("/opt/rocm/hipblas/include") include_directories("/opt/rocm/hiprand/include") include_directories("/opt/rocm/rocrand/include") @@ -11,20 +13,40 @@ include_directories("/opt/rocm/thrust") list(APPEND EXTERNAL_LIBS "-L/opt/rocm/lib/ -lhip_hcc") -set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++14" ) +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -fPIC -DPADDLE_WITH_HIP -std=c++11" ) if(WITH_DSO) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_USE_DSO") endif(WITH_DSO) -if(WITH_DOUBLE) - set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_TYPE_DOUBLE") -endif(WITH_DOUBLE) - if(WITH_TESTING) set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_TESTING") endif(WITH_TESTING) +if(WITH_DISTRIBUTE) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_DISTRIBUTE") +endif(WITH_DISTRIBUTE) + +if(WITH_GRPC) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_GRPC") +endif(WITH_GRPC) + +if(NOT WITH_GOLANG) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITHOUT_GOLANG") +endif(NOT WITH_GOLANG) + +if(WITH_MKLDNN) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_WITH_MKLDNN") +endif(WITH_MKLDNN) + +set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DANY_IMPL_ANY_CAST_MOVEABLE") + +if(NOT WITH_RDMA) + set(HIP_HCC_FLAGS "${HIP_HCC_FLAGS} -DPADDLE_DISABLE_RDMA") +endif(NOT WITH_RDMA) + + + if(CMAKE_BUILD_TYPE STREQUAL "Debug") list(APPEND HIP_HCC_FLAGS ${CMAKE_CXX_FLAGS_DEBUG}) elseif(CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo") diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index fb6ee2f4a5392..25d241d9768c1 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -5,8 +5,8 @@ if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} - DEPS ${PYBIND_DEPS} - ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS}) + DEPS ARCHIVE_START ${PYBIND_DEPS} + ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} ARCHIVE_END) else() cc_library(paddle_pybind SHARED SRCS ${PYBIND_SRCS} diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 598f435461b40..babb862122a0e 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -28,7 +28,7 @@ int main(int argc, char** argv) { for (int i = 0; i < argc; ++i) { new_argv.push_back(argv[i]); } -#ifdef PADDLE_WITH_CUDA +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) new_argv.push_back( strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); #else From 445fff24dcbdce6c4b98b5631bc6c34831276fca Mon Sep 17 00:00:00 2001 From: peizhilin Date: Fri, 23 Nov 2018 14:40:04 +0800 Subject: [PATCH 118/684] add the bigobj option to NVCC compile fix code style --- cmake/cuda.cmake | 4 ++-- paddle/fluid/operators/beam_search_op_test.cc | 4 ++-- paddle/fluid/platform/stream_callback_manager.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake index 964d5fd45b350..4c7e0fd3f6ca9 100644 --- a/cmake/cuda.cmake +++ b/cmake/cuda.cmake @@ -200,9 +200,9 @@ elseif(CMAKE_BUILD_TYPE STREQUAL "MinSizeRel") endif() else(NOT WIN32) if(CMAKE_BUILD_TYPE STREQUAL "Debug") - list(APPEND CUDA_NVCC_FLAGS "-g -G") + list(APPEND CUDA_NVCC_FLAGS "-g -G --compiler-options;/bigobj") elseif(CMAKE_BUILD_TYPE STREQUAL "Release") - list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG") + list(APPEND CUDA_NVCC_FLAGS "-O3 -DNDEBUG --compiler-options;/bigobj") else() message(FATAL "Windows only support Release or Debug build now. Please set visual studio build type to Release/Debug, x64 build.") endif() diff --git a/paddle/fluid/operators/beam_search_op_test.cc b/paddle/fluid/operators/beam_search_op_test.cc index 6e283866ff59a..40b46781daa98 100644 --- a/paddle/fluid/operators/beam_search_op_test.cc +++ b/paddle/fluid/operators/beam_search_op_test.cc @@ -45,8 +45,8 @@ void CreateInput(LoDTensor* ids, LoDTensor* scores) { auto* ids_data = ids->mutable_data(place); auto* scores_data = scores->mutable_data(place); vector _ids({4, 2, 5, 2, 1, 3, 3, 5, 2, 8, 2, 1}); - vector _scores({0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, - 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); + vector _scores( + {0.5f, 0.3f, 0.2f, 0.6f, 0.3f, 0.1f, 0.9f, 0.5f, 0.1f, 0.7f, 0.5f, 0.1f}); for (int i = 0; i < 12; i++) { ids_data[i] = _ids[i]; diff --git a/paddle/fluid/platform/stream_callback_manager.h b/paddle/fluid/platform/stream_callback_manager.h index 8dcfc4e748f27..ed8734c98cb99 100644 --- a/paddle/fluid/platform/stream_callback_manager.h +++ b/paddle/fluid/platform/stream_callback_manager.h @@ -14,11 +14,11 @@ #pragma once +#include #include #include #include #include -#include #include "paddle/fluid/platform/enforce.h" namespace paddle { From 42470f14b77e71a53c25cf318c69c4ca019bb593 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 06:43:42 +0000 Subject: [PATCH 119/684] test=develop --- paddle/fluid/framework/selected_rows.cc | 52 ------------------- paddle/fluid/framework/selected_rows.h | 50 +++++++++++++++++- .../fluid/operators/math/matrix_bit_code.cc | 2 +- 3 files changed, 50 insertions(+), 54 deletions(-) diff --git a/paddle/fluid/framework/selected_rows.cc b/paddle/fluid/framework/selected_rows.cc index f4f2b769d5e47..7262f8cc052ab 100644 --- a/paddle/fluid/framework/selected_rows.cc +++ b/paddle/fluid/framework/selected_rows.cc @@ -140,58 +140,6 @@ bool SelectedRows::HasKey(int64_t key) const { : true; } -int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown, - bool is_test) { - if (is_test) { - auto iter = id_to_index_.find(key); - if (iter == id_to_index_.end()) { - return -1; - } else { - return iter->second; - } - } - - rwlock_->RDLock(); - auto iter = id_to_index_.find(key); - if (iter == id_to_index_.end()) { - rwlock_->UNLock(); - if (!auto_grown) { - PADDLE_THROW("key %d not found", key); - } - rwlock_->WRLock(); - auto map_size = id_to_index_.size(); - auto vector_size = rows_.size(); - if (map_size != vector_size) { - rwlock_->UNLock(); - PADDLE_THROW( - "id_to_index_ size %d should have the same size with rows_ %d", - map_size, vector_size); - } - auto write_iter = id_to_index_.find(key); - if (write_iter == id_to_index_.end()) { - int row_num = rows_.size(); - if (row_num == value_->dims()[0]) { - rwlock_->UNLock(); - PADDLE_THROW("selected rows is full, then length exceed %d", row_num); - } - // key logic to put a key into id_to_index_ - rows_.push_back(key); - auto index = static_cast(rows_.size() - 1); - id_to_index_[key] = index; - rwlock_->UNLock(); - return index; - } else { - auto index = write_iter->second; - rwlock_->UNLock(); - return index; - } - } else { - auto index = iter->second; - rwlock_->UNLock(); - return index; - } -} - void SelectedRows::SyncIndex() { rwlock_->WRLock(); id_to_index_.clear(); diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index d3e0f2168b7e9..6c31dada686b9 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -118,7 +118,55 @@ class SelectedRows { * * @return index of the key. */ - int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false); + int64_t AutoGrownIndex(int64_t key, bool auto_grown, bool is_test = false) { + if (is_test) { + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + return -1; + } else { + return iter->second; + } + } + rwlock_->RDLock(); + auto iter = id_to_index_.find(key); + if (iter == id_to_index_.end()) { + rwlock_->UNLock(); + if (!auto_grown) { + PADDLE_THROW("key %d not found", key); + } + rwlock_->WRLock(); + auto map_size = id_to_index_.size(); + auto vector_size = rows_.size(); + if (map_size != vector_size) { + rwlock_->UNLock(); + PADDLE_THROW( + "id_to_index_ size %d should have the same size with rows_ %d", + map_size, vector_size); + } + auto write_iter = id_to_index_.find(key); + if (write_iter == id_to_index_.end()) { + int row_num = rows_.size(); + if (row_num == value_->dims()[0]) { + rwlock_->UNLock(); + PADDLE_THROW("selected rows is full, then length exceed %d", row_num); + } + // key logic to put a key into id_to_index_ + rows_.push_back(key); + auto index = static_cast(rows_.size() - 1); + id_to_index_[key] = index; + rwlock_->UNLock(); + return index; + } else { + auto index = write_iter->second; + rwlock_->UNLock(); + return index; + } + } else { + auto index = iter->second; + rwlock_->UNLock(); + return index; + } + } void SyncIndex(); /* diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 2967586949846..9a0cf8701fbb6 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -142,7 +142,7 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::LoDTensor& tmat, for (size_t k = 0; k < input_width; ++k) { int64_t row_index = - weight->AutoGrownIndex(static_cast(index), false); + weight->AutoGrownIndex(static_cast(index), false, true); weight_value[row_index * weight_width + k] += tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; From 5cd2fc9fd020444257ec6522504a3b244134439c Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 07:05:48 +0000 Subject: [PATCH 120/684] just for test --- paddle/testing/paddle_gtest_main.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/paddle/testing/paddle_gtest_main.cc b/paddle/testing/paddle_gtest_main.cc index 598f435461b40..6f10a51d18cbf 100644 --- a/paddle/testing/paddle_gtest_main.cc +++ b/paddle/testing/paddle_gtest_main.cc @@ -31,6 +31,11 @@ int main(int argc, char** argv) { #ifdef PADDLE_WITH_CUDA new_argv.push_back( strdup("--tryfromenv=fraction_of_gpu_memory_to_use,allocator_strategy")); +#elif __clang__ + new_argv.push_back( + strdup("--tryfromenv=use_mkldnn,initial_cpu_memory_in_" + "mb,allocator_strategy")); + new_argv.push_back(strdup("--undefok=use_mkldnn,initial_cpu_memory_in_mb")); #else new_argv.push_back( strdup("--tryfromenv=use_pinned_memory,use_mkldnn,initial_cpu_memory_in_" From 9851a534780471b5eefed15fed8846e25a319149 Mon Sep 17 00:00:00 2001 From: Qiao Longfei Date: Fri, 23 Nov 2018 15:18:24 +0800 Subject: [PATCH 121/684] add prefetch part in pserver --- .../operators/distributed/grpc_server.cc | 1 + .../operators/distributed/request_handler.h | 3 +- .../distributed/request_handler_impl.cc | 24 +++++++---- .../distributed/request_handler_impl.h | 40 +++++++++++++++---- .../operators/distributed/send_recv.proto.in | 1 + .../operators/distributed/variable_response.h | 1 + 6 files changed, 54 insertions(+), 16 deletions(-) diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index ffd2b1707bea6..d5295dc63da5c 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -181,6 +181,7 @@ class RequestPrefetch final : public RequestBase { // prefetch process... std::string in_var_name = request_->Varname(); std::string out_var_name = request_->OutVarname(); + std::string table_name = request_->TableName(); int trainer_id = request_->GetTrainerId(); VLOG(40) << "RequestPrefetch, in_var_name: " << in_var_name << " out_var_name: " << out_var_name; diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 3bcc59a47ba5f..f29b2bf7d6855 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -191,7 +191,8 @@ class RequestHandler { virtual bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name = "") = 0; + const std::string& out_var_name = "", + const std::string& table_name = "") = 0; protected: const bool sync_mode_; diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index dae56cc8436c2..0f1264ee96eeb 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -37,7 +37,8 @@ bool RequestSendHandler::Handle(const std::string& varname, framework::Variable* invar, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name) { + const std::string& out_var_name, + const std::string& table_name) { VLOG(40) << "RequestSendHandler:" << varname; // Sync @@ -77,7 +78,8 @@ bool RequestGetHandler::Handle(const std::string& varname, framework::Variable* invar, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name) { + const std::string& out_var_name, + const std::string& table_name) { VLOG(40) << "RequestGetHandler:" << varname; if (sync_mode_) { if (varname == FETCH_BARRIER_MESSAGE) { @@ -114,14 +116,21 @@ bool RequestPrefetchHandler::Handle(const std::string& varname, framework::Variable* invar, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name) { + const std::string& out_var_name, + const std::string& table_name) { VLOG(40) << "RequestPrefetchHandler " << varname; auto var_desc = program_->Block(0).FindVar(out_var_name); InitializeVariable(*outvar, var_desc->GetType()); - executor_->RunPreparedContext( - (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope); - + if (table_name.empty()) { + executor_->RunPreparedContext( + (*prefetch_var_name_to_prepared_ctx_)[varname].get(), scope); + } else { + auto lookup_table_op = + BuildLookupTableOp(table_name, varname, out_var_name); + paddle::platform::CPUPlace cpu_place; + lookup_table_op->Run(*scope, cpu_place); + } return true; } @@ -130,7 +139,8 @@ bool RequestCheckpointHandler::Handle(const std::string& varname, framework::Variable* invar, framework::Variable** outvar, const int trainer_id, - const std::string& out_var_name) { + const std::string& out_var_name, + const std::string& table_name) { PADDLE_ENFORCE( checkpoint_notify_id != -1, "when checkpoint_notify_id = -1, there should be no RPC invoke."); diff --git a/paddle/fluid/operators/distributed/request_handler_impl.h b/paddle/fluid/operators/distributed/request_handler_impl.h index c1afda9dd2445..5e0b25c5c2ce1 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.h +++ b/paddle/fluid/operators/distributed/request_handler_impl.h @@ -24,6 +24,7 @@ #include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/executor.h" #include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/program_desc.h" #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" @@ -43,8 +44,8 @@ class RequestSendHandler final : public RequestHandler { virtual ~RequestSendHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "") override; + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; private: bool enable_dc_asgd_; @@ -59,21 +60,44 @@ class RequestGetHandler final : public RequestHandler { virtual ~RequestGetHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "") override; + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; private: bool enable_dc_asgd_; }; +static inline void BuildVar(const std::string& param_name, + std::initializer_list arguments, + paddle::framework::proto::OpDesc::Var* var) { + var->set_parameter(param_name); + for (auto& arg_name : arguments) { + *var->mutable_arguments()->Add() = arg_name; + } +} + class RequestPrefetchHandler final : public RequestHandler { public: explicit RequestPrefetchHandler(bool sync_mode) : RequestHandler(sync_mode) {} virtual ~RequestPrefetchHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "") override; + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; + + private: + std::unique_ptr BuildLookupTableOp( + const std::string& table_name, const std::string& id_name, + const std::string& out_name) { + paddle::framework::proto::OpDesc op_desc; + op_desc.set_type("lookup_table"); + BuildVar("W", {table_name.data()}, op_desc.add_inputs()); + BuildVar("Ids", {id_name.data()}, op_desc.add_inputs()); + BuildVar("Out", {out_name.data()}, op_desc.add_outputs()); + + auto op = paddle::framework::OpRegistry::CreateOp(op_desc); + return op; + } }; class RequestCheckpointHandler final : public RequestHandler { @@ -85,8 +109,8 @@ class RequestCheckpointHandler final : public RequestHandler { virtual ~RequestCheckpointHandler() {} bool Handle(const std::string& varname, framework::Scope* scope, framework::Variable* var, framework::Variable** outvar, - const int trainer_id, - const std::string& out_var_name = "") override; + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override; private: int checkpoint_notify_id; diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index 55820c980e813..7b7d069f17fd0 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -80,6 +80,7 @@ message VariableMessage { // when profile switches from 1 to 2. int64 profile = 11; int64 trainer_id = 12; + string table_name = 13; } message VoidMessage {} diff --git a/paddle/fluid/operators/distributed/variable_response.h b/paddle/fluid/operators/distributed/variable_response.h index 4c7fcbbdfb305..a4324f67bb99b 100644 --- a/paddle/fluid/operators/distributed/variable_response.h +++ b/paddle/fluid/operators/distributed/variable_response.h @@ -85,6 +85,7 @@ class VariableResponse { inline framework::Scope* GetMutableLocalScope() const { return local_scope_; } inline std::string Varname() const { return meta_.varname(); } inline std::string OutVarname() const { return meta_.out_varname(); } + inline std::string TableName() const { return meta_.table_name(); } // should call parse first. framework::Variable* GetVar() { From e21edb26f6e7fb364597c31a26f128c3c2710516 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 17:53:13 +0800 Subject: [PATCH 122/684] add Set/GetCPUNumThreads api --- paddle/fluid/inference/api/analysis_config.cc | 1 + paddle/fluid/inference/api/analysis_predictor.cc | 3 +-- paddle/fluid/inference/api/api_impl.cc | 3 +-- paddle/fluid/inference/api/paddle_api.h | 9 +++++++++ .../inference/tests/api/analyzer_resnet50_tester.cc | 1 + paddle/fluid/inference/tests/api/config_printer.h | 2 ++ paddle/fluid/inference/tests/api/tester_helper.h | 1 + paddle/fluid/operators/math/fc_compute.h | 4 +--- paddle/fluid/platform/cpu_helper.cc | 2 +- 9 files changed, 18 insertions(+), 8 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 5ccd2dc5ab353..100ee0c9d3798 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -46,6 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; + cpu_num_threads_ = other.cpu_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index cb14d2a260280..9162ccefd8c6e 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -35,7 +35,6 @@ #include "paddle/fluid/platform/profiler.h" DECLARE_bool(profile); -DECLARE_int32(paddle_num_threads); namespace paddle { @@ -67,7 +66,7 @@ bool AnalysisPredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); if (!PrepareScope(parent_scope)) { return false; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index fcbc3803d04de..c3d17edea43a3 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -28,7 +28,6 @@ limitations under the License. */ #include "paddle/fluid/platform/profiler.h" DEFINE_bool(profile, false, "Turn on profiler for fluid"); -DECLARE_int32(paddle_num_threads); namespace paddle { namespace { @@ -76,7 +75,7 @@ bool NativePaddlePredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(FLAGS_paddle_num_threads); + paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index 0a2a2a1a23401..b7f7781d06481 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -186,6 +186,15 @@ struct NativeConfig : public PaddlePredictor::Config { // Specify the variable's name of each input if input tensors don't follow the // `feeds` and `fetches` of the phase `save_inference_model`. bool specify_input_name{false}; + + // Set and get the number of cpu threads. + void SetCPUNumThreads(int cpu_num_threads) { + cpu_num_threads_ = cpu_num_threads; + } + int GetCPUNumThreads() const { return cpu_num_threads_; } + + protected: + int cpu_num_threads_{1}; // number of cpu threads for each instance. }; // A factory to help create different predictors. diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 2b936175ed3f8..308a794ca3d1a 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -27,6 +27,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->enable_ir_optim = true; cfg->specify_input_name = true; + cfg->SetCPUNumThreads(FLAGS_paddle_num_threads); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index aa0c4b1d049bc..a803f5b3f4166 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -53,6 +53,8 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; + os << GenSpaces(num_spaces) + << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 7b686045a59c9..fdadd59049022 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -42,6 +42,7 @@ DEFINE_bool(use_analysis, true, "Running the inference program in analysis mode."); DECLARE_bool(profile); +DECLARE_int32(paddle_num_threads); namespace paddle { namespace inference { diff --git a/paddle/fluid/operators/math/fc_compute.h b/paddle/fluid/operators/math/fc_compute.h index b072b4c20a171..5b9953a5aa9a2 100644 --- a/paddle/fluid/operators/math/fc_compute.h +++ b/paddle/fluid/operators/math/fc_compute.h @@ -17,8 +17,6 @@ limitations under the License. */ #include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/jit_kernel.h" -DECLARE_int32(paddle_num_threads); - namespace paddle { namespace operators { namespace math { @@ -43,7 +41,7 @@ inline void FCCompute(const BlasT& blas, const int M, .template Get>(N); #ifdef PADDLE_WITH_MKLML -#pragma omp parallel for if (FLAGS_paddle_num_threads > 1) +#pragma omp parallel for #endif for (int i = 0; i < M; i++) { T* dst = Y + i * N; diff --git a/paddle/fluid/platform/cpu_helper.cc b/paddle/fluid/platform/cpu_helper.cc index f2d691b2931f5..b737a6c38d044 100644 --- a/paddle/fluid/platform/cpu_helper.cc +++ b/paddle/fluid/platform/cpu_helper.cc @@ -41,7 +41,7 @@ void SetNumThreads(int num_threads) { #elif defined(PADDLE_WITH_MKLML) int real_num_threads = num_threads > 1 ? num_threads : 1; platform::dynload::MKL_Set_Num_Threads(real_num_threads); - omp_set_num_threads(num_threads); + omp_set_num_threads(real_num_threads); #else PADDLE_ENFORCE(false, "To be implemented."); #endif From a5c4b463c962bed48fba89d459adf82f4899d6c3 Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 18:37:33 +0800 Subject: [PATCH 123/684] add SetMKLDNNThreadId api --- paddle/fluid/inference/api/analysis_predictor.cc | 8 ++++++++ paddle/fluid/inference/api/analysis_predictor.h | 2 ++ paddle/fluid/inference/api/paddle_analysis_config.h | 2 +- paddle/fluid/inference/tests/api/tester_helper.h | 9 ++++++--- 4 files changed, 17 insertions(+), 4 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 9162ccefd8c6e..4633a75e5ec35 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -159,6 +159,14 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } +void AnalysisPredictor::SetMKLDNNThreadId(int tid) { +#ifdef PADDLE_WITH_MKLDNN + platform::set_cur_thread_id(tid); +#else + LOG(ERROR) << "Please compile with MKLDNN first to use MKLDNN"; +#endif +} + bool AnalysisPredictor::Run(const std::vector &inputs, std::vector *output_data, int batch_size) { diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index cf81b7db738d8..9191970a3ae07 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -69,6 +69,8 @@ class AnalysisPredictor : public PaddlePredictor { framework::Scope *scope() { return scope_.get(); } framework::ProgramDesc &program() { return *inference_program_; } + void SetMKLDNNThreadId(int tid); + protected: bool PrepareProgram(const std::shared_ptr &program); bool PrepareScope(const std::shared_ptr &parent_scope); diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index 2ac736df7ccd5..a09bd1cac2aa3 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -51,9 +51,9 @@ struct AnalysisConfig : public NativeConfig { int max_batch_size = 1); bool use_tensorrt() const { return use_tensorrt_; } + void EnableMKLDNN(); // NOTE this is just for internal development, please not use it. // NOT stable yet. - void EnableMKLDNN(); bool use_mkldnn() const { return use_mkldnn_; } friend class ::paddle::AnalysisPredictor; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index fdadd59049022..72703bc80b493 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -216,13 +216,16 @@ void TestMultiThreadPrediction( size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { threads.emplace_back([&, tid]() { -#ifdef PADDLE_WITH_MKLDNN - platform::set_cur_thread_id(static_cast(tid) + 1); -#endif // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; auto &predictor = predictors[tid]; +#ifdef PADDLE_WITH_MKLDNN + if (use_analysis) { + static_cast(predictor.get()) + ->SetMKLDNNThreadId(static_cast(tid) + 1); + } +#endif // warmup run LOG(INFO) << "Running thread " << tid << ", warm up run..."; From e66b4c6bff74231898cbbb013627b0eb86eced0f Mon Sep 17 00:00:00 2001 From: luotao1 Date: Thu, 22 Nov 2018 18:49:59 +0800 Subject: [PATCH 124/684] adjust tester_helper to make multi-instance multi-thread work test=develop --- paddle/fluid/inference/tests/api/tester_helper.h | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index 72703bc80b493..d21567ac19758 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -207,11 +207,7 @@ void TestMultiThreadPrediction( int batch_size = FLAGS_batch_size; int num_times = FLAGS_repeat; std::vector threads; - std::vector> predictors; - predictors.emplace_back(CreateTestPredictor(config, use_analysis)); - for (int tid = 1; tid < num_threads; ++tid) { - predictors.emplace_back(predictors.front()->Clone()); - } + auto main_predictor = CreateTestPredictor(config, use_analysis); size_t total_time{0}; for (int tid = 0; tid < num_threads; ++tid) { @@ -219,7 +215,9 @@ void TestMultiThreadPrediction( // Each thread should have local inputs and outputs. // The inputs of each thread are all the same. std::vector outputs_tid; - auto &predictor = predictors[tid]; + // To ensure the thread binding correctly, + // please clone inside the threadpool. + auto predictor = main_predictor->Clone(); #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { static_cast(predictor.get()) From 116979a40adf7fe7788f8cd50b9f03c57bcbba7b Mon Sep 17 00:00:00 2001 From: luotao1 Date: Fri, 23 Nov 2018 16:17:56 +0800 Subject: [PATCH 125/684] refine api name test=develop --- paddle/fluid/inference/api/analysis_config.cc | 3 ++- paddle/fluid/inference/api/analysis_predictor.cc | 4 ++-- paddle/fluid/inference/api/analysis_predictor.h | 2 +- paddle/fluid/inference/api/api_impl.cc | 2 +- paddle/fluid/inference/api/paddle_api.h | 14 +++++++++----- .../tests/api/analyzer_resnet50_tester.cc | 2 +- paddle/fluid/inference/tests/api/config_printer.h | 2 +- paddle/fluid/inference/tests/api/tester_helper.h | 2 +- 8 files changed, 18 insertions(+), 13 deletions(-) diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index 100ee0c9d3798..dd75f0d9a6540 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -46,7 +46,7 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; - cpu_num_threads_ = other.cpu_num_threads_; + cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; @@ -73,6 +73,7 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { prog_file = other.prog_file; param_file = other.param_file; specify_input_name = other.specify_input_name; + cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; use_feed_fetch_ops = other.use_feed_fetch_ops; diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 4633a75e5ec35..c132ce326c6b2 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -66,7 +66,7 @@ bool AnalysisPredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (!PrepareScope(parent_scope)) { return false; @@ -159,7 +159,7 @@ bool AnalysisPredictor::PrepareExecutor() { return true; } -void AnalysisPredictor::SetMKLDNNThreadId(int tid) { +void AnalysisPredictor::SetMkldnnThreadID(int tid) { #ifdef PADDLE_WITH_MKLDNN platform::set_cur_thread_id(tid); #else diff --git a/paddle/fluid/inference/api/analysis_predictor.h b/paddle/fluid/inference/api/analysis_predictor.h index 9191970a3ae07..db57812bc3ba8 100644 --- a/paddle/fluid/inference/api/analysis_predictor.h +++ b/paddle/fluid/inference/api/analysis_predictor.h @@ -69,7 +69,7 @@ class AnalysisPredictor : public PaddlePredictor { framework::Scope *scope() { return scope_.get(); } framework::ProgramDesc &program() { return *inference_program_; } - void SetMKLDNNThreadId(int tid); + void SetMkldnnThreadID(int tid); protected: bool PrepareProgram(const std::shared_ptr &program); diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index c3d17edea43a3..66a8e513961d7 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -75,7 +75,7 @@ bool NativePaddlePredictor::Init( #endif // no matter with or without MKLDNN - paddle::platform::SetNumThreads(config_.GetCPUNumThreads()); + paddle::platform::SetNumThreads(config_.cpu_math_library_num_threads()); if (config_.use_gpu) { place_ = paddle::platform::CUDAPlace(config_.device); diff --git a/paddle/fluid/inference/api/paddle_api.h b/paddle/fluid/inference/api/paddle_api.h index b7f7781d06481..1513a4b3b4f66 100644 --- a/paddle/fluid/inference/api/paddle_api.h +++ b/paddle/fluid/inference/api/paddle_api.h @@ -187,14 +187,18 @@ struct NativeConfig : public PaddlePredictor::Config { // `feeds` and `fetches` of the phase `save_inference_model`. bool specify_input_name{false}; - // Set and get the number of cpu threads. - void SetCPUNumThreads(int cpu_num_threads) { - cpu_num_threads_ = cpu_num_threads; + // Set and get the number of cpu math library threads. + void SetCpuMathLibraryNumThreads(int cpu_math_library_num_threads) { + cpu_math_library_num_threads_ = cpu_math_library_num_threads; + } + int cpu_math_library_num_threads() const { + return cpu_math_library_num_threads_; } - int GetCPUNumThreads() const { return cpu_num_threads_; } protected: - int cpu_num_threads_{1}; // number of cpu threads for each instance. + // number of cpu math library (such as MKL, OpenBlas) threads for each + // instance. + int cpu_math_library_num_threads_{1}; }; // A factory to help create different predictors. diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc index 308a794ca3d1a..abc63577b7913 100644 --- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc @@ -27,7 +27,7 @@ void SetConfig(AnalysisConfig *cfg) { cfg->device = 0; cfg->enable_ir_optim = true; cfg->specify_input_name = true; - cfg->SetCPUNumThreads(FLAGS_paddle_num_threads); + cfg->SetCpuMathLibraryNumThreads(FLAGS_paddle_num_threads); } void SetInput(std::vector> *inputs) { diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index a803f5b3f4166..4231eef722073 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -54,7 +54,7 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; os << GenSpaces(num_spaces) - << "cpu_num_threads: " << config.GetCPUNumThreads() << "\n"; + << "cpu_num_threads: " << config.cpu_math_library_num_threads() << "\n"; num_spaces--; os << GenSpaces(num_spaces) << "}\n"; return os; diff --git a/paddle/fluid/inference/tests/api/tester_helper.h b/paddle/fluid/inference/tests/api/tester_helper.h index d21567ac19758..1dc1678406783 100644 --- a/paddle/fluid/inference/tests/api/tester_helper.h +++ b/paddle/fluid/inference/tests/api/tester_helper.h @@ -221,7 +221,7 @@ void TestMultiThreadPrediction( #ifdef PADDLE_WITH_MKLDNN if (use_analysis) { static_cast(predictor.get()) - ->SetMKLDNNThreadId(static_cast(tid) + 1); + ->SetMkldnnThreadID(static_cast(tid) + 1); } #endif From 47c4e65d608083cb4b75222bcd14c1db8bc40333 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Fri, 23 Nov 2018 08:34:31 +0000 Subject: [PATCH 126/684] test=develop --- paddle/fluid/memory/allocation/retry_allocator_test.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/memory/allocation/retry_allocator_test.cc b/paddle/fluid/memory/allocation/retry_allocator_test.cc index a0ce2875cb833..f0b215dac2524 100644 --- a/paddle/fluid/memory/allocation/retry_allocator_test.cc +++ b/paddle/fluid/memory/allocation/retry_allocator_test.cc @@ -41,7 +41,7 @@ TEST(RetryAllocator, RetryAllocator) { size_t thread_num = 32; size_t sleep_time = 40; - size_t extra_time = 2; + size_t extra_time = 10; // Reserve to perform more tests in the future std::vector> allocators; From c35bf3d34b43dd6cc5b96e963f8990d60a68d749 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 16:36:54 +0800 Subject: [PATCH 127/684] Fix multiclass_nms_op unit test fail in python3.6 test=develop --- .../fluid/tests/unittests/test_multiclass_nms_op.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index df0562dcc79cb..e35be54b63877 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -145,10 +145,16 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold, lod.append(nmsed_num) if nmsed_num == 0: continue + tmp_det_out = [] for c, indices in nmsed_outs.items(): for idx in indices: xmin, ymin, xmax, ymax = boxes[n][idx][:] det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + tmp_det_out.append( + [c, scores[n][c][idx], xmin, ymin, xmax, ymax]) + sorted_det_out = sorted( + tmp_det_out, key=lambda tup: tup[0], reverse=False) + det_outs.extend(sorted_det_out) return det_outs, lod @@ -210,7 +216,7 @@ def test_check_output(self): class TestMulticlassNMSOpNoOutput(TestMulticlassNMSOp): def set_argument(self): # Here set 2.0 to test the case there is no outputs. - # In practical use, 0.0 < score_threshold < 1.0 + # In practical use, 0.0 < score_threshold < 1.0 self.score_threshold = 2.0 From 5ca56cad1f3fdb50f7d019ae5e658b538f98aecc Mon Sep 17 00:00:00 2001 From: sneaxiy Date: Fri, 23 Nov 2018 08:54:09 +0000 Subject: [PATCH 128/684] test=develop --- python/paddle/fluid/layers/control_flow.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 9730fbf510cbe..05138bf94598f 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -896,9 +896,10 @@ def array_to_lod_tensor(x, table): def increment(x, value=1.0, in_place=True): """ - This function performs an operation that increments each value in the + This function performs an operation that increments the value in the input :math:`x` by an amount: :math:`value` as mentioned in the input - parameter. This operation is performed in-place by default. + parameter. This operation is performed in-place by default. Notice that + the number of elements in :math:`x` must be equal to 1. Args: x (Variable|list): The tensor that has the input values. @@ -911,7 +912,8 @@ def increment(x, value=1.0, in_place=True): Examples: .. code-block:: python - data = fluid.layers.data(name='data', shape=[32, 32], dtype='float32') + data = fluid.layers.data(name='data', shape=[1], dtype='float32', + append_batch_size=False) data = fluid.layers.increment(x=data, value=3.0, in_place=True) """ helper = LayerHelper("increment", **locals()) From f7847ca6a304a649982c04bff4f3eec846a06c5d Mon Sep 17 00:00:00 2001 From: chengduozh Date: Fri, 23 Nov 2018 17:09:14 +0800 Subject: [PATCH 129/684] fix cublas warp error test=develop --- paddle/fluid/platform/dynload/cublas.cc | 3 +++ paddle/fluid/platform/dynload/cublas.h | 30 ++++++++++++++++--------- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/paddle/fluid/platform/dynload/cublas.cc b/paddle/fluid/platform/dynload/cublas.cc index 361d3439b844e..41648c32fe6f9 100644 --- a/paddle/fluid/platform/dynload/cublas.cc +++ b/paddle/fluid/platform/dynload/cublas.cc @@ -32,6 +32,9 @@ CUBLAS_BLAS_ROUTINE_EACH_R2(DEFINE_WRAP); CUBLAS_BLAS_ROUTINE_EACH_R3(DEFINE_WRAP); #endif +#ifdef CUBLAS_BLAS_ROUTINE_EACH_R4 +CUBLAS_BLAS_ROUTINE_EACH_R4(DEFINE_WRAP); +#endif } // namespace dynload } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/dynload/cublas.h b/paddle/fluid/platform/dynload/cublas.h index ff80bd525c167..ced789b90d067 100644 --- a/paddle/fluid/platform/dynload/cublas.h +++ b/paddle/fluid/platform/dynload/cublas.h @@ -90,23 +90,33 @@ CUBLAS_BLAS_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) // APIs available after CUDA 8.0 #if CUDA_VERSION >= 8000 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmEx); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasDgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasCgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasZgemmStridedBatched); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasHgemmStridedBatched); +#define CUBLAS_BLAS_ROUTINE_EACH_R2(__macro) \ + __macro(cublasGemmEx); \ + __macro(cublasSgemmStridedBatched); \ + __macro(cublasDgemmStridedBatched); \ + __macro(cublasCgemmStridedBatched); \ + __macro(cublasZgemmStridedBatched); \ + __macro(cublasHgemmStridedBatched); + +CUBLAS_BLAS_ROUTINE_EACH_R2(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif // APIs available after CUDA 9.0 #if CUDA_VERSION >= 9000 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasSetMathMode); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGetMathMode); +#define CUBLAS_BLAS_ROUTINE_EACH_R3(__macro) \ + __macro(cublasSetMathMode); \ + __macro(cublasGetMathMode); + +CUBLAS_BLAS_ROUTINE_EACH_R3(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif +// APIs available after CUDA 9.1 #if CUDA_VERSION >= 9010 -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmBatchedEx); -DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(cublasGemmStridedBatchedEx); +#define CUBLAS_BLAS_ROUTINE_EACH_R4(__macro) \ + __macro(cublasGemmBatchedEx); \ + __macro(cublasGemmStridedBatchedEx); + +CUBLAS_BLAS_ROUTINE_EACH_R4(DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP) #endif #undef DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP From 5431d5c471d32a5ea3be049a339e57262bd3b483 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 18:15:06 +0800 Subject: [PATCH 130/684] Polish code test=develop --- python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py index e35be54b63877..9778bd694de4b 100644 --- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py +++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py @@ -149,7 +149,6 @@ def batched_multiclass_nms(boxes, scores, background, score_threshold, for c, indices in nmsed_outs.items(): for idx in indices: xmin, ymin, xmax, ymax = boxes[n][idx][:] - det_outs.append([c, scores[n][c][idx], xmin, ymin, xmax, ymax]) tmp_det_out.append( [c, scores[n][c][idx], xmin, ymin, xmax, ymax]) sorted_det_out = sorted( From 3d100b0c927b6326e75b3e493d545ee2b0ff4f4b Mon Sep 17 00:00:00 2001 From: minqiyang Date: Fri, 23 Nov 2018 19:14:46 +0800 Subject: [PATCH 131/684] Add Python3.6 Python3.7 compile process test=develop --- Dockerfile | 75 +++++++++++++++++++++++++++++------------------------- 1 file changed, 40 insertions(+), 35 deletions(-) diff --git a/Dockerfile b/Dockerfile index b36102175c42f..eb7bb2549e495 100644 --- a/Dockerfile +++ b/Dockerfile @@ -33,44 +33,49 @@ RUN apt-get update && \ automake locales clang-format swig cmake \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ + build-essential checkinstall \ + libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y -# Install Go and glide -RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ - tar -xz -C /usr/local && \ - mkdir /root/gopath && \ - mkdir /root/gopath/bin && \ - mkdir /root/gopath/src -ENV GOROOT=/usr/local/go GOPATH=/root/gopath -# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. -ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin -# install glide -RUN curl -s -q https://glide.sh/get | sh - -# Install TensorRT -# following TensorRT.tar.gz is not the default official one, we do two miny changes: -# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, -# and its size is only one-third of the official one. -# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. -# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - tar -xz -C /usr/local && \ - cp -rf /usr/local/TensorRT/include /usr && \ - cp -rf /usr/local/TensorRT/lib /usr - -# git credential to skip password typing -RUN git config --global credential.helper store - -# Fix locales to en_US.UTF-8 -RUN localedef -i en_US -f UTF-8 en_US.UTF-8 - -# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter -# version util jupyter fixes this issue. - -# specify sphinx version as 1.5.6 and remove -U option for [pip install -U -# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest -# version(1.7.1 for now), which causes building documentation failed. +COPY tools/manylinux1/build_scripts/* /root/python/ +RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0 + +# # Install Go and glide +# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + # tar -xz -C /usr/local && \ + # mkdir /root/gopath && \ + # mkdir /root/gopath/bin && \ + # mkdir /root/gopath/src +# ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# # install glide +# RUN curl -s -q https://glide.sh/get | sh + +# # Install TensorRT +# # following TensorRT.tar.gz is not the default official one, we do two miny changes: +# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, +# # and its size is only one-third of the official one. +# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. +# # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. +# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ + # tar -xz -C /usr/local && \ + # cp -rf /usr/local/TensorRT/include /usr && \ + # cp -rf /usr/local/TensorRT/lib /usr + +# # git credential to skip password typing +# RUN git config --global credential.helper store + +# # Fix locales to en_US.UTF-8 +# RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter +# # version util jupyter fixes this issue. + +# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# # version(1.7.1 for now), which causes building documentation failed. # RUN pip3 install -U wheel && \ # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ From 64ca3d176cc1348f0735e3e6f4fd2c18e902f43b Mon Sep 17 00:00:00 2001 From: qingqing01 Date: Fri, 23 Nov 2018 19:18:20 +0800 Subject: [PATCH 132/684] Add bias_attr in sequence_conv_pool API. (#14553) --- paddle/fluid/API.spec | 2 +- python/paddle/fluid/nets.py | 9 ++++++++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 50114bf3df0ac..8397ae093ba1b 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -342,7 +342,7 @@ paddle.fluid.transpiler.RoundRobin.dispatch ArgSpec(args=['self', 'varlist'], va paddle.fluid.transpiler.RoundRobin.reset ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) paddle.fluid.transpiler.DistributeTranspilerConfig.__init__ paddle.fluid.nets.simple_img_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'pool_size', 'pool_stride', 'pool_padding', 'pool_type', 'global_pooling', 'conv_stride', 'conv_padding', 'conv_dilation', 'conv_groups', 'param_attr', 'bias_attr', 'act', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, 'max', False, 1, 0, 1, 1, None, None, None, True)) -paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max')) +paddle.fluid.nets.sequence_conv_pool ArgSpec(args=['input', 'num_filters', 'filter_size', 'param_attr', 'act', 'pool_type', 'bias_attr'], varargs=None, keywords=None, defaults=(None, 'sigmoid', 'max', None)) paddle.fluid.nets.glu ArgSpec(args=['input', 'dim'], varargs=None, keywords=None, defaults=(-1,)) paddle.fluid.nets.scaled_dot_product_attention ArgSpec(args=['queries', 'keys', 'values', 'num_heads', 'dropout_rate'], varargs=None, keywords=None, defaults=(1, 0.0)) paddle.fluid.nets.img_conv_group ArgSpec(args=['input', 'conv_num_filter', 'pool_size', 'conv_padding', 'conv_filter_size', 'conv_act', 'param_attr', 'conv_with_batchnorm', 'conv_batchnorm_drop_rate', 'pool_stride', 'pool_type', 'use_cudnn'], varargs=None, keywords=None, defaults=(1, 3, None, None, False, 0.0, 1, 'max', True)) diff --git a/python/paddle/fluid/nets.py b/python/paddle/fluid/nets.py index 00d33b36fcc32..fb75ef62d01ca 100644 --- a/python/paddle/fluid/nets.py +++ b/python/paddle/fluid/nets.py @@ -250,7 +250,8 @@ def sequence_conv_pool(input, filter_size, param_attr=None, act="sigmoid", - pool_type="max"): + pool_type="max", + bias_attr=None): """ The sequence_conv_pool is composed with Sequence Convolution and Pooling. @@ -266,6 +267,11 @@ def sequence_conv_pool(input, pool_type (str): Pooling type can be :math:`max` for max-pooling, :math:`average` for average-pooling, :math:`sum` for sum-pooling, :math:`sqrt` for sqrt-pooling. Default :math:`max`. + bias_attr (ParamAttr|bool|None): The parameter attribute for the bias of sequence_conv. + If it is set to False, no bias will be added to the output units. + If it is set to None or one attribute of ParamAttr, sequence_conv + will create ParamAttr as bias_attr. If the Initializer of the bias_attr + is not set, the bias is initialized zero. Default: None. Return: Variable: The final result after Sequence Convolution and Pooling. @@ -289,6 +295,7 @@ def sequence_conv_pool(input, num_filters=num_filters, filter_size=filter_size, param_attr=param_attr, + bias_attr=bias_attr, act=act) pool_out = layers.sequence_pool(input=conv_out, pool_type=pool_type) From e8a8f2626cc65b8dc3a91507eb233581e8e7e0e2 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 16:58:57 +0800 Subject: [PATCH 133/684] Add Python3.6 and Python3.7 support in Ubuntu Dockerfile test=develop --- Dockerfile | 200 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 118 insertions(+), 82 deletions(-) diff --git a/Dockerfile b/Dockerfile index eb7bb2549e495..6f45c79f3a12d 100644 --- a/Dockerfile +++ b/Dockerfile @@ -22,6 +22,27 @@ ENV HOME /root # Add bash enhancements COPY ./paddle/scripts/docker/root/ /root/ +# Prepare packages for Python +RUN apt-get update && \ + apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ + libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \ + xz-utils tk-dev libffi-dev liblzma-dev + +# Install Python3.6 +RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \ + tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \ + ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ + wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ + tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + +# Install Python3.7 +RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ + tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + RUN apt-get update && \ apt-get install -y --allow-downgrades patchelf \ python3 python3-dev python3-pip \ @@ -34,88 +55,103 @@ RUN apt-get update && \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ build-essential checkinstall \ - libreadline-gplv2-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ + libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y -COPY tools/manylinux1/build_scripts/* /root/python/ -RUN cd /root/python/ && source build_utils && MY_DIR=/root/python/ build_cpythons 3.6.0 3.7.0 - -# # Install Go and glide -# RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ - # tar -xz -C /usr/local && \ - # mkdir /root/gopath && \ - # mkdir /root/gopath/bin && \ - # mkdir /root/gopath/src -# ENV GOROOT=/usr/local/go GOPATH=/root/gopath -# # should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. -# ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin -# # install glide -# RUN curl -s -q https://glide.sh/get | sh - -# # Install TensorRT -# # following TensorRT.tar.gz is not the default official one, we do two miny changes: -# # 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, -# # and its size is only one-third of the official one. -# # 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. -# # See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. -# RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ - # tar -xz -C /usr/local && \ - # cp -rf /usr/local/TensorRT/include /usr && \ - # cp -rf /usr/local/TensorRT/lib /usr - -# # git credential to skip password typing -# RUN git config --global credential.helper store - -# # Fix locales to en_US.UTF-8 -# RUN localedef -i en_US -f UTF-8 en_US.UTF-8 - -# # FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter -# # version util jupyter fixes this issue. - -# # specify sphinx version as 1.5.6 and remove -U option for [pip install -U -# # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest -# # version(1.7.1 for now), which causes building documentation failed. -# RUN pip3 install -U wheel && \ - # pip3 install -U docopt PyYAML sphinx==1.5.6 && \ - # pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ - # easy_install -U pip && \ - # pip install -U pip setuptools wheel && \ - # pip install -U docopt PyYAML sphinx==1.5.6 && \ - # pip install sphinx-rtd-theme==0.1.9 recommonmark - -# RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - # pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - # pip3 install opencv-python && \ - # pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - # pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - # pip install opencv-python - -# #For docstring checker -# RUN pip3 install pylint pytest astroid isort -# RUN pip install pylint pytest astroid isort LinkChecker - -# COPY ./python/requirements.txt /root/ -# RUN pip3 install -r /root/requirements.txt -# RUN pip install -r /root/requirements.txt - -# # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use -# # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 -# RUN apt-get install -y libssl-dev libffi-dev -# RUN pip3 install certifi urllib3[secure] -# RUN pip install certifi urllib3[secure] - - -# # Install woboq_codebrowser to /woboq -# RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ - # (cd /woboq \ - # cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ - # -DCMAKE_BUILD_TYPE=Release . \ - # make) - -# # Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service -# RUN mkdir /var/run/sshd -# RUN echo 'root:root' | chpasswd -# RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config -# RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config -# EXPOSE 22 +# Install Go and glide +RUN wget -qO- https://storage.googleapis.com/golang/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# install glide +RUN curl -s -q https://glide.sh/get | sh + +# Install TensorRT +# following TensorRT.tar.gz is not the default official one, we do two miny changes: +# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, +# and its size is only one-third of the official one. +# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. +# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. +RUN wget -qO- http://paddlepaddledeps.cdn.bcebos.com/TensorRT-4.0.0.3.Ubuntu-16.04.4.x86_64-gnu.cuda-8.0.cudnn7.0.tar.gz | \ + tar -xz -C /usr/local && \ + cp -rf /usr/local/TensorRT/include /usr && \ + cp -rf /usr/local/TensorRT/lib /usr + +# git credential to skip password typing +RUN git config --global credential.helper store + +# Fix locales to en_US.UTF-8 +RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter +# version util jupyter fixes this issue. + +# specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# version(1.7.1 for now), which causes building documentation failed. +RUN pip3.5 install -U wheel && \ + pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 install -U wheel && \ + pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 install -U wheel && \ + pip3.7 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 install sphinx-rtd-theme==0.1.9 recommonmark && \ + easy_install -U pip && \ + pip install -U pip setuptools wheel && \ + pip install -U docopt PyYAML sphinx==1.5.6 && \ + pip install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.5 install opencv-python && \ + pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 install opencv-python && \ + pip3.7 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 install opencv-python && \ + pip install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip install opencv-python + +#For docstring checker +RUN pip3.5 install pylint pytest astroid isort +RUN pip3.6 install pylint pytest astroid isort +RUN pip3.7 install pylint pytest astroid isort +RUN pip install pylint pytest astroid isort LinkChecker + +COPY ./python/requirements.txt /root/ +RUN pip3.5 install -r /root/requirements.txt +RUN pip3.6 install -r /root/requirements.txt +RUN pip3.7 install -r /root/requirements.txt +RUN pip install -r /root/requirements.txt + +# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +RUN apt-get install -y libssl-dev libffi-dev +RUN pip3.5 install certifi urllib3[secure] +RUN pip3.6 install certifi urllib3[secure] +RUN pip3.7 install certifi urllib3[secure] +RUN pip install certifi urllib3[secure] + + +# Install woboq_codebrowser to /woboq +RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + (cd /woboq \ + cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + -DCMAKE_BUILD_TYPE=Release . \ + make) + +# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service +RUN mkdir /var/run/sshd +RUN echo 'root:root' | chpasswd +RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config +EXPOSE 22 From 155a0f78e6f8688a68dae765dee4d25e08bc614b Mon Sep 17 00:00:00 2001 From: Min Date: Sat, 24 Nov 2018 17:17:14 +0800 Subject: [PATCH 134/684] Polish code test=develop --- Dockerfile | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/Dockerfile b/Dockerfile index 6f45c79f3a12d..9459552890fac 100644 --- a/Dockerfile +++ b/Dockerfile @@ -34,13 +34,13 @@ RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-a ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ - CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.6 --enable-shared > /dev/null && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ make -j8 > /dev/null && make altinstall > /dev/null # Install Python3.7 RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ - CFLAGS="-Wformat" ./configure --prefix=/usr/local/python3.7 --enable-shared > /dev/null && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ make -j8 > /dev/null && make altinstall > /dev/null RUN apt-get update && \ @@ -54,8 +54,6 @@ RUN apt-get update && \ automake locales clang-format swig cmake \ liblapack-dev liblapacke-dev \ clang-3.8 llvm-3.8 libclang-3.8-dev \ - build-essential checkinstall \ - libreadline-dev libncursesw5-dev libssl-dev libsqlite3-dev tk-dev libgdbm-dev libc6-dev libbz2-dev \ net-tools libtool ccache && \ apt-get clean -y @@ -94,9 +92,9 @@ RUN localedef -i en_US -f UTF-8 en_US.UTF-8 # specify sphinx version as 1.5.6 and remove -U option for [pip install -U # sphinx-rtd-theme] since -U option will cause sphinx being updated to newest # version(1.7.1 for now), which causes building documentation failed. -RUN pip3.5 install -U wheel && \ - pip3.5 install -U docopt PyYAML sphinx==1.5.6 && \ - pip3.5 install sphinx-rtd-theme==0.1.9 recommonmark && \ +RUN pip3 install -U wheel && \ + pip3 install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 install sphinx-rtd-theme==0.1.9 recommonmark && \ pip3.6 install -U wheel && \ pip3.6 install -U docopt PyYAML sphinx==1.5.6 && \ pip3.6 install sphinx-rtd-theme==0.1.9 recommonmark && \ @@ -108,9 +106,9 @@ RUN pip3.5 install -U wheel && \ pip install -U docopt PyYAML sphinx==1.5.6 && \ pip install sphinx-rtd-theme==0.1.9 recommonmark -RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ - pip3.5 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ - pip3.5 install opencv-python && \ +RUN pip3 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 install opencv-python && \ pip3.6 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip3.6 install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ pip3.6 install opencv-python && \ @@ -122,13 +120,13 @@ RUN pip3.5 install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ pip install opencv-python #For docstring checker -RUN pip3.5 install pylint pytest astroid isort +RUN pip3 install pylint pytest astroid isort RUN pip3.6 install pylint pytest astroid isort RUN pip3.7 install pylint pytest astroid isort RUN pip install pylint pytest astroid isort LinkChecker COPY ./python/requirements.txt /root/ -RUN pip3.5 install -r /root/requirements.txt +RUN pip3 install -r /root/requirements.txt RUN pip3.6 install -r /root/requirements.txt RUN pip3.7 install -r /root/requirements.txt RUN pip install -r /root/requirements.txt @@ -136,7 +134,7 @@ RUN pip install -r /root/requirements.txt # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 RUN apt-get install -y libssl-dev libffi-dev -RUN pip3.5 install certifi urllib3[secure] +RUN pip3 install certifi urllib3[secure] RUN pip3.6 install certifi urllib3[secure] RUN pip3.7 install certifi urllib3[secure] RUN pip install certifi urllib3[secure] From 05e6a7141717f1eb1e73836c7aec67faea4d4db5 Mon Sep 17 00:00:00 2001 From: Min Date: Sat, 24 Nov 2018 17:19:22 +0800 Subject: [PATCH 135/684] Polish code test=develop --- .dockerignore | 1 - 1 file changed, 1 deletion(-) diff --git a/.dockerignore b/.dockerignore index 49adfe4f0accc..2b2e74053d33c 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,6 +1,5 @@ *.DS_Store build/ -build* *.user .vscode .idea From 8038cd10a93c66405bc7221f3d6cf1605c25df0d Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 19:19:44 +0800 Subject: [PATCH 136/684] Upgrade pybind11 to v2.2.4 to support Python3.7 test=develop --- cmake/external/pybind11.cmake | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cmake/external/pybind11.cmake b/cmake/external/pybind11.cmake index c885877a2bcd6..3a10ea945d3d1 100644 --- a/cmake/external/pybind11.cmake +++ b/cmake/external/pybind11.cmake @@ -26,7 +26,7 @@ ExternalProject_Add( extern_pybind ${EXTERNAL_PROJECT_LOG_ARGS} GIT_REPOSITORY "https://github.com/pybind/pybind11.git" - GIT_TAG "v2.1.1" + GIT_TAG "v2.2.4" PREFIX ${PYBIND_SOURCE_DIR} UPDATE_COMMAND "" CONFIGURE_COMMAND "" From bc4923321ce286f357792c5a28884a665fcb87b7 Mon Sep 17 00:00:00 2001 From: Jiabin Yang Date: Sat, 24 Nov 2018 19:32:46 +0800 Subject: [PATCH 137/684] Update __init__.py --- python/paddle/fluid/__init__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index ac2a7ea47e3dd..ff651db043e85 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -120,7 +120,6 @@ def __bootstrap__(): 'reader_queue_speed_test_mode', 'print_sub_graph_dir' ] if 'Darwin' not in sysstr: - print("aaaaa") read_env_flags.append('use_pinned_memory') if os.name != 'nt': From 81994e84e055cba8b4d3fe0b1ecb94b12d731661 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 19:37:37 +0800 Subject: [PATCH 138/684] Change the include files because the version changes of pybind11 test=develop --- paddle/fluid/pybind/tensor_py.h | 1 - paddle/scripts/paddle_build.sh | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/paddle/fluid/pybind/tensor_py.h b/paddle/fluid/pybind/tensor_py.h index b39323f843f8d..02a75236f6c7c 100644 --- a/paddle/fluid/pybind/tensor_py.h +++ b/paddle/fluid/pybind/tensor_py.h @@ -21,7 +21,6 @@ limitations under the License. */ #include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/float16.h" -#include "pybind11/common.h" #include "pybind11/numpy.h" #include "pybind11/pybind11.h" diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 9632eaec005df..86925b26e7e89 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -149,7 +149,7 @@ function cmake_gen() { elif [ "$1" == "cp37-cp37m" ]; then export LD_LIBRARY_PATH=/opt/_internal/cpython-3.7.0/lib/:${LD_LIBRARY_PATH} export PATH=/opt/_internal/cpython-3.7.0/bin/:${PATH} - export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3 + export PYTHON_FLAGS="-DPYTHON_EXECUTABLE:FILEPATH=/opt/_internal/cpython-3.7.0/bin/python3.7 -DPYTHON_INCLUDE_DIR:PATH=/opt/_internal/cpython-3.7.0/include/python3.7m -DPYTHON_LIBRARIES:FILEPATH=/opt/_internal/cpython-3.7.0/lib/libpython3.so" fi From b67229187e67b04f6f6517cf8c0ceb7fcd8629f4 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 19:47:50 +0800 Subject: [PATCH 139/684] Change to PYBIND11_MODULE because the deprecation of PYBIND11_PLUGIN test=develop --- paddle/fluid/pybind/pybind.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index 795800fd51763..bf86b83d4e136 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -86,7 +86,7 @@ bool IsCompiledWithDIST() { #endif } -PYBIND11_PLUGIN(core) { +PYBIND11_MODULE(core) { // Not used, just make sure cpu_info.cc is linked. paddle::platform::CpuTotalPhysicalMemory(); From d2045260a5cd907238e594483daf0d2fbfa51314 Mon Sep 17 00:00:00 2001 From: minqiyang Date: Sat, 24 Nov 2018 20:07:53 +0800 Subject: [PATCH 140/684] Change visibilities of variant_visitor of pybind11 test=develop --- paddle/fluid/pybind/protobuf.cc | 13 +++++++------ paddle/fluid/pybind/pybind.cc | 5 ++--- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/paddle/fluid/pybind/protobuf.cc b/paddle/fluid/pybind/protobuf.cc index 586e92c2b3146..0443ff3fc3d05 100644 --- a/paddle/fluid/pybind/protobuf.cc +++ b/paddle/fluid/pybind/protobuf.cc @@ -30,11 +30,12 @@ namespace pybind11 { namespace detail { // Can be replaced by a generic lambda in C++14 -struct variant_caster_visitor : public boost::static_visitor { +struct __attribute__((visibility("hidden"))) paddle_variant_caster_visitor + : public boost::static_visitor { return_value_policy policy; handle parent; - variant_caster_visitor(return_value_policy policy, handle parent) + paddle_variant_caster_visitor(return_value_policy policy, handle parent) : policy(policy), parent(parent) {} template @@ -44,10 +45,10 @@ struct variant_caster_visitor : public boost::static_visitor { }; template -struct variant_caster; +struct paddle_variant_caster; template