diff --git a/CMakeLists.txt b/CMakeLists.txt
index bd9605a1abb3d..f24513d605c49 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -335,6 +335,8 @@ set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 
+add_definitions(-DPADDLE_DLL_EXPORT)
+
 if(ON_INFER)
     # you can trun off the paddle fluid and inference lib by set ON_INFER=OFF
     message(STATUS "On inference mode, will take place some specific optimization.")
diff --git a/paddle/fluid/distributed/service/communicator.cc b/paddle/fluid/distributed/service/communicator.cc
index 1b05e3e72bc4e..aea758a717b2d 100644
--- a/paddle/fluid/distributed/service/communicator.cc
+++ b/paddle/fluid/distributed/service/communicator.cc
@@ -372,6 +372,7 @@ void Communicator::SendGlobalStep(const CommContext &ctx, int batches,
   if (batches == 0) {
     return;
   }
+  platform::RecordEvent record_event("Communicator->SendGlobalStep");
   auto &table_id = ctx.table_id;
   size_t request_call_num = _worker_ptr->get_server_nums();
 
@@ -775,6 +776,7 @@ void SyncCommunicator::BarrierRecv() {
 
 void GeoCommunicator::Send(const std::vector<std::string> &var_names,
                            const framework::Scope &scope) {
+  platform::RecordEvent record_event("GeoCommunicator->Send");
   waiting_ = false;
   auto before_send = GetCurrentUS();
   auto table_name = var_names[0];
@@ -1011,6 +1013,7 @@ void GeoCommunicator::InitSparse(const std::string &var_name, int table_id) {
 
 std::vector<int64_t> GeoCommunicator::MergeSparseIds(
     const std::string &send_varname) {
+  platform::RecordEvent record_event("GeoCommunicator->MergeSparseIds");
   size_t merge_num = 0, wait_times = 0;
   std::unordered_set<int64_t> sparse_ids;
   while (merge_num < static_cast<size_t>(max_merge_var_num_)) {
diff --git a/paddle/fluid/extension/include/all.h b/paddle/fluid/extension/include/all.h
index 5aa61f8203e75..e2a3bc38c5f4a 100644
--- a/paddle/fluid/extension/include/all.h
+++ b/paddle/fluid/extension/include/all.h
@@ -18,6 +18,12 @@ limitations under the License. */
 #error C++11 or later compatible compiler is required to use Paddle.
 #endif
 
+#ifdef _WIN32
+#ifndef NOMINMAX
+#define NOMINMAX  // msvc max/min macro conflict with std::min/max
+#endif
+#endif
+
 #include "paddle/fluid/extension/include/dispatch.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/op_meta_info.h"
diff --git a/paddle/fluid/extension/include/dispatch.h b/paddle/fluid/extension/include/dispatch.h
index a782b2b132113..c22971039521c 100644
--- a/paddle/fluid/extension/include/dispatch.h
+++ b/paddle/fluid/extension/include/dispatch.h
@@ -18,6 +18,8 @@ limitations under the License. */
 
 namespace paddle {
 
+///////// Basic Marco ///////////
+
 #define PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, HINT, ...) \
   case enum_type: {                                                       \
     using HINT = type;                                                    \
@@ -28,19 +30,139 @@ namespace paddle {
 #define PD_PRIVATE_CASE_TYPE(NAME, enum_type, type, ...) \
   PD_PRIVATE_CASE_TYPE_USING_HINT(NAME, enum_type, type, data_t, __VA_ARGS__)
 
-#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                          \
-  [&] {                                                                      \
-    const auto& dtype = TYPE;                                                \
-    switch (dtype) {                                                         \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,         \
-                           __VA_ARGS__)                                      \
-      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,        \
-                           __VA_ARGS__)                                      \
-      default:                                                               \
-        throw std::runtime_error("function not implemented for this type."); \
-    }                                                                        \
+///////// Floating Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_TYPES(TYPE, NAME, ...)                    \
+  [&] {                                                                \
+    const auto& __dtype__ = TYPE;                                      \
+    switch (__dtype__) {                                               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,   \
+                           __VA_ARGS__)                                \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,  \
+                           __VA_ARGS__)                                \
+      default:                                                         \
+        throw std::runtime_error("function " #NAME                     \
+                                 " not implemented for data type `" +  \
+                                 ::paddle::ToString(__dtype__) + "`"); \
+    }                                                                  \
+  }()
+
+///////// Integral Dispatch Marco ///////////
+
+#define PD_DISPATCH_INTEGRAL_TYPES(TYPE, NAME, ...)                           \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+                           __VA_ARGS__)                                       \
+      default:                                                                \
+        throw std::runtime_error("function " #NAME                            \
+                                 " not implemented for data type `" +         \
+                                 ::paddle::ToString(__dtype__) + "`");        \
+    }                                                                         \
+  }()
+
+///////// Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_COMPLEX_TYPES(TYPE, NAME, ...)                     \
+  [&] {                                                                \
+    const auto& __dtype__ = TYPE;                                      \
+    switch (__dtype__) {                                               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,        \
+                           ::paddle::complex64, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,       \
+                           ::paddle::complex128, __VA_ARGS__)          \
+      default:                                                         \
+        throw std::runtime_error("function " #NAME                     \
+                                 " not implemented for data type `" +  \
+                                 ::paddle::ToString(__dtype__) + "`"); \
+    }                                                                  \
+  }()
+
+///////// Floating and Integral Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(TYPE, NAME, ...)              \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+                           __VA_ARGS__)                                       \
+      default:                                                                \
+        throw std::runtime_error("function " #NAME                            \
+                                 " not implemented for data type `" +         \
+                                 ::paddle::ToString(__dtype__) + "`");        \
+    }                                                                         \
+  }()
+
+///////// Floating and Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(TYPE, NAME, ...)        \
+  [&] {                                                                \
+    const auto& __dtype__ = TYPE;                                      \
+    switch (__dtype__) {                                               \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,   \
+                           __VA_ARGS__)                                \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,  \
+                           __VA_ARGS__)                                \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,        \
+                           ::paddle::complex64, __VA_ARGS__)           \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,       \
+                           ::paddle::complex128, __VA_ARGS__)          \
+      default:                                                         \
+        throw std::runtime_error("function " #NAME                     \
+                                 " not implemented for data type `" +  \
+                                 ::paddle::ToString(__dtype__) + "`"); \
+    }                                                                  \
+  }()
+
+///////// Floating, Integral and Complex Dispatch Marco ///////////
+
+#define PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(TYPE, NAME, ...)  \
+  [&] {                                                                       \
+    const auto& __dtype__ = TYPE;                                             \
+    switch (__dtype__) {                                                      \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT32, float,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::FLOAT64, double,         \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT32, int, __VA_ARGS__) \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT64, int64_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT8, int8_t,            \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::UINT8, uint8_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::INT16, int16_t,          \
+                           __VA_ARGS__)                                       \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX64,               \
+                           ::paddle::complex64, __VA_ARGS__)                  \
+      PD_PRIVATE_CASE_TYPE(NAME, ::paddle::DataType::COMPLEX128,              \
+                           ::paddle::complex128, __VA_ARGS__)                 \
+      default:                                                                \
+        throw std::runtime_error("function " #NAME                            \
+                                 " not implemented for data type `" +         \
+                                 ::paddle::ToString(__dtype__) + "`");        \
+    }                                                                         \
   }()
 
-// TODD(chenweihang): implement other DISPATH macros in next PR
+// TODO(chenweihang): Add more Marcos in the future if needed
 
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/dll_decl.h b/paddle/fluid/extension/include/dll_decl.h
new file mode 100644
index 0000000000000..3dbea5e6dffc2
--- /dev/null
+++ b/paddle/fluid/extension/include/dll_decl.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#if defined(_WIN32)
+#ifndef PD_DLL_DECL
+#ifdef PADDLE_DLL_EXPORT
+#define PD_DLL_DECL __declspec(dllexport)
+#else
+#define PD_DLL_DECL __declspec(dllimport)
+#endif  // PADDLE_DLL_EXPORT
+#endif  // PD_DLL_DECL
+#else
+#define PD_DLL_DECL
+#endif  // _WIN32
diff --git a/paddle/fluid/extension/include/dtype.h b/paddle/fluid/extension/include/dtype.h
index ec8d76a391c1e..c5d2e0f820555 100644
--- a/paddle/fluid/extension/include/dtype.h
+++ b/paddle/fluid/extension/include/dtype.h
@@ -14,27 +14,90 @@ limitations under the License. */
 
 #pragma once
 
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/complex128.h"
+#include "paddle/fluid/platform/complex64.h"
+#include "paddle/fluid/platform/float16.h"
+
 namespace paddle {
 
-struct complex128;
-struct complex64;
-struct float16;
-struct bfloat16;
+using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
+using complex64 = paddle::platform::complex64;
+using complex128 = paddle::platform::complex128;
 
 enum DataType {
+  BOOL,
+  INT8,
+  UINT8,
+  INT16,
+  INT32,
+  INT64,
+  FLOAT16,
+  BFLOAT16,
   FLOAT32,
   FLOAT64,
-  BFLOAT16,
-  COMPLEX128,
   COMPLEX64,
-  FLOAT16,
-  INT64,
-  INT32,
-  INT16,
-  UINT8,
-  INT8,
-  BOOL,
+  COMPLEX128,
   // TODO(JiabinYang) support more data types if needed.
 };
 
+inline std::string ToString(DataType dtype) {
+  switch (dtype) {
+    case DataType::BOOL:
+      return "bool";
+    case DataType::INT8:
+      return "int8_t";
+    case DataType::UINT8:
+      return "uint8_t";
+    case DataType::INT16:
+      return "int16_t";
+    case DataType::INT32:
+      return "int32_t";
+    case DataType::INT64:
+      return "int64_t";
+    case DataType::FLOAT16:
+      return "float16";
+    case DataType::BFLOAT16:
+      return "bfloat16";
+    case DataType::FLOAT32:
+      return "float";
+    case DataType::FLOAT64:
+      return "double";
+    case DataType::COMPLEX64:
+      return "complex64";
+    case DataType::COMPLEX128:
+      return "complex128";
+    default:
+      throw std::runtime_error("Unsupported paddle enum data type.");
+  }
+}
+
+#define PD_FOR_EACH_DATA_TYPE(_)    \
+  _(bool, DataType::BOOL)           \
+  _(int8_t, DataType::INT8)         \
+  _(uint8_t, DataType::UINT8)       \
+  _(int16_t, DataType::INT16)       \
+  _(int, DataType::INT32)           \
+  _(int64_t, DataType::INT64)       \
+  _(float16, DataType::FLOAT16)     \
+  _(bfloat16, DataType::BFLOAT16)   \
+  _(float, DataType::FLOAT32)       \
+  _(double, DataType::FLOAT64)      \
+  _(complex64, DataType::COMPLEX64) \
+  _(complex128, DataType::COMPLEX128)
+
+template <paddle::DataType T>
+struct DataTypeToCPPType;
+
+#define PD_SPECIALIZE_DataTypeToCPPType(cpp_type, data_type) \
+  template <>                                                \
+  struct DataTypeToCPPType<data_type> {                      \
+    using type = cpp_type;                                   \
+  };
+
+PD_FOR_EACH_DATA_TYPE(PD_SPECIALIZE_DataTypeToCPPType)
+
+#undef PD_SPECIALIZE_DataTypeToCPPType
+
 }  // namespace paddle
diff --git a/paddle/fluid/extension/include/op_meta_info.h b/paddle/fluid/extension/include/op_meta_info.h
index d02954dc61eb8..c16f61374f7cb 100644
--- a/paddle/fluid/extension/include/op_meta_info.h
+++ b/paddle/fluid/extension/include/op_meta_info.h
@@ -14,12 +14,14 @@ limitations under the License. */
 
 #pragma once
 
+#include <iostream>
 #include <string>
 #include <unordered_map>
 #include <vector>
 
 #include <boost/any.hpp>
 
+#include "paddle/fluid/extension/include/dll_decl.h"
 #include "paddle/fluid/extension/include/tensor.h"
 
 /**
@@ -31,7 +33,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace framework {
-class OpMetaInfoHelper;
+class PD_DLL_DECL OpMetaInfoHelper;
 }  // namespace framework
 
 using Tensor = paddle::Tensor;
@@ -43,6 +45,26 @@ using Tensor = paddle::Tensor;
   classname& operator=(const classname&) = delete; \
   classname& operator=(classname&&) = delete
 
+#if defined _WIN32
+#define HANDLE_THE_ERROR try {
+#define END_HANDLE_THE_ERROR            \
+  }                                     \
+  catch (const std::exception& e) {     \
+    std::cerr << e.what() << std::endl; \
+    throw e;                            \
+  }
+#else
+#define HANDLE_THE_ERROR
+#define END_HANDLE_THE_ERROR
+#endif
+
+#define PD_THROW(err_msg)              \
+  do {                                 \
+    HANDLE_THE_ERROR                   \
+    throw std::runtime_error(err_msg); \
+    END_HANDLE_THE_ERROR               \
+  } while (0)
+
 ///////////////// Util Define and Function ////////////////
 
 inline std::string Grad(const std::string& var_name) {
@@ -106,7 +128,7 @@ struct KernelFuncImpl<Return (*)(Args...), impl_fn> {
                                                             attr_idx + 1>(
             inputs, attrs, pargs..., arg);
       } catch (boost::bad_any_cast&) {
-        throw std::runtime_error(
+        PD_THROW(
             "Attribute cast error in custom operator. Expected int value.");
       }
     }
@@ -220,7 +242,7 @@ struct InferDtypeFuncImpl<Return (*)(Args...), impl_fn> {
 
 ////////////////////// Op Meta Info //////////////////////
 
-class OpMetaInfo {
+class PD_DLL_DECL OpMetaInfo {
  public:
   explicit OpMetaInfo(const std::string& op_name) : name_(op_name) {}
   OpMetaInfo& Inputs(std::vector<std::string>&& inputs);
@@ -246,7 +268,7 @@ class OpMetaInfo {
 
 //////////////// Op Meta Info Map /////////////////
 
-class OpMetaInfoMap {
+class PD_DLL_DECL OpMetaInfoMap {
  public:
   // this function's impl should keep in header file.
   // if move to cc file, meta info can not be added
@@ -270,14 +292,14 @@ class OpMetaInfoMap {
 
 //////////////// Op Meta Info Builder /////////////////
 
-class OpMetaInfoBuilder {
+class PD_DLL_DECL OpMetaInfoBuilder {
  public:
   explicit OpMetaInfoBuilder(std::string&& name);
   OpMetaInfoBuilder& Inputs(std::vector<std::string>&& inputs);
   OpMetaInfoBuilder& Outputs(std::vector<std::string>&& outputs);
-  OpMetaInfoBuilder& SetKernelFn(KernelFunc&& func);
-  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc&& func);
-  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc&& func);
+  OpMetaInfoBuilder& SetKernelFn(KernelFunc func);
+  OpMetaInfoBuilder& SetInferShapeFn(InferShapeFunc func);
+  OpMetaInfoBuilder& SetInferDtypeFn(InferDtypeFunc func);
   OpMetaInfoBuilder& SetBackwardOp(const std::string& bwd_op_name);
 
  private:
@@ -300,10 +322,15 @@ void LoadCustomOperatorLib(const std::string& dso_name);
 
 /////////////////////// Op register Macro /////////////////////////
 
-#define PD_BUILD_OP(op_name)                                            \
-  static ::paddle::OpMetaInfoBuilder __op_meta_info_##__COUNTER__##__ = \
+#define PD_BUILD_OP_WITH_COUNTER(op_name, counter)                  \
+  static ::paddle::OpMetaInfoBuilder __op_meta_info_##counter##__ = \
       ::paddle::OpMetaInfoBuilder(op_name)
 
+#define PD_BUILD_OP_INNER(op_name, counter) \
+  PD_BUILD_OP_WITH_COUNTER(op_name, counter)
+
+#define PD_BUILD_OP(op_name) PD_BUILD_OP_INNER(op_name, __COUNTER__)
+
 }  // namespace paddle
 
 ///////////////////// C API ///////////////////
@@ -312,8 +339,12 @@ void LoadCustomOperatorLib(const std::string& dso_name);
 extern "C" {
 #endif
 
+#if defined(_WIN32)
 // C-API to get global OpMetaInfoMap.
-paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap();
+__declspec(dllexport) inline paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
+  return paddle::OpMetaInfoMap::Instance();
+}
+#endif  // _WIN32
 
 #ifdef __cplusplus
 }
diff --git a/paddle/fluid/extension/include/tensor.h b/paddle/fluid/extension/include/tensor.h
index a5ce0d1a5858b..47af4dc70a15f 100644
--- a/paddle/fluid/extension/include/tensor.h
+++ b/paddle/fluid/extension/include/tensor.h
@@ -16,6 +16,7 @@ limitations under the License. */
 
 #include <memory>
 #include <vector>
+#include "paddle/fluid/extension/include/dll_decl.h"
 #include "paddle/fluid/extension/include/dtype.h"
 #include "paddle/fluid/extension/include/place.h"
 
@@ -23,7 +24,7 @@ namespace paddle {
 namespace framework {
 class CustomTensorUtils;
 }  // namespace framework
-class Tensor {
+class PD_DLL_DECL Tensor {
  public:
   /// \brief Construct a Tensor on target Place for CustomOp.
   /// Generally it's only used for user to create Tensor.
diff --git a/paddle/fluid/extension/src/op_meta_info.cc b/paddle/fluid/extension/src/op_meta_info.cc
index f31723e5ac836..0273dfd5d07a6 100644
--- a/paddle/fluid/extension/src/op_meta_info.cc
+++ b/paddle/fluid/extension/src/op_meta_info.cc
@@ -78,17 +78,17 @@ OpMetaInfoBuilder& OpMetaInfoBuilder::Outputs(
   return *this;
 }
 
-OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetKernelFn(KernelFunc func) {
   info_ptr_->SetKernelFn(std::forward<KernelFunc>(func));
   return *this;
 }
 
-OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferShapeFn(InferShapeFunc func) {
   info_ptr_->SetInferShapeFn(std::forward<InferShapeFunc>(func));
   return *this;
 }
 
-OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc&& func) {
+OpMetaInfoBuilder& OpMetaInfoBuilder::SetInferDtypeFn(InferDtypeFunc func) {
   info_ptr_->SetInferDtypeFn(std::forward<InferDtypeFunc>(func));
   return *this;
 }
@@ -114,10 +114,17 @@ void LoadCustomOperatorLib(const std::string& dso_name) {
 }
 }  // namespace paddle
 
+#ifdef __cplusplus
 extern "C" {
+#endif
 
+#ifndef _WIN32
+// C-API to get global OpMetaInfoMap.
 paddle::OpMetaInfoMap& PD_GetOpMetaInfoMap() {
   return paddle::OpMetaInfoMap::Instance();
 }
+#endif
 
+#ifdef __cplusplus
 }  // end extern "C"
+#endif
diff --git a/paddle/fluid/extension/src/tensor.cc b/paddle/fluid/extension/src/tensor.cc
index 34ca57d75bf03..39ed274864110 100644
--- a/paddle/fluid/extension/src/tensor.cc
+++ b/paddle/fluid/extension/src/tensor.cc
@@ -17,11 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/custom_tensor_utils.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/complex128.h"
-#include "paddle/fluid/platform/complex64.h"
 #include "paddle/fluid/platform/enforce.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
 
 namespace paddle {
@@ -211,73 +207,87 @@ Tensor Tensor::copy_to(const PlaceType &target_place) const {
   return target;
 }
 
-template Tensor Tensor::copy_to<paddle::platform::float16>(
+template PD_DLL_DECL Tensor
+Tensor::copy_to<paddle::platform::float16>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::bfloat16>(
     const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::bfloat16>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex64>(
     const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::complex64>(
+template PD_DLL_DECL Tensor Tensor::copy_to<paddle::platform::complex128>(
     const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<paddle::platform::complex128>(
-    const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<float>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<double>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
-template Tensor Tensor::copy_to<bool>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<float>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<double>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int64_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int32_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<uint8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int8_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<int16_t>(const PlaceType &target_place) const;
+template PD_DLL_DECL Tensor
+Tensor::copy_to<bool>(const PlaceType &target_place) const;
 
-template float *Tensor::data<float>() const;
-template double *Tensor::data<double>() const;
-template int64_t *Tensor::data<int64_t>() const;
-template int32_t *Tensor::data<int32_t>() const;
-template uint8_t *Tensor::data<uint8_t>() const;
-template int8_t *Tensor::data<int8_t>() const;
-template paddle::platform::float16 *Tensor::data<paddle::platform::float16>()
-    const;
-template paddle::platform::bfloat16 *Tensor::data<paddle::platform::bfloat16>()
-    const;
-template paddle::platform::complex128 *
+template PD_DLL_DECL float *Tensor::data<float>() const;
+template PD_DLL_DECL double *Tensor::data<double>() const;
+template PD_DLL_DECL int64_t *Tensor::data<int64_t>() const;
+template PD_DLL_DECL int32_t *Tensor::data<int32_t>() const;
+template PD_DLL_DECL uint8_t *Tensor::data<uint8_t>() const;
+template PD_DLL_DECL int8_t *Tensor::data<int8_t>() const;
+template PD_DLL_DECL paddle::platform::float16 *
+Tensor::data<paddle::platform::float16>() const;
+template PD_DLL_DECL paddle::platform::bfloat16 *
+Tensor::data<paddle::platform::bfloat16>() const;
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::data<paddle::platform::complex128>() const;
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::data<paddle::platform::complex64>() const;
-template int16_t *Tensor::data<int16_t>() const;
-template bool *Tensor::data<bool>() const;
+template PD_DLL_DECL int16_t *Tensor::data<int16_t>() const;
+template PD_DLL_DECL bool *Tensor::data<bool>() const;
 
-template float *Tensor::mutable_data<float>();
-template double *Tensor::mutable_data<double>();
-template int64_t *Tensor::mutable_data<int64_t>();
-template int32_t *Tensor::mutable_data<int32_t>();
-template uint8_t *Tensor::mutable_data<uint8_t>();
-template int8_t *Tensor::mutable_data<int8_t>();
-template paddle::platform::float16 *
+template PD_DLL_DECL float *Tensor::mutable_data<float>();
+template PD_DLL_DECL double *Tensor::mutable_data<double>();
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>();
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>();
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>();
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>();
+template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>();
-template paddle::platform::bfloat16 *
+template PD_DLL_DECL paddle::platform::bfloat16 *
 Tensor::mutable_data<paddle::platform::bfloat16>();
-template paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>();
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>();
-template int16_t *Tensor::mutable_data<int16_t>();
-template bool *Tensor::mutable_data<bool>();
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>();
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>();
 
-template float *Tensor::mutable_data<float>(const PlaceType &place);
-template double *Tensor::mutable_data<double>(const PlaceType &place);
-template int64_t *Tensor::mutable_data<int64_t>(const PlaceType &place);
-template int32_t *Tensor::mutable_data<int32_t>(const PlaceType &place);
-template uint8_t *Tensor::mutable_data<uint8_t>(const PlaceType &place);
-template int8_t *Tensor::mutable_data<int8_t>(const PlaceType &place);
-template paddle::platform::float16 *
+template PD_DLL_DECL float *Tensor::mutable_data<float>(const PlaceType &place);
+template PD_DLL_DECL double *Tensor::mutable_data<double>(
+    const PlaceType &place);
+template PD_DLL_DECL int64_t *Tensor::mutable_data<int64_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int32_t *Tensor::mutable_data<int32_t>(
+    const PlaceType &place);
+template PD_DLL_DECL uint8_t *Tensor::mutable_data<uint8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL int8_t *Tensor::mutable_data<int8_t>(
+    const PlaceType &place);
+template PD_DLL_DECL paddle::platform::float16 *
 Tensor::mutable_data<paddle::platform::float16>(const PlaceType &place);
-template paddle::platform::bfloat16 *
+template PD_DLL_DECL paddle::platform::bfloat16 *
 Tensor::mutable_data<paddle::platform::bfloat16>(const PlaceType &place);
-template paddle::platform::complex128 *
+template PD_DLL_DECL paddle::platform::complex128 *
 Tensor::mutable_data<paddle::platform::complex128>(const PlaceType &place);
-template paddle::platform::complex64 *
+template PD_DLL_DECL paddle::platform::complex64 *
 Tensor::mutable_data<paddle::platform::complex64>(const PlaceType &place);
-template int16_t *Tensor::mutable_data<int16_t>(const PlaceType &place);
-template bool *Tensor::mutable_data<bool>(const PlaceType &place);
+template PD_DLL_DECL int16_t *Tensor::mutable_data<int16_t>(
+    const PlaceType &place);
+template PD_DLL_DECL bool *Tensor::mutable_data<bool>(const PlaceType &place);
 
 std::vector<int> Tensor::shape() const {
   GET_CASTED_TENSOR
diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
index 14179172db229..b037c11186545 100644
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -345,9 +345,12 @@ if (LINUX)
 endif()
 
 if (WIN32)
+  set(FLUID_FRAMEWORK_IMPORT_LIB
+    ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_framework.lib
+    CACHE INTERNAL "Fluid framework lib")
   set(FLUID_FRAMEWORK_SHARED_LIB
-      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/libpaddle_framework.dll
-      CACHE INTERNAL "Fluid framework lib")
+      ${PADDLE_BINARY_DIR}/paddle/fluid/framework/${CMAKE_BUILD_TYPE}/paddle_framework.dll
+      CACHE INTERNAL "Fluid framework dll")
 endif()
 
 if(APPLE)
diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc
index 6e1e7146ba673..83e812de39cef 100644
--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -1122,8 +1122,6 @@ void ParallelExecutor::BCastParamsToDevices(
 FetchResultType ParallelExecutor::Run(
     const std::vector<std::string> &fetch_tensors, bool return_merged) {
   VLOG(3) << "enter ParallelExecutor Run";
-  platform::RecordEvent parallel_executor_event(
-      "ParallelExecutor::Run", paddle::platform::EventRole::kSpecial);
 #ifdef WITH_GPERFTOOLS
   if (gProfileStarted) {
     ProfilerFlush();
diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt
index 22b30403a6204..a24c0ac09c758 100644
--- a/paddle/fluid/imperative/CMakeLists.txt
+++ b/paddle/fluid/imperative/CMakeLists.txt
@@ -9,10 +9,15 @@ cc_library(basic_engine SRCS basic_engine.cc DEPS layer gradient_accumulator)
 cc_library(engine SRCS basic_engine.cc partial_grad_engine.cc DEPS layer gradient_accumulator)
 cc_library(imperative_profiler SRCS profiler.cc)
 if(NOT WIN32)
-    if(WITH_NCCL)
+    if(WITH_NCCL OR WITH_RCCL)
         cc_library(imperative_all_reduce SRCS all_reduce.cc DEPS collective_helper device_context selected_rows tensor)
         cc_library(nccl_context SRCS nccl_context.cc DEPS collective_helper device_context imperative_all_reduce var_type_traits)
-        nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce)
+        if(WITH_NCCL)
+            nv_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce)
+        endif()
+        if(WITH_RCCL)
+            hip_library(reducer SRCS reducer.cc reducer.cu DEPS layer imperative_all_reduce)
+        endif()
     endif()
     if(WITH_XPU_BKCL)
         cc_library(bkcl_context SRCS bkcl_context.cc DEPS collective_helper device_context tensor var_type_traits)
diff --git a/paddle/fluid/imperative/all_reduce.cc b/paddle/fluid/imperative/all_reduce.cc
index 3b018374f4fde..b922811b4f104 100644
--- a/paddle/fluid/imperative/all_reduce.cc
+++ b/paddle/fluid/imperative/all_reduce.cc
@@ -12,11 +12,17 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 #include "paddle/fluid/imperative/all_reduce.h"
 
+#ifdef PADDLE_WITH_NCCL
 #include <nccl.h>
+#endif
+
+#ifdef PADDLE_WITH_RCCL
+#include <rccl.h>
+#endif
 
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/selected_rows.h"
@@ -46,7 +52,7 @@ static const platform::Place &GetVarPlace(const framework::Variable &src) {
 }
 
 static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
-                      const cudaStream_t stream,
+                      const gpuStream_t stream,
                       const platform::NCCLComm *comm) {
   const auto &place = src.place();
   PADDLE_ENFORCE_EQ(
@@ -67,7 +73,7 @@ static void AllReduce(const framework::Tensor &src, framework::Tensor *dst,
 static void AllReduce(const framework::SelectedRows &src,
                       framework::SelectedRows *dst,
                       const ParallelStrategy &strategy,
-                      const cudaStream_t stream,
+                      const gpuStream_t stream,
                       const platform::NCCLComm *comm) {
   VLOG(3) << "SelectedRows AllReduce start";
   const auto &src_tensor = src.value();
@@ -99,7 +105,11 @@ static void AllReduce(const framework::SelectedRows &src,
       comm->comm(), stream));
 
   if (!use_calc_stream) {
+#ifdef PADDLE_WITH_RCCL
+    PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
     PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
   }
 
   const auto *cpu_rows_num_ptr = rows_num_vector.data();
@@ -176,7 +186,7 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst,
       platform::DeviceContextPool::Instance().Get(place));
   platform::NCCLComm *comm =
       platform::NCCLCommContext::Instance().Get(ring_id, place);
-  cudaStream_t stream = (use_calc_stream ? dev_ctx->stream() : comm->stream());
+  gpuStream_t stream = (use_calc_stream ? dev_ctx->stream() : comm->stream());
 
   if (src.IsType<framework::LoDTensor>()) {
     if (!dst->IsType<framework::LoDTensor>()) {
@@ -199,8 +209,12 @@ void AllReduce(const framework::Variable &src, framework::Variable *dst,
       AllReduce(src.Get<framework::SelectedRows>(),
                 tmp_dst.GetMutable<framework::SelectedRows>(), strategy, stream,
                 comm);
-      // stream must synchronize to ensure accuracy of the move operation
+// stream must synchronize to ensure accuracy of the move operation
+#ifdef PADDLE_WITH_RCCL
+      PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamSynchronize(stream));
+#else
       PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamSynchronize(stream));
+#endif
       *dst = std::move(tmp_dst);
     }
 #endif
diff --git a/paddle/fluid/imperative/all_reduce.h b/paddle/fluid/imperative/all_reduce.h
index 2185c19b696a2..6ef528025b04d 100644
--- a/paddle/fluid/imperative/all_reduce.h
+++ b/paddle/fluid/imperative/all_reduce.h
@@ -14,7 +14,7 @@
 
 #pragma once
 
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index ff8494a388817..deb504a1b657e 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -99,7 +99,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
   }
 #endif
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   void operator()(const platform::CUDAPlace& place) {
     platform::CUDADeviceContext* ctx =
         dynamic_cast<platform::CUDADeviceContext*>(
@@ -186,7 +186,7 @@ void TensorAdd(const framework::Variable& src, framework::Variable* dst) {
 
   if (data_type == framework::proto::VarType::FP16) {
     if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       return TensorAddImpl<platform::CUDADeviceContext, platform::float16>(
           src_tensor, dst_tensor, place);
 #else
@@ -224,7 +224,7 @@ void SelectedRowsAddToTensor(const framework::Variable& src,
     return;                                                                  \
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CUDADeviceContext, double);
@@ -232,7 +232,7 @@ void SelectedRowsAddToTensor(const framework::Variable& src,
 #endif
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, float);
     PADDLE_SELECTED_ROWS_ADD_TO_TENSOR(platform::CPUDeviceContext, double);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
 
@@ -267,7 +267,7 @@ static void SelectedRowsAddTensor(
     return;                                                                \
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CUDADeviceContext, double);
@@ -275,7 +275,7 @@ static void SelectedRowsAddTensor(
 #endif
     PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, float);
     PADDLE_SELECTED_ROWS_ADD_TENSOR(platform::CPUDeviceContext, double);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
 
@@ -314,7 +314,7 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge(
     return dst_var;                                                       \
   }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (paddle::platform::is_gpu_place(place)) {
     PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, float);
     PADDLE_SELECTED_ROWS_ADD(platform::CUDADeviceContext, double);
@@ -322,7 +322,7 @@ std::shared_ptr<VariableWrapper> SelectedRowsMerge(
 #endif
     PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, float);
     PADDLE_SELECTED_ROWS_ADD(platform::CPUDeviceContext, double);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   }
 #endif
 
@@ -518,7 +518,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
         }
       }
 
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       if (paddle::platform::is_gpu_place(place)) {
         // sum selected rows firstly
         for (auto& var_info : tmp_grad_vars_) {
@@ -579,7 +579,7 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
           // Increase count
           IncreaseCurCnt();
         }
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       }
 #endif
       tmp_grad_vars_.clear();
diff --git a/paddle/fluid/imperative/nccl_context.cc b/paddle/fluid/imperative/nccl_context.cc
index 4ec23e4b7d6e2..eb0135d15e074 100644
--- a/paddle/fluid/imperative/nccl_context.cc
+++ b/paddle/fluid/imperative/nccl_context.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/imperative/nccl_context.h"
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/imperative/all_reduce.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/gen_comm_id_helper.h"
@@ -31,7 +31,7 @@ class Variable;
 
 namespace paddle {
 namespace imperative {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 
 void NCCLParallelContext::BcastNCCLId(
     std::vector<ncclUniqueId> &nccl_ids,  // NOLINT
@@ -113,9 +113,14 @@ void NCCLParallelContext::WaitCompute(int ring_id) {
       platform::NCCLCommContext::Instance().Get(ring_id, place_)->stream();
   auto event = compute_events_[ring_id].get();
 
-  // compute_stream-->event-->comm_stream
+// compute_stream-->event-->comm_stream
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, compute_stream));
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(comm_stream, event, 0));
+#else
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, compute_stream));
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(comm_stream, event, 0));
+#endif
 }
 
 void NCCLParallelContext::WaitComm(int ring_id) {
@@ -134,9 +139,14 @@ void NCCLParallelContext::WaitComm(int ring_id) {
       platform::NCCLCommContext::Instance().Get(ring_id, place_)->stream();
   auto event = comm_events_[ring_id].get();
 
-  // comm_stream-->event-->compute_stream
+// comm_stream-->event-->compute_stream
+#ifdef PADDLE_WITH_HIP
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipEventRecord(event, comm_stream));
+  PADDLE_ENFORCE_CUDA_SUCCESS(hipStreamWaitEvent(compute_stream, event, 0));
+#else
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaEventRecord(event, comm_stream));
   PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamWaitEvent(compute_stream, event, 0));
+#endif
 }
 
 #endif
diff --git a/paddle/fluid/imperative/nccl_context.h b/paddle/fluid/imperative/nccl_context.h
index 1a93f897526d6..51e5743aebdc3 100644
--- a/paddle/fluid/imperative/nccl_context.h
+++ b/paddle/fluid/imperative/nccl_context.h
@@ -17,11 +17,18 @@
 #include <string>
 #include <vector>
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/cuda_resource_pool.h"
+#endif
+
+#ifdef PADDLE_WITH_NCCL
 #include "paddle/fluid/platform/dynload/nccl.h"
 #endif
 
+#ifdef PADDLE_WITH_RCCL
+#include "paddle/fluid/platform/dynload/rccl.h"
+#endif
+
 #include "paddle/fluid/imperative/parallel_context.h"
 
 namespace paddle {
@@ -33,7 +40,7 @@ class Variable;
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 class NCCLParallelContext : public ParallelContext {
  public:
   explicit NCCLParallelContext(const ParallelStrategy& strategy,
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 2289d6600f5df..f8740940d041a 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -27,7 +27,8 @@
 namespace paddle {
 namespace imperative {
 
-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
 // div the nranks
 void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
   framework::Tensor *tensor =
@@ -37,7 +38,7 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
           : dense_contents_.GetMutable<framework::LoDTensor>();
 
   if (platform::is_gpu_place(tensor->place())) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     DivNRanks(tensor, nranks, context);
 #endif
   } else if (platform::is_cpu_place(tensor->place())) {
@@ -206,7 +207,7 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
 void Group::ConcatTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     ConcatTensorsWithType(
         static_cast<const platform::CUDADeviceContext &>(context),
         dense_tensors_, &dense_contents_, dtype_);
@@ -238,7 +239,7 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
 void Group::SplitTensors(const platform::DeviceContext &context) {
   auto place = context.GetPlace();
   if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_NCCL
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
     SplitTensorsWithType(
         static_cast<const platform::CUDADeviceContext &>(context),
         &dense_contents_, &dense_tensors_, dtype_);
diff --git a/paddle/fluid/imperative/reducer.cu b/paddle/fluid/imperative/reducer.cu
index 96e1de5b3d10b..ca233292b3470 100644
--- a/paddle/fluid/imperative/reducer.cu
+++ b/paddle/fluid/imperative/reducer.cu
@@ -17,7 +17,7 @@
 namespace paddle {
 namespace imperative {
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void Group::DivNRanks(framework::Tensor *tensor, int64_t nranks,
                       const platform::DeviceContext &context) {
   framework::VisitDataTypeSmall(
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index 1ac9f155a0029..f352ad17fda5d 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -47,7 +47,8 @@ class VariableWrapper;
 namespace paddle {
 namespace imperative {
 
-#if (defined PADDLE_WITH_NCCL) || (defined PADDLE_WITH_XPU_BKCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
+    defined(PADDLE_WITH_XPU_BKCL)
 
 template <typename T>
 struct DivNRanksFunctor {
diff --git a/paddle/fluid/imperative/tests/CMakeLists.txt b/paddle/fluid/imperative/tests/CMakeLists.txt
index 353c137fbf915..adb560df77c78 100644
--- a/paddle/fluid/imperative/tests/CMakeLists.txt
+++ b/paddle/fluid/imperative/tests/CMakeLists.txt
@@ -1,7 +1,7 @@
 if(WIN32)
     cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS device_context)
 else()
-    if (WITH_NCCL)
+    if (WITH_NCCL OR WITH_RCCL)
         cc_test(nccl_context_test SRCS nccl_context_test.cc DEPS nccl_context)
     endif()
     if (WITH_XPU_BKCL)
@@ -16,6 +16,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s
 cc_test(test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy)
 cc_test(test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy)
 
-if (WITH_NCCL OR WITH_XPU_BKCL)
+if (WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL)
 cc_test(test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy)
 endif()
diff --git a/paddle/fluid/imperative/tests/nccl_context_test.cc b/paddle/fluid/imperative/tests/nccl_context_test.cc
index ab4d4add06909..4967df5341d35 100644
--- a/paddle/fluid/imperative/tests/nccl_context_test.cc
+++ b/paddle/fluid/imperative/tests/nccl_context_test.cc
@@ -33,7 +33,7 @@ imperative::ParallelStrategy GetStrategy(int local_rank) {
   return strategy;
 }
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 void BcastNCCLId(int local_rank, std::vector<ncclUniqueId>* nccl_ids) {
   auto strategy = GetStrategy(local_rank);
   platform::CUDAPlace gpu(local_rank);
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index c394ce07df3c3..cb4ab2e79cb99 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -53,7 +53,7 @@ int TensorddTest(Place place, T t1, T t2) {
                          sizeof(T) * src_data.size());
     paddle::memory::Copy(place, dst_mutable, src_place, dst_data.data(),
                          sizeof(T) * dst_data.size());
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   } else {
     paddle::memory::Copy(place, src_mutable, src_place, src_data.data(),
                          sizeof(T) * src_data.size(), 0);
@@ -74,7 +74,7 @@ int TensorddTest(Place place, T t1, T t2) {
 }
 
 TEST(test_add_functor, add_functor) {
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   platform::CUDAPlace gpu_place(0);
 #endif
   platform::CPUPlace cpu_place;
@@ -88,7 +88,7 @@ TEST(test_add_functor, add_functor) {
   cpu_res = TensorddTest(cpu_place, static_cast<platform::float16>(1.0),
                          static_cast<platform::float16>(2.0));
   EXPECT_EQ(cpu_res, 0);
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   int gpu_res = 1;
   gpu_res = TensorddTest(gpu_place, 1.0, 0.0);
   EXPECT_EQ(gpu_res, 0);
@@ -107,7 +107,7 @@ TEST(test_add_functor, execption) {
   platform::CPUPlace cpu_place;
 
   ASSERT_ANY_THROW(TensorddTest(cpu_place, 1, 0));
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place, 1.0, 0.0));
   ASSERT_ANY_THROW(TensorddTest(cuda_pinned_place,
                                 static_cast<platform::float16>(1.0),
@@ -358,7 +358,7 @@ TEST(test_gradient_accumulator, test_unchange_input) {
   for (auto sort_gradient : {false, true}) {
     TestGradientAccumulatorTestUnchangeInput(platform::CPUPlace(),
                                              sort_gradient);
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     TestGradientAccumulatorTestUnchangeInput(platform::CUDAPlace(0),
                                              sort_gradient);
 #endif
diff --git a/paddle/fluid/imperative/tests/test_group.cc b/paddle/fluid/imperative/tests/test_group.cc
index 60814dcb6cc1c..0c058038968be 100644
--- a/paddle/fluid/imperative/tests/test_group.cc
+++ b/paddle/fluid/imperative/tests/test_group.cc
@@ -73,7 +73,7 @@ void GroupConcatSplit(Place place, size_t size) {
     }
 
     if (std::is_same<Place, platform::CUDAPlace>::value) {
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
       paddle::memory::Copy(place, data, cpu_place, value.data(),
                            sizeof(T) * value.size(), 0);
 #endif
@@ -133,7 +133,7 @@ void GroupConcatSplit(Place place, size_t size) {
   }
 }
 
-#if defined(PADDLE_WITH_NCCL)
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 TEST(TestGroup, TestConcatSplit) {
   platform::CUDAPlace cuda_place(0);
   platform::CPUPlace cpu_place;
diff --git a/paddle/fluid/imperative/tests/test_prepare_op.cc b/paddle/fluid/imperative/tests/test_prepare_op.cc
index ea009a4f5a4fc..7d6882a4ee7d0 100644
--- a/paddle/fluid/imperative/tests/test_prepare_op.cc
+++ b/paddle/fluid/imperative/tests/test_prepare_op.cc
@@ -106,7 +106,7 @@ TEST(test_prepare_op, test_get_tensor_from_var) {
   ASSERT_TRUE(ts != nullptr);
 }
 
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(test_prepare_op, test_prepare_data) {
   std::shared_ptr<imperative::VarBase> vin(
       new imperative::VarBase(false, "vin"));
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index c2ead38e4c1dc..e3b5ff670368a 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -195,7 +195,7 @@ TEST(test_tracer, test_track_backward_input) {
   ASSERT_EQ(y_in->GradVarBase()->GradOpNum(), 0UL);
   ASSERT_EQ(vout->GradVarBase()->GradOpNum(), 1UL);
 }
-#if defined(PADDLE_WITH_CUDA)
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 TEST(test_tracer, test_trace_op_with_multi_device_inputs) {
   // Doing an mul
   imperative::Tracer tracer;
@@ -521,7 +521,7 @@ static void TestVarOpDestructionMain(const platform::Place& place,
 
 TEST(test_tracer, test_var_op_destruction) {
   TestVarOpDestructionMain(platform::CPUPlace());
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   TestVarOpDestructionMain(platform::CUDAPlace(0));
 #endif
 }
diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc
index 7003e569d19e9..3c20c1f647ac6 100644
--- a/paddle/fluid/imperative/tracer.cc
+++ b/paddle/fluid/imperative/tracer.cc
@@ -201,7 +201,7 @@ void Tracer::TraceOp(const std::string& type, const NameVarBaseMap& ins,
 void Tracer::SetExpectedPlace(platform::Place place) {
   // NOTE(wangxi): set device id before launch device kernel
   if (platform::is_gpu_place(place)) {
-#ifdef PADDLE_WITH_CUDA
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     platform::SetDeviceId(BOOST_GET_CONST(platform::CUDAPlace, place).device);
 #else
     PADDLE_THROW(platform::errors::PreconditionNotMet(
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index c8488eefb984f..46dd91fed6cbc 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -55,7 +55,7 @@ __global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
       platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
                               out_grad[sample_idx]);
     } else {
-      in_grad[in_idx - index_i + sample_idx] = out_grad[sample_idx];
+      in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
     }
   }
 }
diff --git a/paddle/fluid/platform/dynload/dynamic_loader.cc b/paddle/fluid/platform/dynload/dynamic_loader.cc
index c347d82d1d10e..6669d18f75cc6 100644
--- a/paddle/fluid/platform/dynload/dynamic_loader.cc
+++ b/paddle/fluid/platform/dynload/dynamic_loader.cc
@@ -416,9 +416,6 @@ void* GetOpDsoHandle(const std::string& dso_name) {
 #if defined(__APPLE__) || defined(__OSX__)
   PADDLE_THROW(platform::errors::Unimplemented(
       "Create custom cpp op outside framework do not support Apple."));
-#elif defined(_WIN32) && defined(PADDLE_WITH_CUDA)
-  PADDLE_THROW(platform::errors::Unimplemented(
-      "Create custom cpp op outside framework do not support Windows."));
 #else
   return GetDsoHandleFromSearchPath(FLAGS_op_dir, dso_name);
 #endif
diff --git a/paddle/scripts/paddle_build.bat b/paddle/scripts/paddle_build.bat
index f1ec86cf38549..c1230d26c1672 100644
--- a/paddle/scripts/paddle_build.bat
+++ b/paddle/scripts/paddle_build.bat
@@ -121,23 +121,24 @@ rem ------pre install python requirement----------
 where python
 where pip
 pip install wheel --user
-pip install -r %work_dir%\python\requirements.txt --user
-pip install -r %work_dir%\python\unittest_py\requirements.txt --user
+pip install --force-reinstall -r %work_dir%\python\requirements.txt --user
+pip install --force-reinstall -r %work_dir%\python\unittest_py\requirements.txt --user
 if %ERRORLEVEL% NEQ 0 (
     echo pip install requirements.txt failed!
     exit /b 7
 )
 
 rem ------pre install clcache and init config----------
-pip install clcache --user
+rem pip install clcache --user
+pip uninstall -y clcache
 :: set USE_CLCACHE to enable clcache
-set USE_CLCACHE=1
+rem set USE_CLCACHE=1
 :: In some scenarios, CLCACHE_HARDLINK can save one file copy.
-set CLCACHE_HARDLINK=1
+rem set CLCACHE_HARDLINK=1
 :: If it takes more than 1000s to obtain the right to use the cache, an error will be reported
-set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
+rem set CLCACHE_OBJECT_CACHE_TIMEOUT_MS=1000000
 :: set maximum cache size to 20G
-clcache.exe -M 21474836480
+rem clcache.exe -M 21474836480
 
 rem ------show summary of current environment----------
 cmake --version
@@ -318,7 +319,7 @@ echo Build third_party successfully!
 set build_times=1
 :build_paddle
 :: reset clcache zero stats for collect PR's actual hit rate
-clcache.exe -z
+rem clcache.exe -z
 
 echo Build Paddle the %build_times% time:
 if "%WITH_CLCACHE%"=="OFF" (
@@ -342,7 +343,7 @@ echo 0 > %cache_dir%\error_code.txt
 type %cache_dir%\error_code.txt
 
 :: ci will collect clcache hit rate
-goto :collect_clcache_hits
+rem goto :collect_clcache_hits
 
 goto:eof
 
@@ -385,13 +386,14 @@ set /p PADDLE_WHL_FILE_WIN=< whl_file.txt
 @ECHO ON
 pip uninstall -y paddlepaddle
 pip uninstall -y paddlepaddle-gpu
-pip install -U %PADDLE_WHL_FILE_WIN% --user
+pip install %PADDLE_WHL_FILE_WIN% --user
 if %ERRORLEVEL% NEQ 0 (
     call paddle_winci\Scripts\deactivate.bat 2>NUL
     echo pip install whl package failed!
     exit /b 1
 )
 
+
 set CUDA_VISIBLE_DEVICES=0
 python %work_dir%\paddle\scripts\installation_validate.py
 goto:eof
diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py
index 72302d81d65a2..c3ee11ff5d906 100644
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
@@ -46,7 +46,7 @@ class ParamAttr(object):
         initializer (Initializer, optional): The method to initial this parameter. Default
                 None, meaning that the weight parameter is initialized by Xavier initializer,
                 and the bias parameter is initialized by 0.
-        learning_rate (float): The parameter's learning rate. The learning rate when
+        learning_rate (float, optional): The parameter's learning rate. The learning rate when
                 optimize is the global learning rates times the parameter's learning rate times
                 the factor of learning rate scheduler. Default 1.0.
         regularizer (WeightDecayRegularizer, optional): Regularization strategy. There are two method: 
@@ -54,10 +54,13 @@ class ParamAttr(object):
                 regularizer is also set in ``optimizer`` (such as :ref:`api_paddle_optimizer_SGD` ), 
                 that regularizer setting in optimizer will be ignored. Default None, meaning there is 
                 no regularization.
-        trainable (bool): Whether this parameter is trainable. Default True.
-        do_model_average (bool): Whether this parameter should do model average
+        trainable (bool, optional): Whether this parameter is trainable. Default True.
+        do_model_average (bool, optional): Whether this parameter should do model average
                 when model average is enabled. Only used in ExponentialMovingAverage. Default True.
-        need_clip (bool): Whether the parameter gradient need to be cliped in optimizer. Default is True.
+        need_clip (bool, optional): Whether the parameter gradient need to be cliped in optimizer. Default is True.
+
+    Returns:
+       ParamAttr Object.
 
     Examples:
         .. code-block:: python
diff --git a/python/paddle/fluid/tests/CMakeLists.txt b/python/paddle/fluid/tests/CMakeLists.txt
index bee49945f0074..60be92b892fbe 100644
--- a/python/paddle/fluid/tests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/CMakeLists.txt
@@ -9,7 +9,14 @@ endforeach()
 add_subdirectory(unittests)
 add_subdirectory(book)
 
-if(NOT APPLE AND NOT WIN32)
+# TODO: support New Custom OP on Mac
+if(Linux)
   add_subdirectory(custom_op)
 endif()
+
+# Windows CPU machine doesn't have CUDA, can't compile .cu file
+# if(WIN32 AND WITH_GPU)
+#   add_subdirectory(custom_op)
+# endif()
+
 set_tests_properties(test_beam_search_decoder PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/custom_op/CMakeLists.txt b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
index 1d6304cd6409d..0daf662f551ec 100644
--- a/python/paddle/fluid/tests/custom_op/CMakeLists.txt
+++ b/python/paddle/fluid/tests/custom_op/CMakeLists.txt
@@ -1,3 +1,36 @@
+# New custom OP can support Windows/Linux now
+# 'test_simple_custom_op_jit/test_simple_custom_op_setup' compile .cc and .cu file
+py_test(test_simple_custom_op_setup SRCS test_simple_custom_op_setup.py)
+py_test(test_simple_custom_op_jit SRCS test_simple_custom_op_jit.py)
+
+# Compiling shared library will cost some time, but running process is very fast.
+set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
+set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
+
+py_test(test_sysconfig SRCS test_sysconfig.py)
+
+# 'test_dispatch' compile .cc file
+py_test(test_dispatch SRCS test_dispatch.py)
+set_tests_properties(test_dispatch PROPERTIES TIMEOUT 180)
+
+if(NOT Linux)
+    return()
+endif()
+
+# TODO(zhouwei): support test_check_abi and abi check on Windows
+py_test(test_check_abi SRCS test_check_abi.py)
+
+# Old custom OP only support Linux, only run on Linux
+py_test(test_custom_op SRCS test_custom_op.py)
+py_test(test_jit_load SRCS test_jit_load.py)
+py_test(test_setup_install SRCS test_setup_install.py)
+py_test(test_setup_build SRCS test_setup_build.py)
+
+set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
+set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
+
+
 if(WITH_ROCM)
     hip_library(relu_op_shared SHARED SRCS relu_op.cc relu_op.cu DEPS paddle_framework_shared)
 elseif(WITH_GPU)
@@ -18,18 +51,3 @@ get_target_property(TARGET_LIBRARIES relu_op_shared LINK_LIBRARIES)
 LIST(REMOVE_ITEM TARGET_LIBRARIES glog)
 LIST(REMOVE_ITEM TARGET_LIBRARIES gflags)
 set_property(TARGET relu_op_shared PROPERTY LINK_LIBRARIES  ${TARGET_LIBRARIES} )
-
-file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py")
-string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}")
-
-foreach(src ${TEST_OPS})
-    py_test(${src} SRCS ${src}.py)
-endforeach()
-
-# Compiling .so will cost some time, but running process is very fast.
-set_tests_properties(test_jit_load PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_install PROPERTIES TIMEOUT 180)
-set_tests_properties(test_setup_build PROPERTIES TIMEOUT 180)
-
-set_tests_properties(test_simple_custom_op_setup PROPERTIES TIMEOUT 250)
-set_tests_properties(test_simple_custom_op_jit PROPERTIES TIMEOUT 180)
diff --git a/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
new file mode 100644
index 0000000000000..e09ac2f87c806
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/dispatch_test_op.cc
@@ -0,0 +1,138 @@
+// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <iostream>
+#include <vector>
+
+#include "paddle/extension.h"
+
+template <typename data_t>
+void assign_cpu_kernel(const data_t* x_data,
+                       data_t* out_data,
+                       int64_t x_numel) {
+  for (int i = 0; i < x_numel; ++i) {
+    out_data[i] = x_data[i];
+  }
+}
+
+std::vector<std::vector<int64_t>> InferShape(std::vector<int64_t> x_shape) {
+  return {x_shape};
+}
+
+std::vector<paddle::DataType> InferDType(paddle::DataType x_dtype) {
+  return {x_dtype};
+}
+
+std::vector<paddle::Tensor> DispatchTestInterger(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_INTEGRAL_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_integer")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestInterger))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+
+std::vector<paddle::Tensor> DispatchTestComplex(const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_complex")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestComplex))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndInteger(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_float_and_integer")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndInteger))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndComplex(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_float_and_complex")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndComplex))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
+
+std::vector<paddle::Tensor> DispatchTestFloatAndIntegerAndComplex(
+    const paddle::Tensor& x) {
+  auto out = paddle::Tensor(paddle::PlaceType::kCPU);
+  out.reshape(x.shape());
+
+  PD_DISPATCH_FLOATING_AND_INTEGRAL_AND_COMPLEX_TYPES(
+      x.type(), "assign_cpu_kernel", ([&] {
+        assign_cpu_kernel<data_t>(
+            x.data<data_t>(), out.mutable_data<data_t>(), x.size());
+      }));
+
+  return {out};
+}
+
+PD_BUILD_OP("dispatch_test_float_and_integer_and_complex")
+    .Inputs({"X"})
+    .Outputs({"Out"})
+    .SetKernelFn(PD_KERNEL(DispatchTestFloatAndIntegerAndComplex))
+    .SetInferShapeFn(PD_INFER_SHAPE(InferShape))
+    .SetInferDtypeFn(PD_INFER_DTYPE(InferDType));
diff --git a/python/paddle/fluid/tests/custom_op/test_dispatch.py b/python/paddle/fluid/tests/custom_op/test_dispatch.py
new file mode 100644
index 0000000000000..aaca7333561ee
--- /dev/null
+++ b/python/paddle/fluid/tests/custom_op/test_dispatch.py
@@ -0,0 +1,89 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import unittest
+import paddle
+import numpy as np
+from paddle.utils.cpp_extension import load, get_build_directory
+from utils import paddle_includes, extra_compile_args
+from paddle.utils.cpp_extension.extension_utils import run_cmd
+
+# Because the shared lib already exists in the cache dir,
+# it will not be compiled again unless the cache dir is cleared.
+if os.name == 'nt':
+    cmd = 'rmdir {} /s/q'.format(get_build_directory())
+else:
+    cmd = 'rm -rf {}'.format(get_build_directory())
+
+run_cmd(cmd, True)
+
+dispatch_op = load(
+    name='dispatch_op',
+    sources=['dispatch_test_op.cc'],
+    extra_include_paths=paddle_includes,  # add for Coverage CI
+    extra_cflags=extra_compile_args)  # add for Coverage CI
+
+
+class TestJitDispatch(unittest.TestCase):
+    def setUp(self):
+        paddle.set_device('cpu')
+
+    def run_dispatch_test(self, func, dtype):
+        np_x = np.ones([2, 2]).astype(dtype)
+        x = paddle.to_tensor(np_x)
+        out = func(x)
+        np_x = x.numpy()
+        np_out = out.numpy()
+        self.assertTrue(dtype in str(np_out.dtype))
+        self.assertTrue(
+            np.array_equal(np_x, np_out),
+            "custom op x: {},\n custom op out: {}".format(np_x, np_out))
+
+    def test_dispatch_integer(self):
+        dtypes = ["int32", "int64", "int8", "uint8", "int16"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_integer, dtype)
+
+    def test_dispatch_complex(self):
+        dtypes = ["complex64", "complex128"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_complex, dtype)
+
+    def test_dispatch_float_and_integer(self):
+        dtypes = [
+            "float32", "float64", "int32", "int64", "int8", "uint8", "int16"
+        ]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_integer,
+                                   dtype)
+
+    def test_dispatch_float_and_complex(self):
+        dtypes = ["float32", "float64", "complex64", "complex128"]
+        for dtype in dtypes:
+            self.run_dispatch_test(dispatch_op.dispatch_test_float_and_complex,
+                                   dtype)
+
+    def test_dispatch_float_and_integer_and_complex(self):
+        dtypes = [
+            "float32", "float64", "int32", "int64", "int8", "uint8", "int16",
+            "complex64", "complex128"
+        ]
+        for dtype in dtypes:
+            self.run_dispatch_test(
+                dispatch_op.dispatch_test_float_and_integer_and_complex, dtype)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
index 2c0dc1a4ca6a1..2832e8070d142 100644
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_jit.py
@@ -13,13 +13,24 @@
 # limitations under the License.
 
 import os
+import subprocess
 import unittest
 import paddle
 import numpy as np
-from paddle.utils.cpp_extension import load
+from paddle.utils.cpp_extension import load, get_build_directory
+from paddle.utils.cpp_extension.extension_utils import run_cmd
 from utils import paddle_includes, extra_compile_args
 from test_simple_custom_op_setup import relu2_dynamic, relu2_static
 
+# Because the shared lib already exists in the cache dir,
+# it will not be compiled again unless the cache dir is cleared.
+if os.name == 'nt':
+    cmd = 'rmdir {} /s/q'.format(get_build_directory())
+else:
+    cmd = 'rm -rf {}'.format(get_build_directory())
+
+run_cmd(cmd, True)
+
 # Compile and load custom op Just-In-Time.
 custom_module = load(
     name='simple_jit_relu2',
diff --git a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
index cfa2db0ba24a4..f312508d39320 100644
--- a/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
+++ b/python/paddle/fluid/tests/custom_op/test_simple_custom_op_setup.py
@@ -91,7 +91,12 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
     def setUp(self):
         cur_dir = os.path.dirname(os.path.abspath(__file__))
         # compile, install the custom op egg into site-packages under background
-        cmd = 'cd {} && python setup_install_simple.py install'.format(cur_dir)
+        if os.name == 'nt':
+            cmd = 'cd /d {} && python setup_install_simple.py install'.format(
+                cur_dir)
+        else:
+            cmd = 'cd {} && python setup_install_simple.py install'.format(
+                cur_dir)
         run_cmd(cmd)
 
         # NOTE(Aurelius84): Normally, it's no need to add following codes for users.
@@ -99,7 +104,11 @@ def setUp(self):
         # sys.path has been updated. So we update it manually.
 
         # See: https://stackoverflow.com/questions/56974185/import-runtime-installed-module-using-pip-in-python-3
-        site_dir = site.getsitepackages()[0]
+        if os.name == 'nt':
+            # NOTE(zhouwei25): getsitepackages on windows will return a list: [python install dir, site packages dir]
+            site_dir = site.getsitepackages()[1]
+        else:
+            site_dir = site.getsitepackages()[0]
         custom_egg_path = [
             x for x in os.listdir(site_dir) if 'simple_setup_relu2' in x
         ]
diff --git a/python/paddle/fluid/tests/custom_op/utils.py b/python/paddle/fluid/tests/custom_op/utils.py
index f293c751942cd..52b294dc72b4b 100644
--- a/python/paddle/fluid/tests/custom_op/utils.py
+++ b/python/paddle/fluid/tests/custom_op/utils.py
@@ -23,8 +23,8 @@
 # paddle include directory. Because the following path is generated after insalling
 # PaddlePaddle whl. So here we specific `include_dirs` to avoid errors in CI.
 paddle_includes = [
-    os.path.join(site_packages_path, 'paddle/include'),
-    os.path.join(site_packages_path, 'paddle/include/third_party')
+    os.path.join(site_packages_path, 'paddle', 'include'),
+    os.path.join(site_packages_path, 'paddle', 'include', 'third_party')
 ]
 
 # TODO(Aurelius84): Memory layout is different if build paddle with PADDLE_WITH_MKLDNN=ON,
diff --git a/python/paddle/utils/cpp_extension/cpp_extension.py b/python/paddle/utils/cpp_extension/cpp_extension.py
index 121c1626125af..8c0893b16cf88 100644
--- a/python/paddle/utils/cpp_extension/cpp_extension.py
+++ b/python/paddle/utils/cpp_extension/cpp_extension.py
@@ -17,16 +17,25 @@
 import sys
 import textwrap
 import copy
+import re
 
 import setuptools
 from setuptools.command.easy_install import easy_install
 from setuptools.command.build_ext import build_ext
 
 from .extension_utils import find_cuda_home, normalize_extension_kwargs, add_compile_flag, bootstrap_context
-from .extension_utils import is_cuda_file, prepare_unix_cflags, add_std_without_repeat, get_build_directory
+from .extension_utils import is_cuda_file, prepare_unix_cflags, prepare_win_cflags, add_std_without_repeat, get_build_directory
 from .extension_utils import _import_module_from_library, CustomOpInfo, _write_setup_file, _jit_compile, parse_op_name_from
-from .extension_utils import check_abi_compatibility, log_v, IS_WINDOWS
-from .extension_utils import use_new_custom_op_load_method
+from .extension_utils import check_abi_compatibility, log_v, IS_WINDOWS, OS_NAME
+from .extension_utils import use_new_custom_op_load_method, MSVC_COMPILE_FLAGS
+
+# Note(zhouwei): On windows, it will export function 'PyInit_[name]' by default,
+# The solution is: 1.User add function PyInit_[name] 2. set not to export
+# refer to https://stackoverflow.com/questions/34689210/error-exporting-symbol-when-building-python-c-extension-in-windows
+if IS_WINDOWS and six.PY3:
+    from distutils.command.build_ext import build_ext as _du_build_ext
+    from unittest.mock import Mock
+    _du_build_ext.get_export_symbols = Mock(return_value=None)
 
 CUDA_HOME = find_cuda_home()
 
@@ -112,7 +121,7 @@ def CppExtension(sources, *args, **kwargs):
            sources(list[str]): The C++/CUDA source file names
            args(list[options]): list of config options used to compile shared library
            kwargs(dict[option]): dict of config options used to compile shared library
-           
+
        Returns:
            Extension: An instance of setuptools.Extension
     """
@@ -137,7 +146,7 @@ def CUDAExtension(sources, *args, **kwargs):
            sources(list[str]): The C++/CUDA source file names
            args(list[options]): list of config options used to compile shared library
            kwargs(dict[option]): dict of config options used to compile shared library
-           
+
        Returns:
            Extension: An instance of setuptools.Extension
     """
@@ -191,12 +200,12 @@ def __init__(self, *args, **kwargs):
     def __init__(self, *args, **kwargs):
         """
         Attributes is initialized with following oreder:
-        
+
             1. super(self).__init__()
             2. initialize_options(self)
             3. the reset of current __init__()
             4. finalize_options(self)
-        
+
         So, it is recommended to set attribute value in `finalize_options`.
         """
         super(BuildExtension, self).__init__(*args, **kwargs)
@@ -225,15 +234,17 @@ def build_extensions(self):
                 for compiler in ['cxx', 'nvcc']:
                     if compiler not in extension.extra_compile_args:
                         extension.extra_compile_args[compiler] = []
-            # add determine compile flags
-            add_compile_flag(extension, '-std=c++11')
 
         # Consider .cu, .cu.cc as valid source extensions.
         self.compiler.src_extensions += ['.cu', '.cu.cc']
         # Save the original _compile method for later.
-        if self.compiler.compiler_type == 'msvc' or IS_WINDOWS:
-            raise NotImplementedError("Not support on MSVC currently.")
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler._cpp_extensions += ['.cu', '.cuh']
+            original_compile = self.compiler.compile
+            original_spawn = self.compiler.spawn
         else:
+            # add determine compile flags
+            add_compile_flag(extension, '-std=c++11')
             original_compile = self.compiler._compile
 
         def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
@@ -268,6 +279,81 @@ def unix_custom_single_compiler(obj, src, ext, cc_args, extra_postargs,
                 # restore original_compiler
                 self.compiler.compiler_so = original_compiler
 
+        def win_custom_single_compiler(sources,
+                                       output_dir=None,
+                                       macros=None,
+                                       include_dirs=None,
+                                       debug=0,
+                                       extra_preargs=None,
+                                       extra_postargs=None,
+                                       depends=None):
+
+            self.cflags = copy.deepcopy(extra_postargs)
+            extra_postargs = None
+
+            def win_custom_spawn(cmd):
+                # Using regex to modify compile options
+                compile_options = self.compiler.compile_options
+                for i in range(len(cmd)):
+                    if re.search('/MD', cmd[i]) is not None:
+                        cmd[i] = '/MT'
+                    if re.search('/W[1-4]', cmd[i]) is not None:
+                        cmd[i] = '/W0'
+
+                # Using regex to match src, obj and include files
+                src_regex = re.compile('/T(p|c)(.*)')
+                src_list = [
+                    m.group(2) for m in (src_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                obj_regex = re.compile('/Fo(.*)')
+                obj_list = [
+                    m.group(1) for m in (obj_regex.match(elem) for elem in cmd)
+                    if m
+                ]
+
+                include_regex = re.compile(r'((\-|\/)I.*)')
+                include_list = [
+                    m.group(1)
+                    for m in (include_regex.match(elem) for elem in cmd) if m
+                ]
+
+                assert len(src_list) == 1 and len(obj_list) == 1
+                src = src_list[0]
+                obj = obj_list[0]
+                if is_cuda_file(src):
+                    assert CUDA_HOME is not None
+                    nvcc_cmd = os.path.join(CUDA_HOME, 'bin', 'nvcc')
+                    if isinstance(self.cflags, dict):
+                        cflags = self.cflags['nvcc']
+                    elif isinstance(self.cflags, list):
+                        cflags = self.cflags
+                    else:
+                        cflags = []
+
+                    cflags = prepare_win_cflags(cflags) + ['--use-local-env']
+                    for flag in MSVC_COMPILE_FLAGS:
+                        cflags = ['-Xcompiler', flag] + cflags
+                    cmd = [nvcc_cmd, '-c', src, '-o', obj
+                           ] + include_list + cflags
+                elif isinstance(self.cflags, dict):
+                    cflags = MSVC_COMPILE_FLAGS + self.cflags['cxx']
+                    cmd += cflags
+                elif isinstance(self.cflags, list):
+                    cflags = MSVC_COMPILE_FLAGS + self.cflags
+                    cmd += cflags
+
+                return original_spawn(cmd)
+
+            try:
+                self.compiler.spawn = win_custom_spawn
+                return original_compile(sources, output_dir, macros,
+                                        include_dirs, debug, extra_preargs,
+                                        extra_postargs, depends)
+            finally:
+                self.compiler.spawn = original_spawn
+
         def object_filenames_with_cuda(origina_func, build_directory):
             """
             Decorated the function to add customized naming machanism.
@@ -280,10 +366,13 @@ def wrapper(source_filenames, strip_dir=0, output_dir=''):
                     objects = origina_func(source_filenames, strip_dir,
                                            output_dir)
                     for i, source in enumerate(source_filenames):
-                        # modify xx.o -> xx.cu.o
+                        # modify xx.o -> xx.cu.o/xx.cu.obj
                         if is_cuda_file(source):
                             old_obj = objects[i]
-                            objects[i] = old_obj[:-1] + 'cu.o'
+                            if self.compiler.compiler_type == 'msvc':
+                                objects[i] = old_obj[:-3] + 'cu.obj'
+                            else:
+                                objects[i] = old_obj[:-1] + 'cu.o'
                     # if user set build_directory, output objects there.
                     if build_directory is not None:
                         objects = [
@@ -300,10 +389,13 @@ def wrapper(source_filenames, strip_dir=0, output_dir=''):
             return wrapper
 
         # customized compile process
-        self.compiler._compile = unix_custom_single_compiler
+        if self.compiler.compiler_type == 'msvc':
+            self.compiler.compile = win_custom_single_compiler
+        else:
+            self.compiler._compile = unix_custom_single_compiler
+
         self.compiler.object_filenames = object_filenames_with_cuda(
             self.compiler.object_filenames, self.build_lib)
-
         self._record_op_info()
 
         print("Compiling user custom op, it will cost a few seconds.....")
@@ -333,15 +425,21 @@ def _check_abi(self):
             compiler = self.compiler.compiler_cxx[0]
         elif IS_WINDOWS:
             compiler = os.environ.get('CXX', 'cl')
-            raise NotImplementedError("We don't support Windows Currently.")
         else:
             compiler = os.environ.get('CXX', 'c++')
 
         check_abi_compatibility(compiler)
+        # Warn user if VC env is activated but `DISTUILS_USE_SDK` is not set.
+        if IS_WINDOWS and 'VSCMD_ARG_TGT_ARCH' in os.environ and 'DISTUTILS_USE_SDK' not in os.environ:
+            msg = (
+                'It seems that the VC environment is activated but DISTUTILS_USE_SDK is not set.'
+                'This may lead to multiple activations of the VC env.'
+                'Please set `DISTUTILS_USE_SDK=1` and try again.')
+            raise UserWarning(msg)
 
     def _record_op_info(self):
         """
-        Record custum op inforomation. 
+        Record custum op inforomation.
         """
         # parse shared library abs path
         outputs = self.get_outputs()
@@ -380,7 +478,13 @@ def run(self, *args, **kwargs):
         # .so shared library to another name.
         for egg_file in self.outputs:
             filename, ext = os.path.splitext(egg_file)
-            if ext == '.so':
+            will_rename = False
+            if OS_NAME.startswith('linux') and ext == '.so':
+                will_rename = True
+            elif IS_WINDOWS and ext == '.pyd':
+                will_rename = True
+
+            if will_rename:
                 new_so_path = filename + "_pd_" + ext
                 if not os.path.exists(new_so_path):
                     os.rename(r'%s' % egg_file, r'%s' % new_so_path)
@@ -425,7 +529,7 @@ def load(name,
         extra_include_paths(list[str]): additional include path used to search header files.
                                         Default None.
         build_directory(str): specific directory path to put shared library file. If set None,
-                            it will use `PADDLE_EXTENSION_DIR` from os.environ. Use 
+                            it will use `PADDLE_EXTENSION_DIR` from os.environ. Use
                             `paddle.utils.cpp_extension.get_build_directory()` to see the location.
         interpreter(str): alias or full interpreter path to specific which one to use if have installed multiple.
                            If set None, will use `python` as default interpreter.
@@ -448,6 +552,10 @@ def load(name,
 
     # ensure to use abs path
     build_directory = os.path.abspath(build_directory)
+    # Will load shared library from 'path' on windows
+    if IS_WINDOWS:
+        os.environ['path'] = build_directory + ';' + os.environ['path']
+
     log_v("build_directory: {}".format(build_directory), verbose)
 
     file_path = os.path.join(build_directory, "setup.py")
@@ -460,7 +568,7 @@ def load(name,
     log_v("additonal compile_flags: [{}]".format(' '.join(compile_flags)),
           verbose)
 
-    # write setup.py file and compile it 
+    # write setup.py file and compile it
     _write_setup_file(name, sources, file_path, extra_include_paths,
                       compile_flags, extra_ldflags, verbose)
     _jit_compile(file_path, interpreter, verbose)
diff --git a/python/paddle/utils/cpp_extension/extension_utils.py b/python/paddle/utils/cpp_extension/extension_utils.py
index 52c17d77bd477..f4a801fe3ec47 100644
--- a/python/paddle/utils/cpp_extension/extension_utils.py
+++ b/python/paddle/utils/cpp_extension/extension_utils.py
@@ -38,9 +38,19 @@
 
 OS_NAME = sys.platform
 IS_WINDOWS = OS_NAME.startswith('win')
-NVCC_COMPILE_FLAGS = [
-    '-ccbin', 'cc', '-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU', '-DPADDLE_USE_DSO',
-    '-Xcompiler', '-fPIC', '-w', '--expt-relaxed-constexpr', '-O3', '-DNVCC'
+
+MSVC_COMPILE_FLAGS = [
+    '/MT', '/wd4819', '/wd4251', '/wd4244', '/wd4267', '/wd4275', '/wd4018',
+    '/wd4190', '/EHsc', '/w', '/DPADDLE_WITH_CUDA', '/DEIGEN_USE_GPU',
+    '/DNDEBUG'
+]
+
+MSVC_LINK_FLAGS = [
+    '/MACHINE:X64', 'paddle_framework.lib', 'cudadevrt.lib', 'cudart_static.lib'
+]
+
+COMMON_NVCC_FLAGS = [
+    '-DPADDLE_WITH_CUDA', '-DEIGEN_USE_GPU', '-DPADDLE_USE_DSO', '-O3'
 ]
 
 GCC_MINI_VERSION = (5, 4, 0)
@@ -81,8 +91,8 @@
 USING_NEW_CUSTOM_OP_LOAD_METHOD = True
 
 
-# NOTE(chenweihang): In order to be compatible with 
-# the two custom op define method, after removing 
+# NOTE(chenweihang): In order to be compatible with
+# the two custom op define method, after removing
 # old method, we can remove them together
 def use_new_custom_op_load_method(*args):
     global USING_NEW_CUSTOM_OP_LOAD_METHOD
@@ -210,7 +220,21 @@ def prepare_unix_cflags(cflags):
     """
     Prepare all necessary compiled flags for nvcc compiling CUDA files.
     """
-    cflags = NVCC_COMPILE_FLAGS + cflags + get_cuda_arch_flags(cflags)
+    cflags = COMMON_NVCC_FLAGS + [
+        '-ccbin', 'cc', '-Xcompiler', '-fPIC', '-w', '--expt-relaxed-constexpr',
+        '-DNVCC'
+    ] + cflags + get_cuda_arch_flags(cflags)
+
+    return cflags
+
+
+def prepare_win_cflags(cflags):
+    """
+    Prepare all necessary compiled flags for nvcc compiling CUDA files.
+    """
+    cflags = COMMON_NVCC_FLAGS + [
+        '-DGOOGLE_GLOG_DLL_DECL', '-DBOOST_HAS_STATIC_ASSERT', '-w'
+    ] + cflags + get_cuda_arch_flags(cflags)
 
     return cflags
 
@@ -238,7 +262,7 @@ def get_cuda_arch_flags(cflags):
 
 
 def normalize_extension_kwargs(kwargs, use_cuda=False):
-    """ 
+    """
     Normalize include_dirs, library_dir and other attributes in kwargs.
     """
     assert isinstance(kwargs, dict)
@@ -252,52 +276,36 @@ def normalize_extension_kwargs(kwargs, use_cuda=False):
     library_dirs.extend(find_paddle_libraries(use_cuda))
     kwargs['library_dirs'] = library_dirs
 
-    # add runtime library dirs
-    runtime_library_dirs = kwargs.get('runtime_library_dirs', [])
-    runtime_library_dirs.extend(find_paddle_libraries(use_cuda))
-    kwargs['runtime_library_dirs'] = runtime_library_dirs
+    if IS_WINDOWS:
+        # TODO(zhouwei): may append compile flags in future
+        pass
+        # append link flags
+        extra_link_args = kwargs.get('extra_link_args', [])
+        extra_link_args.extend(MSVC_LINK_FLAGS)
+        kwargs['extra_link_args'] = extra_link_args
+    else:
+        # append compile flags
+        extra_compile_args = kwargs.get('extra_compile_args', [])
+        extra_compile_args.extend(['-g', '-w'])  # diable warnings
+        kwargs['extra_compile_args'] = extra_compile_args
 
-    # append compile flags
-    extra_compile_args = kwargs.get('extra_compile_args', [])
-    extra_compile_args.extend(['-g', '-w'])  # diable warnings
-    kwargs['extra_compile_args'] = extra_compile_args
+        # append link flags
+        extra_link_args = kwargs.get('extra_link_args', [])
+        extra_link_args.append('-lpaddle_framework')
+        if use_cuda:
+            extra_link_args.append('-lcudart')
 
-    # append link flags
-    extra_link_args = kwargs.get('extra_link_args', [])
-    extra_link_args.append('-lpaddle_framework')
-    if use_cuda:
-        extra_link_args.append('-lcudart')
+        kwargs['extra_link_args'] = extra_link_args
 
-    kwargs['extra_link_args'] = extra_link_args
+        # add runtime library dirs
+        runtime_library_dirs = kwargs.get('runtime_library_dirs', [])
+        runtime_library_dirs.extend(find_paddle_libraries(use_cuda))
+        kwargs['runtime_library_dirs'] = runtime_library_dirs
 
     kwargs['language'] = 'c++'
     return kwargs
 
 
-def find_paddle_includes(use_cuda=False):
-    """
-    Return Paddle necessary include dir path.
-    """
-    # pythonXX/site-packages/paddle/include
-    paddle_include_dir = get_include()
-    third_party_dir = os.path.join(paddle_include_dir, 'third_party')
-
-    include_dirs = [paddle_include_dir, third_party_dir]
-
-    return include_dirs
-
-
-def find_cuda_includes():
-
-    cuda_home = find_cuda_home()
-    if cuda_home is None:
-        raise ValueError(
-            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
-        )
-
-    return [os.path.join(cuda_home, 'lib64')]
-
-
 def find_cuda_home():
     """
     Use heuristic method to find cuda path
@@ -315,19 +323,22 @@ def find_cuda_home():
                 if six.PY3:
                     nvcc_path = nvcc_path.decode()
                 nvcc_path = nvcc_path.rstrip('\r\n')
+                log_v(nvcc_path)
                 # for example: /usr/local/cuda/bin/nvcc
                 cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
         except:
             if IS_WINDOWS:
                 # search from default NVIDIA GPU path
                 candidate_paths = glob.glob(
-                    'C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*')
+                    'C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v*.*'
+                )
                 if len(candidate_paths) > 0:
                     cuda_home = candidate_paths[0]
             else:
                 cuda_home = "/usr/local/cuda"
     # step 3. check whether path is valid
-    if not os.path.exists(cuda_home) and core.is_compiled_with_cuda():
+    if cuda_home and not os.path.exists(
+            cuda_home) and core.is_compiled_with_cuda():
         cuda_home = None
         warnings.warn(
             "Not found CUDA runtime, please use `export CUDA_HOME= XXX` to specific it."
@@ -336,15 +347,65 @@ def find_cuda_home():
     return cuda_home
 
 
+def find_cuda_includes():
+    """
+    Use heuristic method to find cuda include path
+    """
+    cuda_home = find_cuda_home()
+    if cuda_home is None:
+        raise ValueError(
+            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
+        )
+
+    return [os.path.join(cuda_home, 'include')]
+
+
+def find_paddle_includes(use_cuda=False):
+    """
+    Return Paddle necessary include dir path.
+    """
+    # pythonXX/site-packages/paddle/include
+    paddle_include_dir = get_include()
+    third_party_dir = os.path.join(paddle_include_dir, 'third_party')
+    include_dirs = [paddle_include_dir, third_party_dir]
+
+    #TODO(zhouwei): because eigen need cuda_runtime.h
+    #So, extend cuda_include_dir always
+    cuda_include_dir = find_cuda_includes()
+    include_dirs.extend(cuda_include_dir)
+
+    return include_dirs
+
+
+def find_cuda_libraries():
+    """
+    Use heuristic method to find cuda static lib path
+    """
+    cuda_home = find_cuda_home()
+    if cuda_home is None:
+        raise ValueError(
+            "Not found CUDA runtime, please use `export CUDA_HOME=XXX` to specific it."
+        )
+    if IS_WINDOWS:
+        cuda_lib_dir = [os.path.join(cuda_home, 'lib', 'x64')]
+    else:
+        cuda_lib_dir = [os.path.join(cuda_home, 'lib64')]
+
+    return cuda_lib_dir
+
+
 def find_paddle_libraries(use_cuda=False):
     """
     Return Paddle necessary library dir path.
     """
     # pythonXX/site-packages/paddle/libs
     paddle_lib_dirs = [get_lib()]
-    if use_cuda:
-        cuda_dirs = find_cuda_includes()
-        paddle_lib_dirs.extend(cuda_dirs)
+
+    #TODO(zhouwei): because eigen need cuda_runtime.h
+    #So, extend cuda_lib_dir always
+    cuda_lib_dir = find_cuda_libraries()
+    paddle_lib_dirs.extend(cuda_lib_dir)
+
     return paddle_lib_dirs
 
 
@@ -374,12 +435,14 @@ def get_build_directory(verbose=False):
     root_extensions_directory = os.environ.get('PADDLE_EXTENSION_DIR')
     if root_extensions_directory is None:
         dir_name = "paddle_extensions"
-        if OS_NAME.startswith('linux'):
-            root_extensions_directory = os.path.join(
-                os.path.expanduser('~/.cache'), dir_name)
-        else:
-            # TODO(Aurelius84): consider wind32/macOs
-            raise NotImplementedError("Only support Linux now.")
+        root_extensions_directory = os.path.join(
+            os.path.expanduser('~/.cache'), dir_name)
+        if IS_WINDOWS:
+            root_extensions_directory = os.path.normpath(
+                root_extensions_directory)
+        elif OS_NAME.startswith('darwin'):
+            # TODO(Aurelius84): consider macOs
+            raise NotImplementedError("Not support Mac now.")
 
         log_v("$PADDLE_EXTENSION_DIR is not set, using path: {} by default.".
               format(root_extensions_directory), verbose)
@@ -410,10 +473,13 @@ def parse_op_info(op_name):
 
 def _import_module_from_library(module_name, build_directory, verbose=False):
     """
-    Load .so shared library and import it as callable python module.
+    Load shared library and import it as callable python module.
     """
-    # TODO(Aurelius84): Consider file suffix is .dll on Windows Platform.
-    ext_path = os.path.join(build_directory, module_name + '.so')
+    if IS_WINDOWS:
+        dynamic_suffix = '.pyd'
+    else:
+        dynamic_suffix = '.so'
+    ext_path = os.path.join(build_directory, module_name + dynamic_suffix)
     if not os.path.exists(ext_path):
         raise FileNotFoundError("Extension path: {} does not exist.".format(
             ext_path))
@@ -565,12 +631,12 @@ def _write_setup_file(name,
 
 def list2str(args):
     """
-    Convert list[str] into string. For example: [x, y] -> "['x', 'y']"
+    Convert list[str] into string. For example: ['x', 'y'] -> "['x', 'y']"
     """
     if args is None: return '[]'
     assert isinstance(args, (list, tuple))
-    args = ["'{}'".format(arg) for arg in args]
-    return '[' + ','.join(args) + ']'
+    args = ["{}".format(arg) for arg in args]
+    return repr(args)
 
 
 def _jit_compile(file_path, interpreter=None, verbose=False):
@@ -583,7 +649,8 @@ def _jit_compile(file_path, interpreter=None, verbose=False):
     if interpreter is None:
         interpreter = 'python'
     try:
-        py_path = subprocess.check_output(['which', interpreter])
+        which = 'where' if IS_WINDOWS else 'which'
+        py_path = subprocess.check_output([which, interpreter])
         py_version = subprocess.check_output([interpreter, '-V'])
         if six.PY3:
             py_path = py_path.decode()
@@ -596,8 +663,13 @@ def _jit_compile(file_path, interpreter=None, verbose=False):
             'Failed to check Python interpreter with `{}`, errors: {}'.format(
                 interpreter, error))
 
-    compile_cmd = 'cd {} && {} {} build'.format(ext_dir, interpreter,
-                                                setup_file)
+    if IS_WINDOWS:
+        compile_cmd = 'cd /d {} && {} {} build'.format(ext_dir, interpreter,
+                                                       setup_file)
+    else:
+        compile_cmd = 'cd {} && {} {} build'.format(ext_dir, interpreter,
+                                                    setup_file)
+
     print("Compiling user custom op, it will cost a few seconds.....")
     run_cmd(compile_cmd, verbose)
 
@@ -682,7 +754,7 @@ def check_abi_compatibility(compiler, verbose=False):
     try:
         if OS_NAME.startswith('linux'):
             version_info = subprocess.check_output(
-                [compiler, '-dumpfullversion'])
+                [compiler, '-dumpfullversion', '-dumpversion'])
             if six.PY3:
                 version_info = version_info.decode()
             version = version_info.strip().split('.')
@@ -694,8 +766,8 @@ def check_abi_compatibility(compiler, verbose=False):
                 warnings.warn(
                     ABI_INCOMPATIBILITY_WARNING.format(
                         user_compiler=compiler, version=version_info.strip()))
-        # TODO(Aurelius84): check version compatibility on windows
         elif IS_WINDOWS:
+            # TODO(zhouwei): support check abi compatibility on windows
             warnings.warn("We don't support Windows now.")
     except Exception:
         _, error, _ = sys.exc_info()
@@ -714,7 +786,7 @@ def _expected_compiler_current_platform():
     return expect_compilers
 
 
-def log_v(info, verbose):
+def log_v(info, verbose=True):
     """
     Print log information on stdout.
     """
diff --git a/python/requirements.txt b/python/requirements.txt
index 77232f4fd7183..e89b3ede94fd4 100644
--- a/python/requirements.txt
+++ b/python/requirements.txt
@@ -3,7 +3,8 @@ numpy>=1.13, <=1.16.4 ; python_version<"3.5"
 numpy>=1.13 ; python_version>="3.5" and platform_system != "Windows"
 numpy>=1.13, <=1.19.3 ; python_version>="3.5" and platform_system == "Windows"
 protobuf>=3.1.0
-gast>=0.3.3
+gast>=0.3.3 ; platform_system != "Windows"
+gast==0.3.3 ; platform_system == "Windows"
 Pillow
 six
 decorator
diff --git a/python/setup.py.in b/python/setup.py.in
index d5c098aa9e350..43a74d191d804 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -335,11 +335,16 @@ if '${WITH_XPU_BKCL}' == 'ON':
     shutil.copy('${XPU_BKCL_LIB}', libs_path)
     package_data['paddle.libs']+=['${XPU_BKCL_LIB_NAME}']
 
-# copy libfuild_framework.so to libs
-if os.name != 'nt' and sys.platform != 'darwin':
-    paddle_framework_lib='${FLUID_FRAMEWORK_SHARED_LIB}'
-    shutil.copy(paddle_framework_lib, libs_path)
-    package_data['paddle.libs'] += [('libpaddle_framework' if os.name != 'nt' else 'paddle_framework') + ext_name]
+# copy libpaddle_framework.so to libs on linux
+if sys.platform.startswith('linux'):
+    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['libpaddle_framework.so']
+
+# copy paddle_framework.lib/paddle_framework.dll to libs on windows
+if os.name == 'nt':
+    shutil.copy('${FLUID_FRAMEWORK_IMPORT_LIB}', libs_path)
+    shutil.copy('${FLUID_FRAMEWORK_SHARED_LIB}', libs_path)
+    package_data['paddle.libs'] += ['paddle_framework.lib', 'paddle_framework.dll']
 
 # remove unused paddle/libs/__init__.py
 if os.path.isfile(libs_path+'/__init__.py'):
@@ -410,9 +415,9 @@ if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
 class InstallCommand(InstallCommandBase):
     def finalize_options(self):
         ret = InstallCommandBase.finalize_options(self)
-        self.install_headers = os.path.join(self.install_purelib, 'paddle',
-                                            'include')
         self.install_lib = self.install_platlib
+        self.install_headers = os.path.join(self.install_platlib, 'paddle',
+                                            'include')
         return ret
 
 
@@ -463,11 +468,6 @@ class InstallHeaders(Command):
         return self.copy_file(header, install_dir)
 
     def run(self):
-        # only copy third_party/cudaErrorMessage.pb for cudaErrorMessage on mac or windows
-        if os.name == 'nt' or sys.platform == 'darwin':
-            if '${WITH_GPU}' == 'ON' or '${WITH_ROCM}' == 'ON':
-                self.mkdir_and_copy_file('${cudaerror_INCLUDE_DIR}/cudaErrorMessage.pb')
-            return
         hdrs = self.distribution.headers
         if not hdrs:
             return