PaddlePaddle · risemeup1 · Jun 13, 2023 · Jun 7, 2023 · Jun 7, 2023 · Jun 7, 2023
diff --git a/cmake/cuda.cmake b/cmake/cuda.cmake
@@ -295,7 +295,7 @@ message(STATUS "NVCC_FLAGS_EXTRA: ${NVCC_FLAGS_EXTRA}")
 set(CUDA_PROPAGATE_HOST_FLAGS OFF)
 # Release/Debug flags set by cmake. Such as -O3 -g -DNDEBUG etc.
 # So, don't set these flags here.
-set(CMAKE_CUDA_STANDARD 14)
+set(CMAKE_CUDA_STANDARD 17)
 
 # (Note) For windows, if delete /W[1-4], /W1 will be added defaultly and conflic with -w
 # So replace /W[1-4] with /W0

diff --git a/cmake/flags.cmake b/cmake/flags.cmake
@@ -38,7 +38,7 @@ if(NOT WIN32)
   if(WITH_CINN)
     set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
   else()
-    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++14")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++17")
   endif()
 else()
   set(CMAKE_CXX_STANDARD 17)

diff --git a/paddle/fluid/distributed/fleet_executor/CMakeLists.txt b/paddle/fluid/distributed/fleet_executor/CMakeLists.txt
@@ -25,66 +25,35 @@ cc_library(
   task_loop_thread_pool
   SRCS task_loop_thread_pool.cc task_loop_thread.cc task_loop.cc
   DEPS enforce glog)
-if(WITH_XPU OR WITH_ROCM)
-  cc_library(
-    fleet_executor
-    SRCS fleet_executor.cc
-         carrier.cc
-         task_node.cc
-         runtime_graph.cc
-         dist_model.cc
-         interceptor.cc
-         compute_interceptor.cc
-         amplifier_interceptor.cc
-         cond_interceptor.cc
-         start_interceptor.cc
-         source_interceptor.cc
-         sink_interceptor.cc
-         message_service.cc
-         message_bus.cc
-         dist_model_tensor_wrapper.cc
-    DEPS naive_executor
-         proto_desc
-         standalone_executor
-         fleet_executor_desc_proto
-         interceptor_message_proto
-         task_loop_thread_pool
-         collective_helper
-         executor_gc_helper
-         op_registry
-         phi
-         glog
-         ${BRPC_DEPS})
-else()
-  cc_library(
-    fleet_executor
-    SRCS fleet_executor.cc
-         carrier.cc
-         task_node.cc
-         runtime_graph.cc
-         dist_model.cc
-         interceptor.cc
-         compute_interceptor.cc
-         amplifier_interceptor.cc
-         cond_interceptor.cc
-         start_interceptor.cc
-         source_interceptor.cc
-         sink_interceptor.cc
-         message_service.cc
-         message_bus.cc
-         dist_model_tensor_wrapper.cc
-    DEPS proto_desc
-         standalone_executor
-         fleet_executor_desc_proto
-         interceptor_message_proto
-         task_loop_thread_pool
-         collective_helper
-         op_registry
-         executor_gc_helper
-         phi
-         glog
-         ${BRPC_DEPS})
-endif()
+cc_library(
+  fleet_executor
+  SRCS fleet_executor.cc
+       carrier.cc
+       task_node.cc
+       runtime_graph.cc
+       dist_model.cc
+       interceptor.cc
+       compute_interceptor.cc
+       amplifier_interceptor.cc
+       cond_interceptor.cc
+       start_interceptor.cc
+       source_interceptor.cc
+       sink_interceptor.cc
+       message_service.cc
+       message_bus.cc
+       dist_model_tensor_wrapper.cc
+  DEPS naive_executor
+       proto_desc
+       standalone_executor
+       fleet_executor_desc_proto
+       interceptor_message_proto
+       task_loop_thread_pool
+       collective_helper
+       executor_gc_helper
+       op_registry
+       phi
+       glog
+       ${BRPC_DEPS})
 if(WITH_DISTRIBUTE)
   set(DISTRIBUTE_COMPILE_FLAGS
       "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"

diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt
@@ -729,8 +729,7 @@ if(WITH_DISTRIBUTE)
            section_worker.cc
            device_worker_factory.cc
            data_set.cc
-      DEPS fleet_executor
-           fleet_wrapper
+      DEPS fleet_wrapper
            recurrent_op_helper
            op_registry
            device_context
@@ -837,7 +836,6 @@ if(WITH_DISTRIBUTE)
            fleet
            heter_server
            brpc
-           fleet_executor
            phi)
     set(DISTRIBUTE_COMPILE_FLAGS "")
     if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0)
@@ -907,8 +905,7 @@ if(WITH_DISTRIBUTE)
            graph_to_program_pass
            variable_helper
            timer
-           monitor
-           fleet_executor)
+           monitor)
   endif()
 elseif(WITH_PSLIB)
   set(DISTRIBUTE_COMPILE_FLAGS "")
@@ -969,7 +966,6 @@ elseif(WITH_PSLIB)
          variable_helper
          timer
          monitor
-         fleet_executor
          ${BRPC_DEP})
 else()
   cc_library(
@@ -1017,8 +1013,7 @@ else()
          graph_to_program_pass
          variable_helper
          timer
-         monitor
-         fleet_executor)
+         monitor)
 endif()
 
 target_link_libraries(executor while_op_helper executor_gc_helper

diff --git a/paddle/fluid/framework/paddle2cinn/CMakeLists.txt b/paddle/fluid/framework/paddle2cinn/CMakeLists.txt
@@ -40,6 +40,7 @@ cc_library(
        cinn_graph_symbolization
        cinn
        cinn_launch_context
+       parallel_executor
        python
        pybind)
 
@@ -56,6 +57,7 @@ if(WITH_TESTING)
     SRCS
     build_cinn_pass_test.cc
     DEPS
+    fleet_executor
     build_cinn_pass
     cinn_compiler
     op_registry
@@ -72,6 +74,7 @@ if(WITH_TESTING)
     SRCS
     cinn_zero_tensor_trick_pass_test.cc
     DEPS
+    fleet_executor
     build_cinn_pass
     cinn_compiler
     op_registry

diff --git a/paddle/fluid/inference/api/CMakeLists.txt b/paddle/fluid/inference/api/CMakeLists.txt
@@ -80,14 +80,20 @@ if(WITH_ONNXRUNTIME)
          infer_io_utils
          model_utils
          onnxruntime
-         paddle2onnx)
+         paddle2onnx
+         fleet_executor)
 else()
   cc_library(
     analysis_predictor
     SRCS analysis_predictor.cc resource_manager.cc infer_context.cc
          ${mkldnn_quantizer_src}
-    DEPS ${inference_deps} zero_copy_tensor ir_pass_manager op_compatible_info
-         infer_io_utils model_utils)
+    DEPS ${inference_deps}
+         zero_copy_tensor
+         ir_pass_manager
+         op_compatible_info
+         infer_io_utils
+         model_utils
+         fleet_executor)
 endif()
 
 if(WITH_ONNXRUNTIME AND WIN32)

diff --git a/paddle/fluid/inference/tensorrt/CMakeLists.txt b/paddle/fluid/inference/tensorrt/CMakeLists.txt
@@ -28,10 +28,23 @@ nv_test(
   test_tensorrt
   SRCS test_tensorrt.cc
   DEPS dynload_cuda device_context dynamic_loader)
-nv_test(
-  test_tensorrt_engine
-  SRCS test_engine.cc test_dynamic_engine.cc
-  DEPS dynload_cuda tensorrt_engine tensorrt_plugin)
+if(WIN32)
+  nv_test(
+    test_tensorrt_engine
+    SRCS test_engine.cc test_dynamic_engine.cc
+    DEPS dynload_cuda tensorrt_engine tensorrt_plugin)
+elseif(WITH_CINN)
+  nv_test(
+    test_tensorrt_engine
+    SRCS test_engine.cc test_dynamic_engine.cc
+    DEPS fleet_executor cinn_compiler dynload_cuda tensorrt_engine
+         tensorrt_plugin python)
+else()
+  nv_test(
+    test_tensorrt_engine
+    SRCS test_engine.cc test_dynamic_engine.cc
+    DEPS fleet_executor dynload_cuda tensorrt_engine tensorrt_plugin python)
+endif()
 nv_test(
   test_arg_mapping_context
   SRCS test_arg_mapping_context.cc

diff --git a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
@@ -137,11 +137,35 @@ nv_test(
   SRCS test_op_converter.cc
   DEPS paddle_framework ${GLOB_OPERATOR_DEPS} tensorrt_engine
        tensorrt_converter)
-
-nv_test(
-  test_custom_plugin_creater
-  SRCS test_custom_plugin_creater.cc
-  DEPS paddle_framework tensorrt_converter phi custom_operator init_phi)
+if(WIN32)
+  nv_test(
+    test_custom_plugin_creater
+    SRCS test_custom_plugin_creater.cc
+    DEPS paddle_framework tensorrt_converter phi custom_operator init_phi)
+elseif(WITH_CINN)
+  nv_test(
+    test_custom_plugin_creater
+    SRCS test_custom_plugin_creater.cc
+    DEPS paddle_framework
+         tensorrt_converter
+         phi
+         custom_operator
+         init_phi
+         fleet_executor
+         cinn_compiler
+         python)
+else()
+  nv_test(
+    test_custom_plugin_creater
+    SRCS test_custom_plugin_creater.cc
+    DEPS paddle_framework
+         tensorrt_converter
+         phi
+         custom_operator
+         init_phi
+         fleet_executor
+         python)
+endif()
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will

diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt
@@ -11,7 +11,15 @@ cc_library(
   model_utils
   SRCS model_utils.cc
   DEPS proto_desc enforce)
-cc_test_old(infer_io_utils_tester SRCS io_utils_tester.cc DEPS infer_io_utils)
+
+cc_test_old(
+  infer_io_utils_tester
+  SRCS
+  io_utils_tester.cc
+  DEPS
+  infer_io_utils
+  fleet_executor
+  python)
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will

diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt
@@ -93,7 +93,7 @@ endif()
 set(OP_HEADER_DEPS ${OP_HEADER_DEPS} phi phi_utils static_prim_api get_expected_kernel_func)
 
 register_operators(EXCLUDES py_func_op dgc_op generated_op1 generated_op2 generated_op3 generated_op4 load_combine_op lstm_op run_program_op quantize_linear_op
-        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS})
+        recurrent_op save_combine_op sparse_attention_op sync_batch_norm_op activation_op ${OP_MKL_DEPS} DEPS ${OP_HEADER_DEPS} processgroup_comm_utils)
 
 op_library(generated_op UNITY SRCS generated_op1.cc generated_op2.cc generated_op3.cc generated_op4.cc DEPS ${OP_HEADER_DEPS})
 op_library(run_program_op DEPS executor_cache ${OP_HEADER_DEPS})
@@ -119,9 +119,9 @@ else()
 endif()
 
 if (WITH_GPU OR WITH_ROCM)
-    op_library(sync_batch_norm_op)
+    op_library(sync_batch_norm_op DEPS processgroup_comm_utils)
     if ((NOT WIN32) AND (NOT WITH_ROCM) AND (NOT PADDLE_WITH_ARM) AND (NOT ${CMAKE_CUDA_COMPILER_VERSION} VERSION_LESS 11.3) )
-        op_library(sparse_attention_op)
+        op_library(sparse_attention_op DEPS processgroup_comm_utils)
     endif()
 endif()
 

diff --git a/paddle/fluid/operators/tensorrt/CMakeLists.txt b/paddle/fluid/operators/tensorrt/CMakeLists.txt
@@ -1,9 +1,18 @@
-op_library(tensorrt_engine_op DEPS tensorrt_engine tensorrt_converter
-           infer_io_utils analysis_helper)
-nv_test(
-  test_tensorrt_engine_op
-  SRCS tensorrt_engine_op_test.cc
-  DEPS tensorrt_engine_op analysis)
+set(tensorrt_engine_op_deps tensorrt_engine tensorrt_converter infer_io_utils
+                            analysis_helper)
+
+op_library(tensorrt_engine_op DEPS ${tensorrt_engine_op_deps})
+if(NOT WIN32)
+  nv_test(
+    test_tensorrt_engine_op
+    SRCS tensorrt_engine_op_test.cc
+    DEPS tensorrt_engine_op analysis fleet_executor python)
+else()
+  nv_test(
+    test_tensorrt_engine_op
+    SRCS tensorrt_engine_op_test.cc
+    DEPS tensorrt_engine_op analysis fleet_executor)
+endif()
 
 if(WITH_ONNXRUNTIME AND WIN32)
   # Copy onnxruntime for some c++ test in Windows, since the test will

@@ -3750,7 +3750,6 @@ EOF
 
     ccache -z
     cd ..
-
     if [ "${PYTHON_EXECUTABLE}" != "" ];then
         if [ "$SYSTEM" == "Darwin" ]; then
             ${PYTHON_EXECUTABLE} setup.py $2 --plat-name=macosx_10_9_x86_64;build_error=$?
@@ -3766,7 +3765,7 @@ EOF
     fi
     # ci will collect ccache hit rate
     collect_ccache_hits
-
+    
     if [ "$build_error" != 0 ];then
         exit 7;
     fi