From 4265c10d7a3edd02202a142013659e395c2a3558 Mon Sep 17 00:00:00 2001 From: Billy Robert O'Neal III Date: Wed, 29 Jul 2020 18:53:41 -0700 Subject: [PATCH] Implement execution::unseq. Resolves GH-44. * Add sequenced_policy and unseq. * Mark sequenced_policy as being an execution policy. * Add detection for this new policy to std::for_each and std::for_each_n, and use #pragma loop(ivdep) when supplied. We are not marking other algorithms because all other algorithms have something that makes the operative loop body not actually independent and the docs for #pragma loop(ivdep) suggest that is not allowed. * Remove #pragma loop(ivdep) from std::transform because transform is callable such that _Dest == _First1 or _Dest == _First2. * Mark proposal as implemented and change __cpp_lib_execution when C++20 is turned on. instantiate_algorithms.hpp: * Add unseq to execution policy matricies. P0024R2_parallel_algorithms_for_each: * Add testing for unseq. VSO_0157762_feature_test_macros: * Update test for new value of __cpp_lib_execution. --- stl/inc/execution | 83 +++++++++++-------- stl/inc/yvals_core.h | 18 +++- tests/std/include/instantiate_algorithms.hpp | 18 ++++ .../test.cpp | 36 +++++--- .../VSO_0157762_feature_test_macros/test.cpp | 10 ++- 5 files changed, 112 insertions(+), 53 deletions(-) diff --git a/stl/inc/execution b/stl/inc/execution index 18ccbc1d496..13ef0f734cc 100644 --- a/stl/inc/execution +++ b/stl/inc/execution @@ -75,42 +75,71 @@ constexpr size_t _Still_active = static_cast(-1); // EXECUTION POLICIES namespace execution { - class sequenced_policy { // request for sequential execution with termination + class sequenced_policy { + // indicates support for only sequential execution, and request termination on exceptions public: using _Standard_execution_policy = int; static constexpr bool _Parallelize = false; + static constexpr bool _Ivdep = false; }; inline constexpr sequenced_policy seq{/* unspecified */}; - class parallel_policy { // request for parallel execution with termination + class parallel_policy { + // indicates support by element access functions for execution with parallel forward progress guarantees and + // requests termination on exceptions public: using _Standard_execution_policy = int; static constexpr bool _Parallelize = true; + static constexpr bool _Ivdep = true; }; inline constexpr parallel_policy par{/* unspecified */}; class parallel_unsequenced_policy { - // request for parallel execution without thread identity with termination + // indicates support by element access functions for parallel execution with weakly parallel forward progress + // guarantees, and requests termination on exceptions + // // (at this time, equivalent to parallel_policy) public: using _Standard_execution_policy = int; static constexpr bool _Parallelize = true; + static constexpr bool _Ivdep = true; }; inline constexpr parallel_unsequenced_policy par_unseq{/* unspecified */}; + +#if _HAS_CXX20 + class unsequenced_policy { + // indicates support by element access functions for weakly parallel forward progress guarantees, and for + // executing interleaved on the same thread, and requests termination on exceptions + // + // (at this time, equivalent to sequenced_policy except for the for_each family) + public: + using _Standard_execution_policy = int; + static constexpr bool _Parallelize = false; + static constexpr bool _Ivdep = true; + }; + + inline constexpr unsequenced_policy unseq{/* unspecified */}; +#endif // _HAS_CXX20 + } // namespace execution +// All of the above are execution policies: template <> -struct is_execution_policy : true_type {}; // sequenced_policy is an execution policy +struct is_execution_policy : true_type {}; template <> -struct is_execution_policy : true_type {}; // parallel_policy is an execution policy +struct is_execution_policy : true_type {}; template <> -struct is_execution_policy : true_type { -}; // parallel_unsequenced_policy is an execution policy +struct is_execution_policy : true_type {}; + +#if _HAS_CXX20 +template <> +struct is_execution_policy : true_type {}; +#endif // _HAS_CXX20 // STRUCT _Parallelism_resources_exhausted struct _Parallelism_resources_exhausted : exception { @@ -1216,6 +1245,8 @@ void for_each(_ExPo&&, _FwdIt _First, _FwdIt _Last, _Fn _Func) noexcept /* termi } } + _For_each_ivdep(_UFirst, _ULast, _Pass_fn(_Func)); + } else if constexpr (remove_reference_t<_ExPo>::_Parallelize) { _For_each_ivdep(_UFirst, _ULast, _Pass_fn(_Func)); } else { for (; _UFirst != _ULast; ++_UFirst) { @@ -1258,6 +1289,8 @@ _FwdIt for_each_n(_ExPo&&, _FwdIt _First, const _Diff _Count_raw, _Fn _Func) noe _CATCH_END } + _Seek_wrapped(_First, _For_each_n_ivdep(_UFirst, _Count, _Pass_fn(_Func))); + } else if constexpr (remove_reference_t<_ExPo>::_Ivdep) { _Seek_wrapped(_First, _For_each_n_ivdep(_UFirst, _Count, _Pass_fn(_Func))); } else { for (; 0 < _Count; --_Count, (void) ++_UFirst) { @@ -2281,17 +2314,6 @@ _NODISCARD _FwdIt search_n(_ExPo&&, const _FwdIt _First, _FwdIt _Last, const _Di } // PARALLEL FUNCTION TEMPLATE transform -template -_FwdIt2 _Transform_ivdep(_FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _Dest, _Fn _Func) { - // unary op transform with independent loop bodies -#pragma loop(ivdep) - for (; _First != _Last; ++_First, (void) ++_Dest) { - *_Dest = _Func(*_First); - } - - return _Dest; -} - template struct _Static_partitioned_unary_transform2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2>; @@ -2311,7 +2333,7 @@ struct _Static_partitioned_unary_transform2 { const auto _Key = _Team._Get_next_key(); if (_Key) { const auto _Source = _Source_basis._Get_chunk(_Key); - _Transform_ivdep(_Source._First, _Source._Last, _Dest_basis._Get_chunk(_Key)._First, _Func); + _STD transform(_Source._First, _Source._Last, _Dest_basis._Get_chunk(_Key)._First, _Func); return _Cancellation_status::_Running; } @@ -2349,12 +2371,12 @@ _FwdIt2 transform(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _D _CATCH_END } - _Seek_wrapped(_Dest, _Transform_ivdep(_UFirst, _ULast, _UDest, _Pass_fn(_Func))); + _Seek_wrapped(_Dest, _STD transform(_UFirst, _ULast, _UDest, _Pass_fn(_Func))); return _Dest; } else { _Seek_wrapped( - _Dest, _Transform_ivdep(_UFirst, _ULast, - _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), _Pass_fn(_Func))); + _Dest, _STD transform(_UFirst, _ULast, _Get_unwrapped_n(_Dest, _Idl_distance<_FwdIt1>(_UFirst, _ULast)), + _Pass_fn(_Func))); return _Dest; } } else { @@ -2364,17 +2386,6 @@ _FwdIt2 transform(_ExPo&&, const _FwdIt1 _First, const _FwdIt1 _Last, _FwdIt2 _D } } -template -_FwdIt3 _Transform_ivdep(_FwdIt1 _First1, const _FwdIt1 _Last1, _FwdIt2 _First2, _FwdIt3 _Dest, _Fn _Func) { - // binary op transform with independent loop bodies -#pragma loop(ivdep) - for (; _First1 != _Last1; ++_First1, (void) ++_First2, ++_Dest) { - *_Dest = _Func(*_First1, *_First2); - } - - return _Dest; -} - template struct _Static_partitioned_binary_transform2 { using _Diff = _Common_diff_t<_FwdIt1, _FwdIt2, _FwdIt3>; @@ -2396,7 +2407,7 @@ struct _Static_partitioned_binary_transform2 { const auto _Key = _Team._Get_next_key(); if (_Key) { const auto _Source1 = _Source1_basis._Get_chunk(_Key); - _Transform_ivdep(_Source1._First, _Source1._Last, _Source2_basis._Get_chunk(_Key)._First, + _STD transform(_Source1._First, _Source1._Last, _Source2_basis._Get_chunk(_Key)._First, _Dest_basis._Get_chunk(_Key)._First, _Func); return _Cancellation_status::_Running; } @@ -2442,11 +2453,11 @@ _FwdIt3 transform(_ExPo&&, const _FwdIt1 _First1, const _FwdIt1 _Last1, const _F _CATCH_END } - _Seek_wrapped(_Dest, _Transform_ivdep(_UFirst1, _ULast1, _UFirst2, _UDest, _Pass_fn(_Func))); + _Seek_wrapped(_Dest, _STD transform(_UFirst1, _ULast1, _UFirst2, _UDest, _Pass_fn(_Func))); return _Dest; } else { const auto _Count = _Idl_distance<_FwdIt1>(_UFirst1, _ULast1); - _Seek_wrapped(_Dest, _Transform_ivdep(_UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Count), + _Seek_wrapped(_Dest, _STD transform(_UFirst1, _ULast1, _Get_unwrapped_n(_First2, _Count), _Get_unwrapped_n(_Dest, _Count), _Pass_fn(_Func))); return _Dest; } diff --git a/stl/inc/yvals_core.h b/stl/inc/yvals_core.h index 61292ad030b..da3d708dbad 100644 --- a/stl/inc/yvals_core.h +++ b/stl/inc/yvals_core.h @@ -174,6 +174,7 @@ // (partially implemented, missing noop coroutines) // P0919R3 Heterogeneous Lookup For Unordered Containers // P0966R1 string::reserve() Should Not Shrink +// P1001R2 execution::unseq // P1006R1 constexpr For pointer_traits::pointer_to() // P1023R0 constexpr For std::array Comparisons // P1024R3 Enhancing span Usability @@ -225,6 +226,10 @@ // C++ allows an implementation to implement parallel algorithms as calls to the serial algorithms. // This implementation parallelizes several common algorithm calls, but not all. // +// std::execution::unseq has no direct analogue for any optimizer we target as of 2020-07-29, +// though we will map it to #pragma loop(ivdep) for the for_each algorithms only as these are the only algorithms where +// the library does not need to introduce inter-loop-body dependencies to accomplish the algorithm's goals. +// // The following algorithms are parallelized. // * adjacent_difference // * adjacent_find @@ -1091,10 +1096,7 @@ #if _HAS_STD_BYTE #define __cpp_lib_byte 201603L #endif // _HAS_STD_BYTE -#define __cpp_lib_clamp 201603L -#ifndef _M_CEE -#define __cpp_lib_execution 201603L -#endif // _M_CEE +#define __cpp_lib_clamp 201603L #define __cpp_lib_filesystem 201703L #define __cpp_lib_gcd_lcm 201606L #define __cpp_lib_hardware_interference_size 201703L @@ -1184,6 +1186,14 @@ #define __cpp_lib_unwrap_ref 201811L #endif // _HAS_CXX20 +#ifndef _M_CEE +#if _HAS_CXX17 +#define __cpp_lib_execution 201603L // P0024R2 Parallel Algorithms +#elif _HAS_CXX20 +#define __cpp_lib_execution 201902L // P1001R2 execution::unseq +#endif +#endif // _M_CEE + #if _HAS_CXX20 #define __cpp_lib_array_constexpr 201811L // P1032R1 Miscellaneous constexpr #elif _HAS_CXX17 // ^^^ _HAS_CXX20 / _HAS_CXX17 vvv diff --git a/tests/std/include/instantiate_algorithms.hpp b/tests/std/include/instantiate_algorithms.hpp index 47d81e69976..5d19929cb36 100644 --- a/tests/std/include/instantiate_algorithms.hpp +++ b/tests/std/include/instantiate_algorithms.hpp @@ -415,6 +415,9 @@ namespace std_testing { test_exec_fwd1_fwd2(std::execution::seq, fwd1, fwd2); test_exec_fwd1_fwd2(std::execution::par, fwd1, fwd2); test_exec_fwd1_fwd2(std::execution::par_unseq, fwd1, fwd2); +#if _HAS_CXX20 + test_exec_fwd1_fwd2(std::execution::unseq, fwd1, fwd2); +#endif // _HAS_CXX20 #endif // HAS_PARALLEL_ALGORITHMS (void) std::find_end(fwd1, fwd1, fwd2, fwd2); @@ -503,6 +506,9 @@ namespace std_testing { test_exec_fwd1(std::execution::seq, fwd1); test_exec_fwd1(std::execution::par, fwd1); test_exec_fwd1(std::execution::par_unseq, fwd1); +#if _HAS_CXX20 + test_exec_fwd1(std::execution::unseq, fwd1); +#endif // _HAS_CXX20 #endif // HAS_PARALLEL_ALGORITHMS test_fwd1_fwd2(fwd1, FWDIT); @@ -592,6 +598,9 @@ namespace std_testing { test_exec_bid1_bid2_xxx_backward(std::execution::seq, bid1, bid2); test_exec_bid1_bid2_xxx_backward(std::execution::par, bid1, bid2); test_exec_bid1_bid2_xxx_backward(std::execution::par_unseq, bid1, bid2); +#if _HAS_CXX20 + test_exec_bid1_bid2_xxx_backward(std::execution::unseq, bid1, bid2); +#endif // _HAS_CXX20 #endif // HAS_PARALLEL_ALGORITHMS std::copy_backward(bid1, bid1, bid2); @@ -615,6 +624,9 @@ namespace std_testing { test_exec_bid1_fwd1(std::execution::seq, bid1, fwd1); test_exec_bid1_fwd1(std::execution::par, bid1, fwd1); test_exec_bid1_fwd1(std::execution::par_unseq, bid1, fwd1); +#if _HAS_CXX20 + test_exec_bid1_fwd1(std::execution::unseq, bid1, fwd1); +#endif // _HAS_CXX20 } template @@ -653,6 +665,9 @@ namespace std_testing { test_exec_bid1(std::execution::seq, bid1); test_exec_bid1(std::execution::par, bid1); test_exec_bid1(std::execution::par_unseq, bid1); +#if _HAS_CXX20 + test_exec_bid1(std::execution::unseq, bid1); +#endif // _HAS_CXX20 #endif // HAS_PARALLEL_ALGORITHMS std::reverse(bid1, bid1); @@ -700,6 +715,9 @@ namespace std_testing { test_exec_ran(std::execution::seq, ran); test_exec_ran(std::execution::par, ran); test_exec_ran(std::execution::par_unseq, ran); +#if _HAS_CXX20 + test_exec_ran(std::execution::unseq, ran); +#endif // _HAS_CXX20 #endif // HAS_PARALLEL_ALGORITHMS #if _HAS_AUTO_PTR_ETC diff --git a/tests/std/tests/P0024R2_parallel_algorithms_for_each/test.cpp b/tests/std/tests/P0024R2_parallel_algorithms_for_each/test.cpp index bf168a6522d..dfc69437179 100644 --- a/tests/std/tests/P0024R2_parallel_algorithms_for_each/test.cpp +++ b/tests/std/tests/P0024R2_parallel_algorithms_for_each/test.cpp @@ -47,27 +47,39 @@ const auto call_only_once = [](atomic& b) { assert(!b.exchange(true)); }; const auto atomic_identity = [](atomic& b) { return b.load(); }; -template