Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROTOTYPE] Generalized two pass algorithm and copy_if #1700

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
9d6b54a
removing vestigial __is_full_block
danhoeflinger Jul 11, 2024
ba409fb
first draft generalized
danhoeflinger Jul 11, 2024
d182e7b
adding copy_if
danhoeflinger Jul 12, 2024
eeb062f
templated size_type
danhoeflinger Jul 12, 2024
329272b
switch to __result_and_scratch_space
danhoeflinger Jul 12, 2024
d819481
copy_if non-range (TODO fix tuple stuff)
danhoeflinger Jul 12, 2024
10bfffd
copy_if changes (currently broken)
danhoeflinger Jul 12, 2024
70dd06d
bugfixes for copy_if
danhoeflinger Jul 14, 2024
c958e5b
real bug fix for last element
danhoeflinger Jul 15, 2024
56ece5e
type fixes
danhoeflinger Jul 15, 2024
edbae46
fixes for block carry-forward
danhoeflinger Jul 15, 2024
2b73cdd
removing now-erroneous inclusive/exclusive differences
danhoeflinger Jul 15, 2024
b5118d7
fixing size of scratch
danhoeflinger Jul 15, 2024
fd4af6b
avoid launching empty groups
danhoeflinger Jul 16, 2024
d9fbc4c
fixing single wg copy_if
danhoeflinger Jul 16, 2024
4b6cc1c
shrinking single wg span
danhoeflinger Jul 16, 2024
de55743
clang formatting
danhoeflinger Jul 16, 2024
da8574e
minimizing usage of __scan_pred
danhoeflinger Jul 16, 2024
ca23212
adding comments for generalized scan helpers
danhoeflinger Jul 16, 2024
dc16033
Revert "minimizing usage of __scan_pred"
danhoeflinger Jul 16, 2024
fc5a931
reverting changes to range API (will handle later with changes to usa…
danhoeflinger Jul 16, 2024
70763ad
improve type deduction with invoke_result
danhoeflinger Jul 16, 2024
891b802
_FinalOp -> _WriteOp
danhoeflinger Jul 16, 2024
07bcd8b
Address Reviewer Comments
danhoeflinger Jul 16, 2024
559e9d5
restoring decay_t lost in invoke_result change
danhoeflinger Jul 16, 2024
6e71ad8
bugfix for zip iterator inputs
danhoeflinger Jul 17, 2024
42056c7
type fixes to adhere to specification
danhoeflinger Jul 17, 2024
4b39c0b
using internal tuple type
danhoeflinger Jul 17, 2024
70413da
[PROTOTYPE] Fallback to legacy scan implementation if underlying init…
mmichel11 Jul 17, 2024
530ec43
improving comments
danhoeflinger Jul 17, 2024
d072968
type / variable naming and clang-format
danhoeflinger Jul 17, 2024
c787091
remove redundant future (parallel_transform_reduce_then_scan returns …
danhoeflinger Jul 17, 2024
82c87c9
Revert "remove redundant future (parallel_transform_reduce_then_scan …
danhoeflinger Jul 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion include/oneapi/dpl/internal/exclusive_scan_by_segment_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,8 @@ exclusive_scan_by_segment_impl(__internal::__hetero_tag<_BackendTag>, Policy&& p
transform_inclusive_scan(::std::move(policy2), make_zip_iterator(_temp.get(), _flags.get()),
make_zip_iterator(_temp.get(), _flags.get()) + n, make_zip_iterator(result, _flags.get()),
internal::segmented_scan_fun<ValueType, FlagType, Operator>(binary_op),
oneapi::dpl::__internal::__no_op(), ::std::make_tuple(init, FlagType(1)));
oneapi::dpl::__internal::__no_op(),
oneapi::dpl::__internal::make_tuple(init, FlagType(1)));
return result + n;
}

Expand Down
188 changes: 145 additions & 43 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
template <typename _Policy, typename _InRng, typename _OutRng, typename _InitType, typename _BinaryOperation,
typename _UnaryOp>
auto
operator()(const _Policy& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init,
operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init,
_BinaryOperation __bin_op, _UnaryOp __unary_op)
{
using _ValueType = ::std::uint16_t;
Expand All @@ -589,7 +589,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W

constexpr ::std::uint32_t __elems_per_wg = _ElemsPerItem * _WGSize;

sycl::buffer<_Size> __res(sycl::range<1>(1));
__result_and_scratch_storage<_Policy, _Size> __result{__policy, 0};

auto __event = __policy.queue().submit([&](sycl::handler& __hdl) {
oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng);
Expand All @@ -598,10 +598,12 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
// predicate on each element of the input range. The second half stores the index of the output
// range to copy elements of the input range.
auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg * 2}, __hdl);
auto __res_acc = __res.template get_access<access_mode::write>(__hdl);
auto __res_acc = __result.__get_result_acc(__hdl);

__hdl.parallel_for<_ScanKernelName...>(
sycl::nd_range<1>(_WGSize, _WGSize), [=](sycl::nd_item<1> __self_item) {
auto __res_ptr =
__result_and_scratch_storage<_Policy, _Size>::__get_usm_or_buffer_accessor_ptr(__res_acc);
const auto& __group = __self_item.get_group();
const auto& __subgroup = __self_item.get_sub_group();
// This kernel is only launched for sizes less than 2^16
Expand Down Expand Up @@ -656,11 +658,11 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
if (__item_id == 0)
{
// Add predicate of last element to account for the scan's exclusivity
__res_acc[0] = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1];
__res_ptr[0] = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1];
}
});
});
return __future(__event, __res);
return __future(__event, __result);
}
};

Expand Down Expand Up @@ -774,6 +776,82 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, ::std::size_t __n, ::std::s
return (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size);
}

template <typename _UnaryOp>
struct __gen_transform_input
{
template <typename InRng>
auto
operator()(InRng&& __in_rng, std::size_t __idx) const
{
using _ValueType = oneapi::dpl::__internal::__value_t<InRng>;
using _OutValueType = oneapi::dpl::__internal::__decay_with_tuple_specialization_t<typename std::invoke_result<_UnaryOp, _ValueType>::type>;
return _OutValueType{__unary_op(__in_rng[__idx])};
}
_UnaryOp __unary_op;
};

struct __simple_write_to_idx
{
template <typename _OutRng, typename ValueType>
void
operator()(_OutRng&& __out, std::size_t __idx, const ValueType& __v) const
{
__out[__idx] = __v;
}
};

template <typename _Predicate>
struct __gen_count_pred
{
template <typename _InRng, typename _SizeType>
_SizeType
operator()(_InRng&& __in_rng, _SizeType __idx) const
{
return __pred(__in_rng[__idx]) ? _SizeType{1} : _SizeType{0};
}
_Predicate __pred;
};

template <typename _Predicate>
struct __gen_expand_count_pred
{
template <typename _InRng, typename _SizeType>
auto
operator()(_InRng&& __in_rng, _SizeType __idx) const
{
// Explicitly creating this element type is necessary to avoid modifying the input data when _InRng is a
// zip_iterator which will return a tuple of references when dereferenced. With this explicit type, we copy
// the values of zipped the input types rather than their references.
using _ElementType =
oneapi::dpl::__internal::__decay_with_tuple_specialization_t<oneapi::dpl::__internal::__value_t<_InRng>>;
_ElementType ele = __in_rng[__idx];
bool mask = __pred(ele);
return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele);
}
_Predicate __pred;
};

struct __get_zeroth_element
{
template <typename _Tp>
auto&
operator()(_Tp&& __a) const
{
return std::get<0>(std::forward<_Tp>(__a));
}
};

struct __write_to_idx_if
{
template <typename _OutRng, typename _SizeType, typename ValueType>
void
operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const
{
if (std::get<1>(__v))
__out[std::get<0>(__v) - 1] = std::get<2>(__v);
}
};

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
typename _BinaryOperation, typename _Inclusive>
auto
Expand All @@ -782,39 +860,62 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
_InitType __init, _BinaryOperation __binary_op, _Inclusive)
{
using _Type = typename _InitType::__value_type;

// Next power of 2 greater than or equal to __n
auto __n_uniform = __n;
if ((__n_uniform & (__n_uniform - 1)) != 0)
__n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(__n) << 1;

// TODO: can we reimplement this with support fort non-identities as well? We can then use in reduce-then-scan
// for the last block if it is sufficiently small
constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
if constexpr (__can_use_group_scan)
// Reduce-then-scan is dependent on sycl::shift_group_right which requires the underlying type to be trivially
// copyable. If this is not met, then we must fallback to the legacy implementation. The single work-group implementation
// requires a fundamental type which must also be trivially copyable.
if constexpr (std::is_trivially_copyable_v<_Type>)
{
if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform))
// Next power of 2 greater than or equal to __n
auto __n_uniform = __n;
if ((__n_uniform & (__n_uniform - 1)) != 0)
__n_uniform = oneapi::dpl::__internal::__dpl_bit_floor(__n) << 1;

// TODO: can we reimplement this with support for non-identities as well? We can then use in reduce-then-scan
// for the last block if it is sufficiently small
constexpr bool __can_use_group_scan = unseq_backend::__has_known_identity<_BinaryOperation, _Type>::value;
if constexpr (__can_use_group_scan)
{
return __parallel_transform_scan_single_group(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
if (__group_scan_fits_in_slm<_Type>(__exec.queue(), __n, __n_uniform))
{
return __parallel_transform_scan_single_group(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
::std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
}
}
oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation> __gen_transform{__unary_op};
return __future(__parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{})
.event());
}
else
{
using _Assigner = unseq_backend::__scan_assigner;
using _NoAssign = unseq_backend::__scan_no_assign;
using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;

_Assigner __assign_op;
_NoAssign __no_assign_op;
_NoOpFunctor __get_data_op;

return __future(
__parallel_transform_scan_base(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
std::forward<_Range2>(__out_rng), __binary_op, __init,
// local scan
unseq_backend::__scan<_Inclusive, _ExecutionPolicy, _BinaryOperation, _UnaryFunctor, _Assigner,
_Assigner, _NoOpFunctor, _InitType>{__binary_op, _UnaryFunctor{__unary_op},
__assign_op, __assign_op, __get_data_op},
// scan between groups
unseq_backend::__scan</*inclusive=*/std::true_type, _ExecutionPolicy, _BinaryOperation, _NoOpFunctor,
_NoAssign, _Assigner, _NoOpFunctor, unseq_backend::__no_init_value<_Type>>{
__binary_op, _NoOpFunctor{}, __no_assign_op, __assign_op, __get_data_op},
// global scan
unseq_backend::__global_scan_functor<_Inclusive, _BinaryOperation, _InitType>{__binary_op, __init})
.event());
}

// TODO: Reintegrate once support has been added
//// Either we can't use group scan or this input is too big for one workgroup
//using _Assigner = unseq_backend::__scan_assigner;
//using _NoAssign = unseq_backend::__scan_no_assign;
//using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
//using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;

//_Assigner __assign_op;
//_NoAssign __no_assign_op;
//_NoOpFunctor __get_data_op;
return __future(__parallel_transform_reduce_then_scan(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
::std::forward<_Range1>(__in_rng), ::std::forward<_Range2>(__out_rng),
__binary_op, __unary_op, __init, _Inclusive{})
.event());
}

template <typename _SizeType>
Expand Down Expand Up @@ -915,15 +1016,14 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
// The kernel stores n integers for the predicate and another n integers for the offsets
const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2;

constexpr ::std::uint16_t __single_group_upper_limit = 16384;
constexpr ::std::uint16_t __single_group_upper_limit = 2048;

::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);

if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
__max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
{
using _SizeBreakpoints =
::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384>;
using _SizeBreakpoints = ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;

return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
_SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
Expand All @@ -932,13 +1032,15 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
else
{
using _ReduceOp = ::std::plus<_Size>;
using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
/*inclusive*/ ::std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n,
CreateOp{__pred}, CopyOp{});
return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), oneapi::dpl::__par_backend_hetero::__gen_count_pred<_Pred>{__pred},
_ReduceOp{}, oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<_Pred>{__pred},
oneapi::dpl::__par_backend_hetero::__get_zeroth_element{},
oneapi::dpl::__par_backend_hetero::__write_to_idx_if{},
oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
/*_Inclusive=*/std::true_type{});
}
}

Expand Down
Loading