Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[PROTOTYPE] Generalized two pass algorithm and copy_if #1700

Merged
Merged
Show file tree
Hide file tree
Changes from 21 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
9d6b54a
removing vestigial __is_full_block
danhoeflinger Jul 11, 2024
ba409fb
first draft generalized
danhoeflinger Jul 11, 2024
d182e7b
adding copy_if
danhoeflinger Jul 12, 2024
eeb062f
templated size_type
danhoeflinger Jul 12, 2024
329272b
switch to __result_and_scratch_space
danhoeflinger Jul 12, 2024
d819481
copy_if non-range (TODO fix tuple stuff)
danhoeflinger Jul 12, 2024
10bfffd
copy_if changes (currently broken)
danhoeflinger Jul 12, 2024
70dd06d
bugfixes for copy_if
danhoeflinger Jul 14, 2024
c958e5b
real bug fix for last element
danhoeflinger Jul 15, 2024
56ece5e
type fixes
danhoeflinger Jul 15, 2024
edbae46
fixes for block carry-forward
danhoeflinger Jul 15, 2024
2b73cdd
removing now-erroneous inclusive/exclusive differences
danhoeflinger Jul 15, 2024
b5118d7
fixing size of scratch
danhoeflinger Jul 15, 2024
fd4af6b
avoid launching empty groups
danhoeflinger Jul 16, 2024
d9fbc4c
fixing single wg copy_if
danhoeflinger Jul 16, 2024
4b6cc1c
shrinking single wg span
danhoeflinger Jul 16, 2024
de55743
clang formatting
danhoeflinger Jul 16, 2024
da8574e
minimizing usage of __scan_pred
danhoeflinger Jul 16, 2024
ca23212
adding comments for generalized scan helpers
danhoeflinger Jul 16, 2024
dc16033
Revert "minimizing usage of __scan_pred"
danhoeflinger Jul 16, 2024
fc5a931
reverting changes to range API (will handle later with changes to usa…
danhoeflinger Jul 16, 2024
70763ad
improve type deduction with invoke_result
danhoeflinger Jul 16, 2024
891b802
_FinalOp -> _WriteOp
danhoeflinger Jul 16, 2024
07bcd8b
Address Reviewer Comments
danhoeflinger Jul 16, 2024
559e9d5
restoring decay_t lost in invoke_result change
danhoeflinger Jul 16, 2024
6e71ad8
bugfix for zip iterator inputs
danhoeflinger Jul 17, 2024
42056c7
type fixes to adhere to specification
danhoeflinger Jul 17, 2024
4b39c0b
using internal tuple type
danhoeflinger Jul 17, 2024
70413da
[PROTOTYPE] Fallback to legacy scan implementation if underlying init…
mmichel11 Jul 17, 2024
530ec43
improving comments
danhoeflinger Jul 17, 2024
d072968
type / variable naming and clang-format
danhoeflinger Jul 17, 2024
c787091
remove redundant future (parallel_transform_reduce_then_scan returns …
danhoeflinger Jul 17, 2024
82c87c9
Revert "remove redundant future (parallel_transform_reduce_then_scan …
danhoeflinger Jul 17, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
124 changes: 96 additions & 28 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
template <typename _Policy, typename _InRng, typename _OutRng, typename _InitType, typename _BinaryOperation,
typename _UnaryOp>
auto
operator()(const _Policy& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init,
operator()(_Policy&& __policy, _InRng&& __in_rng, _OutRng&& __out_rng, ::std::size_t __n, _InitType __init,
_BinaryOperation __bin_op, _UnaryOp __unary_op)
{
using _ValueType = ::std::uint16_t;
Expand All @@ -589,7 +589,7 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W

constexpr ::std::uint32_t __elems_per_wg = _ElemsPerItem * _WGSize;

sycl::buffer<_Size> __res(sycl::range<1>(1));
__result_and_scratch_storage<_Policy, _Size> __result{__policy, 0};

auto __event = __policy.queue().submit([&](sycl::handler& __hdl) {
oneapi::dpl::__ranges::__require_access(__hdl, __in_rng, __out_rng);
Expand All @@ -598,10 +598,12 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
// predicate on each element of the input range. The second half stores the index of the output
// range to copy elements of the input range.
auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg * 2}, __hdl);
auto __res_acc = __res.template get_access<access_mode::write>(__hdl);
auto __res_acc = __result.__get_result_acc(__hdl);

__hdl.parallel_for<_ScanKernelName...>(
sycl::nd_range<1>(_WGSize, _WGSize), [=](sycl::nd_item<1> __self_item) {
auto __res_ptr =
__result_and_scratch_storage<_Policy, _Size>::__get_usm_or_buffer_accessor_ptr(__res_acc);
const auto& __group = __self_item.get_group();
const auto& __subgroup = __self_item.get_sub_group();
// This kernel is only launched for sizes less than 2^16
Expand Down Expand Up @@ -656,11 +658,11 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
if (__item_id == 0)
{
// Add predicate of last element to account for the scan's exclusivity
__res_acc[0] = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1];
__res_ptr[0] = __lacc[__elems_per_wg + __n - 1] + __lacc[__n - 1];
}
});
});
return __future(__event, __res);
return __future(__event, __result);
}
};

Expand Down Expand Up @@ -774,6 +776,77 @@ __group_scan_fits_in_slm(const sycl::queue& __queue, ::std::size_t __n, ::std::s
return (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size);
}

template <typename _ValueType, typename _UnaryOp>
struct __gen_transform_input
{
using __out_value_type = std::decay_t<decltype(::std::declval<_UnaryOp>()(::std::declval<_ValueType>()))>;
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
template <typename InRng>
auto
operator()(InRng&& __in_rng, std::size_t __idx) const
{
return __unary_op(__in_rng[__idx]);
}
_UnaryOp __unary_op;
};

struct __simple_write_to_idx
{
template <typename _OutRng, typename ValueType>
void
operator()(_OutRng&& __out, std::size_t __idx, const ValueType& __v) const
{
__out[__idx] = __v;
}
};

template <typename _SizeType, typename _Predicate>
struct __gen_count_pred
{
using __out_value_type = _SizeType;
template <typename _InRng>
_SizeType
operator()(_InRng&& __in_rng, _SizeType __idx)
mmichel11 marked this conversation as resolved.
Show resolved Hide resolved
{
return __pred(__in_rng[__idx]) ? _SizeType{1} : _SizeType{0};
}
_Predicate __pred;
};

template <typename _SizeType, typename _Predicate>
struct __gen_expand_count_pred
{
template <typename _InRng>
auto
operator()(_InRng&& __in_rng, _SizeType __idx)
mmichel11 marked this conversation as resolved.
Show resolved Hide resolved
{
auto ele = __in_rng[__idx];
danhoeflinger marked this conversation as resolved.
Show resolved Hide resolved
bool mask = __pred(ele);
return std::tuple(mask ? _SizeType{1} : _SizeType{0}, mask, ele);
}
_Predicate __pred;
};

struct __get_zeroth_element
{
template <typename _Tp>
auto&
operator()(_Tp&& __a) const
{
return std::get<0>(std::forward<_Tp>(__a));
}
};

struct __write_to_idx_if
{
template <typename _OutRng, typename _SizeType, typename ValueType>
void
operator()(_OutRng&& __out, _SizeType __idx, const ValueType& __v) const
{
if (std::get<1>(__v))
__out[std::get<0>(__v) - 1] = std::get<2>(__v);
}
};

template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryOperation, typename _InitType,
typename _BinaryOperation, typename _Inclusive>
auto
Expand Down Expand Up @@ -801,20 +874,14 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
}
}

// TODO: Reintegrate once support has been added
//// Either we can't use group scan or this input is too big for one workgroup
//using _Assigner = unseq_backend::__scan_assigner;
//using _NoAssign = unseq_backend::__scan_no_assign;
//using _UnaryFunctor = unseq_backend::walk_n<_ExecutionPolicy, _UnaryOperation>;
//using _NoOpFunctor = unseq_backend::walk_n<_ExecutionPolicy, oneapi::dpl::__internal::__no_op>;

//_Assigner __assign_op;
//_NoAssign __no_assign_op;
//_NoOpFunctor __get_data_op;
return __future(__parallel_transform_reduce_then_scan(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
::std::forward<_Range1>(__in_rng), ::std::forward<_Range2>(__out_rng),
__binary_op, __unary_op, __init, _Inclusive{})
.event());
oneapi::dpl::__par_backend_hetero::__gen_transform_input<oneapi::dpl::__internal::__value_t<_Range1>,
_UnaryOperation>
__gen_transform{__unary_op};
return __future(__parallel_transform_reduce_then_scan(
__backend_tag, ::std::forward<_ExecutionPolicy>(__exec), ::std::forward<_Range1>(__in_rng),
::std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform,
oneapi::dpl::__internal::__no_op{}, __simple_write_to_idx{}, __init, _Inclusive{})
.event());
}

template <typename _SizeType>
Expand Down Expand Up @@ -915,15 +982,14 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
// The kernel stores n integers for the predicate and another n integers for the offsets
const auto __req_slm_size = sizeof(::std::uint16_t) * __n_uniform * 2;

constexpr ::std::uint16_t __single_group_upper_limit = 16384;
constexpr ::std::uint16_t __single_group_upper_limit = 2048;

::std::size_t __max_wg_size = oneapi::dpl::__internal::__max_work_group_size(__exec);

if (__n <= __single_group_upper_limit && __max_slm_size >= __req_slm_size &&
__max_wg_size >= _SingleGroupInvoker::__targeted_wg_size)
{
using _SizeBreakpoints =
::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384>;
using _SizeBreakpoints = ::std::integer_sequence<::std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;

return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
_SingleGroupInvoker{}, __n, ::std::forward<_ExecutionPolicy>(__exec), __n, ::std::forward<_InRng>(__in_rng),
Expand All @@ -932,13 +998,15 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
else
{
using _ReduceOp = ::std::plus<_Size>;
using CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
using CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, oneapi::dpl::__internal::__pstl_assign,
/*inclusive*/ ::std::true_type, 1>;

return __parallel_scan_copy(__backend_tag, ::std::forward<_ExecutionPolicy>(__exec),
::std::forward<_InRng>(__in_rng), ::std::forward<_OutRng>(__out_rng), __n,
CreateOp{__pred}, CopyOp{});
return __parallel_transform_reduce_then_scan(
__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
std::forward<_OutRng>(__out_rng), oneapi::dpl::__par_backend_hetero::__gen_count_pred<_Size, _Pred>{__pred},
_ReduceOp{}, oneapi::dpl::__par_backend_hetero::__gen_expand_count_pred<_Size, _Pred>{__pred},
oneapi::dpl::__par_backend_hetero::__get_zeroth_element{},
oneapi::dpl::__par_backend_hetero::__write_to_idx_if{},
oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
/*_Inclusive=*/std::true_type{});
}
}

Expand Down
Loading