diff --git a/include/dlaf/factorization/qr.h b/include/dlaf/factorization/qr.h index ac8f8db9ea..26fdd5d16e 100644 --- a/include/dlaf/factorization/qr.h +++ b/include/dlaf/factorization/qr.h @@ -96,7 +96,7 @@ void computeTFactor(matrix::Panel& hh_panel, pika::shared_future> taus, pika::future> t, common::Pipeline& mpi_col_task_chain) { - QR_TfactorDistributed::call(hh_panel, taus, std::move(t), mpi_col_task_chain); + QR_Tfactor::call(hh_panel, taus, std::move(t), mpi_col_task_chain); } } diff --git a/include/dlaf/factorization/qr/api.h b/include/dlaf/factorization/qr/api.h index 2c1f9a6dfe..79add6fbce 100644 --- a/include/dlaf/factorization/qr/api.h +++ b/include/dlaf/factorization/qr/api.h @@ -31,6 +31,10 @@ struct QR_Tfactor { static void call(matrix::Panel& panel_view, pika::shared_future> taus, pika::future> t); + static void call(matrix::Panel& hh_panel, + pika::shared_future> taus, + pika::future> t, + common::Pipeline& mpi_col_task_chain); }; #ifdef DLAF_WITH_GPU @@ -39,21 +43,16 @@ struct QR_Tfactor { static void call(matrix::Panel& panel_view, pika::shared_future> taus, pika::future> t); -}; -#endif - -template -struct QR_TfactorDistributed { - static void call(matrix::Panel& hh_panel, + static void call(matrix::Panel& hh_panel, pika::shared_future> taus, - pika::future> t, + pika::future> t, common::Pipeline& mpi_col_task_chain); }; +#endif /// ---- ETI #define DLAF_FACTORIZATION_QR_TFACTOR_ETI(KWORD, BACKEND, DEVICE, DATATYPE) \ - KWORD template struct QR_Tfactor; \ - KWORD template struct QR_TfactorDistributed; + KWORD template struct QR_Tfactor; DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, float) DLAF_FACTORIZATION_QR_TFACTOR_ETI(extern, Backend::MC, Device::CPU, double) diff --git a/include/dlaf/factorization/qr/t_factor_impl.h b/include/dlaf/factorization/qr/t_factor_impl.h index 7fbc0c2b6a..4c1dd4aefe 100644 --- a/include/dlaf/factorization/qr/t_factor_impl.h +++ b/include/dlaf/factorization/qr/t_factor_impl.h @@ -393,11 +393,14 @@ void QR_Tfactor::call(matrix::Panel -void QR_TfactorDistributed::call(matrix::Panel& hh_panel, - pika::shared_future> taus, - pika::future> t, - common::Pipeline& mpi_col_task_chain) { +template +void QR_Tfactor::call( + matrix::Panel& hh_panel, + pika::shared_future> taus, pika::future> t, + common::Pipeline& mpi_col_task_chain) { + constexpr auto B = Backend::MC; + constexpr auto D = Device::CPU; + namespace ex = pika::execution::experimental; // Fast return in case of no reflectors @@ -495,4 +498,68 @@ void QR_TfactorDistributed::call(matrix::Panel& hh_pa }); })); } + +#ifdef DLAF_WITH_GPU +template +void QR_Tfactor::call( + matrix::Panel& hh_panel, + pika::shared_future> taus, pika::future> t, + common::Pipeline& mpi_col_task_chain) { + constexpr auto B = Backend::GPU; + constexpr auto D = Device::GPU; + + namespace ex = pika::execution::experimental; + + using Helpers = tfactor_l::Helpers; + + // Fast return in case of no reflectors + if (hh_panel.getWidth() == 0) + return; + + const auto v_start = hh_panel.offsetElement(); + auto dist = hh_panel.parentDistribution(); + + ex::unique_any_sender> t_local = Helpers::set0(std::move(t)); + + // Note: + // T factor is an upper triangular square matrix, built column by column + // with taus values on the diagonal + // + // T(j,j) = tau(j) + // + // and in the upper triangular part the following formula applies + // + // T(0:j, j) = T(0:j, 0:j) . -tau(j) . V(j:, 0:j)* . V(j:, j) + // + // + // The result is achieved in two main steps: + // 1) t = -tau(j) . V(j:, 0:j)* . V(j:, j) + // 2) T(0:j, j) = T(0:j, 0:j) . t + + // 1st step: compute the column partial result `t` + // First we compute the matrix vector multiplication for each column + // -tau(j) . V(j:, 0:j)* . V(j:, j) + for (const auto& v_i_loc : hh_panel.iteratorLocal()) { + const SizeType v_i = dist.template globalTileFromLocalTile(v_i_loc.row()); + const SizeType first_row_tile = std::max(0, v_i * dist.blockSize().rows() - v_start); + + // TODO + // Note: + // Since we are writing always on the same t, the gemv are serialized + // A possible solution to this would be to have multiple places where to store partial + // results, and then locally reduce them just before the reduce over ranks + t_local = Helpers::gemvColumnT(first_row_tile, hh_panel.read(v_i_loc), taus, std::move(t_local)); + } + + // at this point each rank has its partial result for each column + // so, let's reduce the results (on all ranks, so that everyone can independently compute T factor) + if (true) // TODO if the column communicator has more than 1 tile...but I just have the pipeline + t_local = dlaf::comm::scheduleAllReduceInPlace(mpi_col_task_chain(), MPI_SUM, std::move(t_local)); + + // 2nd step: compute the T factor, by performing the last step on each column + // each column depends on the previous part (all reflectors that comes before) + // so it is performed sequentially + ex::start_detached(Helpers::trmvUpdateColumn(std::move(t_local))); +} +#endif }