eth-cscs · rasolca · Jun 21, 2024 · Apr 9, 2024 · Apr 9, 2024 · Apr 9, 2024
diff --git a/include/dlaf/eigensolver/tridiag_solver/impl.h b/include/dlaf/eigensolver/tridiag_solver/impl.h
@@ -228,6 +228,7 @@ void TridiagSolver<B, D, T>::call(Matrix<T, Device::CPU>& tridiag, Matrix<T, D>&
                      Matrix<T, D>(vec_size, vec_tile_size),          // z1
                      Matrix<SizeType, D>(vec_size, vec_tile_size),   // i2
                      Matrix<SizeType, D>(vec_size, vec_tile_size),   // i5
+                     Matrix<SizeType, D>(vec_size, vec_tile_size),   // i5b
                      Matrix<SizeType, D>(vec_size, vec_tile_size)};  // i6
 
   WorkSpaceHost<T> ws_h{Matrix<T, Device::CPU>(vec_size, vec_tile_size),          // d0
@@ -398,6 +399,7 @@ void TridiagSolver<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, Device
   // Auxiliary matrix used for the D&C algorithm
   const matrix::Distribution& dist_evecs = evecs.distribution();
   const matrix::Distribution& dist_evals = evals.distribution();
+  const matrix::Distribution dist_local({dist_evecs.local_size().cols(), 1}, dist_evecs.tile_size());
 
   WorkSpace<T, D> ws{Matrix<T, D>(dist_evecs),          // e0
                      Matrix<T, D>(dist_evecs),          // e1
@@ -407,6 +409,7 @@ void TridiagSolver<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, Device
                      Matrix<T, D>(dist_evals),          // z1
                      Matrix<SizeType, D>(dist_evals),   // i2
                      Matrix<SizeType, D>(dist_evals),   // i5
+                     Matrix<SizeType, D>(dist_local),   // i5b
                      Matrix<SizeType, D>(dist_evals)};  // i6
 
   WorkSpaceHost<T> ws_h{Matrix<T, Device::CPU>(dist_evals),          // d0
@@ -419,7 +422,8 @@ void TridiagSolver<B, D, T>::call(comm::CommunicatorGrid& grid, Matrix<T, Device
   DistWorkSpaceHostMirror<T, D> ws_hm{initMirrorMatrix(ws.e0), initMirrorMatrix(ws.e2),
                                       initMirrorMatrix(ws.d1), initMirrorMatrix(ws.z0),
                                       initMirrorMatrix(ws.z1), initMirrorMatrix(ws.i2),
-                                      initMirrorMatrix(ws.i5), initMirrorMatrix(ws.i6)};
+                                      initMirrorMatrix(ws.i5), initMirrorMatrix(ws.i5b),
+                                      initMirrorMatrix(ws.i6)};
 
   // Set `ws.e0` to `zero` (needed for Given's rotation to make sure no random values are picked up)
   matrix::util::set0<B, T, D>(thread_priority::normal, ws.e0);

diff --git a/include/dlaf/eigensolver/tridiag_solver/merge.h b/include/dlaf/eigensolver/tridiag_solver/merge.h
@@ -126,6 +126,7 @@ struct WorkSpace {
 
   Matrix<SizeType, D> i2;
   Matrix<SizeType, D> i5;
+  Matrix<SizeType, D> i5b;
   Matrix<SizeType, D> i6;
 };
 
@@ -169,6 +170,7 @@ struct DistWorkSpaceHostMirror {
 
   HostMirrorMatrix<SizeType, D> i2;
   HostMirrorMatrix<SizeType, D> i5;
+  HostMirrorMatrix<SizeType, D> i5b;
   HostMirrorMatrix<SizeType, D> i6;
 };
 
@@ -387,7 +389,8 @@ auto stablePartitionIndexForDeflationArrays(const SizeType n, const ColType* typ
 //
 // Both permutations are represented using global indices, but:
 // - @p index_sorted          sorts "globally", i.e. considering all evecs across ranks
-// - @p index_sorted_coltype  sorts "locally", i.e. consdering just evecs from the same rank
+// - @p index_sorted_coltype  sorts "locally", i.e. considering just evecs from the same rank (global indices)
+// - @p i5_lc                 sorts "locally", i.e. considering just evecs from current rank (local indices)
 //
 // In addition, even if all ranks have the full permutation, it is important to highlight that
 // thanks to how it is built, i.e. rank-independent permutations, @p index_sorted_coltype can be used as
@@ -397,23 +400,28 @@ auto stablePartitionIndexForDeflationArrays(const SizeType n, const ColType* typ
 // rank                   |     0     |     1     |     0     |
 // initial                | 0U| 1L| 2X| 3U| 4U| 5X| 6L| 7L| 8X|
 // index_sorted           | 0U| 1L| 3U| 4U| 6L| 7L| 2X| 5X| 8X| -> sort(non-deflated) | sort(deflated)
-// index_sorted_col_type  | 0U| 1L| 6L| 3U| 4U| 5X| 7L| 2X| 8X| -> rank0(ULLLXX) - rank1(UUX)
+// index_sorted_col_type  | 0U| 1L| 6L| 3U| 4U| 5X| 7L| 2X| 8X| -> rank0(ULLLXX) - rank1(UUX) (global indices)
+// i5_lc                  | 0U| 1L| 3L|...........| 4L| 2X| 5X| -> rank0(ULLLXX)  (local indices)
+// i5_lc                  |...........| 0U| 1U| 2X|...........| -> rank1(UUX)     (local indices)
 //
 // index_sorted_col_type can be used "locally":
 // on rank0               | 0U| 1L| 6L| --| --| --| 7L| 2X| 8X| -> ULLLXX
 // on rank1               | --| --| --| 3U| 4U| 5X| --| --| --| -> UUX
 //
+// i5_lc is just local to the current rank, and it is a view over index_sorted_col_type where indices are local.
+//
 // where U: Upper, D: Dense, L: Lower, X: Deflated
 //
 // The permutations will allow to keep the mapping between sorted eigenvalues and unsorted eigenvectors,
 // which is useful since eigenvectors are more expensive to permute, so we can keep them in their
 // initial order.
 //
-// @param types                 array[n] column type of each eigenvector after deflation (initial order)
-// @param evals                 array[n] of eigenvalues sorted as perm_sorted
-// @param perm_sorted           array[n] current -> initial (i.e. evals[i] -> types[perm_sorted[i]])
-// @param index_sorted          array[n] global(sort(non-deflated)|sort(deflated))) -> initial
-// @param index_sorted_coltype  array[n] local(sort(upper)|sort(dense)|sort(lower)|sort(deflated))) -> initial
+// @param types                 array[n]    column type of each eigenvector after deflation (initial order)
+// @param evals                 array[n]    of eigenvalues sorted as perm_sorted
+// @param perm_sorted           array[n]    current -> initial (i.e. evals[i] -> types[perm_sorted[i]])
+// @param index_sorted          array[n]    global(sort(non-deflated)|sort(deflated))) -> initial
+// @param index_sorted_coltype  array[n]    local(sort(upper)|sort(dense)|sort(lower)|sort(deflated))) -> initial
+// @param i5_lc                 array[n_lc] local(sort(upper)|sort(dense)|sort(lower)|sort(deflated))) -> initial
 //
 // @return k                    number of non-deflated eigenvectors
 // @return k_local              number of local non-deflated eigenvectors
@@ -422,7 +430,7 @@ template <class T>
 auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub, const ColType* types,
                                             const T* evals, SizeType* perm_sorted,
                                             SizeType* index_sorted, SizeType* index_sorted_coltype,
-                                            SizeType* i4, SizeType* i6) {
+                                            SizeType* i5_lc, SizeType* i4, SizeType* i6) {
   const SizeType n = dist_sub.size().cols();
   const SizeType k = std::count_if(types, types + n,
                                    [](const ColType coltype) { return ColType::Deflated != coltype; });
@@ -506,6 +514,12 @@ auto stablePartitionIndexForDeflationArrays(const matrix::Distribution& dist_sub
     index_sorted_coltype[to_sizet(jjj_el)] = jj_el;
   }
 
+  // This is the local version of the previous one, just for the current rank
+  for (SizeType i_lc = 0; i_lc < dist_sub.local_size().cols(); ++i_lc) {
+    const SizeType i = dist_sub.global_element_from_local_element<Coord::Col>(i_lc);
+    i5_lc[i_lc] = dist_sub.local_element_from_global_element<Coord::Col>(index_sorted_coltype[i]);
+  }
+
   std::array<SizeType, 3> n_udl = [&]() {
     SizeType first_dense;
     for (first_dense = 0; first_dense < n; ++first_dense) {
@@ -611,8 +625,8 @@ auto stablePartitionIndexForDeflation(
     const matrix::Distribution& dist_evecs, const SizeType i_begin, const SizeType i_end,
     Matrix<const ColType, Device::CPU>& c, Matrix<const T, Device::CPU>& evals,
     Matrix<SizeType, Device::CPU>& in, Matrix<SizeType, Device::CPU>& out,
-    Matrix<SizeType, Device::CPU>& out_by_coltype, Matrix<SizeType, Device::CPU>& i4,
-    Matrix<SizeType, Device::CPU>& i6) {
+    Matrix<SizeType, Device::CPU>& out_by_coltype, Matrix<SizeType, Device::CPU>& i5_lc,
+    Matrix<SizeType, Device::CPU>& i4, Matrix<SizeType, Device::CPU>& i6) {
   namespace ex = pika::execution::experimental;
   namespace di = dlaf::internal;
   using pika::execution::thread_stacksize;
@@ -624,25 +638,32 @@ auto stablePartitionIndexForDeflation(
 
   auto part_fn = [dist_evecs_sub](const auto& c_tiles, const auto& evals_tiles, const auto& in_tiles,
                                   const auto& out_tiles, const auto& out_coltype_tiles,
-                                  const auto& i4_tiles, const auto& i6_tiles) {
+                                  const auto& i5_lc_tiles, const auto& i4_tiles, const auto& i6_tiles) {
     const TileElementIndex zero_idx(0, 0);
     const ColType* c_ptr = c_tiles[0].get().ptr(zero_idx);
     const T* evals_ptr = evals_tiles[0].get().ptr(zero_idx);
     SizeType* in_ptr = in_tiles[0].ptr(zero_idx);
     SizeType* out_ptr = out_tiles[0].ptr(zero_idx);
     SizeType* out_coltype_ptr = out_coltype_tiles[0].ptr(zero_idx);
+    SizeType* i5_lc_ptr = i5_lc_tiles.size() > 0 ? i5_lc_tiles[0].ptr(zero_idx) : nullptr;
     SizeType* i4_ptr = i4_tiles[0].ptr(zero_idx);
     SizeType* i6_ptr = i6_tiles[0].ptr(zero_idx);
 
     return stablePartitionIndexForDeflationArrays(dist_evecs_sub, c_ptr, evals_ptr, in_ptr, out_ptr,
-                                                  out_coltype_ptr, i4_ptr, i6_ptr);
+                                                  out_coltype_ptr, i5_lc_ptr, i4_ptr, i6_ptr);
   };
 
+  const SizeType i_begin_lc = dist_evecs.next_local_tile_from_global_tile<Coord::Col>(i_begin);
+  const SizeType i_end_lc = dist_evecs.next_local_tile_from_global_tile<Coord::Col>(i_end);
+  auto i5_lc_snd =
+      select(i5_lc, common::iterate_range2d(LocalTileIndex(i_begin_lc, 0), LocalTileIndex(i_end_lc, 1)));
+
   TileCollector tc{i_begin, i_end};
   return ex::when_all(ex::when_all_vector(tc.read(c)), ex::when_all_vector(tc.read(evals)),
                       ex::when_all_vector(tc.readwrite(in)), ex::when_all_vector(tc.readwrite(out)),
                       ex::when_all_vector(tc.readwrite(out_by_coltype)),
-                      ex::when_all_vector(tc.readwrite(i4)), ex::when_all_vector(tc.readwrite(i6))) |
+                      ex::when_all_vector(std::move(i5_lc_snd)), ex::when_all_vector(tc.readwrite(i4)),
+                      ex::when_all_vector(tc.readwrite(i6))) |
          di::transform(di::Policy<Backend::MC>(thread_stacksize::nostack), std::move(part_fn));
 }
 
@@ -1802,12 +1823,11 @@ void mergeDistSubproblems(comm::CommunicatorPipeline<comm::CommunicatorType::Ful
 
   const GlobalElementIndex sub_offset{i_begin * dist.tile_size().rows(),
                                       i_begin * dist.tile_size().cols()};
-  const matrix::Distribution dist_sub(
-      dist, {sub_offset,
-             {
-                 global_tile_element_distance<Coord::Row>(dist, i_begin, i_end),
-                 global_tile_element_distance<Coord::Col>(dist, i_begin, i_end),
-             }});
+  const GlobalElementSize sub_size{
+      global_tile_element_distance<Coord::Row>(dist, i_begin, i_end),
+      global_tile_element_distance<Coord::Col>(dist, i_begin, i_end),
+  };
+  const matrix::Distribution dist_sub(dist, {sub_offset, sub_size});
 
   // Calculate the size of the upper subproblem
   const SizeType n_upper = global_tile_element_distance<Coord::Row>(dist, i_begin, i_split);
@@ -1882,22 +1902,15 @@ void mergeDistSubproblems(comm::CommunicatorPipeline<comm::CommunicatorType::Ful
   // - set deflated diagonal entries of `U` to 1 (temporary solution until optimized GEMM is implemented)
   auto [k_unique, k_lc_unique, n_udl] =
       ex::split_tuple(stablePartitionIndexForDeflation(dist, i_begin, i_end, ws_h.c, ws_h.d0, ws_hm.i2,
-                                                       ws_h.i3, ws_hm.i5, ws_h.i4, ws_hm.i6));
+                                                       ws_h.i3, ws_hm.i5, ws_hm.i5b, ws_h.i4, ws_hm.i6));
 
   auto k = ex::split(std::move(k_unique));
   auto k_lc = ex::split(std::move(k_lc_unique));
 
   // Reorder Eigenvectors
-  using dlaf::permutations::internal::permuteJustLocal;
-  if constexpr (Backend::MC == B) {
-    copy(idx_begin_tiles_vec, sz_tiles_vec, ws_hm.i5, ws.i5);
-    permuteJustLocal<T, Coord::Col>(i_begin, i_end, ws.i5, ws.e0, ws.e1);
-  }
-  else {
-    copy(idx_loc_begin, sz_loc_tiles, ws.e0, ws_hm.e0);
-    permuteJustLocal<T, Coord::Col>(i_begin, i_end, ws_hm.i5, ws_hm.e0, ws_hm.e2);
-    copy(idx_loc_begin, sz_loc_tiles, ws_hm.e2, ws.e1);
-  }
+  copy(LocalTileIndex(idx_loc_begin.col(), 0), LocalTileSize(idx_loc_end.col() - idx_loc_begin.col(), 1),
+       ws_hm.i5b, ws.i5b);
+  permutations::permute<B, D, T, Coord::Col>(i_begin, i_end, ws.i5b, ws.e0, ws.e1);
 
   // Reorder Eigenvalues
   applyIndex(i_begin, i_end, ws_h.i3, ws_h.d0, ws_hm.d1);

diff --git a/include/dlaf/permutations/general.h b/include/dlaf/permutations/general.h
@@ -28,7 +28,7 @@ namespace dlaf::permutations {
 /// perms[i_begin:i_end].
 ///
 /// @param perms is the index map of permutations represented as a tiled column vector. Indices are in
-///        the range [0, n) where `n` is the size of the submatrix (i.e. the indices are local to the
+///        the range [0, n) where `n` is the local size of the submatrix (i.e. the indices are local to the
 ///        submatrix, they are not global). Only tiles whose row tile coords are in the range
 ///        [i_begin,i_end) are accessed in read-only mode.
 /// @pre @p perms is not distributed
@@ -37,7 +37,6 @@ namespace dlaf::permutations {
 ///
 /// @param mat_in is the input matrix. Only tiles whose both row and col tile coords are in
 ///        the range [i_begin,i_end) are accessed in read-only mode.
-/// @pre @p mat_in is not distributed
 /// @pre @p mat_in has size (N x N)
 /// @pre @p mat_in has blocksize (NB x NB)
 /// @pre @p mat_in has tilesize (NB x NB)
@@ -51,15 +50,14 @@ template <Backend B, Device D, class T, Coord coord>
 void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
              Matrix<const T, D>& mat_in, Matrix<T, D>& mat_out) {
   DLAF_ASSERT(matrix::local_matrix(perms), perms);
-  DLAF_ASSERT(matrix::local_matrix(mat_in), mat_in);
-  DLAF_ASSERT(matrix::local_matrix(mat_out), mat_out);
+  DLAF_ASSERT(matrix::same_process_grid(mat_in, mat_out), mat_in, mat_out);
 
   // Note:
   // These are not implementation constraints, but more logic constraints. Indeed, these ensure that
   // the range [i_begin, i_end] is square in terms of elements (it would not make sense to have it square
   // in terms of number of tiles). Moreover, by requiring mat_in and mat_out matrices to have the same
   // shape, it is ensured that range [i_begin, i_end] is actually the same on both sides.
-  DLAF_ASSERT(square_size(mat_in), mat_in);
+  DLAF_ASSERT(matrix::square_size(mat_in), mat_in);
   DLAF_ASSERT(matrix::square_blocksize(mat_in), mat_in);
   DLAF_ASSERT(matrix::equal_size(mat_in, mat_out), mat_in);
   DLAF_ASSERT(matrix::equal_blocksize(mat_in, mat_out), mat_in);
@@ -68,12 +66,17 @@ void permute(SizeType i_begin, SizeType i_end, Matrix<const SizeType, D>& perms,
   DLAF_ASSERT(matrix::single_tile_per_block(mat_in), mat_in);
   DLAF_ASSERT(matrix::single_tile_per_block(mat_out), mat_out);
 
-  DLAF_ASSERT(perms.size().rows() == mat_in.size().rows(), perms, mat_in);
+  DLAF_ASSERT(perms.block_size().rows() == mat_in.block_size().rows(), mat_in, perms);
+
+  // Note:
+  // perms is a column vector with a number of elements equal to the local part of matrix involved
+  // in the permutation, i.e. [i_begin, i_end), along coord axis
   DLAF_ASSERT(perms.size().cols() == 1, perms);
-  DLAF_ASSERT(perms.blockSize().rows() == mat_in.blockSize().rows(), mat_in, perms);
+  DLAF_ASSERT(perms.size().rows() == mat_in.distribution().local_size().template get<coord>(), perms,
+              mat_in);
 
   DLAF_ASSERT(i_begin >= 0 && i_begin <= i_end, i_begin, i_end);
-  DLAF_ASSERT(i_end <= perms.nrTiles().rows(), i_end, perms);
+  DLAF_ASSERT(i_end <= mat_in.nr_tiles().rows(), i_end, perms);
 
   internal::Permutations<B, D, T, coord>::call(i_begin, i_end, perms, mat_in, mat_out);
 }