From fe2cf93a43eb32a91be20324809d0e66db4ecba2 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Mon, 1 Jul 2024 09:40:41 +0200
Subject: [PATCH 01/18] Add sequential strength graph computation

---
 PartitionedSolvers/src/amg.jl        |  99 ++++++++++++++++++++++
 PartitionedSolvers/test/amg_tests.jl | 118 ++++++++++++++++++++++++++-
 2 files changed, 215 insertions(+), 2 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 499a5230..981db5bf 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -306,6 +306,105 @@ function smoothed_aggregation(;
     (coarsen, coarsen!)
 end
 
+function smoothed_aggregation_with_block_size(;
+    epsilon = 0,
+    approximate_omega = omega_for_1d_laplace,
+    tentative_prolongator = tentative_prolongator_for_laplace,
+    repartition_threshold = 2000,
+    block_size = 1,
+    )
+    function coarsen(A,B)
+        # build strength graph
+        G = strength_graph(A, block_size=block_size, theta=epsilon)
+        diagG = dense_diag(G)
+        node_to_aggregate, node_aggregates = aggregate(G,diagG;epsilon)
+        # compute dof_to_aggregate, dof_aggregates
+        n_nullspace_vecs = length(B)
+        P0 = constant_prolongator(dof_to_aggregate, dof_aggregates,n_nullspace_vecs)
+        P0,Bc = tentative_prolongator(P0,B) # changes needed here
+        P = smoothed_prolongator(A,P0,diagG;approximate_omega) #changes needed here
+        R = transpose(P)
+        Ac,cache = rap(R,A,P;reuse=true)
+        Ac,Bc,R,P,cache,repartition_threshold = enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)#maybe changes here
+        Ac,Bc,R,P,cache
+    end
+    function coarsen!(A,Ac,R,P,cache)
+        rap!(Ac,R,A,P,cache)
+        Ac,R,P,cache
+    end
+    (coarsen, coarsen!)
+end
+
+function getblock!(B,A,ids_i,ids_j)
+    for (j,J) in enumerate(ids_j)
+        for (i,I) in enumerate(ids_i)
+            B[i,j] = A[I,J]
+        end
+    end
+end
+
+function strength_graph(A::AbstractSparseMatrix, block_size::Integer; theta = 0)
+
+    if block_size < 1 
+        error("Block size must be equal to or larger than 1.")
+    end
+
+    if A.n != A.m 
+        error("Matrix must be square.")
+    end
+
+    if A.n % block_size != 0
+        error("Matrix size must be multiple of block size.")
+    end
+
+    if block_size == 1
+        return A
+    end
+
+    if theta < 0
+        error("Expected a positive theta.")
+    end
+
+    n_dofs = A.m 
+    nnodes = Integer(n_dofs/block_size)
+    B = zeros(block_size,block_size)
+    diag_norms = zeros(nnodes)
+    
+    I = Int[]
+    J = Int[]
+    V = Float64[]
+
+    for i_node in 1:nnodes
+        i_dofs = ((i_node-1)*block_size+1) : i_node*block_size 
+        getblock!(B,A,i_dofs,i_dofs)
+        diag_norms[i_node] = norm(B)
+        if theta <= 1 
+            push!(I, i_node)
+            push!(J, i_node)
+            push!(V, 1.0)
+        end
+    end
+
+    for j_node in 1:nnodes
+        for i_node in 1:nnodes
+            if j_node != i_node
+                i_dofs = ((i_node-1)*block_size+1) : i_node*block_size 
+                j_dofs = ((j_node-1)*block_size+1) : j_node*block_size 
+                getblock!(B,A,i_dofs,j_dofs)
+                # Calculate strength according to https://github.com/pyamg/pyamg/blob/e1fe54c93be1029c02ddcf84c2338a607b088703/pyamg/strength.py#L275 
+                if norm(B) >= theta * sqrt(diag_norms[i_node] * diag_norms[j_node])
+                    push!(I, i_node)
+                    push!(J, j_node)
+                    push!(V, 1.0)
+                end
+            end 
+        end
+    end
+  
+    G = sparse(I, J, V, nnodes, nnodes)
+    G
+end 
+
 function amg_level_params(;
     pre_smoother = additive_schwarz(gauss_seidel(;iters=1);iters=1),
     coarsening = smoothed_aggregation(;),
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 125a584b..fbd0c626 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -6,6 +6,121 @@ using PartitionedSolvers
 using LinearAlgebra
 using Test
 using IterativeSolvers: cg!
+using SparseArrays
+
+# Test strength graph computation 
+# First with Psparse matrix 
+nrows = 18
+ngrid = 9
+nrowgrid = 3
+ncols = nrows 
+p = 4
+ranks = DebugArray(LinearIndices((p,)))
+row_partition = uniform_partition(ranks, nrows)
+
+IJV = map(row_partition) do row_indices
+    I, J, V = Int[], Int[], Float64[]
+    for i in local_to_global(row_indices)
+        push!(V, 9)
+        push!(I, i)
+        push!(J, i)
+
+        grid_point = 0 
+        if (i % 2) == 1
+            # 1st dimension
+            push!(V, -1)
+            push!(I, i)
+            push!(J, i+1)
+            grid_point = div((i+1),2)
+        else
+            # 2nd dimension
+            push!(V, -1)
+            push!(I, i)
+            push!(J, i-1) 
+            grid_point = div(i,2)
+        end
+        north = grid_point - nrowgrid
+        south = grid_point + nrowgrid
+        east = grid_point - 1
+        west = grid_point + 1
+        if north >= 1
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*north - 1)
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*north)
+        end
+        if south <= ngrid
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*south -1)
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*south)
+        end
+        if grid_point % nrowgrid != 1
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*east-1)
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*east)
+        end
+        if grid_point % nrowgrid != 0
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*west-1)
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*west)
+        end
+    end
+    I,J,V
+end
+
+I,J,V = tuple_of_arrays(IJV)
+col_partition = row_partition
+A = psparse(I,J,V,row_partition, col_partition) |> fetch 
+# TODO: implement and test psparse matrix 
+#R = PartitionedSolvers.strength_graph(A, 2)
+
+# Now with CSC sparse matrix 
+# Test sample matrix with block size 2
+theta = 0.02
+A = centralize(A)
+
+# build solution
+I = [1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 
+    6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9]
+J = [1, 2, 4,
+     1, 2, 3, 5, 
+     2, 3, 6, 
+     1, 4, 5, 7, 
+     2, 4, 5, 6, 8,
+     3, 5, 6, 9,
+     4, 7, 8, 
+     5, 7, 8, 9, 
+     6, 8, 9]
+V = ones(length(I))
+
+solution = sparse(I, J, V, ngrid, ngrid)
+R = PartitionedSolvers.strength_graph(A, 2, theta=theta)
+@test solution ≈ R 
+
+# Another test with 3 dims
+M = rand([-2.0, -1, 1, 2], (3, 3))
+M = sparse(M)
+A = blockdiag(M, M, M)
+solution = sparse([1, 2, 3], [1, 2, 3], fill(1.0, 3), 3, 3)
+R = PartitionedSolvers.strength_graph(A, 3, theta=theta)
+@test solution ≈ R 
+
+# Test with minimal matrix size
+R = PartitionedSolvers.strength_graph(M, 3, theta=theta)
+solution = sparse([1], [1], 1.0, 1, 1)
+@test solution ≈ R 
+
 
 # First with a sequential matrix
 nodes_per_dir = (100,100)
@@ -124,5 +239,4 @@ x .= 0
 Pl = setup(amg(),x,A,b)
 _, history = cg!(x,A,b;Pl,log=true)
 display(history)
-
-end
+end
\ No newline at end of file

From 9d93e310d7a3ad629c722528ef3cbfed532060d8 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Fri, 19 Jul 2024 08:51:10 +0200
Subject: [PATCH 02/18] Add strength graph and tentative prolongator with block
 size

---
 PartitionedSolvers/src/amg.jl        | 308 ++++++++++++++++++++++++---
 PartitionedSolvers/test/amg_tests.jl | 131 +++++++++---
 2 files changed, 381 insertions(+), 58 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 981db5bf..6f848413 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -145,7 +145,6 @@ function constant_prolongator(node_to_aggregate,aggregates,n_nullspace_vecs)
     nnodes = length(node_to_aggregate)
     pending = typeof_aggregate(0)
     isolated = typeof_aggregate(-1)
-    nnodes = length(node_to_aggregate)
     naggregates = length(aggregates)
     aggregate_to_nodes_ptrs = zeros(Int,naggregates+1)
     for node in 1:nnodes
@@ -198,6 +197,79 @@ function constant_prolongator(node_to_aggregate::PVector,aggregates::PRange,n_nu
     P0
 end
 
+function collect_nodes_in_aggregate(node_to_aggregate,aggregates)
+    typeof_aggregate = eltype(node_to_aggregate)
+    nnodes = length(node_to_aggregate)
+    pending = typeof_aggregate(0)
+    isolated = typeof_aggregate(-1)
+    nnodes = length(node_to_aggregate)
+    naggregates = length(aggregates)
+    aggregate_to_nodes_ptrs = zeros(Int,naggregates+1)
+    for node in 1:nnodes
+        agg = node_to_aggregate[node]
+        if agg == pending || agg == isolated
+            continue
+        end
+        aggregate_to_nodes_ptrs[agg+1] += 1
+    end 
+    length_to_ptrs!(aggregate_to_nodes_ptrs)
+    ndata = aggregate_to_nodes_ptrs[end]-1
+    aggregate_to_nodes_data = zeros(Int,ndata)
+    for node in 1:nnodes
+        agg = node_to_aggregate[node]
+        if agg == pending || agg == isolated
+            continue
+        end
+        p = aggregate_to_nodes_ptrs[agg]
+        aggregate_to_nodes_data[p] = node
+        aggregate_to_nodes_ptrs[agg] += 1
+    end
+    rewind_ptrs!(aggregate_to_nodes_ptrs)
+    aggregate_to_nodes = jagged_array(aggregate_to_nodes_data,aggregate_to_nodes_ptrs)
+    aggregate_to_nodes
+end
+
+function remove_singleton_aggregates(aggregate_to_nodes_old)
+    typeof_nodes = eltype(aggregate_to_nodes_old[1])
+    n_aggregates_old = length(aggregate_to_nodes_old)
+    n_aggregates = 0
+    n_data = 0
+    for i in 1:n_aggregates_old
+        nnodes = aggregate_to_nodes_old.ptrs[i+1] - aggregate_to_nodes_old.ptrs[i]
+        if nnodes < 2
+            continue
+        end
+        n_aggregates += 1 
+        n_data += nnodes
+    end
+
+    # If there are no singleton aggregates, stop and return old array
+    if n_aggregates == n_aggregates_old
+        return aggregate_to_nodes_old
+    end
+
+    # Else copy non-singleton aggregates to new array
+    aggregate_to_nodes_ptrs = zeros(Int, n_aggregates+1)
+    aggregate_to_nodes_data = zeros(typeof_nodes, n_data)
+    aggregate_to_nodes_ptrs[1] = 1
+    agg = 1
+    for i in 1:n_aggregates_old
+        nnodes = aggregate_to_nodes_old.ptrs[i+1] - aggregate_to_nodes_old.ptrs[i]
+        if nnodes < 2
+            continue 
+        end
+        aggregate_to_nodes_ptrs[agg+1] = aggregate_to_nodes_ptrs[agg] + nnodes 
+        pini = aggregate_to_nodes_ptrs[agg]
+        pend = aggregate_to_nodes_ptrs[agg+1]-1
+        pini_old = aggregate_to_nodes_old.ptrs[i]
+        pend_old = aggregate_to_nodes_old.ptrs[i+1]-1
+        aggregate_to_nodes_data[pini:pend] = aggregate_to_nodes_old.data[pini_old:pend_old]
+        agg += 1     
+    end
+    aggregate_to_nodes = jagged_array(aggregate_to_nodes_data,aggregate_to_nodes_ptrs)
+    aggregate_to_nodes
+end
+
 function tentative_prolongator_for_laplace(P0,B)
     n_nullspace_vecs = length(B)
     if n_nullspace_vecs != 1
@@ -207,6 +279,107 @@ function tentative_prolongator_for_laplace(P0,B)
     P0,Bc
 end
 
+function tentative_prolongator_with_block_size(aggregate_to_nodes::JaggedArray,B, block_size)
+    # A draft for the scalar case is commented below
+    ## TODO assumes CSC
+    # Algorithm 7 in https://mediatum.ub.tum.de/download/1229321/1229321.pdf 
+
+    if length(B) < 1
+        error("Null space must contain at least one null vector.")
+    end
+
+    n_aggregates = length(aggregate_to_nodes.ptrs)-1
+    n_B = length(B)
+    n_dofs = length(B[1])
+    n_dofs_c = n_aggregates * n_B
+    Bc = [Vector{Float64}(undef,n_dofs_c) for _ in 1:n_B]
+
+    # Build P0 colptr
+    P0_colptr = zeros(Int, n_dofs_c + 1)
+    P0_colptr[1] = 1
+    for i_agg in 1:n_aggregates
+        pini = aggregate_to_nodes.ptrs[i_agg]
+        pend = aggregate_to_nodes.ptrs[i_agg+1]-1
+        n_nodes = length(pini:pend)
+        for b in 1:n_B 
+            col = (i_agg - 1) * n_B + b
+            P0_colptr[col+1] += P0_colptr[col] + n_nodes * block_size 
+        end
+    end
+    
+    # Build P0 rowvals
+    nnz = length(aggregate_to_nodes.data) * block_size * n_B
+    P0_rowval = zeros(Int, nnz)
+    for i_agg in 1:n_aggregates
+        pini = aggregate_to_nodes.ptrs[i_agg]
+        pend = aggregate_to_nodes.ptrs[i_agg+1]-1
+        i_nodes = aggregate_to_nodes.data[pini:pend]
+        for b in 1:n_B
+            rval_ini = P0_colptr[(i_agg-1)*n_B+b]
+            for i_node in i_nodes
+                rval_end = rval_ini+block_size-1
+                P0_rowval[rval_ini:rval_end] = node_to_dofs(i_node, block_size)
+                rval_ini = rval_end + 1
+            end
+        end
+    end
+    
+    P0 = SparseMatrixCSC(
+        n_dofs,
+        n_dofs_c,
+        P0_colptr,
+        P0_rowval,
+        ones(nnz))
+    
+    # Fill with nullspace vectors 
+    for i_agg in 1:n_aggregates
+        for b in 1:n_B
+            col = (i_agg-1) * n_B + b
+            pini = P0.colptr[col]
+            pend = P0.colptr[col+1]-1
+            rowids = P0.rowval[pini:pend]
+            P0.nzval[pini:pend] = B[b][rowids]
+        end
+    end
+
+    # Compute QR decomposition for nullspace in each aggregate 
+    for i_agg in 1:n_aggregates
+        # Copy values to Bi 
+        pini = aggregate_to_nodes.ptrs[i_agg]
+        pend = aggregate_to_nodes.ptrs[i_agg+1]-1
+        n_i = length(pini:pend) * block_size
+        Bi = zeros(n_i, n_B)
+        P0cols = (i_agg-1)*n_B+1 : i_agg*n_B
+        for (b, col) in enumerate(P0cols)
+            pini = P0.colptr[col]
+            pend = P0.colptr[col+1]-1
+            Bi[:,b] = P0.nzval[pini:pend]
+        end
+
+        # Compute thin QR decomposition 
+        # TODO: check if QR decomposition is possible if n_i >= n_B, raise an error
+        F= qr(Bi)
+        Qi = F.Q * Matrix(I,(n_i, n_B))
+        Qi = Qi[:,1:n_B]
+        Ri = F.R
+
+        # Build global tentative prolongator 
+        for (b, col) in enumerate(P0cols)
+            pini = P0.colptr[col]
+            pend = P0.colptr[col+1]-1
+            P0.nzval[pini:pend] = Qi[:,b]
+        end
+
+        # Build coarse null space
+        Bcrows = P0cols
+        for b in 1:n_B
+            Bc[b][Bcrows] = Ri[:, b]
+        end
+    end
+
+    P0, Bc
+end
+
 function generic_tentative_prolongator(P0::SparseArrays.AbstractSparseMatrixCSC,B)
     error("not implemented yet")
     # A draft for the scalar case is commented below
@@ -247,19 +420,42 @@ end
 function smoothed_prolongator(A,P0,diagA=dense_diag(A);approximate_omega)
     # TODO the performance of this one can be improved
     invDiagA = 1 ./ diagA
-    omega = approximate_omega(invDiagA,A)
     Dinv = PartitionedArrays.sparse_diag_matrix(invDiagA,(axes(A,1),axes(A,1)))
+    omega = approximate_omega(invDiagA,A,Dinv*A)
     P = (I-omega*Dinv*A)*P0
     P
 end
 
-function omega_for_1d_laplace(invD,A)
+function omega_for_1d_laplace(invD,A, DinvA)
     # in general this will be (4/3)/ρ(D^-1*A)
     # with a cheap approximation (e.g., power method) of the spectral radius ρ(D^-1*A)
     # ρ(D^-1*A) == 2 for 1d Laplace problem
     2/3
 end
 
+function lambda_generic(invD,A,DinvA)
+    ω = 4/3
+    # Perform a few iterations of Power method to estimate lambda 
+    # (Remark 3.5.2. in https://mediatum.ub.tum.de/download/1229321/1229321.pdf)
+    ρ = spectral_radius(DinvA, 20)
+    ω/ρ 
+end
+
+function spectral_radius(A, x, num_iterations:: Int)
+    # Choose diagonal vector as initial guess
+    y = zeros(size(A,1))
+    # Power iteration
+    for i in 1:num_iterations
+        mul!(y, A, x)
+        y_norm = norm(y)
+        x = y/y_norm
+    end
+    # Compute spectral radius using Rayleigh coefficient
+    y = mul!(y, A, x)
+    ρ = (y' * x) / (x' * x)
+    abs(ρ)
+end
+
 function enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)
     Ac,Bc,R,P,cache,repartition_threshold
 end
@@ -308,21 +504,19 @@ end
 
 function smoothed_aggregation_with_block_size(;
     epsilon = 0,
-    approximate_omega = omega_for_1d_laplace,
-    tentative_prolongator = tentative_prolongator_for_laplace,
+    approximate_omega = lambda_generic,
+    tentative_prolongator = tentative_prolongator_with_block_size,
     repartition_threshold = 2000,
     block_size = 1,
     )
     function coarsen(A,B)
         # build strength graph
-        G = strength_graph(A, block_size=block_size, theta=epsilon)
+        G = strength_graph(A, block_size=block_size, epsilon=epsilon)
         diagG = dense_diag(G)
         node_to_aggregate, node_aggregates = aggregate(G,diagG;epsilon)
-        # compute dof_to_aggregate, dof_aggregates
-        n_nullspace_vecs = length(B)
-        P0 = constant_prolongator(dof_to_aggregate, dof_aggregates,n_nullspace_vecs)
-        P0,Bc = tentative_prolongator(P0,B) # changes needed here
-        P = smoothed_prolongator(A,P0,diagG;approximate_omega) #changes needed here
+        aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
+        Pc,Bc,perm = tentative_prolongator(aggregate_to_nodes,B, block_size) 
+        P = smoothed_prolongator(A,Pc,diagG;approximate_omega) 
         R = transpose(P)
         Ac,cache = rap(R,A,P;reuse=true)
         Ac,Bc,R,P,cache,repartition_threshold = enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)#maybe changes here
@@ -343,7 +537,7 @@ function getblock!(B,A,ids_i,ids_j)
     end
 end
 
-function strength_graph(A::AbstractSparseMatrix, block_size::Integer; theta = 0)
+function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon = 0)
 
     if block_size < 1 
         error("Block size must be equal to or larger than 1.")
@@ -361,50 +555,102 @@ function strength_graph(A::AbstractSparseMatrix, block_size::Integer; theta = 0)
         return A
     end
 
-    if theta < 0
-        error("Expected a positive theta.")
+    if epsilon < 0
+        error("Expected a positive epsilon.")
     end
 
     n_dofs = A.m 
-    nnodes = Integer(n_dofs/block_size)
+    nnodes = div(n_dofs, block_size)
     B = zeros(block_size,block_size)
     diag_norms = zeros(nnodes)
+
+    # Count nonzero values 
+    nnz = 0
     
-    I = Int[]
-    J = Int[]
-    V = Float64[]
+    # If epsilon <= 1, all diagonal entries are in the graph
+    if epsilon <= 1 
+        nnz += nnodes
+    end
 
+    # Calculate norms of diagonal values first
     for i_node in 1:nnodes
-        i_dofs = ((i_node-1)*block_size+1) : i_node*block_size 
+        i_dofs = node_to_dofs(i_node, block_size)
         getblock!(B,A,i_dofs,i_dofs)
         diag_norms[i_node] = norm(B)
-        if theta <= 1 
-            push!(I, i_node)
-            push!(J, i_node)
-            push!(V, 1.0)
-        end
     end
 
     for j_node in 1:nnodes
         for i_node in 1:nnodes
             if j_node != i_node
-                i_dofs = ((i_node-1)*block_size+1) : i_node*block_size 
-                j_dofs = ((j_node-1)*block_size+1) : j_node*block_size 
+                i_dofs = node_to_dofs(i_node, block_size) 
+                j_dofs = node_to_dofs(j_node, block_size) 
                 getblock!(B,A,i_dofs,j_dofs)
                 # Calculate strength according to https://github.com/pyamg/pyamg/blob/e1fe54c93be1029c02ddcf84c2338a607b088703/pyamg/strength.py#L275 
-                if norm(B) >= theta * sqrt(diag_norms[i_node] * diag_norms[j_node])
-                    push!(I, i_node)
-                    push!(J, j_node)
-                    push!(V, 1.0)
+                if norm(B) >= epsilon * sqrt(diag_norms[i_node] * diag_norms[j_node])
+                    nnz += 1
                 end
             end 
         end
     end
-  
+    
+    # Allocate data structures for sparsematrix
+    I = zeros(Int, nnz)
+    J = zeros(Int, nnz)
+    V = zeros(nnz)
+
+    i_nz = 1
+
+    for j_node in 1:nnodes
+        for i_node in 1:nnodes
+            if j_node == i_node
+                # Diagonal entries are always in graph if epsilon <= 1
+                if epsilon <= 1
+                    I[i_nz] = i_node 
+                    J[i_nz] = j_node
+                    V[i_nz] = 1.0
+                    i_nz += 1
+                end
+            else 
+                i_dofs = node_to_dofs(i_node, block_size)
+                j_dofs = node_to_dofs(j_node, block_size) 
+                getblock!(B,A,i_dofs,j_dofs)
+                # Calculate strength according to https://github.com/pyamg/pyamg/blob/e1fe54c93be1029c02ddcf84c2338a607b088703/pyamg/strength.py#L275 
+                if norm(B) >= epsilon * sqrt(diag_norms[i_node] * diag_norms[j_node])
+                    I[i_nz] = i_node 
+                    J[i_nz] = j_node
+                    V[i_nz] = 1.0
+                    i_nz += 1
+                end
+            end
+        end
+    end
+
     G = sparse(I, J, V, nnodes, nnodes)
     G
 end 
 
+function node_to_dofs(i_node, block_size)
+    # Convert single node in graph to range of dofs
+    ((i_node-1)*block_size+1) : i_node*block_size
+end
+
+function dofs_to_node(dofs, block_size)
+    # Convert range of dofs to single node in graph
+    div(dofs[end], block_size)
+end
+
+function amg_level_params_linear_elasticity(;
+    pre_smoother = additive_schwarz(gauss_seidel(;iters=1);iters=1),
+    coarsening = smoothed_aggregation(approximate_omega = lambda_generic,
+    tentative_prolongator = tentative_prolongator_with_block_size),
+    cycle = v_cycle,
+    pos_smoother = pre_smoother,
+    )
+
+    level_params = (;pre_smoother,pos_smoother,coarsening,cycle)
+    level_params
+end
+
 function amg_level_params(;
     pre_smoother = additive_schwarz(gauss_seidel(;iters=1);iters=1),
     coarsening = smoothed_aggregation(;),
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index fbd0c626..1ed77203 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -1,5 +1,8 @@
 module AMGTests
 
+# just for debugging
+using Pkg
+Pkg.activate("PartitionedSolvers")
 using PartitionedArrays
 using PartitionedArrays: laplace_matrix
 using PartitionedSolvers
@@ -10,13 +13,12 @@ using SparseArrays
 
 # Test strength graph computation 
 # First with Psparse matrix 
-nrows = 18
-ngrid = 9
-nrowgrid = 3
-ncols = nrows 
+ndofs = 18
+nnodes = 9
+nrows = 3
 p = 4
 ranks = DebugArray(LinearIndices((p,)))
-row_partition = uniform_partition(ranks, nrows)
+row_partition = uniform_partition(ranks, ndofs)
 
 IJV = map(row_partition) do row_indices
     I, J, V = Int[], Int[], Float64[]
@@ -25,24 +27,24 @@ IJV = map(row_partition) do row_indices
         push!(I, i)
         push!(J, i)
 
-        grid_point = 0 
+        node = 0 
         if (i % 2) == 1
             # 1st dimension
             push!(V, -1)
             push!(I, i)
             push!(J, i+1)
-            grid_point = div((i+1),2)
+            node = div((i+1),2)
         else
             # 2nd dimension
             push!(V, -1)
             push!(I, i)
             push!(J, i-1) 
-            grid_point = div(i,2)
+            node = div(i,2)
         end
-        north = grid_point - nrowgrid
-        south = grid_point + nrowgrid
-        east = grid_point - 1
-        west = grid_point + 1
+        north = node - nrows
+        south = node + nrows
+        east = node - 1
+        west = node + 1
         if north >= 1
             push!(V, -1)
             push!(I, i)
@@ -51,7 +53,7 @@ IJV = map(row_partition) do row_indices
             push!(I, i)
             push!(J, 2*north)
         end
-        if south <= ngrid
+        if south <= nnodes
             push!(V, -1)
             push!(I, i)
             push!(J, 2*south -1)
@@ -59,7 +61,7 @@ IJV = map(row_partition) do row_indices
             push!(I, i)
             push!(J, 2*south)
         end
-        if grid_point % nrowgrid != 1
+        if node % nrows != 1
             push!(V, -1)
             push!(I, i)
             push!(J, 2*east-1)
@@ -67,7 +69,7 @@ IJV = map(row_partition) do row_indices
             push!(I, i)
             push!(J, 2*east)
         end
-        if grid_point % nrowgrid != 0
+        if node % nrows != 0
             push!(V, -1)
             push!(I, i)
             push!(J, 2*west-1)
@@ -83,12 +85,12 @@ I,J,V = tuple_of_arrays(IJV)
 col_partition = row_partition
 A = psparse(I,J,V,row_partition, col_partition) |> fetch 
 # TODO: implement and test psparse matrix 
-#R = PartitionedSolvers.strength_graph(A, 2)
+
 
 # Now with CSC sparse matrix 
 # Test sample matrix with block size 2
-theta = 0.02
-A = centralize(A)
+epsilon = 0.02
+A_test = centralize(A)
 
 # build solution
 I = [1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 
@@ -104,26 +106,101 @@ J = [1, 2, 4,
      6, 8, 9]
 V = ones(length(I))
 
-solution = sparse(I, J, V, ngrid, ngrid)
-R = PartitionedSolvers.strength_graph(A, 2, theta=theta)
-@test solution ≈ R 
+solution = sparse(I, J, V, nnodes, nnodes)
+G_test = PartitionedSolvers.strength_graph(A_test, 2, epsilon=epsilon)
+@test solution ≈ G_test
 
 # Another test with 3 dims
 M = rand([-2.0, -1, 1, 2], (3, 3))
 M = sparse(M)
 A = blockdiag(M, M, M)
 solution = sparse([1, 2, 3], [1, 2, 3], fill(1.0, 3), 3, 3)
-R = PartitionedSolvers.strength_graph(A, 3, theta=theta)
-@test solution ≈ R 
+G = PartitionedSolvers.strength_graph(A, 3, epsilon=epsilon)
+@test solution ≈ G 
 
 # Test with minimal matrix size
-R = PartitionedSolvers.strength_graph(M, 3, theta=theta)
+G = PartitionedSolvers.strength_graph(M, 3, epsilon=epsilon)
 solution = sparse([1], [1], 1.0, 1, 1)
-@test solution ≈ R 
+@test solution ≈ G 
+
+# Test tentative prolongator 
+function random_nullspace(ndofs, n_B)
+    B = Array{Array{Float64, 1}, 1}(undef, n_B)
+    for i = 1:n_B
+        B[i] = rand(ndofs)
+    end
+    B
+end
 
+# Test tentative prolongator with laplace matrix
+nodes_per_dir = (90,90)
+block_size = 3
+A = laplace_matrix(nodes_per_dir)
+G = PartitionedSolvers.strength_graph(A, block_size, epsilon=epsilon)
+diagG = dense_diag(G)
+B = random_nullspace(size(A, 1), block_size)
+node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G,diagG;epsilon)
+aggregate_to_nodes_old = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
+aggregate_to_nodes = PartitionedSolvers.remove_singleton_aggregates(aggregate_to_nodes_old)
+# Assert there are no singleton aggregates
+@assert length(aggregate_to_nodes_old) == length(aggregate_to_nodes)
+Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B, block_size) 
+@test Pc * stack(Bc) ≈ stack(B)
+
+
+# Test spectral radius estimation 
+n_its = [2, 4, 6, 8, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]
+n_trials = 10
+errors_powm = zeros(n_trials, length(n_its))
+time_powm = zeros(n_trials,length(n_its))
+errors_spectrad = zeros(n_trials,length(n_its))
+time_spectrad = zeros(n_trials,length(n_its))
+using IterativeSolvers
+using Plots
+using LinearMaps
+using Statistics
+A = laplace_matrix((100,100))
+diagA = dense_diag(A)
+invD = 1 ./ diagA
+x0 = rand(size(A,2))
+M = invD .* A
+Fmap = LinearMap(M)
+exp = 2.0
+for (i, n_it) in enumerate(n_its)
+    for t in 1:n_trials
+        # Power method
+        tic = time_ns()
+        λ, x = powm!(Fmap, x0, maxiter = n_it)
+        toc = time_ns()
+        time_powm[t,i] = toc - tic
+        errors_powm[t,i] = abs((λ-exp)/exp)
+        
+        # own implementation
+        tic = time_ns()
+        ρ = PartitionedSolvers.spectral_radius(M,x0, n_it)
+        toc = time_ns()
+        time_spectrad[t,i] = toc-tic
+        errors_spectrad[t,i] = abs((ρ-exp)/exp)
+    end
+end
+avg_time_powm = median(time_powm, dims=1)
+avg_time_spectrad = median(time_spectrad, dims=1)
+avg_error_powm = median(errors_powm, dims=1)
+avg_error_spectrad = median(errors_spectrad, dims=1)
+avg_time_powm = avg_time_powm ./ 10^9
+avg_time_spectrad = avg_time_spectrad ./ 10^9
+p1 = plot(n_its, [avg_error_powm' avg_error_spectrad'], label = ["powm" "spectrad"], 
+    marker=[:c :x], ms=2)
+plot!(p1, xlabel="#iterations", ylabel="avg rel error", xticks = n_its)
+p2 = plot(n_its, [avg_time_powm' avg_time_spectrad'], label = ["powm" "spectrad"], 
+    marker=[:c :x], ms=2)
+plot!(p2, xlabel="#iterations", ylabel="avg runtime (s)", xticks = n_its)
+p = plot(p1, p2, layout=(2,1), suptitle="Convergence of powm and spectral_radius")
+savefig(p, "C:/Users/gelie/Home/ComputationalScience/GSoC/powm_convergence.png")    
+display(p)
 
 # First with a sequential matrix
-nodes_per_dir = (100,100)
+#= nodes_per_dir = (100,100)
 A = laplace_matrix(nodes_per_dir)
 using Random
 Random.seed!(1)
@@ -238,5 +315,5 @@ x = similar(b,axes(A,2))
 x .= 0
 Pl = setup(amg(),x,A,b)
 _, history = cg!(x,A,b;Pl,log=true)
-display(history)
+display(history) =#
 end
\ No newline at end of file

From e7ea591a15d87daeccda33e10e9f497d2a43b4b1 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Fri, 19 Jul 2024 08:52:12 +0200
Subject: [PATCH 03/18] Add IterativeSolvers dependency

---
 PartitionedSolvers/src/PartitionedSolvers.jl | 1 +
 1 file changed, 1 insertion(+)

diff --git a/PartitionedSolvers/src/PartitionedSolvers.jl b/PartitionedSolvers/src/PartitionedSolvers.jl
index 1c7899dc..bf08d25a 100644
--- a/PartitionedSolvers/src/PartitionedSolvers.jl
+++ b/PartitionedSolvers/src/PartitionedSolvers.jl
@@ -3,6 +3,7 @@ module PartitionedSolvers
 using PartitionedArrays
 using SparseArrays
 using LinearAlgebra
+using IterativeSolvers
 
 export setup
 export solve!

From ae89c8d6405cc86fba38c6c3c92ec912a550c2f5 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Wed, 24 Jul 2024 10:06:09 +0200
Subject: [PATCH 04/18] Add spectral radius estimation

---
 PartitionedSolvers/src/amg.jl        |  18 ++--
 PartitionedSolvers/test/amg_tests.jl | 137 +++++++++++++++++----------
 2 files changed, 101 insertions(+), 54 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 6f848413..a47b333a 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -348,6 +348,11 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::JaggedArray,B
         pini = aggregate_to_nodes.ptrs[i_agg]
         pend = aggregate_to_nodes.ptrs[i_agg+1]-1
         n_i = length(pini:pend) * block_size
+
+        if n_i < n_B 
+            error("Singleton node aggregate: Case not implemented.")
+        end
+
         Bi = zeros(n_i, n_B)
         P0cols = (i_agg-1)*n_B+1 : i_agg*n_B
         for (b, col) in enumerate(P0cols)
@@ -357,7 +362,6 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::JaggedArray,B
         end
 
         # Compute thin QR decomposition 
-        # TODO: check if QR decomposition is possible if n_i >= n_B, raise an error
         F= qr(Bi)
         Qi = F.Q * Matrix(I,(n_i, n_B))
         Qi = Qi[:,1:n_B]
@@ -437,15 +441,16 @@ function lambda_generic(invD,A,DinvA)
     ω = 4/3
     # Perform a few iterations of Power method to estimate lambda 
     # (Remark 3.5.2. in https://mediatum.ub.tum.de/download/1229321/1229321.pdf)
-    ρ = spectral_radius(DinvA, 20)
+    x0 = rand(size(DinvA,2))
+    ρ = spectral_radius(DinvA, x0, maxiter=20)
     ω/ρ 
 end
 
 function spectral_radius(A, x, num_iterations:: Int)
     # Choose diagonal vector as initial guess
-    y = zeros(size(A,1))
+    y = similar(x)
     # Power iteration
-    for i in 1:num_iterations
+    for _ in 1:num_iterations
         mul!(y, A, x)
         y_norm = norm(y)
         x = y/y_norm
@@ -453,7 +458,7 @@ function spectral_radius(A, x, num_iterations:: Int)
     # Compute spectral radius using Rayleigh coefficient
     y = mul!(y, A, x)
     ρ = (y' * x) / (x' * x)
-    abs(ρ)
+    abs(ρ), x
 end
 
 function enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)
@@ -516,7 +521,8 @@ function smoothed_aggregation_with_block_size(;
         node_to_aggregate, node_aggregates = aggregate(G,diagG;epsilon)
         aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
         Pc,Bc,perm = tentative_prolongator(aggregate_to_nodes,B, block_size) 
-        P = smoothed_prolongator(A,Pc,diagG;approximate_omega) 
+        diagA = dense_diag(A)
+        P = smoothed_prolongator(A,Pc,diagA;approximate_omega) 
         R = transpose(P)
         Ac,cache = rap(R,A,P;reuse=true)
         Ac,Bc,R,P,cache,repartition_threshold = enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)#maybe changes here
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index c4fea87c..c4d46524 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -7,8 +7,11 @@ using PartitionedArrays
 using PartitionedArrays: laplace_matrix
 using PartitionedSolvers
 using LinearAlgebra
-using Test
+using IterativeSolvers
 using IterativeSolvers: cg!
+using Plots
+using Statistics
+using Test
 using SparseArrays
 
 # Test strength graph computation 
@@ -133,71 +136,109 @@ function random_nullspace(ndofs, n_B)
 end
 
 # Test tentative prolongator with laplace matrix
-nodes_per_dir = (90,90)
 block_size = 3
-A = laplace_matrix(nodes_per_dir)
+parts_per_dir = (2,2,2)
+p = prod(parts_per_dir)
+ranks = DebugArray(LinearIndices((p,)))
+nodes_per_dir = map(i->block_size*i,parts_per_dir)
+args = laplacian_fdm(nodes_per_dir,parts_per_dir,ranks)
+A_dist = psparse(args...) |> fetch 
+A = centralize(A_dist)
+println("dims A: $(size(A))")
 G = PartitionedSolvers.strength_graph(A, block_size, epsilon=epsilon)
 diagG = dense_diag(G)
 B = random_nullspace(size(A, 1), block_size)
 node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G,diagG;epsilon)
-aggregate_to_nodes_old = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
-aggregate_to_nodes = PartitionedSolvers.remove_singleton_aggregates(aggregate_to_nodes_old)
-# Assert there are no singleton aggregates
-@assert length(aggregate_to_nodes_old) == length(aggregate_to_nodes)
+aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
 Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B, block_size) 
 @test Pc * stack(Bc) ≈ stack(B)
 
+# Test spectral radius sequential & parallel 
+diagA = dense_diag(A_dist)
+invD = 1 ./ diagA
+Dinv = PartitionedArrays.sparse_diag_matrix(invD,(axes(A_dist,1),axes(A_dist,1)))
+M = Dinv * A_dist
+exp = eigmax(Array(centralize(M)))
+cols = axes(M, 2) 
+x0 = prand(partition(cols))
+x0_seq = collect(x0)
+l, x = PartitionedSolvers.spectral_radius(M, x0, 10)
+lseq, x = PartitionedSolvers.spectral_radius(centralize(M), x0_seq, 10)
+@test l ≈ lseq 
+@test abs((l-exp)/exp) < 2*10^-1
 
 # Test spectral radius estimation 
-n_its = [2, 4, 6, 8, 10, 15, 20, 30, 40, 50, 60, 70, 80, 90, 100]
-n_trials = 10
-errors_powm = zeros(n_trials, length(n_its))
-time_powm = zeros(n_trials,length(n_its))
-errors_spectrad = zeros(n_trials,length(n_its))
-time_spectrad = zeros(n_trials,length(n_its))
-using IterativeSolvers
-using Plots
-using LinearMaps
-using Statistics
-A = laplace_matrix((100,100))
-diagA = dense_diag(A)
-invD = 1 ./ diagA
-x0 = rand(size(A,2))
-M = invD .* A
-Fmap = LinearMap(M)
-exp = 2.0
-for (i, n_it) in enumerate(n_its)
+#= maxiter = 5000
+nnodes = [8, 16, 32]
+exp = [1.9396926207859075, 1.9829730996838997, 1.9954719225730886]
+msizes = zeros(Int, length(nnodes))
+parts_per_dir = (2,2,2)
+p = prod(parts_per_dir)
+ranks = DebugArray(LinearIndices((p,)))
+n_trials = 3
+A_dims = zeros(length(nnodes))
+errors_powm = zeros(length(nnodes), n_trials, maxiter)
+time_powm = zeros(length(nnodes), n_trials)
+errors_spectrad = zeros(length(nnodes), n_trials, maxiter)
+time_spectrad = zeros(length(nnodes), n_trials)
+
+for (n_i, n) in enumerate(nnodes)
+    nodes_per_dir = (n, n, n)
+    args = laplacian_fdm(nodes_per_dir,parts_per_dir,ranks)
+    A = psparse(args...) |> fetch |> centralize
+    println("dims A: $(size(A))")
+    msizes[n_i] = size(A,1)
+    diagA = dense_diag(A)
+    invD = 1 ./ diagA
+    M = invD .* A
+    #M_dense = Array(M)
+    #exp[n_i] = max(abs(eigmax(M_dense)), abs(eigmin(M_dense)))
     for t in 1:n_trials
-        # Power method
+        x0 = rand(size(A,2))
+        lprev, x = powm!(M, x0, maxiter=1)
         tic = time_ns()
-        λ, x = powm!(Fmap, x0, maxiter = n_it)
+        for i in 1:maxiter
+            # Power method
+            l, x = powm!(M, x, maxiter = 1)
+            errors_powm[n_i, t, i] = abs(l-lprev) 
+            lprev = l
+        end
         toc = time_ns()
-        time_powm[t,i] = toc - tic
-        errors_powm[t,i] = abs((λ-exp)/exp)
-        
-        # own implementation
+        time_powm[n_i,t] = (toc - tic)/10^9
+        ρprev, x = PartitionedSolvers.spectral_radius(M,x0, 1)
         tic = time_ns()
-        ρ = PartitionedSolvers.spectral_radius(M,x0, n_it)
+        for i in 1:maxiter
+            # own implementation
+            ρ, x = PartitionedSolvers.spectral_radius(M,x, 1)
+            errors_spectrad[n_i, t, i] = abs(ρ-ρprev)
+            ρprev = ρ
+        end
         toc = time_ns()
-        time_spectrad[t,i] = toc-tic
-        errors_spectrad[t,i] = abs((ρ-exp)/exp)
+        time_spectrad[n_i, t] = (toc-tic)/10^9
     end
 end
-avg_time_powm = median(time_powm, dims=1)
-avg_time_spectrad = median(time_spectrad, dims=1)
-avg_error_powm = median(errors_powm, dims=1)
-avg_error_spectrad = median(errors_spectrad, dims=1)
-avg_time_powm = avg_time_powm ./ 10^9
-avg_time_spectrad = avg_time_spectrad ./ 10^9
-p1 = plot(n_its, [avg_error_powm' avg_error_spectrad'], label = ["powm" "spectrad"], 
-    marker=[:c :x], ms=2)
-plot!(p1, xlabel="#iterations", ylabel="avg rel error", xticks = n_its)
-p2 = plot(n_its, [avg_time_powm' avg_time_spectrad'], label = ["powm" "spectrad"], 
-    marker=[:c :x], ms=2)
-plot!(p2, xlabel="#iterations", ylabel="avg runtime (s)", xticks = n_its)
+avg_time_powm = median(time_powm, dims=2)
+avg_time_spectrad = median(time_spectrad, dims=2)
+avg_error_powm = median(errors_powm, dims=2)
+avg_error_spectrad = median(errors_spectrad, dims=2)
+
+p1 = plot()
+plot!(p1, [1], [0], color="black", label="powm")
+plot!(p1, [1], [0], color="black", ls=:dash, label="spectrad")
+for (n_i, n) in enumerate(msizes)
+    plot!(p1, 1:maxiter, avg_error_powm[n_i,:,:]', label = "($(n),$(n))", color= n_i)
+    plot!(p1, 1:maxiter, avg_error_spectrad[n_i,:,:]', label = "", color=n_i, ls=:dash)
+end
+
+p2 = plot(msizes, [avg_time_powm avg_time_spectrad], label = ["powm" "spectrad"], color="black", marker=[:c :x])
+yticks=[10^-16, 10^-14, 10^-12, 10^-10, 10^-8, 10^-6, 10^-4, 10^-2, 10^0]
+xticks=[10^0, 10^1, 10^2, 10^3]
+plot!(p1, xlabel="#iterations (k)", ylabel="|λ(k) - exp|", legend=:outertopleft, xscale=:log,
+    xticks=xticks, yscale=:log, ylim=(10^-16, 1), yticks=yticks)
+plot!(p2, xlabel="size of matrix", ylabel="avg runtime (s)", xscale=:log10)
 p = plot(p1, p2, layout=(2,1), suptitle="Convergence of powm and spectral_radius")
-savefig(p, "C:/Users/gelie/Home/ComputationalScience/GSoC/powm_convergence.png")    
-display(p)
+savefig(p, "C:/Users/gelie/Home/ComputationalScience/GSoC/powm_l-lprev_k$(maxiter)_m$(msizes[end]).png")    
+display(p)  =#
 
 # First with a sequential matrix
 #= nodes_per_dir = (100,100)

From b8c4470d4e503396cae398678de90e139e39540f Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Fri, 26 Jul 2024 11:36:10 +0200
Subject: [PATCH 05/18] Add parallel implementation of strength_graph and
 collect_nodes_in_aggregates

---
 PartitionedSolvers/src/amg.jl        | 85 ++++++++++++++++++++++++++--
 PartitionedSolvers/test/amg_tests.jl | 74 +++++++++++++++++++-----
 2 files changed, 141 insertions(+), 18 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index a47b333a..69366971 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -197,6 +197,22 @@ function constant_prolongator(node_to_aggregate::PVector,aggregates::PRange,n_nu
     P0
 end
 
+function collect_nodes_in_aggregate(node_to_aggregate::PVector,aggregates::PRange)
+    aggregate_to_nodes_partition = map(own_values(node_to_aggregate), partition(aggregates)) do own_node_to_global_aggregate, my_aggregates
+        # convert to local ids
+        global_to_local_aggregate = global_to_local(my_aggregates) 
+        local_aggregates = global_to_local_aggregate[my_aggregates]
+        @assert all(local_aggregates .!= 0)
+        n_local_aggregates = length(local_aggregates)
+        own_node_to_local_aggregate = map(own_node_to_global_aggregate) do global_aggregate
+            global_to_local_aggregate[global_aggregate]
+        end
+        local_aggregate_to_local_nodes = collect_nodes_in_aggregate(own_node_to_local_aggregate, 1:n_local_aggregates)    
+        local_aggregate_to_local_nodes
+    end
+    PVector(aggregate_to_nodes_partition, partition(aggregates))
+end
+
 function collect_nodes_in_aggregate(node_to_aggregate,aggregates)
     typeof_aggregate = eltype(node_to_aggregate)
     nnodes = length(node_to_aggregate)
@@ -279,6 +295,25 @@ function tentative_prolongator_for_laplace(P0,B)
     P0,Bc
 end
 
+function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, block_size)
+    # B vector of pvectors
+    own_values_B = map(own_values, B)
+    P0_partition, Bc_partition = map(own_values(aggregate_to_nodes), own_values_B...) do own_aggregate_to_local_nodes, local_dof_to_b...
+        P0_own_own, local_coarse_dof_to_Bc = tentative_prolongator_with_block_size(own_aggregate_to_local_nodes, local_dof_to_b, block_size)
+        # create P0 sparse matrix as in strength graph
+        # partition in B 
+        # return tuple here 
+    end |> tuple_of_arrays
+    # partition for the coarse dofs 
+    P0 = PSparseMatrix(P0_partition, dof_partition, coarse_dof_partition)
+    Bc = map(Bc_partition) do bic_partition 
+
+        PVector(bic_partition, coarse_dof_partition)
+    end
+    P0, Bc
+end
+
+
 function tentative_prolongator_with_block_size(aggregate_to_nodes::JaggedArray,B, block_size)
     # A draft for the scalar case is commented below
     ## TODO assumes CSC
@@ -516,13 +551,13 @@ function smoothed_aggregation_with_block_size(;
     )
     function coarsen(A,B)
         # build strength graph
-        G = strength_graph(A, block_size=block_size, epsilon=epsilon)
+        G = strength_graph(A, block_size=block_size, epsilon=epsilon) # TODO parallel
         diagG = dense_diag(G)
         node_to_aggregate, node_aggregates = aggregate(G,diagG;epsilon)
-        aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
-        Pc,Bc,perm = tentative_prolongator(aggregate_to_nodes,B, block_size) 
+        aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates) # TODO: provide parallel version
+        Pc,Bc,perm = tentative_prolongator(aggregate_to_nodes,B, block_size) # TODO parallel
         diagA = dense_diag(A)
-        P = smoothed_prolongator(A,Pc,diagA;approximate_omega) 
+        P = smoothed_prolongator(A,Pc,diagA;approximate_omega) # the given approximate omega should work in parallel 
         R = transpose(P)
         Ac,cache = rap(R,A,P;reuse=true)
         Ac,Bc,R,P,cache,repartition_threshold = enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)#maybe changes here
@@ -543,6 +578,48 @@ function getblock!(B,A,ids_i,ids_j)
     end
 end
 
+function strength_graph(A::PSparseMatrix, block_size::Integer; epsilon = 0)
+    dof_partition = partition(axes(A,1))
+    node_partition = map(dof_partition) do my_dofs
+        n_local_dofs = length(my_dofs)
+        @assert n_local_dofs % block_size == 0 
+        @assert own_length(my_dofs) == n_local_dofs
+        n_own_nodes = div(n_local_dofs, block_size)
+        own_to_global_node = zeros(Int, n_own_nodes)
+        for local_node in 1:n_own_nodes
+            local_dof = (local_node - 1) * block_size + 1
+            global_dof = my_dofs[local_dof]
+            global_node = div(global_dof -1, block_size) +1 
+            own_to_global_node[local_node] = global_node 
+        end
+        n_global_dofs = global_length(my_dofs)
+        n_global_nodes = div(n_global_dofs, block_size)
+        @assert n_global_dofs % block_size == 0
+        own_nodes = OwnIndices(n_global_nodes, part_id(my_dofs), own_to_global_node)
+        ghost_nodes = GhostIndices(n_global_nodes, Int[], Int32[])
+        my_nodes = OwnAndGhostIndices(own_nodes, ghost_nodes)
+        my_nodes 
+    end
+
+    G_partition = map(own_own_values(A), node_partition) do A_own_own, my_nodes
+        G_own_own = strength_graph(A_own_own, block_size, epsilon=epsilon)
+        n_own_nodes = own_length(my_nodes)
+        n_ghost_nodes = ghost_length(my_nodes)
+        @assert n_ghost_nodes == 0 
+        @assert size(G_own_own, 1) == n_own_nodes 
+        @assert size(G_own_own, 2) == size(G_own_own, 1)
+        G_own_ghost = sparse(Int[], Int[], eltype(G_own_own)[], n_own_nodes, n_ghost_nodes)
+        G_ghost_own = sparse(Int[], Int[], eltype(G_own_own)[], n_ghost_nodes, n_own_nodes)
+        G_ghost_ghost = sparse(Int[], Int[], eltype(G_own_own)[], n_ghost_nodes, n_ghost_nodes)
+        blocks = PartitionedArrays.split_matrix_blocks(G_own_own, G_own_ghost, G_ghost_own, G_ghost_ghost)
+        perm = PartitionedArrays.local_permutation(my_nodes)
+        PartitionedArrays.split_matrix(blocks, perm, perm)
+    end
+
+    G = PSparseMatrix(G_partition, node_partition, node_partition, true)
+    G
+end
+
 function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon = 0)
 
     if block_size < 1 
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index c4d46524..47d7cf99 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -87,13 +87,6 @@ end
 I,J,V = tuple_of_arrays(IJV)
 col_partition = row_partition
 A = psparse(I,J,V,row_partition, col_partition) |> fetch 
-# TODO: implement and test psparse matrix 
-
-
-# Now with CSC sparse matrix 
-# Test sample matrix with block size 2
-epsilon = 0.02
-A_test = centralize(A)
 
 # build solution
 I = [1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 
@@ -108,10 +101,15 @@ J = [1, 2, 4,
      5, 7, 8, 9, 
      6, 8, 9]
 V = ones(length(I))
-
 solution = sparse(I, J, V, nnodes, nnodes)
-G_test = PartitionedSolvers.strength_graph(A_test, 2, epsilon=epsilon)
-@test solution ≈ G_test
+
+epsilon = 0.02
+
+# Test with CSC sparse matrix 
+# Test sample matrix with block size 2
+A_seq = centralize(A)
+G_seq = PartitionedSolvers.strength_graph(A_seq, 2, epsilon=epsilon)
+@test solution ≈ G_seq
 
 # Another test with 3 dims
 M = rand([-2.0, -1, 1, 2], (3, 3))
@@ -126,6 +124,52 @@ G = PartitionedSolvers.strength_graph(M, 3, epsilon=epsilon)
 solution = sparse([1], [1], 1.0, 1, 1)
 @test solution ≈ G 
 
+# Test strength_graph with psparse matrix 
+block_size = 3
+parts_per_dir = (2,2,2)
+p = prod(parts_per_dir)
+ranks = DebugArray(LinearIndices((p,)))
+nodes_per_dir = map(i->block_size*i,parts_per_dir)
+args = laplacian_fdm(nodes_per_dir,parts_per_dir,ranks)
+A_dist = psparse(args...) |> fetch 
+A_seq = centralize(A_dist)
+G_seq = PartitionedSolvers.strength_graph(A_seq, block_size, epsilon=epsilon)
+G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
+diff = G_seq - centralize(G_dist)
+println("Difference between sequential and parallel strength_graph:")
+display(diff)
+
+# Test sequential collect nodes in aggregate 
+diagG = dense_diag(centralize(G_dist))
+node_to_aggregate_seq, node_aggregates_seq = PartitionedSolvers.aggregate(centralize(G_dist),diagG;epsilon)
+aggregate_to_nodes_seq = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate_seq, node_aggregates_seq)
+@test length(aggregate_to_nodes_seq.data) == length(node_to_aggregate_seq)
+for i_agg in node_aggregates_seq
+    pini = aggregate_to_nodes_seq.ptrs[i_agg]
+    pend = aggregate_to_nodes_seq.ptrs[i_agg+1]-1
+    nodes = aggregate_to_nodes_seq.data[pini:pend]
+    @test all(node_to_aggregate_seq[nodes] .== i_agg)
+end
+
+# Test parallel collect_nodes_in_aggregate
+diagG = dense_diag(G_dist)
+node_to_aggregate_dist, node_aggregates_dist = PartitionedSolvers.aggregate(G_dist,diagG;epsilon)
+aggregate_to_nodes_dist = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate_dist, node_aggregates_dist)
+map(partition(aggregate_to_nodes_dist), partition(node_to_aggregate_dist), partition(node_aggregates_dist)) do my_aggregate_to_nodes, my_node_to_aggregate, my_aggregates
+    @test length(my_aggregate_to_nodes.data) == length(my_node_to_aggregate)
+    global_to_local_aggregate = global_to_local(my_aggregates) 
+    local_aggregates = global_to_local_aggregate[my_aggregates]
+    own_node_to_local_aggregate = map(my_node_to_aggregate) do global_aggregate
+        global_to_local_aggregate[global_aggregate]
+    end
+    for i_agg in local_aggregates
+        pini = my_aggregate_to_nodes.ptrs[i_agg]
+        pend = my_aggregate_to_nodes.ptrs[i_agg+1]-1
+        nodes = my_aggregate_to_nodes.data[pini:pend]
+        @test all(own_node_to_local_aggregate[nodes] .== i_agg)
+    end
+end
+
 # Test tentative prolongator 
 function random_nullspace(ndofs, n_B)
     B = Array{Array{Float64, 1}, 1}(undef, n_B)
@@ -144,7 +188,6 @@ nodes_per_dir = map(i->block_size*i,parts_per_dir)
 args = laplacian_fdm(nodes_per_dir,parts_per_dir,ranks)
 A_dist = psparse(args...) |> fetch 
 A = centralize(A_dist)
-println("dims A: $(size(A))")
 G = PartitionedSolvers.strength_graph(A, block_size, epsilon=epsilon)
 diagG = dense_diag(G)
 B = random_nullspace(size(A, 1), block_size)
@@ -153,6 +196,9 @@ aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggre
 Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B, block_size) 
 @test Pc * stack(Bc) ≈ stack(B)
 
+# Test tentative prolongator with parallel matrix 
+
+
 # Test spectral radius sequential & parallel 
 diagA = dense_diag(A_dist)
 invD = 1 ./ diagA
@@ -233,13 +279,13 @@ end
 p2 = plot(msizes, [avg_time_powm avg_time_spectrad], label = ["powm" "spectrad"], color="black", marker=[:c :x])
 yticks=[10^-16, 10^-14, 10^-12, 10^-10, 10^-8, 10^-6, 10^-4, 10^-2, 10^0]
 xticks=[10^0, 10^1, 10^2, 10^3]
-plot!(p1, xlabel="#iterations (k)", ylabel="|λ(k) - exp|", legend=:outertopleft, xscale=:log,
+plot!(p1, xlabel="#iterations (k)", ylabel="|λ(k) - λ(k-1)|", legend=:outertopleft, xscale=:log,
     xticks=xticks, yscale=:log, ylim=(10^-16, 1), yticks=yticks)
 plot!(p2, xlabel="size of matrix", ylabel="avg runtime (s)", xscale=:log10)
 p = plot(p1, p2, layout=(2,1), suptitle="Convergence of powm and spectral_radius")
 savefig(p, "C:/Users/gelie/Home/ComputationalScience/GSoC/powm_l-lprev_k$(maxiter)_m$(msizes[end]).png")    
-display(p)  =#
-
+display(p)  
+ =#
 # First with a sequential matrix
 #= nodes_per_dir = (100,100)
 A = laplace_matrix(nodes_per_dir)

From d2299465bc6c6937cb9bb6e833d44a270f645055 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Wed, 7 Aug 2024 14:32:50 +0200
Subject: [PATCH 06/18] Add parallel implementation of tentative prolongator

---
 PartitionedSolvers/src/amg.jl        | 63 +++++++++++++++++++++-------
 PartitionedSolvers/test/amg_tests.jl | 32 +++++++++++---
 2 files changed, 76 insertions(+), 19 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 69366971..03522187 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -298,19 +298,56 @@ end
 function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, block_size)
     # B vector of pvectors
     own_values_B = map(own_values, B)
-    P0_partition, Bc_partition = map(own_values(aggregate_to_nodes), own_values_B...) do own_aggregate_to_local_nodes, local_dof_to_b...
-        P0_own_own, local_coarse_dof_to_Bc = tentative_prolongator_with_block_size(own_aggregate_to_local_nodes, local_dof_to_b, block_size)
-        # create P0 sparse matrix as in strength graph
-        # partition in B 
-        # return tuple here 
+    n_B = length(B)
+    P0_partition, Bc_partition, coarse_dof_to_Bc... = map(aggregate_to_nodes.index_partition, B[1].index_partition, local_values(aggregate_to_nodes), own_values_B...) do local_aggregate_local_indices, local_dof_local_indices, local_aggregate_to_local_nodes, own_dof_to_b...
+        P0_own_own, local_coarse_dof_to_Bc = tentative_prolongator_with_block_size(local_aggregate_to_local_nodes, own_dof_to_b, block_size)
+        # Create P0 partition
+        n_global_aggregates = global_length(local_aggregate_local_indices)
+        n_own_aggregates = own_length(local_aggregate_local_indices)
+        n_local_aggregates = local_length(local_aggregate_local_indices)
+        n_ghost_aggregates = ghost_length(local_aggregate_local_indices)
+        n_own_dofs = length(own_dof_to_b[1])
+        n_own_coarse_dofs = n_own_aggregates * n_B
+        n_global_coarse_dofs = n_global_aggregates * n_B
+        @assert n_own_aggregates == n_local_aggregates
+        @assert n_ghost_aggregates == 0 
+        @assert size(P0_own_own, 1) == n_own_dofs 
+        @assert size(P0_own_own, 2) == n_own_coarse_dofs
+        @assert length(local_coarse_dof_to_Bc) == n_B 
+        @assert length(local_coarse_dof_to_Bc[1]) == n_own_coarse_dofs
+        P0_own_ghost = sparse(Int[], Int[], eltype(P0_own_own)[], n_own_aggregates, n_ghost_aggregates)
+        P0_ghost_own = sparse(Int[], Int[], eltype(P0_own_own)[], n_ghost_aggregates, n_own_aggregates)
+        P0_ghost_ghost = sparse(Int[], Int[], eltype(P0_own_own)[], n_ghost_aggregates, n_ghost_aggregates)
+        blocks = PartitionedArrays.split_matrix_blocks(P0_own_own, P0_own_ghost, P0_ghost_own, P0_ghost_ghost)
+        perm_aggs = PartitionedArrays.local_permutation(local_aggregate_local_indices) 
+        # Turn permutation of aggregates to permutation of null vectors 
+        perm_cols = zeros(Int, size(P0_own_own,2))
+        for (i, agg) in enumerate(perm_aggs)
+            cols = (agg - 1) * n_B + 1 : agg * n_B 
+            inds = (i - 1) * n_B + 1 : i * n_B 
+            perm_cols[inds] = cols 
+        end
+        perm_rows = PartitionedArrays.local_permutation(local_dof_local_indices) 
+        P0_partition = PartitionedArrays.split_matrix(blocks, perm_rows, perm_cols)
+        # Partition of Bc
+        own_to_global_coarse_dofs = zeros(Int, n_own_coarse_dofs)
+        for (i, agg) in enumerate(local_to_global(local_aggregate_local_indices))
+            inds = (i-1) * n_B + 1 : i * n_B
+            global_coarse_dofs = (agg-1) * n_B + 1 : agg * n_B 
+            own_to_global_coarse_dofs[inds] = global_coarse_dofs
+        end
+        own_coarse_dofs = OwnIndices(n_global_coarse_dofs, part_id(local_dof_local_indices), own_to_global_coarse_dofs)
+        ghost_coarse_dofs = GhostIndices(n_global_coarse_dofs, Int[], Int32[])
+        my_coarse_dofs = OwnAndGhostIndices(own_coarse_dofs, ghost_coarse_dofs)
+        P0_partition, my_coarse_dofs, local_coarse_dof_to_Bc...
     end |> tuple_of_arrays
     # partition for the coarse dofs 
-    P0 = PSparseMatrix(P0_partition, dof_partition, coarse_dof_partition)
-    Bc = map(Bc_partition) do bic_partition 
-
-        PVector(bic_partition, coarse_dof_partition)
-    end
-    P0, Bc
+    P0 = PSparseMatrix(P0_partition, B[1].index_partition, Bc_partition, true) 
+    Bc = Array{PVector}(undef, n_B)
+    for (i,b) in enumerate(coarse_dof_to_Bc)
+        Bc[i] = PVector(b, Bc_partition)
+    end  
+    P0, Bc 
 end
 
 
@@ -918,6 +955,4 @@ function amg_finalize!(setup)
     end
     coarse_solver_setup = setup.coarse_level.coarse_solver_setup
     finalize!(coarse_solver_setup)
-    nothing
-end
-
+end
\ No newline at end of file
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 47d7cf99..a7444b64 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -136,8 +136,6 @@ A_seq = centralize(A_dist)
 G_seq = PartitionedSolvers.strength_graph(A_seq, block_size, epsilon=epsilon)
 G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
 diff = G_seq - centralize(G_dist)
-println("Difference between sequential and parallel strength_graph:")
-display(diff)
 
 # Test sequential collect nodes in aggregate 
 diagG = dense_diag(centralize(G_dist))
@@ -171,7 +169,7 @@ map(partition(aggregate_to_nodes_dist), partition(node_to_aggregate_dist), parti
 end
 
 # Test tentative prolongator 
-function random_nullspace(ndofs, n_B)
+function random_nullspace(ndofs::Int, n_B)
     B = Array{Array{Float64, 1}, 1}(undef, n_B)
     for i = 1:n_B
         B[i] = rand(ndofs)
@@ -179,6 +177,14 @@ function random_nullspace(ndofs, n_B)
     B
 end
 
+function random_nullspace(index_partition::AbstractArray, n_B)
+    B = Array{PVector}(undef, n_B)
+    for i = 1:n_B
+        B[i] = prand(index_partition)
+    end
+    B
+end
+
 # Test tentative prolongator with laplace matrix
 block_size = 3
 parts_per_dir = (2,2,2)
@@ -197,6 +203,21 @@ Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_n
 @test Pc * stack(Bc) ≈ stack(B)
 
 # Test tentative prolongator with parallel matrix 
+G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
+diagG = dense_diag(G_dist)
+node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_dist,diagG;epsilon)
+aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
+B_dist = random_nullspace(partition(axes(A_dist,1)), block_size)
+Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B_dist, block_size) 
+Bc_matrix = zeros(size(Pc,2), length(Bc))
+for (i,b) in enumerate(Bc)
+    Bc_matrix[:,i] = collect(b)
+end
+B_matrix = zeros(size(Pc,1), length(Bc))
+for (i,b) in enumerate(B)
+    B_matrix[:,i] = collect(b)
+end
+@test centralize(Pc) * Bc_matrix ≈ B_matrix # TODO: test fails 
 
 
 # Test spectral radius sequential & parallel 
@@ -285,9 +306,10 @@ plot!(p2, xlabel="size of matrix", ylabel="avg runtime (s)", xscale=:log10)
 p = plot(p1, p2, layout=(2,1), suptitle="Convergence of powm and spectral_radius")
 savefig(p, "C:/Users/gelie/Home/ComputationalScience/GSoC/powm_l-lprev_k$(maxiter)_m$(msizes[end]).png")    
 display(p)  
- =#
+
+
 # First with a sequential matrix
-#= nodes_per_dir = (100,100)
+nodes_per_dir = (100,100)
 A = laplace_matrix(nodes_per_dir)
 using Random
 Random.seed!(1)

From 7cf1469400bbb58aae83a5d5138b703d923acc49 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Fri, 9 Aug 2024 09:50:15 +0200
Subject: [PATCH 07/18] Add tests for parallel tentative prolongator

---
 PartitionedSolvers/src/amg.jl        |  4 ++--
 PartitionedSolvers/test/amg_tests.jl | 10 +++++++---
 2 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 03522187..9ff28005 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -338,8 +338,8 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, bl
         end
         own_coarse_dofs = OwnIndices(n_global_coarse_dofs, part_id(local_dof_local_indices), own_to_global_coarse_dofs)
         ghost_coarse_dofs = GhostIndices(n_global_coarse_dofs, Int[], Int32[])
-        my_coarse_dofs = OwnAndGhostIndices(own_coarse_dofs, ghost_coarse_dofs)
-        P0_partition, my_coarse_dofs, local_coarse_dof_to_Bc...
+        Bc_partition = OwnAndGhostIndices(own_coarse_dofs, ghost_coarse_dofs)
+        P0_partition, Bc_partition, local_coarse_dof_to_Bc...
     end |> tuple_of_arrays
     # partition for the coarse dofs 
     P0 = PSparseMatrix(P0_partition, B[1].index_partition, Bc_partition, true) 
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index a7444b64..2ea00328 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -205,19 +205,23 @@ Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_n
 # Test tentative prolongator with parallel matrix 
 G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
 diagG = dense_diag(G_dist)
+n_B = block_size
 node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_dist,diagG;epsilon)
 aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
-B_dist = random_nullspace(partition(axes(A_dist,1)), block_size)
+B_dist = random_nullspace(partition(axes(A_dist,1)), n_B)
 Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B_dist, block_size) 
+for i in 1:n_B
+    @test isa(Bc[i], PVector)
+end
 Bc_matrix = zeros(size(Pc,2), length(Bc))
 for (i,b) in enumerate(Bc)
     Bc_matrix[:,i] = collect(b)
 end
 B_matrix = zeros(size(Pc,1), length(Bc))
-for (i,b) in enumerate(B)
+for (i,b) in enumerate(B_dist)
     B_matrix[:,i] = collect(b)
 end
-@test centralize(Pc) * Bc_matrix ≈ B_matrix # TODO: test fails 
+@test centralize(Pc) * Bc_matrix ≈ B_matrix  
 
 
 # Test spectral radius sequential & parallel 

From e06c74e1d8970e0dcc5d561aa96655e3a8df56e3 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Fri, 9 Aug 2024 10:12:50 +0200
Subject: [PATCH 08/18] Remove plots of power method convergence

---
 PartitionedSolvers/src/amg.jl        |  2 -
 PartitionedSolvers/test/amg_tests.jl | 83 ++--------------------------
 2 files changed, 4 insertions(+), 81 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 9ff28005..6ca6cc85 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -296,7 +296,6 @@ function tentative_prolongator_for_laplace(P0,B)
 end
 
 function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, block_size)
-    # B vector of pvectors
     own_values_B = map(own_values, B)
     n_B = length(B)
     P0_partition, Bc_partition, coarse_dof_to_Bc... = map(aggregate_to_nodes.index_partition, B[1].index_partition, local_values(aggregate_to_nodes), own_values_B...) do local_aggregate_local_indices, local_dof_local_indices, local_aggregate_to_local_nodes, own_dof_to_b...
@@ -341,7 +340,6 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, bl
         Bc_partition = OwnAndGhostIndices(own_coarse_dofs, ghost_coarse_dofs)
         P0_partition, Bc_partition, local_coarse_dof_to_Bc...
     end |> tuple_of_arrays
-    # partition for the coarse dofs 
     P0 = PSparseMatrix(P0_partition, B[1].index_partition, Bc_partition, true) 
     Bc = Array{PVector}(undef, n_B)
     for (i,b) in enumerate(coarse_dof_to_Bc)
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 2ea00328..558ffec5 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -9,7 +9,6 @@ using PartitionedSolvers
 using LinearAlgebra
 using IterativeSolvers
 using IterativeSolvers: cg!
-using Plots
 using Statistics
 using Test
 using SparseArrays
@@ -223,7 +222,6 @@ for (i,b) in enumerate(B_dist)
 end
 @test centralize(Pc) * Bc_matrix ≈ B_matrix  
 
-
 # Test spectral radius sequential & parallel 
 diagA = dense_diag(A_dist)
 invD = 1 ./ diagA
@@ -238,79 +236,6 @@ lseq, x = PartitionedSolvers.spectral_radius(centralize(M), x0_seq, 10)
 @test l ≈ lseq 
 @test abs((l-exp)/exp) < 2*10^-1
 
-# Test spectral radius estimation 
-#= maxiter = 5000
-nnodes = [8, 16, 32]
-exp = [1.9396926207859075, 1.9829730996838997, 1.9954719225730886]
-msizes = zeros(Int, length(nnodes))
-parts_per_dir = (2,2,2)
-p = prod(parts_per_dir)
-ranks = DebugArray(LinearIndices((p,)))
-n_trials = 3
-A_dims = zeros(length(nnodes))
-errors_powm = zeros(length(nnodes), n_trials, maxiter)
-time_powm = zeros(length(nnodes), n_trials)
-errors_spectrad = zeros(length(nnodes), n_trials, maxiter)
-time_spectrad = zeros(length(nnodes), n_trials)
-
-for (n_i, n) in enumerate(nnodes)
-    nodes_per_dir = (n, n, n)
-    args = laplacian_fdm(nodes_per_dir,parts_per_dir,ranks)
-    A = psparse(args...) |> fetch |> centralize
-    println("dims A: $(size(A))")
-    msizes[n_i] = size(A,1)
-    diagA = dense_diag(A)
-    invD = 1 ./ diagA
-    M = invD .* A
-    #M_dense = Array(M)
-    #exp[n_i] = max(abs(eigmax(M_dense)), abs(eigmin(M_dense)))
-    for t in 1:n_trials
-        x0 = rand(size(A,2))
-        lprev, x = powm!(M, x0, maxiter=1)
-        tic = time_ns()
-        for i in 1:maxiter
-            # Power method
-            l, x = powm!(M, x, maxiter = 1)
-            errors_powm[n_i, t, i] = abs(l-lprev) 
-            lprev = l
-        end
-        toc = time_ns()
-        time_powm[n_i,t] = (toc - tic)/10^9
-        ρprev, x = PartitionedSolvers.spectral_radius(M,x0, 1)
-        tic = time_ns()
-        for i in 1:maxiter
-            # own implementation
-            ρ, x = PartitionedSolvers.spectral_radius(M,x, 1)
-            errors_spectrad[n_i, t, i] = abs(ρ-ρprev)
-            ρprev = ρ
-        end
-        toc = time_ns()
-        time_spectrad[n_i, t] = (toc-tic)/10^9
-    end
-end
-avg_time_powm = median(time_powm, dims=2)
-avg_time_spectrad = median(time_spectrad, dims=2)
-avg_error_powm = median(errors_powm, dims=2)
-avg_error_spectrad = median(errors_spectrad, dims=2)
-
-p1 = plot()
-plot!(p1, [1], [0], color="black", label="powm")
-plot!(p1, [1], [0], color="black", ls=:dash, label="spectrad")
-for (n_i, n) in enumerate(msizes)
-    plot!(p1, 1:maxiter, avg_error_powm[n_i,:,:]', label = "($(n),$(n))", color= n_i)
-    plot!(p1, 1:maxiter, avg_error_spectrad[n_i,:,:]', label = "", color=n_i, ls=:dash)
-end
-
-p2 = plot(msizes, [avg_time_powm avg_time_spectrad], label = ["powm" "spectrad"], color="black", marker=[:c :x])
-yticks=[10^-16, 10^-14, 10^-12, 10^-10, 10^-8, 10^-6, 10^-4, 10^-2, 10^0]
-xticks=[10^0, 10^1, 10^2, 10^3]
-plot!(p1, xlabel="#iterations (k)", ylabel="|λ(k) - λ(k-1)|", legend=:outertopleft, xscale=:log,
-    xticks=xticks, yscale=:log, ylim=(10^-16, 1), yticks=yticks)
-plot!(p2, xlabel="size of matrix", ylabel="avg runtime (s)", xscale=:log10)
-p = plot(p1, p2, layout=(2,1), suptitle="Convergence of powm and spectral_radius")
-savefig(p, "C:/Users/gelie/Home/ComputationalScience/GSoC/powm_l-lprev_k$(maxiter)_m$(msizes[end]).png")    
-display(p)  
-
 
 # First with a sequential matrix
 nodes_per_dir = (100,100)
@@ -334,7 +259,7 @@ amg_statistics(S) |> display
 # Non-default options
 
 level_params = amg_level_params(;
-    pre_smoother = jacobi(;iters=10,omega=2/3),
+    pre_smoother = PartitionedSolvers.jacobi(;iters=10,omega=2/3),
     cycle = v_cycle,
    )
 
@@ -361,7 +286,7 @@ finalize!(S)
 # Now as a preconditioner
 
 level_params = amg_level_params(;
-   pre_smoother = gauss_seidel(;iters=1),
+   pre_smoother = PartitionedSolvers.gauss_seidel(;iters=1),
    )
 
 fine_params = amg_fine_params(;level_params)
@@ -409,7 +334,7 @@ solve!(y,S,b)
 finalize!(S)
 
 level_params = amg_level_params(;
-    pre_smoother = jacobi(;iters=1,omega=2/3),
+    pre_smoother = PartitionedSolvers.jacobi(;iters=1,omega=2/3),
     coarsening = smoothed_aggregation(;repartition_threshold=10000000)
    )
 
@@ -434,5 +359,5 @@ x = similar(b,axes(A,2))
 x .= 0
 Pl = setup(amg(),x,A,b)
 _, history = cg!(x,A,b;Pl,log=true)
-display(history) =#
+display(history)
 end
\ No newline at end of file

From e2a8e86dcf344d53adf7917cafafc7dc8eec4fba Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Tue, 13 Aug 2024 09:11:28 +0200
Subject: [PATCH 09/18] Use linear elasticity matrix for testing

---
 PartitionedSolvers/test/amg_tests.jl | 28 ++++++++++++----------------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 558ffec5..8208eaa9 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -3,7 +3,7 @@ module AMGTests
 # just for debugging
 using Pkg
 Pkg.activate("PartitionedSolvers")
-using PartitionedArrays
+using PartitionedArrays 
 using PartitionedArrays: laplace_matrix
 using PartitionedSolvers
 using LinearAlgebra
@@ -123,18 +123,22 @@ G = PartitionedSolvers.strength_graph(M, 3, epsilon=epsilon)
 solution = sparse([1], [1], 1.0, 1, 1)
 @test solution ≈ G 
 
-# Test strength_graph with psparse matrix 
+# Create Psparse Test matrix (Linear Elasticity)
 block_size = 3
 parts_per_dir = (2,2,2)
 p = prod(parts_per_dir)
 ranks = DebugArray(LinearIndices((p,)))
 nodes_per_dir = map(i->block_size*i,parts_per_dir)
-args = laplacian_fdm(nodes_per_dir,parts_per_dir,ranks)
-A_dist = psparse(args...) |> fetch 
+args = PartitionedArrays.linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
+A_dist = psparse(args...) |> fetch
 A_seq = centralize(A_dist)
+display(A_seq)
+
+# Test strength graph with sequential and parallel linear elasticity matrix
 G_seq = PartitionedSolvers.strength_graph(A_seq, block_size, epsilon=epsilon)
 G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
 diff = G_seq - centralize(G_dist)
+display(diff)
 
 # Test sequential collect nodes in aggregate 
 diagG = dense_diag(centralize(G_dist))
@@ -185,18 +189,10 @@ function random_nullspace(index_partition::AbstractArray, n_B)
 end
 
 # Test tentative prolongator with laplace matrix
-block_size = 3
-parts_per_dir = (2,2,2)
-p = prod(parts_per_dir)
-ranks = DebugArray(LinearIndices((p,)))
-nodes_per_dir = map(i->block_size*i,parts_per_dir)
-args = laplacian_fdm(nodes_per_dir,parts_per_dir,ranks)
-A_dist = psparse(args...) |> fetch 
-A = centralize(A_dist)
-G = PartitionedSolvers.strength_graph(A, block_size, epsilon=epsilon)
-diagG = dense_diag(G)
-B = random_nullspace(size(A, 1), block_size)
-node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G,diagG;epsilon)
+G_seq = PartitionedSolvers.strength_graph(A_seq, block_size, epsilon=epsilon)
+diagG = dense_diag(G_seq)
+B = random_nullspace(size(A_seq, 1), block_size)
+node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_seq,diagG;epsilon)
 aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
 Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B, block_size) 
 @test Pc * stack(Bc) ≈ stack(B)

From 2008787b8d5bc25cabf0c88ca9f933e825b374e0 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Tue, 13 Aug 2024 11:36:39 +0200
Subject: [PATCH 10/18] Use near nullspace for linear elasticity for testing

---
 PartitionedSolvers/test/amg_tests.jl | 71 ++++++++++++----------------
 1 file changed, 31 insertions(+), 40 deletions(-)

diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 8208eaa9..c3c960a6 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -3,8 +3,7 @@ module AMGTests
 # just for debugging
 using Pkg
 Pkg.activate("PartitionedSolvers")
-using PartitionedArrays 
-using PartitionedArrays: laplace_matrix
+using PartitionedArrays
 using PartitionedSolvers
 using LinearAlgebra
 using IterativeSolvers
@@ -123,22 +122,26 @@ G = PartitionedSolvers.strength_graph(M, 3, epsilon=epsilon)
 solution = sparse([1], [1], 1.0, 1, 1)
 @test solution ≈ G 
 
-# Create Psparse Test matrix (Linear Elasticity)
-block_size = 3
+# Create Test matrices (Linear Elasticity)
 parts_per_dir = (2,2,2)
+block_size = length(parts_per_dir) 
 p = prod(parts_per_dir)
 ranks = DebugArray(LinearIndices((p,)))
 nodes_per_dir = map(i->block_size*i,parts_per_dir)
 args = PartitionedArrays.linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
 A_dist = psparse(args...) |> fetch
 A_seq = centralize(A_dist)
-display(A_seq)
 
 # Test strength graph with sequential and parallel linear elasticity matrix
 G_seq = PartitionedSolvers.strength_graph(A_seq, block_size, epsilon=epsilon)
 G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
-diff = G_seq - centralize(G_dist)
-display(diff)
+G_dist_centralized = centralize(G_dist)
+diff = G_seq - G_dist_centralized
+diff_nnz = nnz(diff)
+println("$(round(diff_nnz/(G_seq.m^2); digits=3))% of total entries are different between sequential and parallel G.") 
+println("G_dist has $(round(1-nnz(G_dist_centralized)/nnz(G_seq);digits=2))% fewer nnz entries than G_seq.")
+println("(G_dist nnz entries: $(nnz(G_dist_centralized)), G_seq nnz entries: $(nnz(G_seq)))")
+@test isa(G_dist, PSparseMatrix)
 
 # Test sequential collect nodes in aggregate 
 diagG = dense_diag(centralize(G_dist))
@@ -171,40 +174,25 @@ map(partition(aggregate_to_nodes_dist), partition(node_to_aggregate_dist), parti
     end
 end
 
-# Test tentative prolongator 
-function random_nullspace(ndofs::Int, n_B)
-    B = Array{Array{Float64, 1}, 1}(undef, n_B)
-    for i = 1:n_B
-        B[i] = rand(ndofs)
-    end
-    B
-end
-
-function random_nullspace(index_partition::AbstractArray, n_B)
-    B = Array{PVector}(undef, n_B)
-    for i = 1:n_B
-        B[i] = prand(index_partition)
-    end
-    B
-end
+# Create nullspace vectors for tentative prolongator
+x_dist = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir,ranks)
+B_dist = near_nullspace_linear_elasticity(x_dist, partition(axes(A_dist,2)))
+B_seq = [collect(b) for b in B_dist]
 
-# Test tentative prolongator with laplace matrix
+# Test tentative prolongator with sequential linear elasticity matrix
 G_seq = PartitionedSolvers.strength_graph(A_seq, block_size, epsilon=epsilon)
-diagG = dense_diag(G_seq)
-B = random_nullspace(size(A_seq, 1), block_size)
-node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_seq,diagG;epsilon)
+node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_seq, dense_diag(G_seq);epsilon)
 aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
-Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B, block_size) 
-@test Pc * stack(Bc) ≈ stack(B)
+Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B_seq, block_size) 
+@test Pc * stack(Bc) ≈ stack(B_seq)
+
 
 # Test tentative prolongator with parallel matrix 
 G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
-diagG = dense_diag(G_dist)
-n_B = block_size
-node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_dist,diagG;epsilon)
+node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_dist,dense_diag(G_dist);epsilon)
 aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
-B_dist = random_nullspace(partition(axes(A_dist,1)), n_B)
 Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B_dist, block_size) 
+n_B = length(B_dist)
 for i in 1:n_B
     @test isa(Bc[i], PVector)
 end
@@ -218,6 +206,7 @@ for (i,b) in enumerate(B_dist)
 end
 @test centralize(Pc) * Bc_matrix ≈ B_matrix  
 
+
 # Test spectral radius sequential & parallel 
 diagA = dense_diag(A_dist)
 invD = 1 ./ diagA
@@ -232,10 +221,11 @@ lseq, x = PartitionedSolvers.spectral_radius(centralize(M), x0_seq, 10)
 @test l ≈ lseq 
 @test abs((l-exp)/exp) < 2*10^-1
 
-
-# First with a sequential matrix
+#=
+# Test amg 
+# first with a sequential matrix
 nodes_per_dir = (100,100)
-A = laplace_matrix(nodes_per_dir)
+A = PartitionedArrays.laplace_matrix(nodes_per_dir)
 using Random
 Random.seed!(1)
 x = rand(size(A,2))
@@ -304,7 +294,7 @@ np = prod(parts_per_dir)
 parts = DebugArray(LinearIndices((np,)))
 
 nodes_per_dir = (100,100)
-A = laplace_matrix(nodes_per_dir,parts_per_dir,parts)
+A = PartitionedArrays.laplace_matrix(nodes_per_dir,parts_per_dir,parts)
 x = pones(partition(axes(A,2)))
 b = A*x
 
@@ -346,9 +336,9 @@ cg!(y,A,b;Pl,verbose=true)
 
 nodes_per_dir = (40,40,40)
 parts_per_dir = (2,2,1)
-nparts = prod(parts_per_dir)
-parts = LinearIndices((nparts,))
-A = laplace_matrix(nodes_per_dir,parts_per_dir,parts)
+np = prod(parts_per_dir)
+parts = LinearIndices((np,))
+A = PartitionedArrays.laplace_matrix(nodes_per_dir,parts_per_dir,parts)
 x_exact = pones(partition(axes(A,2)))
 b = A*x_exact
 x = similar(b,axes(A,2))
@@ -356,4 +346,5 @@ x .= 0
 Pl = setup(amg(),x,A,b)
 _, history = cg!(x,A,b;Pl,log=true)
 display(history)
+=#
 end
\ No newline at end of file

From b2ee910b8d64f97f0e2621b617957d7568c1dc28 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Tue, 13 Aug 2024 11:37:27 +0200
Subject: [PATCH 11/18] Fix typo in node_coordinates_unit_cube

---
 src/PartitionedArrays.jl | 2 +-
 src/gallery.jl           | 2 +-
 test/gallery_tests.jl    | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/PartitionedArrays.jl b/src/PartitionedArrays.jl
index 5575613c..6c7d1542 100644
--- a/src/PartitionedArrays.jl
+++ b/src/PartitionedArrays.jl
@@ -180,7 +180,7 @@ include("p_timer.jl")
 export laplacian_fdm
 export laplacian_fem
 export linear_elasticity_fem
-export node_coorinates_unit_cube
+export node_coordinates_unit_cube
 export near_nullspace_linear_elasticity
 include("gallery.jl")
 
diff --git a/src/gallery.jl b/src/gallery.jl
index 231dcce1..59a9c51d 100644
--- a/src/gallery.jl
+++ b/src/gallery.jl
@@ -415,7 +415,7 @@ function node_to_dof_partition(node_partition,D)
     dof_partition
 end
 
-function node_coorinates_unit_cube(
+function node_coordinates_unit_cube(
         nodes_per_dir, # free (== interior) nodes
         parts_per_dir,
         parts,
diff --git a/test/gallery_tests.jl b/test/gallery_tests.jl
index 07fb9a13..dbd393bb 100644
--- a/test/gallery_tests.jl
+++ b/test/gallery_tests.jl
@@ -41,7 +41,7 @@ function gallery_tests(distribute,parts_per_dir)
     Y = A*pones(axes(A,2))
     @test isa(y,PVector)
 
-    x = node_coorinates_unit_cube(nodes_per_dir,parts_per_dir,ranks)
+    x = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir,ranks)
     B = near_nullspace_linear_elasticity(x)
     @test isa(B[1],PVector)
     B = near_nullspace_linear_elasticity(x,partition(axes(A,2)))

From e80f15ce062b0a1323f0720847dc94b8604d3e45 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Wed, 14 Aug 2024 10:01:28 +0200
Subject: [PATCH 12/18] Delete remove_singleton_aggregates

---
 PartitionedSolvers/src/amg.jl        | 46 +---------------------------
 PartitionedSolvers/test/amg_tests.jl | 32 ++++++++-----------
 2 files changed, 13 insertions(+), 65 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 6ca6cc85..008443a3 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -245,47 +245,6 @@ function collect_nodes_in_aggregate(node_to_aggregate,aggregates)
     aggregate_to_nodes
 end
 
-function remove_singleton_aggregates(aggregate_to_nodes_old)
-    typeof_nodes = eltype(aggregate_to_nodes_old[1])
-    n_aggregates_old = length(aggregate_to_nodes_old)
-    n_aggregates = 0
-    n_data = 0
-    for i in 1:n_aggregates_old
-        nnodes = aggregate_to_nodes_old.ptrs[i+1] - aggregate_to_nodes_old.ptrs[i]
-        if nnodes < 2
-            continue
-        end
-        n_aggregates += 1 
-        n_data += nnodes
-    end
-
-    # If there are no singleton aggregates, stop and return old array
-    if n_aggregates == n_aggregates_old
-        return aggregate_to_nodes_old
-    end
-
-    # Else copy non-singleton aggregates to new array
-    aggregate_to_nodes_ptrs = zeros(Int, n_aggregates+1)
-    aggregate_to_nodes_data = zeros(typeof_nodes, n_data)
-    aggregate_to_nodes_ptrs[1] = 1
-    agg = 1
-    for i in 1:n_aggregates_old
-        nnodes = aggregate_to_nodes_old.ptrs[i+1] - aggregate_to_nodes_old.ptrs[i]
-        if nnodes < 2
-            continue 
-        end
-        aggregate_to_nodes_ptrs[agg+1] = aggregate_to_nodes_ptrs[agg] + nnodes 
-        pini = aggregate_to_nodes_ptrs[agg]
-        pend = aggregate_to_nodes_ptrs[agg+1]-1
-        pini_old = aggregate_to_nodes_old.ptrs[i]
-        pend_old = aggregate_to_nodes_old.ptrs[i+1]-1
-        aggregate_to_nodes_data[pini:pend] = aggregate_to_nodes_old.data[pini_old:pend_old]
-        agg += 1     
-    end
-    aggregate_to_nodes = jagged_array(aggregate_to_nodes_data,aggregate_to_nodes_ptrs)
-    aggregate_to_nodes
-end
-
 function tentative_prolongator_for_laplace(P0,B)
     n_nullspace_vecs = length(B)
     if n_nullspace_vecs != 1
@@ -341,10 +300,7 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, bl
         P0_partition, Bc_partition, local_coarse_dof_to_Bc...
     end |> tuple_of_arrays
     P0 = PSparseMatrix(P0_partition, B[1].index_partition, Bc_partition, true) 
-    Bc = Array{PVector}(undef, n_B)
-    for (i,b) in enumerate(coarse_dof_to_Bc)
-        Bc[i] = PVector(b, Bc_partition)
-    end  
+    Bc = [PVector(b, Bc_partition) for b in coarse_dof_to_Bc]
     P0, Bc 
 end
 
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index c3c960a6..7f91238b 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -13,7 +13,7 @@ using Test
 using SparseArrays
 
 # Test strength graph computation 
-# First with Psparse matrix 
+# First with Psparse matrix and known solution
 ndofs = 18
 nnodes = 9
 nrows = 3
@@ -83,8 +83,7 @@ IJV = map(row_partition) do row_indices
 end
 
 I,J,V = tuple_of_arrays(IJV)
-col_partition = row_partition
-A = psparse(I,J,V,row_partition, col_partition) |> fetch 
+A = psparse(I,J,V,row_partition, row_partition) |> fetch 
 
 # build solution
 I = [1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 
@@ -104,12 +103,12 @@ solution = sparse(I, J, V, nnodes, nnodes)
 epsilon = 0.02
 
 # Test with CSC sparse matrix 
-# Test sample matrix with block size 2
+# Test sample matrix with known solution
 A_seq = centralize(A)
 G_seq = PartitionedSolvers.strength_graph(A_seq, 2, epsilon=epsilon)
 @test solution ≈ G_seq
 
-# Another test with 3 dims
+# Another test with blocksize 3
 M = rand([-2.0, -1, 1, 2], (3, 3))
 M = sparse(M)
 A = blockdiag(M, M, M)
@@ -122,7 +121,7 @@ G = PartitionedSolvers.strength_graph(M, 3, epsilon=epsilon)
 solution = sparse([1], [1], 1.0, 1, 1)
 @test solution ≈ G 
 
-# Create Test matrices (Linear Elasticity)
+# Create Test matrix for Linear Elasticity
 parts_per_dir = (2,2,2)
 block_size = length(parts_per_dir) 
 p = prod(parts_per_dir)
@@ -138,7 +137,7 @@ G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
 G_dist_centralized = centralize(G_dist)
 diff = G_seq - G_dist_centralized
 diff_nnz = nnz(diff)
-println("$(round(diff_nnz/(G_seq.m^2); digits=3))% of total entries are different between sequential and parallel G.") 
+println("Test strength graph: \n$(round(diff_nnz/(G_seq.m^2); digits=3))% of total entries are different between sequential and parallel G.") 
 println("G_dist has $(round(1-nnz(G_dist_centralized)/nnz(G_seq);digits=2))% fewer nnz entries than G_seq.")
 println("(G_dist nnz entries: $(nnz(G_dist_centralized)), G_seq nnz entries: $(nnz(G_seq)))")
 @test isa(G_dist, PSparseMatrix)
@@ -180,18 +179,11 @@ B_dist = near_nullspace_linear_elasticity(x_dist, partition(axes(A_dist,2)))
 B_seq = [collect(b) for b in B_dist]
 
 # Test tentative prolongator with sequential linear elasticity matrix
-G_seq = PartitionedSolvers.strength_graph(A_seq, block_size, epsilon=epsilon)
-node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_seq, dense_diag(G_seq);epsilon)
-aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
-Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B_seq, block_size) 
+Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes_seq,B_seq, block_size) 
 @test Pc * stack(Bc) ≈ stack(B_seq)
 
-
 # Test tentative prolongator with parallel matrix 
-G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
-node_to_aggregate, node_aggregates = PartitionedSolvers.aggregate(G_dist,dense_diag(G_dist);epsilon)
-aggregate_to_nodes = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate, node_aggregates)
-Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes,B_dist, block_size) 
+Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes_dist,B_dist, block_size) 
 n_B = length(B_dist)
 for i in 1:n_B
     @test isa(Bc[i], PVector)
@@ -216,10 +208,10 @@ exp = eigmax(Array(centralize(M)))
 cols = axes(M, 2) 
 x0 = prand(partition(cols))
 x0_seq = collect(x0)
-l, x = PartitionedSolvers.spectral_radius(M, x0, 10)
-lseq, x = PartitionedSolvers.spectral_radius(centralize(M), x0_seq, 10)
-@test l ≈ lseq 
-@test abs((l-exp)/exp) < 2*10^-1
+l_dist, x = PartitionedSolvers.spectral_radius(M, x0, 10)
+l_seq, x = PartitionedSolvers.spectral_radius(centralize(M), x0_seq, 10)
+@test l_dist ≈ l_seq 
+@test abs((l_dist-exp)/exp) < 2*10^-1
 
 #=
 # Test amg 

From 5fc4e956dee26394f1cad136a78307e3d98314b1 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Wed, 14 Aug 2024 15:33:47 +0200
Subject: [PATCH 13/18] Test amg for sequential linear elasticity

---
 PartitionedSolvers/src/PartitionedSolvers.jl |  1 +
 PartitionedSolvers/src/amg.jl                | 10 ++--
 PartitionedSolvers/test/amg_tests.jl         | 62 +++++++++++++++++++-
 3 files changed, 66 insertions(+), 7 deletions(-)

diff --git a/PartitionedSolvers/src/PartitionedSolvers.jl b/PartitionedSolvers/src/PartitionedSolvers.jl
index bf08d25a..768adf7f 100644
--- a/PartitionedSolvers/src/PartitionedSolvers.jl
+++ b/PartitionedSolvers/src/PartitionedSolvers.jl
@@ -29,6 +29,7 @@ export smoothed_aggregation
 export v_cycle
 export w_cycle
 export amg_level_params
+export amg_level_params_linear_elasticity
 export amg_fine_params
 export amg_coarse_params
 export amg_statistics
diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 008443a3..84866844 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -468,7 +468,7 @@ function lambda_generic(invD,A,DinvA)
     # Perform a few iterations of Power method to estimate lambda 
     # (Remark 3.5.2. in https://mediatum.ub.tum.de/download/1229321/1229321.pdf)
     x0 = rand(size(DinvA,2))
-    ρ = spectral_radius(DinvA, x0, maxiter=20)
+    ρ, x = spectral_radius(DinvA, x0, 20)
     ω/ρ 
 end
 
@@ -542,11 +542,11 @@ function smoothed_aggregation_with_block_size(;
     )
     function coarsen(A,B)
         # build strength graph
-        G = strength_graph(A, block_size=block_size, epsilon=epsilon) # TODO parallel
+        G = strength_graph(A, block_size, epsilon=epsilon) # TODO parallel
         diagG = dense_diag(G)
         node_to_aggregate, node_aggregates = aggregate(G,diagG;epsilon)
-        aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates) # TODO: provide parallel version
-        Pc,Bc,perm = tentative_prolongator(aggregate_to_nodes,B, block_size) # TODO parallel
+        aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates) 
+        Pc,Bc = tentative_prolongator(aggregate_to_nodes,B, block_size) 
         diagA = dense_diag(A)
         P = smoothed_prolongator(A,Pc,diagA;approximate_omega) # the given approximate omega should work in parallel 
         R = transpose(P)
@@ -715,7 +715,7 @@ end
 
 function amg_level_params_linear_elasticity(;
     pre_smoother = additive_schwarz(gauss_seidel(;iters=1);iters=1),
-    coarsening = smoothed_aggregation(approximate_omega = lambda_generic,
+    coarsening = smoothed_aggregation_with_block_size(approximate_omega = lambda_generic,
     tentative_prolongator = tentative_prolongator_with_block_size),
     cycle = v_cycle,
     pos_smoother = pre_smoother,
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 7f91238b..7efad6db 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -213,7 +213,7 @@ l_seq, x = PartitionedSolvers.spectral_radius(centralize(M), x0_seq, 10)
 @test l_dist ≈ l_seq 
 @test abs((l_dist-exp)/exp) < 2*10^-1
 
-#=
+
 # Test amg 
 # first with a sequential matrix
 nodes_per_dir = (100,100)
@@ -279,6 +279,37 @@ solve!(y,S,b)
 update!(S,2*A)
 solve!(y,S,b)
 
+# Now for linear elasticity (sequential)
+
+
+level_params = amg_level_params_linear_elasticity()
+fine_params = amg_fine_params(;level_params)
+solver = amg(;fine_params)
+
+parts_per_dir = (2,2,2)
+block_size = length(parts_per_dir) 
+p = prod(parts_per_dir)
+ranks = DebugArray(LinearIndices((p,)))
+nodes_per_dir = map(i->block_size*i,parts_per_dir)
+args = PartitionedArrays.linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
+A_dist = psparse(args...) |> fetch 
+A =  centralize(A_dist)
+
+x = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir, ranks)
+B_dist = near_nullspace_linear_elasticity(x, partition(axes(A_dist,2)))
+B = [collect(b) for b in 1:length(B_dist)]
+x_exact = rand(size(A,2))
+b = A*x_exact   
+y = similar(x_exact)
+y .= 0
+
+Pl = setup(solver,y,A,b)
+cg!(y,A,b;Pl,verbose=true)
+println("Linear Elasticity norm of error: $(norm(y-x_exact))")
+@test y ≈ x_exact
+
+
+
 # Now in parallel
 
 parts_per_dir = (2,2)
@@ -338,5 +369,32 @@ x .= 0
 Pl = setup(amg(),x,A,b)
 _, history = cg!(x,A,b;Pl,log=true)
 display(history)
-=#
+
+# Now for linear elasticity 
+
+level_params = amg_level_params_linear_elasticity()
+fine_params = amg_fine_params(;level_params)
+solver = amg(;fine_params)
+
+parts_per_dir = (2,2,2)
+block_size = length(parts_per_dir) 
+p = prod(parts_per_dir)
+ranks = DebugArray(LinearIndices((p,)))
+nodes_per_dir = map(i->block_size*i,parts_per_dir)
+args = PartitionedArrays.linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
+A = psparse(args...) |> fetch
+
+x = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir, ranks)
+B = near_nullspace_linear_elasticity(x, partition(axes(A,2)))
+x_exact = pones(partition(axes(A,2)))
+b = A*x_exact   # TODO: bounds error 
+x = similar(b,axes(A,2))
+x .= 0
+
+Pl = setup(solver,x,A,b)
+cg!(x,A,b;Pl,verbose=true)
+diff = collect(x) - collect(x_exact)
+display(diff)
+@test x ≈ x_exact
+
 end
\ No newline at end of file

From 706885dc53f9155973fbbda6608829d999c0346b Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Wed, 21 Aug 2024 09:10:50 +0200
Subject: [PATCH 14/18] Fix bug in tentative_prolongator_with_block_size

---
 PartitionedSolvers/src/amg.jl        | 48 ++++++++++++++++++--------
 PartitionedSolvers/test/amg_tests.jl | 50 ++++++++++++++--------------
 2 files changed, 60 insertions(+), 38 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 84866844..75ad080d 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -257,6 +257,7 @@ end
 function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, block_size)
     own_values_B = map(own_values, B)
     n_B = length(B)
+    global_aggregate_to_owner = global_to_owner(aggregate_to_nodes.index_partition)
     P0_partition, Bc_partition, coarse_dof_to_Bc... = map(aggregate_to_nodes.index_partition, B[1].index_partition, local_values(aggregate_to_nodes), own_values_B...) do local_aggregate_local_indices, local_dof_local_indices, local_aggregate_to_local_nodes, own_dof_to_b...
         P0_own_own, local_coarse_dof_to_Bc = tentative_prolongator_with_block_size(local_aggregate_to_local_nodes, own_dof_to_b, block_size)
         # Create P0 partition
@@ -285,7 +286,16 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, bl
             inds = (i - 1) * n_B + 1 : i * n_B 
             perm_cols[inds] = cols 
         end
-        perm_rows = PartitionedArrays.local_permutation(local_dof_local_indices) 
+        perm_rows_with_ghost = PartitionedArrays.local_permutation(local_dof_local_indices) 
+        # Filter only own rows 
+        perm_rows = zeros(Int, own_length(local_dof_local_indices))
+        ptr = 1
+        for ind in perm_rows_with_ghost
+            if ind in own_to_local(local_dof_local_indices)
+                perm_rows[ptr] = ind
+                ptr += 1
+            end
+        end
         P0_partition = PartitionedArrays.split_matrix(blocks, perm_rows, perm_cols)
         # Partition of Bc
         own_to_global_coarse_dofs = zeros(Int, n_own_coarse_dofs)
@@ -294,9 +304,14 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, bl
             global_coarse_dofs = (agg-1) * n_B + 1 : agg * n_B 
             own_to_global_coarse_dofs[inds] = global_coarse_dofs
         end
-        own_coarse_dofs = OwnIndices(n_global_coarse_dofs, part_id(local_dof_local_indices), own_to_global_coarse_dofs)
+        global_coarse_dofs_to_owner = zeros(Int, n_global_coarse_dofs)
+        for global_agg in 1:n_global_aggregates
+            inds = (global_agg-1) * n_B + 1 : global_agg * n_B
+            global_coarse_dofs_to_owner[inds] .= global_aggregate_to_owner[global_agg] 
+        end
+        own_coarse_dofs = OwnIndices(n_global_coarse_dofs, part_id(local_aggregate_local_indices), own_to_global_coarse_dofs) #todo 
         ghost_coarse_dofs = GhostIndices(n_global_coarse_dofs, Int[], Int32[])
-        Bc_partition = OwnAndGhostIndices(own_coarse_dofs, ghost_coarse_dofs)
+        Bc_partition = OwnAndGhostIndices(own_coarse_dofs, ghost_coarse_dofs, global_coarse_dofs_to_owner)
         P0_partition, Bc_partition, local_coarse_dof_to_Bc...
     end |> tuple_of_arrays
     P0 = PSparseMatrix(P0_partition, B[1].index_partition, Bc_partition, true) 
@@ -306,8 +321,6 @@ end
 
 
 function tentative_prolongator_with_block_size(aggregate_to_nodes::JaggedArray,B, block_size)
-    # A draft for the scalar case is commented below
-    ## TODO assumes CSC
     # Algorithm 7 in https://mediatum.ub.tum.de/download/1229321/1229321.pdf 
 
     if length(B) < 1
@@ -316,7 +329,7 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::JaggedArray,B
 
     n_aggregates = length(aggregate_to_nodes.ptrs)-1
     n_B = length(B)
-    n_dofs = length(B[1])
+    n_dofs = length(aggregate_to_nodes.data)*block_size
     n_dofs_c = n_aggregates * n_B
     Bc = [Vector{Float64}(undef,n_dofs_c) for _ in 1:n_B]
 
@@ -364,7 +377,7 @@ function tentative_prolongator_with_block_size(aggregate_to_nodes::JaggedArray,B
             pini = P0.colptr[col]
             pend = P0.colptr[col+1]-1
             rowids = P0.rowval[pini:pend]
-            P0.nzval[pini:pend] = B[b][rowids]
+            P0.nzval[pini:pend] = B[b][rowids] # fails
         end
     end
 
@@ -452,7 +465,7 @@ function smoothed_prolongator(A,P0,diagA=dense_diag(A);approximate_omega)
     invDiagA = 1 ./ diagA
     Dinv = PartitionedArrays.sparse_diag_matrix(invDiagA,(axes(A,1),axes(A,1)))
     omega = approximate_omega(invDiagA,A,Dinv*A)
-    P = (I-omega*Dinv*A)*P0
+    P = (I-omega*Dinv*A) * P0 #TODO: this gives an error
     P
 end
 
@@ -463,7 +476,7 @@ function omega_for_1d_laplace(invD,A, DinvA)
     2/3
 end
 
-function lambda_generic(invD,A,DinvA)
+function lambda_generic(invD,A,DinvA::AbstractMatrix)
     ω = 4/3
     # Perform a few iterations of Power method to estimate lambda 
     # (Remark 3.5.2. in https://mediatum.ub.tum.de/download/1229321/1229321.pdf)
@@ -472,6 +485,15 @@ function lambda_generic(invD,A,DinvA)
     ω/ρ 
 end
 
+function lambda_generic(invD,A,DinvA::PSparseMatrix)
+    ω = 4/3
+    # Perform a few iterations of Power method to estimate lambda 
+    # (Remark 3.5.2. in https://mediatum.ub.tum.de/download/1229321/1229321.pdf)
+    x0 = prand(partition(axes(DinvA,2)))
+    ρ, x = spectral_radius(DinvA, x0, 20)
+    ω/ρ 
+end
+
 function spectral_radius(A, x, num_iterations:: Int)
     # Choose diagonal vector as initial guess
     y = similar(x)
@@ -542,13 +564,13 @@ function smoothed_aggregation_with_block_size(;
     )
     function coarsen(A,B)
         # build strength graph
-        G = strength_graph(A, block_size, epsilon=epsilon) # TODO parallel
+        G = strength_graph(A, block_size, epsilon=epsilon) 
         diagG = dense_diag(G)
         node_to_aggregate, node_aggregates = aggregate(G,diagG;epsilon)
         aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates) 
         Pc,Bc = tentative_prolongator(aggregate_to_nodes,B, block_size) 
         diagA = dense_diag(A)
-        P = smoothed_prolongator(A,Pc,diagA;approximate_omega) # the given approximate omega should work in parallel 
+        P = smoothed_prolongator(A, Pc, diagA; approximate_omega)
         R = transpose(P)
         Ac,cache = rap(R,A,P;reuse=true)
         Ac,Bc,R,P,cache,repartition_threshold = enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)#maybe changes here
@@ -713,10 +735,10 @@ function dofs_to_node(dofs, block_size)
     div(dofs[end], block_size)
 end
 
-function amg_level_params_linear_elasticity(;
+function amg_level_params_linear_elasticity(block_size;
     pre_smoother = additive_schwarz(gauss_seidel(;iters=1);iters=1),
     coarsening = smoothed_aggregation_with_block_size(approximate_omega = lambda_generic,
-    tentative_prolongator = tentative_prolongator_with_block_size),
+    tentative_prolongator = tentative_prolongator_with_block_size, block_size = block_size),
     cycle = v_cycle,
     pos_smoother = pre_smoother,
     )
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 7efad6db..6851548f 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -12,6 +12,7 @@ using Statistics
 using Test
 using SparseArrays
 
+
 # Test strength graph computation 
 # First with Psparse matrix and known solution
 ndofs = 18
@@ -175,7 +176,7 @@ end
 
 # Create nullspace vectors for tentative prolongator
 x_dist = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir,ranks)
-B_dist = near_nullspace_linear_elasticity(x_dist, partition(axes(A_dist,2)))
+B_dist = nullspace_linear_elasticity(x_dist, partition(axes(A_dist,2)))
 B_seq = [collect(b) for b in B_dist]
 
 # Test tentative prolongator with sequential linear elasticity matrix
@@ -279,12 +280,9 @@ solve!(y,S,b)
 update!(S,2*A)
 solve!(y,S,b)
 
-# Now for linear elasticity (sequential)
 
 
-level_params = amg_level_params_linear_elasticity()
-fine_params = amg_fine_params(;level_params)
-solver = amg(;fine_params)
+# Now for linear elasticity (sequential)
 
 parts_per_dir = (2,2,2)
 block_size = length(parts_per_dir) 
@@ -296,14 +294,18 @@ A_dist = psparse(args...) |> fetch
 A =  centralize(A_dist)
 
 x = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir, ranks)
-B_dist = near_nullspace_linear_elasticity(x, partition(axes(A_dist,2)))
-B = [collect(b) for b in 1:length(B_dist)]
+B_dist = nullspace_linear_elasticity(x, partition(axes(A_dist,2)))
+B = [collect(b) for b in B_dist]
 x_exact = rand(size(A,2))
 b = A*x_exact   
 y = similar(x_exact)
 y .= 0
 
-Pl = setup(solver,y,A,b)
+level_params = amg_level_params_linear_elasticity(block_size)
+fine_params = amg_fine_params(;level_params)
+solver = amg(;fine_params)
+
+Pl = setup(solver,y,A,b;nullspace=B)
 cg!(y,A,b;Pl,verbose=true)
 println("Linear Elasticity norm of error: $(norm(y-x_exact))")
 @test y ≈ x_exact
@@ -370,31 +372,29 @@ Pl = setup(amg(),x,A,b)
 _, history = cg!(x,A,b;Pl,log=true)
 display(history)
 
-# Now for linear elasticity 
 
-level_params = amg_level_params_linear_elasticity()
-fine_params = amg_fine_params(;level_params)
-solver = amg(;fine_params)
+# Now for linear elasticity 
 
 parts_per_dir = (2,2,2)
 block_size = length(parts_per_dir) 
 p = prod(parts_per_dir)
 ranks = DebugArray(LinearIndices((p,)))
 nodes_per_dir = map(i->block_size*i,parts_per_dir)
-args = PartitionedArrays.linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
-A = psparse(args...) |> fetch
-
-x = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir, ranks)
-B = near_nullspace_linear_elasticity(x, partition(axes(A,2)))
+args = linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
+A = psparse(args...) |> fetch 
 x_exact = pones(partition(axes(A,2)))
-b = A*x_exact   # TODO: bounds error 
-x = similar(b,axes(A,2))
-x .= 0
+x = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir, ranks)
+B = nullspace_linear_elasticity(x, partition(axes(A,2)))
+b = A*x_exact   
+y = similar(b,axes(A,2))
+y .= 0
+
+level_params = amg_level_params_linear_elasticity(block_size)
+fine_params = amg_fine_params(;level_params)
+solver = amg(;fine_params)
 
-Pl = setup(solver,x,A,b)
-cg!(x,A,b;Pl,verbose=true)
-diff = collect(x) - collect(x_exact)
-display(diff)
-@test x ≈ x_exact
+Pl = setup(solver,y,A,b;nullspace=B) 
+cg!(y,A,b;Pl,verbose=true)
+@test y ≈ x_exact
 
 end
\ No newline at end of file

From 2946d79bd5dadaf35ddef0159646b2abbf0290e7 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Wed, 21 Aug 2024 09:40:54 +0200
Subject: [PATCH 15/18] Remove Pkg dependency

---
 PartitionedSolvers/test/amg_tests.jl | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 6851548f..0d3c4b0e 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -1,8 +1,5 @@
 module AMGTests
 
-# just for debugging
-using Pkg
-Pkg.activate("PartitionedSolvers")
 using PartitionedArrays
 using PartitionedSolvers
 using LinearAlgebra

From a0d569627135498ce4703e8c2f082212a9693057 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Wed, 21 Aug 2024 09:45:58 +0200
Subject: [PATCH 16/18] Remove Statistics dependency

---
 PartitionedSolvers/test/amg_tests.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 0d3c4b0e..a7328c8c 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -5,7 +5,6 @@ using PartitionedSolvers
 using LinearAlgebra
 using IterativeSolvers
 using IterativeSolvers: cg!
-using Statistics
 using Test
 using SparseArrays
 

From f2588744343867437753bf9f627b68a410acdbec Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Thu, 22 Aug 2024 17:09:43 +0200
Subject: [PATCH 17/18] Set default epsilon to 0.01

---
 PartitionedSolvers/src/amg.jl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 75ad080d..37c40682 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -465,7 +465,7 @@ function smoothed_prolongator(A,P0,diagA=dense_diag(A);approximate_omega)
     invDiagA = 1 ./ diagA
     Dinv = PartitionedArrays.sparse_diag_matrix(invDiagA,(axes(A,1),axes(A,1)))
     omega = approximate_omega(invDiagA,A,Dinv*A)
-    P = (I-omega*Dinv*A) * P0 #TODO: this gives an error
+    P = (I-omega*Dinv*A) * P0 
     P
 end
 
@@ -556,7 +556,7 @@ function smoothed_aggregation(;
 end
 
 function smoothed_aggregation_with_block_size(;
-    epsilon = 0,
+    epsilon = 0.01,
     approximate_omega = lambda_generic,
     tentative_prolongator = tentative_prolongator_with_block_size,
     repartition_threshold = 2000,
@@ -564,7 +564,7 @@ function smoothed_aggregation_with_block_size(;
     )
     function coarsen(A,B)
         # build strength graph
-        G = strength_graph(A, block_size, epsilon=epsilon) 
+        G = strength_graph(A, block_size; epsilon) 
         diagG = dense_diag(G)
         node_to_aggregate, node_aggregates = aggregate(G,diagG;epsilon)
         aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates) 
@@ -591,7 +591,7 @@ function getblock!(B,A,ids_i,ids_j)
     end
 end
 
-function strength_graph(A::PSparseMatrix, block_size::Integer; epsilon = 0)
+function strength_graph(A::PSparseMatrix, block_size::Integer; epsilon)
     dof_partition = partition(axes(A,1))
     node_partition = map(dof_partition) do my_dofs
         n_local_dofs = length(my_dofs)
@@ -633,7 +633,7 @@ function strength_graph(A::PSparseMatrix, block_size::Integer; epsilon = 0)
     G
 end
 
-function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon = 0)
+function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon)
 
     if block_size < 1 
         error("Block size must be equal to or larger than 1.")
@@ -651,8 +651,8 @@ function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon =
         return A
     end
 
-    if epsilon < 0
-        error("Expected a positive epsilon.")
+    if epsilon <= 0
+        error("Expected epsilon > 0.")
     end
 
     n_dofs = A.m 
@@ -737,7 +737,7 @@ end
 
 function amg_level_params_linear_elasticity(block_size;
     pre_smoother = additive_schwarz(gauss_seidel(;iters=1);iters=1),
-    coarsening = smoothed_aggregation_with_block_size(approximate_omega = lambda_generic,
+    coarsening = smoothed_aggregation_with_block_size(;approximate_omega = lambda_generic,
     tentative_prolongator = tentative_prolongator_with_block_size, block_size = block_size),
     cycle = v_cycle,
     pos_smoother = pre_smoother,

From 6d9cad869dd407fdbea8ea55a20fffac03201679 Mon Sep 17 00:00:00 2001
From: Gelieza K <gelieza.gk@gmail.com>
Date: Fri, 23 Aug 2024 09:01:16 +0200
Subject: [PATCH 18/18] Fix bug in strength_graph

---
 PartitionedSolvers/src/amg.jl | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 37c40682..83a43f36 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -556,7 +556,7 @@ function smoothed_aggregation(;
 end
 
 function smoothed_aggregation_with_block_size(;
-    epsilon = 0.01,
+    epsilon = 0,
     approximate_omega = lambda_generic,
     tentative_prolongator = tentative_prolongator_with_block_size,
     repartition_threshold = 2000,
@@ -651,8 +651,8 @@ function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon)
         return A
     end
 
-    if epsilon <= 0
-        error("Expected epsilon > 0.")
+    if epsilon < 0
+        error("Expected epsilon >= 0.")
     end
 
     n_dofs = A.m 
@@ -681,8 +681,12 @@ function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon)
                 i_dofs = node_to_dofs(i_node, block_size) 
                 j_dofs = node_to_dofs(j_node, block_size) 
                 getblock!(B,A,i_dofs,j_dofs)
+                norm_B = norm(B)
+                if norm_B == 0 
+                    continue
+                end
                 # Calculate strength according to https://github.com/pyamg/pyamg/blob/e1fe54c93be1029c02ddcf84c2338a607b088703/pyamg/strength.py#L275 
-                if norm(B) >= epsilon * sqrt(diag_norms[i_node] * diag_norms[j_node])
+                if norm_B >= epsilon * sqrt(diag_norms[i_node] * diag_norms[j_node])
                     nnz += 1
                 end
             end 
@@ -710,8 +714,12 @@ function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon)
                 i_dofs = node_to_dofs(i_node, block_size)
                 j_dofs = node_to_dofs(j_node, block_size) 
                 getblock!(B,A,i_dofs,j_dofs)
+                norm_B = norm(B)
+                if norm_B == 0 
+                    continue
+                end
                 # Calculate strength according to https://github.com/pyamg/pyamg/blob/e1fe54c93be1029c02ddcf84c2338a607b088703/pyamg/strength.py#L275 
-                if norm(B) >= epsilon * sqrt(diag_norms[i_node] * diag_norms[j_node])
+                if norm_B >= epsilon * sqrt(diag_norms[i_node] * diag_norms[j_node])
                     I[i_nz] = i_node 
                     J[i_nz] = j_node
                     V[i_nz] = 1.0