diff --git a/PartitionedSolvers/src/PartitionedSolvers.jl b/PartitionedSolvers/src/PartitionedSolvers.jl
index bf08d25a..768adf7f 100644
--- a/PartitionedSolvers/src/PartitionedSolvers.jl
+++ b/PartitionedSolvers/src/PartitionedSolvers.jl
@@ -29,6 +29,7 @@ export smoothed_aggregation
 export v_cycle
 export w_cycle
 export amg_level_params
+export amg_level_params_linear_elasticity
 export amg_fine_params
 export amg_coarse_params
 export amg_statistics
diff --git a/PartitionedSolvers/src/amg.jl b/PartitionedSolvers/src/amg.jl
index 499a5230..83a43f36 100644
--- a/PartitionedSolvers/src/amg.jl
+++ b/PartitionedSolvers/src/amg.jl
@@ -145,7 +145,6 @@ function constant_prolongator(node_to_aggregate,aggregates,n_nullspace_vecs)
     nnodes = length(node_to_aggregate)
     pending = typeof_aggregate(0)
     isolated = typeof_aggregate(-1)
-    nnodes = length(node_to_aggregate)
     naggregates = length(aggregates)
     aggregate_to_nodes_ptrs = zeros(Int,naggregates+1)
     for node in 1:nnodes
@@ -198,6 +197,54 @@ function constant_prolongator(node_to_aggregate::PVector,aggregates::PRange,n_nu
     P0
 end
 
+function collect_nodes_in_aggregate(node_to_aggregate::PVector,aggregates::PRange)
+    aggregate_to_nodes_partition = map(own_values(node_to_aggregate), partition(aggregates)) do own_node_to_global_aggregate, my_aggregates
+        # convert to local ids
+        global_to_local_aggregate = global_to_local(my_aggregates) 
+        local_aggregates = global_to_local_aggregate[my_aggregates]
+        @assert all(local_aggregates .!= 0)
+        n_local_aggregates = length(local_aggregates)
+        own_node_to_local_aggregate = map(own_node_to_global_aggregate) do global_aggregate
+            global_to_local_aggregate[global_aggregate]
+        end
+        local_aggregate_to_local_nodes = collect_nodes_in_aggregate(own_node_to_local_aggregate, 1:n_local_aggregates)    
+        local_aggregate_to_local_nodes
+    end
+    PVector(aggregate_to_nodes_partition, partition(aggregates))
+end
+
+function collect_nodes_in_aggregate(node_to_aggregate,aggregates)
+    typeof_aggregate = eltype(node_to_aggregate)
+    nnodes = length(node_to_aggregate)
+    pending = typeof_aggregate(0)
+    isolated = typeof_aggregate(-1)
+    nnodes = length(node_to_aggregate)
+    naggregates = length(aggregates)
+    aggregate_to_nodes_ptrs = zeros(Int,naggregates+1)
+    for node in 1:nnodes
+        agg = node_to_aggregate[node]
+        if agg == pending || agg == isolated
+            continue
+        end
+        aggregate_to_nodes_ptrs[agg+1] += 1
+    end 
+    length_to_ptrs!(aggregate_to_nodes_ptrs)
+    ndata = aggregate_to_nodes_ptrs[end]-1
+    aggregate_to_nodes_data = zeros(Int,ndata)
+    for node in 1:nnodes
+        agg = node_to_aggregate[node]
+        if agg == pending || agg == isolated
+            continue
+        end
+        p = aggregate_to_nodes_ptrs[agg]
+        aggregate_to_nodes_data[p] = node
+        aggregate_to_nodes_ptrs[agg] += 1
+    end
+    rewind_ptrs!(aggregate_to_nodes_ptrs)
+    aggregate_to_nodes = jagged_array(aggregate_to_nodes_data,aggregate_to_nodes_ptrs)
+    aggregate_to_nodes
+end
+
 function tentative_prolongator_for_laplace(P0,B)
     n_nullspace_vecs = length(B)
     if n_nullspace_vecs != 1
@@ -207,6 +254,175 @@ function tentative_prolongator_for_laplace(P0,B)
     P0,Bc
 end
 
+function tentative_prolongator_with_block_size(aggregate_to_nodes::PVector,B, block_size)
+    own_values_B = map(own_values, B)
+    n_B = length(B)
+    global_aggregate_to_owner = global_to_owner(aggregate_to_nodes.index_partition)
+    P0_partition, Bc_partition, coarse_dof_to_Bc... = map(aggregate_to_nodes.index_partition, B[1].index_partition, local_values(aggregate_to_nodes), own_values_B...) do local_aggregate_local_indices, local_dof_local_indices, local_aggregate_to_local_nodes, own_dof_to_b...
+        P0_own_own, local_coarse_dof_to_Bc = tentative_prolongator_with_block_size(local_aggregate_to_local_nodes, own_dof_to_b, block_size)
+        # Create P0 partition
+        n_global_aggregates = global_length(local_aggregate_local_indices)
+        n_own_aggregates = own_length(local_aggregate_local_indices)
+        n_local_aggregates = local_length(local_aggregate_local_indices)
+        n_ghost_aggregates = ghost_length(local_aggregate_local_indices)
+        n_own_dofs = length(own_dof_to_b[1])
+        n_own_coarse_dofs = n_own_aggregates * n_B
+        n_global_coarse_dofs = n_global_aggregates * n_B
+        @assert n_own_aggregates == n_local_aggregates
+        @assert n_ghost_aggregates == 0 
+        @assert size(P0_own_own, 1) == n_own_dofs 
+        @assert size(P0_own_own, 2) == n_own_coarse_dofs
+        @assert length(local_coarse_dof_to_Bc) == n_B 
+        @assert length(local_coarse_dof_to_Bc[1]) == n_own_coarse_dofs
+        P0_own_ghost = sparse(Int[], Int[], eltype(P0_own_own)[], n_own_aggregates, n_ghost_aggregates)
+        P0_ghost_own = sparse(Int[], Int[], eltype(P0_own_own)[], n_ghost_aggregates, n_own_aggregates)
+        P0_ghost_ghost = sparse(Int[], Int[], eltype(P0_own_own)[], n_ghost_aggregates, n_ghost_aggregates)
+        blocks = PartitionedArrays.split_matrix_blocks(P0_own_own, P0_own_ghost, P0_ghost_own, P0_ghost_ghost)
+        perm_aggs = PartitionedArrays.local_permutation(local_aggregate_local_indices) 
+        # Turn permutation of aggregates to permutation of null vectors 
+        perm_cols = zeros(Int, size(P0_own_own,2))
+        for (i, agg) in enumerate(perm_aggs)
+            cols = (agg - 1) * n_B + 1 : agg * n_B 
+            inds = (i - 1) * n_B + 1 : i * n_B 
+            perm_cols[inds] = cols 
+        end
+        perm_rows_with_ghost = PartitionedArrays.local_permutation(local_dof_local_indices) 
+        # Filter only own rows 
+        perm_rows = zeros(Int, own_length(local_dof_local_indices))
+        ptr = 1
+        for ind in perm_rows_with_ghost
+            if ind in own_to_local(local_dof_local_indices)
+                perm_rows[ptr] = ind
+                ptr += 1
+            end
+        end
+        P0_partition = PartitionedArrays.split_matrix(blocks, perm_rows, perm_cols)
+        # Partition of Bc
+        own_to_global_coarse_dofs = zeros(Int, n_own_coarse_dofs)
+        for (i, agg) in enumerate(local_to_global(local_aggregate_local_indices))
+            inds = (i-1) * n_B + 1 : i * n_B
+            global_coarse_dofs = (agg-1) * n_B + 1 : agg * n_B 
+            own_to_global_coarse_dofs[inds] = global_coarse_dofs
+        end
+        global_coarse_dofs_to_owner = zeros(Int, n_global_coarse_dofs)
+        for global_agg in 1:n_global_aggregates
+            inds = (global_agg-1) * n_B + 1 : global_agg * n_B
+            global_coarse_dofs_to_owner[inds] .= global_aggregate_to_owner[global_agg] 
+        end
+        own_coarse_dofs = OwnIndices(n_global_coarse_dofs, part_id(local_aggregate_local_indices), own_to_global_coarse_dofs) #todo 
+        ghost_coarse_dofs = GhostIndices(n_global_coarse_dofs, Int[], Int32[])
+        Bc_partition = OwnAndGhostIndices(own_coarse_dofs, ghost_coarse_dofs, global_coarse_dofs_to_owner)
+        P0_partition, Bc_partition, local_coarse_dof_to_Bc...
+    end |> tuple_of_arrays
+    P0 = PSparseMatrix(P0_partition, B[1].index_partition, Bc_partition, true) 
+    Bc = [PVector(b, Bc_partition) for b in coarse_dof_to_Bc]
+    P0, Bc 
+end
+
+
+function tentative_prolongator_with_block_size(aggregate_to_nodes::JaggedArray,B, block_size)
+    # Algorithm 7 in https://mediatum.ub.tum.de/download/1229321/1229321.pdf 
+
+    if length(B) < 1
+        error("Null space must contain at least one null vector.")
+    end
+
+    n_aggregates = length(aggregate_to_nodes.ptrs)-1
+    n_B = length(B)
+    n_dofs = length(aggregate_to_nodes.data)*block_size
+    n_dofs_c = n_aggregates * n_B
+    Bc = [Vector{Float64}(undef,n_dofs_c) for _ in 1:n_B]
+
+    # Build P0 colptr
+    P0_colptr = zeros(Int, n_dofs_c + 1)
+    P0_colptr[1] = 1
+    for i_agg in 1:n_aggregates
+        pini = aggregate_to_nodes.ptrs[i_agg]
+        pend = aggregate_to_nodes.ptrs[i_agg+1]-1
+        n_nodes = length(pini:pend)
+        for b in 1:n_B 
+            col = (i_agg - 1) * n_B + b
+            P0_colptr[col+1] += P0_colptr[col] + n_nodes * block_size 
+        end
+    end
+    
+    # Build P0 rowvals
+    nnz = length(aggregate_to_nodes.data) * block_size * n_B
+    P0_rowval = zeros(Int, nnz)
+    for i_agg in 1:n_aggregates
+        pini = aggregate_to_nodes.ptrs[i_agg]
+        pend = aggregate_to_nodes.ptrs[i_agg+1]-1
+        i_nodes = aggregate_to_nodes.data[pini:pend]
+        for b in 1:n_B
+            rval_ini = P0_colptr[(i_agg-1)*n_B+b]
+            for i_node in i_nodes
+                rval_end = rval_ini+block_size-1
+                P0_rowval[rval_ini:rval_end] = node_to_dofs(i_node, block_size)
+                rval_ini = rval_end + 1
+            end
+        end
+    end
+    
+    P0 = SparseMatrixCSC(
+        n_dofs,
+        n_dofs_c,
+        P0_colptr,
+        P0_rowval,
+        ones(nnz))
+    
+    # Fill with nullspace vectors 
+    for i_agg in 1:n_aggregates
+        for b in 1:n_B
+            col = (i_agg-1) * n_B + b
+            pini = P0.colptr[col]
+            pend = P0.colptr[col+1]-1
+            rowids = P0.rowval[pini:pend]
+            P0.nzval[pini:pend] = B[b][rowids] # fails
+        end
+    end
+
+    # Compute QR decomposition for nullspace in each aggregate 
+    for i_agg in 1:n_aggregates
+        # Copy values to Bi 
+        pini = aggregate_to_nodes.ptrs[i_agg]
+        pend = aggregate_to_nodes.ptrs[i_agg+1]-1
+        n_i = length(pini:pend) * block_size
+
+        if n_i < n_B 
+            error("Singleton node aggregate: Case not implemented.")
+        end
+
+        Bi = zeros(n_i, n_B)
+        P0cols = (i_agg-1)*n_B+1 : i_agg*n_B
+        for (b, col) in enumerate(P0cols)
+            pini = P0.colptr[col]
+            pend = P0.colptr[col+1]-1
+            Bi[:,b] = P0.nzval[pini:pend]
+        end
+
+        # Compute thin QR decomposition 
+        F= qr(Bi)
+        Qi = F.Q * Matrix(I,(n_i, n_B))
+        Qi = Qi[:,1:n_B]
+        Ri = F.R
+
+        # Build global tentative prolongator 
+        for (b, col) in enumerate(P0cols)
+            pini = P0.colptr[col]
+            pend = P0.colptr[col+1]-1
+            P0.nzval[pini:pend] = Qi[:,b]
+        end
+
+        # Build coarse null space
+        Bcrows = P0cols
+        for b in 1:n_B
+            Bc[b][Bcrows] = Ri[:, b]
+        end
+    end
+
+    P0, Bc
+end
+
 function generic_tentative_prolongator(P0::SparseArrays.AbstractSparseMatrixCSC,B)
     error("not implemented yet")
     # A draft for the scalar case is commented below
@@ -247,19 +463,52 @@ end
 function smoothed_prolongator(A,P0,diagA=dense_diag(A);approximate_omega)
     # TODO the performance of this one can be improved
     invDiagA = 1 ./ diagA
-    omega = approximate_omega(invDiagA,A)
     Dinv = PartitionedArrays.sparse_diag_matrix(invDiagA,(axes(A,1),axes(A,1)))
-    P = (I-omega*Dinv*A)*P0
+    omega = approximate_omega(invDiagA,A,Dinv*A)
+    P = (I-omega*Dinv*A) * P0 
     P
 end
 
-function omega_for_1d_laplace(invD,A)
+function omega_for_1d_laplace(invD,A, DinvA)
     # in general this will be (4/3)/ρ(D^-1*A)
     # with a cheap approximation (e.g., power method) of the spectral radius ρ(D^-1*A)
     # ρ(D^-1*A) == 2 for 1d Laplace problem
     2/3
 end
 
+function lambda_generic(invD,A,DinvA::AbstractMatrix)
+    ω = 4/3
+    # Perform a few iterations of Power method to estimate lambda 
+    # (Remark 3.5.2. in https://mediatum.ub.tum.de/download/1229321/1229321.pdf)
+    x0 = rand(size(DinvA,2))
+    ρ, x = spectral_radius(DinvA, x0, 20)
+    ω/ρ 
+end
+
+function lambda_generic(invD,A,DinvA::PSparseMatrix)
+    ω = 4/3
+    # Perform a few iterations of Power method to estimate lambda 
+    # (Remark 3.5.2. in https://mediatum.ub.tum.de/download/1229321/1229321.pdf)
+    x0 = prand(partition(axes(DinvA,2)))
+    ρ, x = spectral_radius(DinvA, x0, 20)
+    ω/ρ 
+end
+
+function spectral_radius(A, x, num_iterations:: Int)
+    # Choose diagonal vector as initial guess
+    y = similar(x)
+    # Power iteration
+    for _ in 1:num_iterations
+        mul!(y, A, x)
+        y_norm = norm(y)
+        x = y/y_norm
+    end
+    # Compute spectral radius using Rayleigh coefficient
+    y = mul!(y, A, x)
+    ρ = (y' * x) / (x' * x)
+    abs(ρ), x
+end
+
 function enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)
     Ac,Bc,R,P,cache,repartition_threshold
 end
@@ -306,6 +555,206 @@ function smoothed_aggregation(;
     (coarsen, coarsen!)
 end
 
+function smoothed_aggregation_with_block_size(;
+    epsilon = 0,
+    approximate_omega = lambda_generic,
+    tentative_prolongator = tentative_prolongator_with_block_size,
+    repartition_threshold = 2000,
+    block_size = 1,
+    )
+    function coarsen(A,B)
+        # build strength graph
+        G = strength_graph(A, block_size; epsilon) 
+        diagG = dense_diag(G)
+        node_to_aggregate, node_aggregates = aggregate(G,diagG;epsilon)
+        aggregate_to_nodes = collect_nodes_in_aggregate(node_to_aggregate, node_aggregates) 
+        Pc,Bc = tentative_prolongator(aggregate_to_nodes,B, block_size) 
+        diagA = dense_diag(A)
+        P = smoothed_prolongator(A, Pc, diagA; approximate_omega)
+        R = transpose(P)
+        Ac,cache = rap(R,A,P;reuse=true)
+        Ac,Bc,R,P,cache,repartition_threshold = enhance_coarse_partition(A,Ac,Bc,R,P,cache,repartition_threshold)#maybe changes here
+        Ac,Bc,R,P,cache
+    end
+    function coarsen!(A,Ac,R,P,cache)
+        rap!(Ac,R,A,P,cache)
+        Ac,R,P,cache
+    end
+    (coarsen, coarsen!)
+end
+
+function getblock!(B,A,ids_i,ids_j)
+    for (j,J) in enumerate(ids_j)
+        for (i,I) in enumerate(ids_i)
+            B[i,j] = A[I,J]
+        end
+    end
+end
+
+function strength_graph(A::PSparseMatrix, block_size::Integer; epsilon)
+    dof_partition = partition(axes(A,1))
+    node_partition = map(dof_partition) do my_dofs
+        n_local_dofs = length(my_dofs)
+        @assert n_local_dofs % block_size == 0 
+        @assert own_length(my_dofs) == n_local_dofs
+        n_own_nodes = div(n_local_dofs, block_size)
+        own_to_global_node = zeros(Int, n_own_nodes)
+        for local_node in 1:n_own_nodes
+            local_dof = (local_node - 1) * block_size + 1
+            global_dof = my_dofs[local_dof]
+            global_node = div(global_dof -1, block_size) +1 
+            own_to_global_node[local_node] = global_node 
+        end
+        n_global_dofs = global_length(my_dofs)
+        n_global_nodes = div(n_global_dofs, block_size)
+        @assert n_global_dofs % block_size == 0
+        own_nodes = OwnIndices(n_global_nodes, part_id(my_dofs), own_to_global_node)
+        ghost_nodes = GhostIndices(n_global_nodes, Int[], Int32[])
+        my_nodes = OwnAndGhostIndices(own_nodes, ghost_nodes)
+        my_nodes 
+    end
+
+    G_partition = map(own_own_values(A), node_partition) do A_own_own, my_nodes
+        G_own_own = strength_graph(A_own_own, block_size, epsilon=epsilon)
+        n_own_nodes = own_length(my_nodes)
+        n_ghost_nodes = ghost_length(my_nodes)
+        @assert n_ghost_nodes == 0 
+        @assert size(G_own_own, 1) == n_own_nodes 
+        @assert size(G_own_own, 2) == size(G_own_own, 1)
+        G_own_ghost = sparse(Int[], Int[], eltype(G_own_own)[], n_own_nodes, n_ghost_nodes)
+        G_ghost_own = sparse(Int[], Int[], eltype(G_own_own)[], n_ghost_nodes, n_own_nodes)
+        G_ghost_ghost = sparse(Int[], Int[], eltype(G_own_own)[], n_ghost_nodes, n_ghost_nodes)
+        blocks = PartitionedArrays.split_matrix_blocks(G_own_own, G_own_ghost, G_ghost_own, G_ghost_ghost)
+        perm = PartitionedArrays.local_permutation(my_nodes)
+        PartitionedArrays.split_matrix(blocks, perm, perm)
+    end
+
+    G = PSparseMatrix(G_partition, node_partition, node_partition, true)
+    G
+end
+
+function strength_graph(A::AbstractSparseMatrix, block_size::Integer; epsilon)
+
+    if block_size < 1 
+        error("Block size must be equal to or larger than 1.")
+    end
+
+    if A.n != A.m 
+        error("Matrix must be square.")
+    end
+
+    if A.n % block_size != 0
+        error("Matrix size must be multiple of block size.")
+    end
+
+    if block_size == 1
+        return A
+    end
+
+    if epsilon < 0
+        error("Expected epsilon >= 0.")
+    end
+
+    n_dofs = A.m 
+    nnodes = div(n_dofs, block_size)
+    B = zeros(block_size,block_size)
+    diag_norms = zeros(nnodes)
+
+    # Count nonzero values 
+    nnz = 0
+    
+    # If epsilon <= 1, all diagonal entries are in the graph
+    if epsilon <= 1 
+        nnz += nnodes
+    end
+
+    # Calculate norms of diagonal values first
+    for i_node in 1:nnodes
+        i_dofs = node_to_dofs(i_node, block_size)
+        getblock!(B,A,i_dofs,i_dofs)
+        diag_norms[i_node] = norm(B)
+    end
+
+    for j_node in 1:nnodes
+        for i_node in 1:nnodes
+            if j_node != i_node
+                i_dofs = node_to_dofs(i_node, block_size) 
+                j_dofs = node_to_dofs(j_node, block_size) 
+                getblock!(B,A,i_dofs,j_dofs)
+                norm_B = norm(B)
+                if norm_B == 0 
+                    continue
+                end
+                # Calculate strength according to https://github.com/pyamg/pyamg/blob/e1fe54c93be1029c02ddcf84c2338a607b088703/pyamg/strength.py#L275 
+                if norm_B >= epsilon * sqrt(diag_norms[i_node] * diag_norms[j_node])
+                    nnz += 1
+                end
+            end 
+        end
+    end
+    
+    # Allocate data structures for sparsematrix
+    I = zeros(Int, nnz)
+    J = zeros(Int, nnz)
+    V = zeros(nnz)
+
+    i_nz = 1
+
+    for j_node in 1:nnodes
+        for i_node in 1:nnodes
+            if j_node == i_node
+                # Diagonal entries are always in graph if epsilon <= 1
+                if epsilon <= 1
+                    I[i_nz] = i_node 
+                    J[i_nz] = j_node
+                    V[i_nz] = 1.0
+                    i_nz += 1
+                end
+            else 
+                i_dofs = node_to_dofs(i_node, block_size)
+                j_dofs = node_to_dofs(j_node, block_size) 
+                getblock!(B,A,i_dofs,j_dofs)
+                norm_B = norm(B)
+                if norm_B == 0 
+                    continue
+                end
+                # Calculate strength according to https://github.com/pyamg/pyamg/blob/e1fe54c93be1029c02ddcf84c2338a607b088703/pyamg/strength.py#L275 
+                if norm_B >= epsilon * sqrt(diag_norms[i_node] * diag_norms[j_node])
+                    I[i_nz] = i_node 
+                    J[i_nz] = j_node
+                    V[i_nz] = 1.0
+                    i_nz += 1
+                end
+            end
+        end
+    end
+
+    G = sparse(I, J, V, nnodes, nnodes)
+    G
+end 
+
+function node_to_dofs(i_node, block_size)
+    # Convert single node in graph to range of dofs
+    ((i_node-1)*block_size+1) : i_node*block_size
+end
+
+function dofs_to_node(dofs, block_size)
+    # Convert range of dofs to single node in graph
+    div(dofs[end], block_size)
+end
+
+function amg_level_params_linear_elasticity(block_size;
+    pre_smoother = additive_schwarz(gauss_seidel(;iters=1);iters=1),
+    coarsening = smoothed_aggregation_with_block_size(;approximate_omega = lambda_generic,
+    tentative_prolongator = tentative_prolongator_with_block_size, block_size = block_size),
+    cycle = v_cycle,
+    pos_smoother = pre_smoother,
+    )
+
+    level_params = (;pre_smoother,pos_smoother,coarsening,cycle)
+    level_params
+end
+
 function amg_level_params(;
     pre_smoother = additive_schwarz(gauss_seidel(;iters=1);iters=1),
     coarsening = smoothed_aggregation(;),
@@ -490,6 +939,4 @@ function amg_finalize!(setup)
     end
     coarse_solver_setup = setup.coarse_level.coarse_solver_setup
     finalize!(coarse_solver_setup)
-    nothing
-end
-
+end
\ No newline at end of file
diff --git a/PartitionedSolvers/test/amg_tests.jl b/PartitionedSolvers/test/amg_tests.jl
index 659ca455..a7328c8c 100644
--- a/PartitionedSolvers/test/amg_tests.jl
+++ b/PartitionedSolvers/test/amg_tests.jl
@@ -1,16 +1,220 @@
 module AMGTests
 
 using PartitionedArrays
-using PartitionedArrays: laplace_matrix
 using PartitionedSolvers
 using LinearAlgebra
-using Test
+using IterativeSolvers
 using IterativeSolvers: cg!
-import IterativeSolvers
+using Test
+using SparseArrays
+
+
+# Test strength graph computation 
+# First with Psparse matrix and known solution
+ndofs = 18
+nnodes = 9
+nrows = 3
+p = 4
+ranks = DebugArray(LinearIndices((p,)))
+row_partition = uniform_partition(ranks, ndofs)
+
+IJV = map(row_partition) do row_indices
+    I, J, V = Int[], Int[], Float64[]
+    for i in local_to_global(row_indices)
+        push!(V, 9)
+        push!(I, i)
+        push!(J, i)
+
+        node = 0 
+        if (i % 2) == 1
+            # 1st dimension
+            push!(V, -1)
+            push!(I, i)
+            push!(J, i+1)
+            node = div((i+1),2)
+        else
+            # 2nd dimension
+            push!(V, -1)
+            push!(I, i)
+            push!(J, i-1) 
+            node = div(i,2)
+        end
+        north = node - nrows
+        south = node + nrows
+        east = node - 1
+        west = node + 1
+        if north >= 1
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*north - 1)
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*north)
+        end
+        if south <= nnodes
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*south -1)
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*south)
+        end
+        if node % nrows != 1
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*east-1)
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*east)
+        end
+        if node % nrows != 0
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*west-1)
+            push!(V, -1)
+            push!(I, i)
+            push!(J, 2*west)
+        end
+    end
+    I,J,V
+end
+
+I,J,V = tuple_of_arrays(IJV)
+A = psparse(I,J,V,row_partition, row_partition) |> fetch 
+
+# build solution
+I = [1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 5, 
+    6, 6, 6, 6, 7, 7, 7, 8, 8, 8, 8, 9, 9, 9]
+J = [1, 2, 4,
+     1, 2, 3, 5, 
+     2, 3, 6, 
+     1, 4, 5, 7, 
+     2, 4, 5, 6, 8,
+     3, 5, 6, 9,
+     4, 7, 8, 
+     5, 7, 8, 9, 
+     6, 8, 9]
+V = ones(length(I))
+solution = sparse(I, J, V, nnodes, nnodes)
+
+epsilon = 0.02
+
+# Test with CSC sparse matrix 
+# Test sample matrix with known solution
+A_seq = centralize(A)
+G_seq = PartitionedSolvers.strength_graph(A_seq, 2, epsilon=epsilon)
+@test solution ≈ G_seq
+
+# Another test with blocksize 3
+M = rand([-2.0, -1, 1, 2], (3, 3))
+M = sparse(M)
+A = blockdiag(M, M, M)
+solution = sparse([1, 2, 3], [1, 2, 3], fill(1.0, 3), 3, 3)
+G = PartitionedSolvers.strength_graph(A, 3, epsilon=epsilon)
+@test solution ≈ G 
+
+# Test with minimal matrix size
+G = PartitionedSolvers.strength_graph(M, 3, epsilon=epsilon)
+solution = sparse([1], [1], 1.0, 1, 1)
+@test solution ≈ G 
+
+# Create Test matrix for Linear Elasticity
+parts_per_dir = (2,2,2)
+block_size = length(parts_per_dir) 
+p = prod(parts_per_dir)
+ranks = DebugArray(LinearIndices((p,)))
+nodes_per_dir = map(i->block_size*i,parts_per_dir)
+args = PartitionedArrays.linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
+A_dist = psparse(args...) |> fetch
+A_seq = centralize(A_dist)
+
+# Test strength graph with sequential and parallel linear elasticity matrix
+G_seq = PartitionedSolvers.strength_graph(A_seq, block_size, epsilon=epsilon)
+G_dist = PartitionedSolvers.strength_graph(A_dist, block_size, epsilon=epsilon)
+G_dist_centralized = centralize(G_dist)
+diff = G_seq - G_dist_centralized
+diff_nnz = nnz(diff)
+println("Test strength graph: \n$(round(diff_nnz/(G_seq.m^2); digits=3))% of total entries are different between sequential and parallel G.") 
+println("G_dist has $(round(1-nnz(G_dist_centralized)/nnz(G_seq);digits=2))% fewer nnz entries than G_seq.")
+println("(G_dist nnz entries: $(nnz(G_dist_centralized)), G_seq nnz entries: $(nnz(G_seq)))")
+@test isa(G_dist, PSparseMatrix)
+
+# Test sequential collect nodes in aggregate 
+diagG = dense_diag(centralize(G_dist))
+node_to_aggregate_seq, node_aggregates_seq = PartitionedSolvers.aggregate(centralize(G_dist),diagG;epsilon)
+aggregate_to_nodes_seq = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate_seq, node_aggregates_seq)
+@test length(aggregate_to_nodes_seq.data) == length(node_to_aggregate_seq)
+for i_agg in node_aggregates_seq
+    pini = aggregate_to_nodes_seq.ptrs[i_agg]
+    pend = aggregate_to_nodes_seq.ptrs[i_agg+1]-1
+    nodes = aggregate_to_nodes_seq.data[pini:pend]
+    @test all(node_to_aggregate_seq[nodes] .== i_agg)
+end
+
+# Test parallel collect_nodes_in_aggregate
+diagG = dense_diag(G_dist)
+node_to_aggregate_dist, node_aggregates_dist = PartitionedSolvers.aggregate(G_dist,diagG;epsilon)
+aggregate_to_nodes_dist = PartitionedSolvers.collect_nodes_in_aggregate(node_to_aggregate_dist, node_aggregates_dist)
+map(partition(aggregate_to_nodes_dist), partition(node_to_aggregate_dist), partition(node_aggregates_dist)) do my_aggregate_to_nodes, my_node_to_aggregate, my_aggregates
+    @test length(my_aggregate_to_nodes.data) == length(my_node_to_aggregate)
+    global_to_local_aggregate = global_to_local(my_aggregates) 
+    local_aggregates = global_to_local_aggregate[my_aggregates]
+    own_node_to_local_aggregate = map(my_node_to_aggregate) do global_aggregate
+        global_to_local_aggregate[global_aggregate]
+    end
+    for i_agg in local_aggregates
+        pini = my_aggregate_to_nodes.ptrs[i_agg]
+        pend = my_aggregate_to_nodes.ptrs[i_agg+1]-1
+        nodes = my_aggregate_to_nodes.data[pini:pend]
+        @test all(own_node_to_local_aggregate[nodes] .== i_agg)
+    end
+end
+
+# Create nullspace vectors for tentative prolongator
+x_dist = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir,ranks)
+B_dist = nullspace_linear_elasticity(x_dist, partition(axes(A_dist,2)))
+B_seq = [collect(b) for b in B_dist]
+
+# Test tentative prolongator with sequential linear elasticity matrix
+Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes_seq,B_seq, block_size) 
+@test Pc * stack(Bc) ≈ stack(B_seq)
+
+# Test tentative prolongator with parallel matrix 
+Pc, Bc = PartitionedSolvers.tentative_prolongator_with_block_size(aggregate_to_nodes_dist,B_dist, block_size) 
+n_B = length(B_dist)
+for i in 1:n_B
+    @test isa(Bc[i], PVector)
+end
+Bc_matrix = zeros(size(Pc,2), length(Bc))
+for (i,b) in enumerate(Bc)
+    Bc_matrix[:,i] = collect(b)
+end
+B_matrix = zeros(size(Pc,1), length(Bc))
+for (i,b) in enumerate(B_dist)
+    B_matrix[:,i] = collect(b)
+end
+@test centralize(Pc) * Bc_matrix ≈ B_matrix  
+
+
+# Test spectral radius sequential & parallel 
+diagA = dense_diag(A_dist)
+invD = 1 ./ diagA
+Dinv = PartitionedArrays.sparse_diag_matrix(invD,(axes(A_dist,1),axes(A_dist,1)))
+M = Dinv * A_dist
+exp = eigmax(Array(centralize(M)))
+cols = axes(M, 2) 
+x0 = prand(partition(cols))
+x0_seq = collect(x0)
+l_dist, x = PartitionedSolvers.spectral_radius(M, x0, 10)
+l_seq, x = PartitionedSolvers.spectral_radius(centralize(M), x0_seq, 10)
+@test l_dist ≈ l_seq 
+@test abs((l_dist-exp)/exp) < 2*10^-1
 
-# First with a sequential matrix
+
+# Test amg 
+# first with a sequential matrix
 nodes_per_dir = (100,100)
-A = laplace_matrix(nodes_per_dir)
+A = PartitionedArrays.laplace_matrix(nodes_per_dir)
 using Random
 Random.seed!(1)
 x = rand(size(A,2))
@@ -30,7 +234,7 @@ amg_statistics(S) |> display
 # Non-default options
 
 level_params = amg_level_params(;
-    pre_smoother = jacobi(;iters=10,omega=2/3),
+    pre_smoother = PartitionedSolvers.jacobi(;iters=10,omega=2/3),
     cycle = v_cycle,
    )
 
@@ -57,7 +261,7 @@ finalize!(S)
 # Now as a preconditioner
 
 level_params = amg_level_params(;
-   pre_smoother = gauss_seidel(;iters=1),
+   pre_smoother = PartitionedSolvers.gauss_seidel(;iters=1),
    )
 
 fine_params = amg_fine_params(;level_params)
@@ -72,6 +276,38 @@ solve!(y,S,b)
 update!(S,2*A)
 solve!(y,S,b)
 
+
+
+# Now for linear elasticity (sequential)
+
+parts_per_dir = (2,2,2)
+block_size = length(parts_per_dir) 
+p = prod(parts_per_dir)
+ranks = DebugArray(LinearIndices((p,)))
+nodes_per_dir = map(i->block_size*i,parts_per_dir)
+args = PartitionedArrays.linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
+A_dist = psparse(args...) |> fetch 
+A =  centralize(A_dist)
+
+x = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir, ranks)
+B_dist = nullspace_linear_elasticity(x, partition(axes(A_dist,2)))
+B = [collect(b) for b in B_dist]
+x_exact = rand(size(A,2))
+b = A*x_exact   
+y = similar(x_exact)
+y .= 0
+
+level_params = amg_level_params_linear_elasticity(block_size)
+fine_params = amg_fine_params(;level_params)
+solver = amg(;fine_params)
+
+Pl = setup(solver,y,A,b;nullspace=B)
+cg!(y,A,b;Pl,verbose=true)
+println("Linear Elasticity norm of error: $(norm(y-x_exact))")
+@test y ≈ x_exact
+
+
+
 # Now in parallel
 
 parts_per_dir = (2,2)
@@ -79,7 +315,7 @@ np = prod(parts_per_dir)
 parts = DebugArray(LinearIndices((np,)))
 
 nodes_per_dir = (100,100)
-A = laplace_matrix(nodes_per_dir,parts_per_dir,parts)
+A = PartitionedArrays.laplace_matrix(nodes_per_dir,parts_per_dir,parts)
 x = pones(partition(axes(A,2)))
 b = A*x
 
@@ -105,7 +341,7 @@ solve!(y,S,b)
 finalize!(S)
 
 level_params = amg_level_params(;
-    pre_smoother = jacobi(;iters=1,omega=2/3),
+    pre_smoother = PartitionedSolvers.jacobi(;iters=1,omega=2/3),
     coarsening = smoothed_aggregation(;repartition_threshold=10000000)
    )
 
@@ -121,9 +357,9 @@ cg!(y,A,b;Pl,verbose=true)
 
 nodes_per_dir = (40,40,40)
 parts_per_dir = (2,2,1)
-nparts = prod(parts_per_dir)
-parts = LinearIndices((nparts,))
-A = laplace_matrix(nodes_per_dir,parts_per_dir,parts)
+np = prod(parts_per_dir)
+parts = LinearIndices((np,))
+A = PartitionedArrays.laplace_matrix(nodes_per_dir,parts_per_dir,parts)
 x_exact = pones(partition(axes(A,2)))
 b = A*x_exact
 x = similar(b,axes(A,2))
@@ -132,4 +368,29 @@ Pl = setup(amg(),x,A,b)
 _, history = cg!(x,A,b;Pl,log=true)
 display(history)
 
-end
+
+# Now for linear elasticity 
+
+parts_per_dir = (2,2,2)
+block_size = length(parts_per_dir) 
+p = prod(parts_per_dir)
+ranks = DebugArray(LinearIndices((p,)))
+nodes_per_dir = map(i->block_size*i,parts_per_dir)
+args = linear_elasticity_fem(nodes_per_dir,parts_per_dir,ranks)
+A = psparse(args...) |> fetch 
+x_exact = pones(partition(axes(A,2)))
+x = node_coordinates_unit_cube(nodes_per_dir,parts_per_dir, ranks)
+B = nullspace_linear_elasticity(x, partition(axes(A,2)))
+b = A*x_exact   
+y = similar(b,axes(A,2))
+y .= 0
+
+level_params = amg_level_params_linear_elasticity(block_size)
+fine_params = amg_fine_params(;level_params)
+solver = amg(;fine_params)
+
+Pl = setup(solver,y,A,b;nullspace=B) 
+cg!(y,A,b;Pl,verbose=true)
+@test y ≈ x_exact
+
+end
\ No newline at end of file