diff --git a/poisson2d/README.md b/poisson2d/README.md
index 14ae309..fa2d742 100644
--- a/poisson2d/README.md
+++ b/poisson2d/README.md
@@ -3,14 +3,48 @@
 This solver uses a simple Jacobi iteration to solve a Poisson equation. For details, see, for example,
 "Computational Physics" by Mark Newman, Chap 9. Compiler commands were:
 
-```gfortran sourcefile.f95 -Ofast -o program```
+```gfortran sourcefile.f95 -O3 -o program```
 
-```gcc sourcefile.c -Ofast -o program```
+```gcc sourcefile.c -O3 -o program```
 
 For Cython module (import it to run):
 
 ```python setup.py build_ext --inplace```
 
+
+There are various implementations using various strategies.
+All codes are named with a 1-1 correspondance, i.e. `c_i_*` has the same strategy as `fortran_i_*` files.
+The suffixed `_*` is only used if there are multiple alternatives that utilizes the same strategy between
+the different languages. For example the `c_4_1d` and `c_4_2d` are both using the same strategy for
+solving the Poisson equation, while the former uses a 1D array for the data while the latter uses a 2D
+array with some slight overhead of another pointer level.
+
+The two files `c_common.c` and `fortran_common.f90` are baseline programs used for printing
+out data and for parsing the command line arguments.
+
+All executables accept 2 arguments:
+
+    # N = 100
+    # N_ITER = 2000
+    ./c_1 100 2000
+    100             2000  5.658391137993336e+04  1.448613750000000e-03
+
+which will use a 2D array of size `100x100` and run for 2000 iterations.
+The output line consists of 4 values, 1) N, 2) N_ITER, 3) max-difference, 4) time per iteration.
+
+Some thoughts on the code at https://runningcrocodile.fi/articles/pythonperformance.html. Also, there was
+discussion about this problem at Discourse: https://fortran-lang.discourse.group/t/performance-c-vs-fortran/1461 
+with important contributions from numerous users (Mason, Beliavsky, septc, implicitall, nncarlson, han190 and pmk).
+
+The entire code base has been re-written to allow command-line arguments allowing to sample different
+matrix sizes with the same executable (user zerothi).
+
+To run a preset benchmark suite, simply execute `bash run.sh` which will run the currently
+implemented benchmarks with dimensions up to 600.
+
+
+## Old timings
+
 The grid is a 2-dimensional array of size MxM. The timings for different values of M are, in seconds:
 
 | Language            | M=100           | M=200                 | M=300                  |
@@ -25,7 +59,3 @@ The grid is a 2-dimensional array of size MxM. The timings for different values
 * For all of these results, the amount of iterations performed by the respective codes was approximately
 the same, with the exception of the 100x100 grid C codes, which did nearly a double amount of iterations
 compared to the rest, for some reason. The timings are on AMD Ryzen 5 3600 @3.6GHz, using WSL2 on Windows 10.
-
-Some thoughts on the code at https://runningcrocodile.fi/articles/pythonperformance.html . Also, there was
-discussion about this problem at Discourse: https://fortran-lang.discourse.group/t/performance-c-vs-fortran/1461 
-with important contributions from numerous users (Mason, Beliavsky, septc, implicitall, nncarlson, han190 and pmk)
diff --git a/poisson2d/c_1.c b/poisson2d/c_1.c
new file mode 100644
index 0000000..8430ee1
--- /dev/null
+++ b/poisson2d/c_1.c
@@ -0,0 +1,110 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+// COMMENT
+// I know, this is extremely bad practice.
+// But to accommodate argument sizes for the arguments
+// I felt this was ok.
+// It makes it much more versatile when testing
+// END COMMENT
+void swap(int M, double (*a)[M], double (*b)[M])
+{
+  double swp;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      swp = a[i][j];
+      a[i][j] = b[i][j];
+      b[i][j] = swp;
+    }
+  }
+}
+
+// See comment above
+double mmax(int M, double (*phi)[M], double (*phip)[M]) {
+  double max = 0.0;
+  double diff = 0.0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      diff = fabs(phi[i][j]-phip[i][j]);
+      if (diff > max)
+        max = diff;
+    }
+  }
+  return max;
+}
+
+double rho(double x, double y) {
+  double s1 = 0.6;
+  double e1 = 0.8;
+  double s2 = 0.2;
+  double e2 = 0.4;
+
+  if (x > s1 && x < e1 && y > s1 && y < e1) {
+    return 1.0;
+  } else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+clock_t run(int M, int N_ITER) {
+
+  // COMMENT:
+  // I wouldn't even imagine any novice in C would
+  // do something like this. It really makes no sense :)
+  // END OF COMMENT
+  double (*phi)[M];
+  double (*phip)[M]; 
+  double (*rhoa)[M]; 
+  phi = malloc(sizeof(double[M][M]));
+  phip = malloc(sizeof(double[M][M]));
+  rhoa = malloc(sizeof(double[M][M]));
+
+  clock_t tic = clock();
+
+  // initialize matrices
+  memset(phip, 0, sizeof(phip[0][0])*M*M);
+  memset(phi, 0, sizeof(phi[0][0])*M*M);
+  
+  double delta = 1.0;
+  double a2 = pow(SPACE,2.0);
+  
+  int iter = 0;
+  while (iter < N_ITER ) {
+    iter += 1;
+    
+    for (int i=1; i < M-1; i++) {
+      for (int j=1; j < M-1; j++) {
+        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
+                      phi[i][j+1] + phi[i][j-1])/4.0 +
+	  a2/(4.0 * EPSILON0)*rho(i*SPACE,j*SPACE);
+      }
+    }
+    delta = mmax(M, phi, phip);
+    swap(M, phi, phip);
+    
+  }
+
+  clock_t toc = clock();
+
+  free(phi);
+  free(phip);
+  free(rhoa);
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+
+int main(int argc, char *argv[]) {
+
+  int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_2.c b/poisson2d/c_2.c
new file mode 100644
index 0000000..05a0c42
--- /dev/null
+++ b/poisson2d/c_2.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+// COMMENT
+// I know, this is extremely bad practice.
+// But to accommodate argument sizes for the arguments
+// I felt this was ok.
+// It makes it much more versatile when testing
+// END COMMENT
+void swap(int M, double a[M][M], double b[M][M]) {
+  double swp;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      swp = a[i][j];
+      a[i][j] = b[i][j];
+      b[i][j] = swp;
+    }
+  }
+}
+
+// See comment above
+double mmax(int M, double a[M][M], double b[M][M]) {
+  double max = 0.0;
+  double diff = 0.0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      diff = fabs(a[i][j]-b[i][j]);
+      if (diff > max)
+        max = diff;
+    }
+  }
+  return max;
+}
+
+double rho(double x, double y) {
+  double s1 = 0.6;
+  double e1 = 0.8;
+  double s2 = 0.2;
+  double e2 = 0.4;
+
+  if (x > s1 && x < e1 && y > s1 && y < e1) {
+    return 1.0;
+  } else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+void run(int M, int N_ITER) {
+
+  double phi[M][M];
+  double phip[M][M]; 
+
+  clock_t tic = clock();
+  int iter = 0;
+
+  // initialize matrices
+  memset(&phip[0][0], 0, sizeof(double)*M*M);
+  memset(&phi[0][0], 0, sizeof(double)*M*M);
+  
+  double delta;
+  double a2 = pow(SPACE,2.0);
+  
+  while (iter < N_ITER ) {
+    iter += 1;
+    
+    for (int i=1; i < M-1; i++) {
+      for (int j=1; j < M-1; j++) {
+        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
+                      phi[i][j+1] + phi[i][j-1])/4.0 +
+	  a2/(4.0 * EPSILON0)*rho(i*SPACE,j*SPACE);
+      }
+    }
+    delta = mmax(M, phi, phip);
+    swap(M, phi, phip);
+    
+  }
+
+  clock_t toc = clock();
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+
+int main(int argc, char *argv[]) {
+
+  int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_3.c b/poisson2d/c_3.c
new file mode 100644
index 0000000..8806cd7
--- /dev/null
+++ b/poisson2d/c_3.c
@@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+// COMMENT
+// I know, this is extremely bad practice.
+// But to accommodate argument sizes for the arguments
+// I felt this was ok.
+// It makes it much more versatile when testing
+// END COMMENT
+void swap(int M, double a[M][M], double b[M][M]) {
+  double swp;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      swp = a[i][j];
+      a[i][j] = b[i][j];
+      b[i][j] = swp;
+    }
+  }
+}
+
+// See comment above
+double mmax(int M, double a[M][M], double b[M][M]) {
+  double max = 0.0;
+  double diff = 0.0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      diff = fabs(a[i][j]-b[i][j]);
+      if (diff > max)
+        max = diff;
+    }
+  }
+  return max;
+}
+
+double rho(double x, double y) {
+  double s1 = 0.6;
+  double e1 = 0.8;
+  double s2 = 0.2;
+  double e2 = 0.4;
+
+  if (x > s1 && x < e1 && y > s1 && y < e1) {
+    return 1.0;
+  } else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+void run(int M, int N_ITER) {
+
+  double phi[M][M];
+  double phip[M][M]; 
+  double rhoa[M][M]; 
+
+  clock_t tic = clock();
+  int iter = 0;
+
+  // initialize matrices
+  memset(&phip[0][0], 0, sizeof(double)*M*M);
+  memset(&phi[0][0], 0, sizeof(double)*M*M);
+  for (int i=1; i<M-1; i++)  {
+    for (int j=1; j<M-1; j++) {
+      rhoa[i][j] =  rho(i*SPACE,j*SPACE);
+    }
+  }
+
+  double delta;
+  double a2 = pow(SPACE,2.0);
+  
+  while (iter < N_ITER ) {
+    iter += 1;
+    
+    for (int i=1; i < M-1; i++) {
+      for (int j=1; j < M-1; j++) {
+        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
+                      phi[i][j+1] + phi[i][j-1])/4.0 +
+	  a2/(4.0 * EPSILON0)*rhoa[i][j];
+      }
+    }
+    delta = mmax(M, phi, phip);
+    swap(M, phi, phip);
+    
+  }
+
+  clock_t toc = clock();
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+
+int main(int argc, char *argv[]) {
+
+  int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_4_1d.c b/poisson2d/c_4_1d.c
new file mode 100644
index 0000000..26e839c
--- /dev/null
+++ b/poisson2d/c_4_1d.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+#define IDX(i,j) (i)*M+j
+
+
+double rho(const double x, const double y) {
+  const double s1 = 0.6;
+  const double e1 = 0.8;
+  const double s2 = 0.2;
+  const double e2 = 0.4;
+
+  if (s1 < x && x < e1 && s1 < y && y < e1) {
+    return 1.0;
+  } else if ( s2 < x && x < e2 && s2 < y && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+double iterate(int M, double *restrict phi, double *restrict phinew, double *restrict rhoa) {
+  double delta = 0, err;
+  for (int i=1; i < M-1; i++) {
+    for (int j=1; j < M-1; j++) {
+      phinew[IDX(i,j)] = (phi[IDX(i+1,j)] + phi[IDX(i-1,j)] + phi[IDX(i,j+1)] + phi[IDX(i,j-1)] + rhoa[IDX(i,j)])*0.25;
+      err = fabs(phinew[IDX(i,j)] - phi[IDX(i,j)]);
+      if ( err > delta ) delta = err;
+    }
+  }
+  return delta;
+}
+
+void init_rho(int M, double *restrict rhoa, const double epsilon, const double a) {
+  const double a2 = a * a / epsilon;
+  for (int i=1; i<M-1; i++)  {
+    for (int j=1; j<M-1; j++) {
+      rhoa[IDX(i,j)] = rho(i*a,j*a) * a2;
+    }
+  }
+}
+
+
+void run(int M, int N_ITER) {
+
+  double *phi;
+  double *phip;
+  double *rhoa;
+  double *tmp;
+
+  // A real world program will definitely use malloc
+  phi = malloc(M*M*sizeof(double));
+  phip = malloc(M*M*sizeof(double));
+  rhoa = malloc(M*M*sizeof(double));
+
+  clock_t tic = clock();
+
+  memset(phi, 0, sizeof(double)*M*M);
+  memset(phip, 0, sizeof(double)*M*M);
+
+  // In C one tries to avoid using pow because
+  // it assumes floating point powers (integers are faster)
+  // So better do it directly
+  init_rho(M, rhoa, EPSILON0, SPACE);
+  
+  int iter = 0;
+  double delta;
+  while ( iter < N_ITER ) {
+    iter += 1;
+
+    delta = iterate(M, phi, phip, rhoa);
+
+    // swap pointers (no copies)
+    tmp = phi;
+    phi = phip;
+    phip = tmp;
+  }
+
+  clock_t toc = clock();
+
+  free(phi);
+  free(phip);
+  free(rhoa);
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+int main(int argc, char *argv[]) {
+ int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_4_2d.c b/poisson2d/c_4_2d.c
new file mode 100644
index 0000000..a6bbdd5
--- /dev/null
+++ b/poisson2d/c_4_2d.c
@@ -0,0 +1,127 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+double ** malloc_2d(int m, int n) {
+  if (m <= 0 || n <= 0)
+    return NULL;
+
+  // Allocate pointers for columns
+  double **array2D = malloc(m * sizeof(double *));
+  if (array2D == NULL) {
+    return NULL;
+  }
+
+  // Allocate actual data
+  array2D[0] = malloc(m * n * sizeof(double));
+  if (array2D[0] == NULL) {
+    free(array2D);
+    return NULL;
+  }
+
+  // assign pointers to make it 2D
+  for(int i = 1; i < m; i++) {
+    array2D[i] = array2D[0] + i * n ;
+  }
+
+  return array2D;
+}
+
+void free_2d(double **array2D) {
+  free(array2D[0]);
+  free(array2D);
+}
+
+
+
+double rho(const double x, const double y) {
+  const double s1 = 0.6;
+  const double e1 = 0.8;
+  const double s2 = 0.2;
+  const double e2 = 0.4;
+
+  if (s1 < x && x < e1 && s1 < y && y < e1) {
+    return 1.0;
+  } else if ( s2 < x && x < e2 && s2 < y && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+double iterate(int M, double *restrict*restrict phi, double *restrict*restrict phinew, double *restrict*restrict rhoa) {
+  double delta = 0, err;
+  for (int i=1; i < M-1; i++) {
+    for (int j=1; j < M-1; j++) {
+      phinew[i][j] = (phi[i+1][j] + phi[i-1][j] + phi[i][j+1] + phi[i][j-1] + rhoa[i][j])*0.25;
+      err = fabs(phinew[i][j] - phi[i][j]);
+      if ( err > delta ) delta = err;
+    }
+  }
+  return delta;
+}
+
+void init_rho(int M, double *restrict*restrict rhoa, const double epsilon, const double a) {
+  const double a2 = a * a / epsilon;
+  for (int i=1; i<M-1; i++)  {
+    for (int j=1; j<M-1; j++) {
+      rhoa[i][j] = rho(i*a,j*a) * a2;
+    }
+  }
+}
+
+
+void run(int M, int N_ITER) {
+
+  double **phi;
+  double **phip;
+  double **rhoa;
+  double **tmp;
+
+  // A real world program will definitely use malloc
+  phi = malloc_2d(M, M);
+  phip = malloc_2d(M, M);
+  rhoa = malloc_2d(M, M);
+
+  clock_t tic = clock();
+
+  memset(&phi[0][0], 0, sizeof(double)*M*M);
+  memset(&phip[0][0], 0, sizeof(double)*M*M);
+
+  // In C one tries to avoid using pow because
+  // it assumes floating point powers (integers are faster)
+  // So better do it directly
+  init_rho(M, rhoa, EPSILON0, SPACE);
+  
+  int iter = 0;
+  double delta;
+  while ( iter < N_ITER ) {
+    iter += 1;
+
+    delta = iterate(M, phi, phip, rhoa);
+
+    // swap pointers (no copies)
+    tmp = phi;
+    phi = phip;
+    phip = tmp;
+  }
+
+  clock_t toc = clock();
+
+  free_2d(phi);
+  free_2d(phip);
+  free_2d(rhoa);
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+int main(int argc, char *argv[]) {
+  int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_common.c b/poisson2d/c_common.c
new file mode 100644
index 0000000..caa4335
--- /dev/null
+++ b/poisson2d/c_common.c
@@ -0,0 +1,17 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+void parse_args(int argc, char **argv, int *N, int *N_iter) {
+  if ( argc < 3 ) {
+    printf("Not enough arguments [N N_ITER]\n");
+  }
+
+  *N = atoi(argv[1]);
+  *N_iter = atoi(argv[2]);
+}
+
+void write_timing(int N, int iter, double delta, clock_t timing) {
+  printf("%16d %16d %22.15e %22.15e\n", N, iter, delta, ((double)timing)/CLOCKS_PER_SEC/iter);
+}
diff --git a/poisson2d/c_common.h b/poisson2d/c_common.h
new file mode 100644
index 0000000..374921d
--- /dev/null
+++ b/poisson2d/c_common.h
@@ -0,0 +1,10 @@
+#ifndef _COMMON_HEADER_
+#define _COMMON_HEADER_
+
+#define EPSILON0 8.85e-12
+#define SPACE 0.01
+
+
+void parse_args(int, char **, int *, int *);
+void write_timing(int, int, double, clock_t);
+#endif
diff --git a/poisson2d/fortran_2.f90 b/poisson2d/fortran_2.f90
new file mode 100644
index 0000000..22c1bcd
--- /dev/null
+++ b/poisson2d/fortran_2.f90
@@ -0,0 +1,67 @@
+module poisson_m
+  use common_m
+  implicit none
+
+contains
+
+  pure real(dp) function rho(x,y)
+    real(dp), intent(in) :: x,y
+    if (x > 0.6_dp .and. x < 0.8_dp .and. y > 0.6_dp .and. y<0.8) then
+      rho = 1.0_dp
+    else if (x> 0.2_dp .and. x<0.4_dp .and. y>0.2_dp .and. y<0.4_dp) then
+      rho = -1.0_dp
+    else
+      rho = 0.0_dp
+    end if
+  end function rho
+
+  subroutine run(M, N_ITER)
+    integer, intent(in) :: M, N_ITER
+
+    integer :: i,j, iter
+    real(dp) :: tic, toc
+    real(dp) :: delta, phiprime(M,M), phi(M,M), a2, temp(M,M)
+
+    ! Since arrays are allocated on the stack
+    call cpu_time(tic)
+    
+    phiprime(:,:) = 0.0_dp
+    phi(:,:) = 0.0_dp
+
+    a2 = a**2
+
+    iter = 0
+    do while (iter < N_ITER )
+      iter = iter + 1
+      do j=2, M-1
+        do i=2, M-1
+          phiprime(i,j) = (phi(i+1,j) + phi(i-1,j) + phi(i,j+1) + phi(i,j-1))/4.0_dp &
+              + a2/4.0_dp/epsilon0*rho((i-1)*a,(j-1)*a)
+        end do
+      end do
+
+      delta = maxval(abs(phiprime - phi))
+      temp = phi
+      phi = phiprime 
+      phiprime = temp
+      
+    end do
+    call cpu_time(toc)
+
+    call write_timing(M, iter, delta, toc - tic)
+
+  end subroutine run
+
+end module poisson_m
+
+program poisson
+  use common_m, only : parse_args
+  use poisson_m, only: run
+  implicit none
+  integer :: M, N_ITER
+
+  call parse_args(M, N_ITER)
+
+  call run(M, N_iter)
+
+end program poisson
diff --git a/poisson2d/fortran_3.f90 b/poisson2d/fortran_3.f90
new file mode 100644
index 0000000..9b9153c
--- /dev/null
+++ b/poisson2d/fortran_3.f90
@@ -0,0 +1,74 @@
+module poisson_m
+  use common_m
+  implicit none
+
+contains
+
+  pure real(dp) function rho(x,y)
+    real(dp), intent(in) :: x,y
+    if (x > 0.6_dp .and. x < 0.8_dp .and. y > 0.6_dp .and. y<0.8) then
+      rho = 1.0_dp
+    else if (x> 0.2_dp .and. x<0.4_dp .and. y>0.2_dp .and. y<0.4_dp) then
+      rho = -1.0_dp
+    else
+      rho = 0.0_dp
+    end if
+  end function rho
+
+  subroutine run(M, N_ITER)
+    integer, intent(in) :: M, N_ITER
+
+    integer :: i,j, iter
+    real(dp) :: tic, toc
+    real(dp) :: delta, a2
+    real(dp) :: phiprime(M,M), phi(M,M), rhoa(M,M), temp(M,M)
+
+    ! Since arrays are allocated on the stack
+    call cpu_time(tic)
+    
+    phiprime(:,:) = 0.0_dp
+    phi(:,:) = 0.0_dp
+    do j=2, M-1
+      do i=2, M-1
+        rhoa(i,j) = rho((i-1)*a,(j-1)*a)
+      end do
+    end do
+
+    a2 = a**2
+
+    iter = 0
+    do while (iter < N_ITER )
+      
+      iter = iter + 1
+      do j=2, M-1
+        do i=2, M-1
+          phiprime(i,j) = (phi(i+1,j) + phi(i-1,j) + phi(i,j+1) + phi(i,j-1) + &
+              a2 / epsilon0 * rhoa(i,j))*0.25_dp
+        end do
+      end do
+
+      delta = maxval(abs(phiprime - phi))
+      temp = phi
+      phi = phiprime 
+      phiprime = temp
+      
+    end do
+    call cpu_time(toc)
+
+    call write_timing(M, iter, delta, toc - tic)
+
+  end subroutine run
+
+end module poisson_m
+
+program poisson
+  use common_m, only : parse_args
+  use poisson_m, only: run
+  implicit none
+  integer :: M, N_ITER
+
+  call parse_args(M, N_ITER)
+
+  call run(M, N_iter)
+
+end program poisson
diff --git a/poisson2d/fortran_4.f90 b/poisson2d/fortran_4.f90
new file mode 100644
index 0000000..4ecb6ed
--- /dev/null
+++ b/poisson2d/fortran_4.f90
@@ -0,0 +1,88 @@
+program poisson
+  use common_m
+
+  implicit none
+
+  integer :: M, N_ITER
+  integer :: i,j, iter
+  real(dp) :: tic, toc
+  real(dp) :: delta, a2
+
+  real(dp), allocatable :: rhoa(:,:)
+  real(dp), allocatable, target :: phiprime(:,:), phi(:,:)
+  real(dp), pointer :: p(:,:), pnew(:,:), tmp(:,:)
+
+  call parse_args(M, N_ITER)
+  
+  allocate(phi(M,M))
+  allocate(phiprime(M,M))
+  allocate(rhoa(M,M))
+
+  ! we don't time allocation/deallocation
+  call cpu_time(tic)
+
+  ! Fortran doesn't care too much about pow
+  ! since it figures out if the exponent is an integer or not
+  a2 = a*a / epsilon0
+  do j=2, M-1
+    do i=2, M-1
+      rhoa(i,j) = rho((i-1)*a,(j-1)*a) * a2
+    end do
+  end do
+
+  phi(:,:) = 0.0_dp
+  phiprime(:,:) = 0.0_dp
+  p => phi
+  pnew => phiprime
+  iter = 0
+  do while ( iter < N_ITER )
+    iter = iter + 1
+
+    call iterate(rhoa, p, pnew, delta)
+
+    ! Easer to swap pointer than copying stuff around
+    ! In fortran we could even use pointers
+    ! But we can just call
+    tmp => p
+    p => pnew
+    pnew => tmp
+
+  end do
+  
+  call cpu_time(toc)
+
+  deallocate(phi, phiprime, rhoa)
+
+  call write_timing(M, iter, delta, toc - tic)
+
+contains
+
+  subroutine iterate(rhoa, p, pnew, delta)
+    real(dp), intent(in) :: p(:,:), rhoa(:,:)
+    real(dp), intent(out) :: pnew(:,:)
+    real(dp), intent(out) :: delta
+
+    integer :: i, j
+
+    delta = 0._dp
+    do j=2, M-1
+      do i=2, M-1
+        pnew(i,j) = (p(i+1,j) + p(i-1,j) + p(i,j+1) + p(i,j-1) + rhoa(i,j)) * 0.25_dp
+        delta = max(delta, abs(pnew(i,j) - p(i,j)))
+      end do
+    end do
+
+  end subroutine iterate
+
+  pure real(dp) function rho(x,y)
+    real(dp), intent(in) :: x,y
+    if ( 0.6_dp < x .and. x < 0.8_dp .and. 0.6_dp < y .and. y < 0.8_dp ) then
+      rho = 1.0_dp
+    else if ( 0.2_dp < x .and. x < 0.4_dp .and. 0.2_dp < y .and. y < 0.4_dp ) then
+      rho = -1.0_dp
+    else
+      rho = 0.0_dp
+    end if
+  end function rho
+
+end program poisson
diff --git a/poisson2d/fortran_common.f90 b/poisson2d/fortran_common.f90
new file mode 100644
index 0000000..eaf46fd
--- /dev/null
+++ b/poisson2d/fortran_common.f90
@@ -0,0 +1,43 @@
+!< Module for handling common things such as input arguments and timings
+module common_m
+
+  implicit none
+
+  public
+  integer, parameter :: sp = kind(0.)
+  integer, parameter :: dp = kind(0.d0)
+
+  real(dp), parameter :: a = 0.01_dp
+  real(dp), parameter :: epsilon0 = 8.85E-12_dp
+
+contains
+
+  subroutine parse_args(N, N_ITER)
+    integer, intent(out) :: N, N_ITER
+
+    character(len=128) :: input
+    integer :: length
+
+    if ( command_argument_count() < 2 ) then
+      stop 'Not enough arguments [N N_ITER]'
+    end if
+
+    call get_command_argument(1,input,length)
+    read(input,*) N
+    call get_command_argument(2,input,length)
+    read(input,*) N_ITER
+
+  end subroutine parse_args
+
+  subroutine write_timing(N, iter, delta, timing)
+    integer, intent(in) :: N, iter
+    real(dp), intent(in) :: delta, timing
+
+    write(*,'(2(i16,tr1),2(e22.15,tr1))') N, iter, delta, timing/real(iter,dp)
+
+  end subroutine write_timing
+
+end module common_m
+  
+
+  
diff --git a/poisson2d/naive.c b/poisson2d/naive.c
deleted file mode 100644
index 9d72f22..0000000
--- a/poisson2d/naive.c
+++ /dev/null
@@ -1,90 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <time.h>
-#define M 300
-void swap(double (*a)[M], double (*b)[M], int n)
-{
-  double swp;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      swp = a[i][j];
-      a[i][j] = b[i][j];
-      b[i][j] = swp;
-    }
-  }
-}
-double mmax(double (*phi)[M], double (*phip)[M], int n)
-{
-  double max = 0.0;
-  double diff = 0.0;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      diff = fabs(phi[i][j]-phip[i][j]);
-      if (diff > max)
-        max = diff;
-    }
-  }
-  return max;
-}
-double rho(double x, double y)
-{
-  double s1 = 0.6;
-  double e1 = 0.8;
-	double s2 = 0.2;
-	double e2 = 0.4;
-
-	if (x > s1 && x < e1 && y > s1 && y < e1) {
-		return 1.0;
-	} else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
-		return -1.0;
-	} else {
-		return 0.0;
-	}
-}
-void run(double toler, double a)
-{
-  double epsilon0 = 8.85e-12;
-  double a2;
-
-  double (*phi)[M];
-  double (*phip)[M]; 
-  double (*rhoa)[M]; 
-  phi = malloc(sizeof(double[M][M]));
-  phip = malloc(sizeof(double[M][M]));
-  rhoa = malloc(sizeof(double[M][M]));
-
-  int iter = 0;
-
-  memset(phip, 0, sizeof(phip[0][0])*M*M);
-  memset(phi, 0, sizeof(phi[0][0])*M*M);
-  
-  double delta = 1.0;
-  a2 = pow(a,2.0);
-  while (delta > toler) {
-    iter += 1;
-    
-    for (int i=1; i < M-1; i++) {
-      for (int j=1; j < M-1; j++) {
-        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
-                      phi[i][j+1] + phi[i][j-1])/4.0 +
-                      a2/(4.0 * epsilon0)*rho(i*a,j*a);
-      }
-    }
-    delta = mmax(phi, phip, M);
-    swap(phi, phip, M);
-
-  }
-  printf("iters %d", iter);
-
-}
-
-int main(int argc, char *argv[])
-{
-  clock_t start = clock();
-  run(1e-6, 0.01);
-  clock_t end = clock();
-  double total = ((double)(end - start)) / CLOCKS_PER_SEC;
-  printf("Execution time: %f\n",total);
-}
diff --git a/poisson2d/naive.f90 b/poisson2d/naive.f90
deleted file mode 100644
index 3c34692..0000000
--- a/poisson2d/naive.f90
+++ /dev/null
@@ -1,51 +0,0 @@
-module rhofunc
-implicit none
-public
-integer, parameter :: dp=kind(0.d0)
-contains
-    pure real(dp) function rho(x,y)
-        real(dp), intent(in) :: x,y
-        if (x > 0.6_dp .and. x < 0.8_dp .and. y > 0.6_dp .and. y<0.8) then
-            rho = 1.0_dp
-        else if (x> 0.2_dp .and. x<0.4_dp .and. y>0.2_dp .and. y<0.4_dp) then
-            rho = -1.0_dp
-        else
-            rho = 0.0_dp
-        end if
-    end function
-
-end module
-
-program poisson
-use rhofunc, only: rho
-implicit none
-integer, parameter :: dp=kind(0.d0), M=300
-integer            :: i,j, iter
-real(dp),parameter :: epsilon0=8.85E-12_dp, target=1E-6_dp, a=0.01
-real(dp)           :: delta, b, e, phiprime(M,M), phi(M,M), a2, rhoarr(M,M), temp(M,M)
-
-
-delta = 1.0_dp
-iter = 0
-call cpu_time(b)
-phiprime(:,:) = 0.0_dp
-phi(:,:) = 0.0_dp
-
-do while (delta > target )
-    iter = iter + 1
-    a2 = a**2.0_dp
-    do i=2, M-1
-        do j=2, M-1
-            phiprime(i,j) = (phi(i+1,j) + phi(i-1,j) + phi(i,j+1) + phi(i,j-1))/4.0_dp &
-            + a2/4.0_dp/epsilon0*rho(i*a,j*a)
-        end do
-    end do
-    delta = maxval(abs(phiprime - phi))
-    temp = phi
-    phi = phiprime 
-    phiprime = temp
-
-end do
-call cpu_time(e)
-print *, e-b, iter
-end program
\ No newline at end of file
diff --git a/poisson2d/optimized.c b/poisson2d/optimized.c
deleted file mode 100644
index 0dddfec..0000000
--- a/poisson2d/optimized.c
+++ /dev/null
@@ -1,92 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <time.h>
-#define M 300
-void swap(double (*a)[M], double (*b)[M], int n)
-{
-  double swp;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      a[i][j] = b[i][j];
-    }
-  }
-}
-double mmax(double (*phi)[M], double (*phip)[M], int n)
-{
-  double max = 0.0;
-  double diff = 0.0;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      diff = fabs(phi[i][j]-phip[i][j]);
-      if (diff > max)
-        max = diff;
-    }
-  }
-  return max;
-}
-double rho(double x, double y)
-{
-  double s1 = 0.6;
-  double e1 = 0.8;
-	double s2 = 0.2;
-	double e2 = 0.4;
-
-	if (x > s1 && x < e1 && y > s1 && y < e1) {
-		return 1.0;
-	} else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
-		return -1.0;
-	} else {
-		return 0.0;
-	}
-}
-void run(double toler, double a)
-{
-  double epsilon0 = 8.85e-12;
-  double a2;
-
-  double (*phi)[M];
-  double (*phip)[M]; 
-  double (*rhoa)[M]; 
-  phi = malloc(sizeof(double[M][M]));
-  phip = malloc(sizeof(double[M][M]));
-  rhoa = malloc(sizeof(double[M][M]));
-
-  int iter = 0;
-
-  memset(phip, 0, sizeof(phip[0][0])*M*M);
-  memset(phi, 0, sizeof(phi[0][0])*M*M);
-  for (int i=1; i<M-1; i++)  {
-    for (int j=1; j<M-1; j++) {
-      rhoa[i][j] =  rho(i*a,j*a);
-    }
-  }
-  double delta = 1.0;
-  a2 = pow(a,2.0);
-  while (delta > toler) {
-    iter += 1;
-    
-    for (int i=1; i < M-1; i++) {
-      for (int j=1; j < M-1; j++) {
-        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
-                      phi[i][j+1] + phi[i][j-1])/4.0 +
-                      a2/(4.0 * epsilon0)*rhoa[i][j];
-      }
-    }
-    delta = mmax(phi, phip, M);
-    swap(phi, phip, M);
-
-  }
-  printf("iters %d", iter);
-
-}
-
-int main(int argc, char *argv[])
-{
-  clock_t start = clock();
-  run(1e-6, 0.01);
-  clock_t end = clock();
-  double total = ((double)(end - start)) / CLOCKS_PER_SEC;
-  printf("Execution time: %f\n",total);
-}
diff --git a/poisson2d/optimized.f90 b/poisson2d/optimized.f90
deleted file mode 100644
index 3bc1045..0000000
--- a/poisson2d/optimized.f90
+++ /dev/null
@@ -1,55 +0,0 @@
-module rhofunc
-implicit none
-public
-integer, parameter :: dp=kind(0.d0)
-contains
-    pure real(dp) function rho(x,y)
-        real(dp), intent(in) :: x,y
-        if (x > 0.6_dp .and. x < 0.8_dp .and. y > 0.6_dp .and. y<0.8) then
-            rho = 1.0_dp
-        else if (x> 0.2_dp .and. x<0.4_dp .and. y>0.2_dp .and. y<0.4_dp) then
-            rho = -1.0_dp
-        else
-            rho = 0.0_dp
-        end if
-    end function
-
-end module
-
-program poisson
-use rhofunc, only: rho
-implicit none
-integer, parameter :: dp=kind(0.d0), M=300
-integer            :: i,j, iter
-real(dp),parameter :: epsilon0=8.85E-12_dp, target=1E-6_dp, a=0.01
-real(dp)           :: delta, b, e, phiprime(M,M), phi(M,M), a2, rhoarr(M,M)
-
-
-delta = 1.0_dp
-iter = 0
-call cpu_time(b)
-phiprime(:,:) = 0.0_dp
-phi(:,:) = 0.0_dp
-do i=1, M
-    do j=1, M
-        rhoarr(i,j) = rho(i*a,j*a)
-    end do
-end do
-
-do while (delta > target )
-    iter = iter + 1
-    a2 = a**2.0_dp
-    do i=2, M-1
-        do j=2, M-1
-            phiprime(i,j) = (phi(i+1,j) + phi(i-1,j) + phi(i,j+1) + phi(i,j-1))/4.0_dp &
-            + a2/4.0_dp/epsilon0*rhoarr(i,j)
-        end do
-    end do
-    delta = maxval(abs(phiprime - phi))
-    phi = phiprime 
-
-
-end do
-call cpu_time(e)
-print *, e-b, iter
-end program
\ No newline at end of file
diff --git a/poisson2d/run.sh b/poisson2d/run.sh
new file mode 100644
index 0000000..234dae4
--- /dev/null
+++ b/poisson2d/run.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# This may not apply to your distribution
+# change or comment out at will, but remark
+# the stack-size.
+ulimit -s unlimited
+
+OPTS="-O3 -march=native"
+FC=gfortran
+CC=gcc
+
+$CC $OPTS -c c_common.c
+for c in 1 2 3 4_1d 4_2d
+do
+    echo "Compiling and running: c_${c}"
+    $CC $OPTS -o c_${c} c_${c}.c c_common.o
+
+    # Now run for
+    {
+	for N in 100 200 300 ; do
+	    ./c_${c} $N 10000
+	done
+	for N in 400 500 600; do
+	    ./c_${c} $N 4000
+	done
+    } > c_${c}.data
+done
+
+$FC $OPTS -c fortran_common.f90
+for f in 2 3 4
+do
+    echo "Compiling and running: fortran_${f}"
+    $FC $OPTS -o fortran_${f} fortran_${f}.f90 fortran_common.o
+
+    # Now run for
+    {
+	for N in 100 200 300 ; do
+	    ./fortran_${f} $N 10000
+	done
+	for N in 400 500 600; do
+	    ./fortran_${f} $N 4000
+	done
+    } > fortran_${f}.data
+done