From 33d5b68c1d1aa8246494f94e0cc0e2af46ccf80a Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Fri, 2 Jul 2021 21:43:15 +0200
Subject: [PATCH] mnt: re-arranged codes and added many new versions

Benchmarks should not rely on convergence, rather a fixed
number of iterations is more appropriate.

The problem of inconsistent iterations between the C and Fortran versions
was also fixed because of an indexing error when calculating the cell
coordinates (C is 0-based, F is 1-based). So they now run the same number
of iterations, regardless of M.
Note, this issue still persists in 2 and 3, but not in 4, I can't seem
to figure out why.

Moved timings about to make them more comparable.

All different benchmarks write out the same format output to more easily
compare them.

The later versions are more realistic since they use allocate/malloc
which is more appropriate.

Heavily updated the README.md file to explain what is going on.
---
 poisson2d/README.md          |  42 ++++++++++--
 poisson2d/c_1.c              | 110 ++++++++++++++++++++++++++++++
 poisson2d/c_2.c              |  98 +++++++++++++++++++++++++++
 poisson2d/c_3.c              | 104 ++++++++++++++++++++++++++++
 poisson2d/c_4_1d.c           |  98 +++++++++++++++++++++++++++
 poisson2d/c_4_2d.c           | 127 +++++++++++++++++++++++++++++++++++
 poisson2d/c_common.c         |  17 +++++
 poisson2d/c_common.h         |  10 +++
 poisson2d/fortran_2.f90      |  67 ++++++++++++++++++
 poisson2d/fortran_3.f90      |  74 ++++++++++++++++++++
 poisson2d/fortran_4.f90      |  88 ++++++++++++++++++++++++
 poisson2d/fortran_common.f90 |  43 ++++++++++++
 poisson2d/naive.c            |  90 -------------------------
 poisson2d/naive.f90          |  51 --------------
 poisson2d/optimized.c        |  92 -------------------------
 poisson2d/optimized.f90      |  55 ---------------
 poisson2d/run.sh             |  44 ++++++++++++
 17 files changed, 916 insertions(+), 294 deletions(-)
 create mode 100644 poisson2d/c_1.c
 create mode 100644 poisson2d/c_2.c
 create mode 100644 poisson2d/c_3.c
 create mode 100644 poisson2d/c_4_1d.c
 create mode 100644 poisson2d/c_4_2d.c
 create mode 100644 poisson2d/c_common.c
 create mode 100644 poisson2d/c_common.h
 create mode 100644 poisson2d/fortran_2.f90
 create mode 100644 poisson2d/fortran_3.f90
 create mode 100644 poisson2d/fortran_4.f90
 create mode 100644 poisson2d/fortran_common.f90
 delete mode 100644 poisson2d/naive.c
 delete mode 100644 poisson2d/naive.f90
 delete mode 100644 poisson2d/optimized.c
 delete mode 100644 poisson2d/optimized.f90
 create mode 100644 poisson2d/run.sh

diff --git a/poisson2d/README.md b/poisson2d/README.md
index 14ae309..fa2d742 100644
--- a/poisson2d/README.md
+++ b/poisson2d/README.md
@@ -3,14 +3,48 @@
 This solver uses a simple Jacobi iteration to solve a Poisson equation. For details, see, for example,
 "Computational Physics" by Mark Newman, Chap 9. Compiler commands were:
 
-```gfortran sourcefile.f95 -Ofast -o program```
+```gfortran sourcefile.f95 -O3 -o program```
 
-```gcc sourcefile.c -Ofast -o program```
+```gcc sourcefile.c -O3 -o program```
 
 For Cython module (import it to run):
 
 ```python setup.py build_ext --inplace```
 
+
+There are various implementations using various strategies.
+All codes are named with a 1-1 correspondance, i.e. `c_i_*` has the same strategy as `fortran_i_*` files.
+The suffixed `_*` is only used if there are multiple alternatives that utilizes the same strategy between
+the different languages. For example the `c_4_1d` and `c_4_2d` are both using the same strategy for
+solving the Poisson equation, while the former uses a 1D array for the data while the latter uses a 2D
+array with some slight overhead of another pointer level.
+
+The two files `c_common.c` and `fortran_common.f90` are baseline programs used for printing
+out data and for parsing the command line arguments.
+
+All executables accept 2 arguments:
+
+    # N = 100
+    # N_ITER = 2000
+    ./c_1 100 2000
+    100             2000  5.658391137993336e+04  1.448613750000000e-03
+
+which will use a 2D array of size `100x100` and run for 2000 iterations.
+The output line consists of 4 values, 1) N, 2) N_ITER, 3) max-difference, 4) time per iteration.
+
+Some thoughts on the code at https://runningcrocodile.fi/articles/pythonperformance.html. Also, there was
+discussion about this problem at Discourse: https://fortran-lang.discourse.group/t/performance-c-vs-fortran/1461 
+with important contributions from numerous users (Mason, Beliavsky, septc, implicitall, nncarlson, han190 and pmk).
+
+The entire code base has been re-written to allow command-line arguments allowing to sample different
+matrix sizes with the same executable (user zerothi).
+
+To run a preset benchmark suite, simply execute `bash run.sh` which will run the currently
+implemented benchmarks with dimensions up to 600.
+
+
+## Old timings
+
 The grid is a 2-dimensional array of size MxM. The timings for different values of M are, in seconds:
 
 | Language            | M=100           | M=200                 | M=300                  |
@@ -25,7 +59,3 @@ The grid is a 2-dimensional array of size MxM. The timings for different values
 * For all of these results, the amount of iterations performed by the respective codes was approximately
 the same, with the exception of the 100x100 grid C codes, which did nearly a double amount of iterations
 compared to the rest, for some reason. The timings are on AMD Ryzen 5 3600 @3.6GHz, using WSL2 on Windows 10.
-
-Some thoughts on the code at https://runningcrocodile.fi/articles/pythonperformance.html . Also, there was
-discussion about this problem at Discourse: https://fortran-lang.discourse.group/t/performance-c-vs-fortran/1461 
-with important contributions from numerous users (Mason, Beliavsky, septc, implicitall, nncarlson, han190 and pmk)
diff --git a/poisson2d/c_1.c b/poisson2d/c_1.c
new file mode 100644
index 0000000..8430ee1
--- /dev/null
+++ b/poisson2d/c_1.c
@@ -0,0 +1,110 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+// COMMENT
+// I know, this is extremely bad practice.
+// But to accommodate argument sizes for the arguments
+// I felt this was ok.
+// It makes it much more versatile when testing
+// END COMMENT
+void swap(int M, double (*a)[M], double (*b)[M])
+{
+  double swp;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      swp = a[i][j];
+      a[i][j] = b[i][j];
+      b[i][j] = swp;
+    }
+  }
+}
+
+// See comment above
+double mmax(int M, double (*phi)[M], double (*phip)[M]) {
+  double max = 0.0;
+  double diff = 0.0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      diff = fabs(phi[i][j]-phip[i][j]);
+      if (diff > max)
+        max = diff;
+    }
+  }
+  return max;
+}
+
+double rho(double x, double y) {
+  double s1 = 0.6;
+  double e1 = 0.8;
+  double s2 = 0.2;
+  double e2 = 0.4;
+
+  if (x > s1 && x < e1 && y > s1 && y < e1) {
+    return 1.0;
+  } else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+clock_t run(int M, int N_ITER) {
+
+  // COMMENT:
+  // I wouldn't even imagine any novice in C would
+  // do something like this. It really makes no sense :)
+  // END OF COMMENT
+  double (*phi)[M];
+  double (*phip)[M]; 
+  double (*rhoa)[M]; 
+  phi = malloc(sizeof(double[M][M]));
+  phip = malloc(sizeof(double[M][M]));
+  rhoa = malloc(sizeof(double[M][M]));
+
+  clock_t tic = clock();
+
+  // initialize matrices
+  memset(phip, 0, sizeof(phip[0][0])*M*M);
+  memset(phi, 0, sizeof(phi[0][0])*M*M);
+  
+  double delta = 1.0;
+  double a2 = pow(SPACE,2.0);
+  
+  int iter = 0;
+  while (iter < N_ITER ) {
+    iter += 1;
+    
+    for (int i=1; i < M-1; i++) {
+      for (int j=1; j < M-1; j++) {
+        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
+                      phi[i][j+1] + phi[i][j-1])/4.0 +
+	  a2/(4.0 * EPSILON0)*rho(i*SPACE,j*SPACE);
+      }
+    }
+    delta = mmax(M, phi, phip);
+    swap(M, phi, phip);
+    
+  }
+
+  clock_t toc = clock();
+
+  free(phi);
+  free(phip);
+  free(rhoa);
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+
+int main(int argc, char *argv[]) {
+
+  int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_2.c b/poisson2d/c_2.c
new file mode 100644
index 0000000..05a0c42
--- /dev/null
+++ b/poisson2d/c_2.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+// COMMENT
+// I know, this is extremely bad practice.
+// But to accommodate argument sizes for the arguments
+// I felt this was ok.
+// It makes it much more versatile when testing
+// END COMMENT
+void swap(int M, double a[M][M], double b[M][M]) {
+  double swp;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      swp = a[i][j];
+      a[i][j] = b[i][j];
+      b[i][j] = swp;
+    }
+  }
+}
+
+// See comment above
+double mmax(int M, double a[M][M], double b[M][M]) {
+  double max = 0.0;
+  double diff = 0.0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      diff = fabs(a[i][j]-b[i][j]);
+      if (diff > max)
+        max = diff;
+    }
+  }
+  return max;
+}
+
+double rho(double x, double y) {
+  double s1 = 0.6;
+  double e1 = 0.8;
+  double s2 = 0.2;
+  double e2 = 0.4;
+
+  if (x > s1 && x < e1 && y > s1 && y < e1) {
+    return 1.0;
+  } else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+void run(int M, int N_ITER) {
+
+  double phi[M][M];
+  double phip[M][M]; 
+
+  clock_t tic = clock();
+  int iter = 0;
+
+  // initialize matrices
+  memset(&phip[0][0], 0, sizeof(double)*M*M);
+  memset(&phi[0][0], 0, sizeof(double)*M*M);
+  
+  double delta;
+  double a2 = pow(SPACE,2.0);
+  
+  while (iter < N_ITER ) {
+    iter += 1;
+    
+    for (int i=1; i < M-1; i++) {
+      for (int j=1; j < M-1; j++) {
+        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
+                      phi[i][j+1] + phi[i][j-1])/4.0 +
+	  a2/(4.0 * EPSILON0)*rho(i*SPACE,j*SPACE);
+      }
+    }
+    delta = mmax(M, phi, phip);
+    swap(M, phi, phip);
+    
+  }
+
+  clock_t toc = clock();
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+
+int main(int argc, char *argv[]) {
+
+  int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_3.c b/poisson2d/c_3.c
new file mode 100644
index 0000000..8806cd7
--- /dev/null
+++ b/poisson2d/c_3.c
@@ -0,0 +1,104 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+// COMMENT
+// I know, this is extremely bad practice.
+// But to accommodate argument sizes for the arguments
+// I felt this was ok.
+// It makes it much more versatile when testing
+// END COMMENT
+void swap(int M, double a[M][M], double b[M][M]) {
+  double swp;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      swp = a[i][j];
+      a[i][j] = b[i][j];
+      b[i][j] = swp;
+    }
+  }
+}
+
+// See comment above
+double mmax(int M, double a[M][M], double b[M][M]) {
+  double max = 0.0;
+  double diff = 0.0;
+  for (int i = 0; i < M; i++) {
+    for (int j = 0; j < M; j++) {
+      diff = fabs(a[i][j]-b[i][j]);
+      if (diff > max)
+        max = diff;
+    }
+  }
+  return max;
+}
+
+double rho(double x, double y) {
+  double s1 = 0.6;
+  double e1 = 0.8;
+  double s2 = 0.2;
+  double e2 = 0.4;
+
+  if (x > s1 && x < e1 && y > s1 && y < e1) {
+    return 1.0;
+  } else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+void run(int M, int N_ITER) {
+
+  double phi[M][M];
+  double phip[M][M]; 
+  double rhoa[M][M]; 
+
+  clock_t tic = clock();
+  int iter = 0;
+
+  // initialize matrices
+  memset(&phip[0][0], 0, sizeof(double)*M*M);
+  memset(&phi[0][0], 0, sizeof(double)*M*M);
+  for (int i=1; i<M-1; i++)  {
+    for (int j=1; j<M-1; j++) {
+      rhoa[i][j] =  rho(i*SPACE,j*SPACE);
+    }
+  }
+
+  double delta;
+  double a2 = pow(SPACE,2.0);
+  
+  while (iter < N_ITER ) {
+    iter += 1;
+    
+    for (int i=1; i < M-1; i++) {
+      for (int j=1; j < M-1; j++) {
+        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
+                      phi[i][j+1] + phi[i][j-1])/4.0 +
+	  a2/(4.0 * EPSILON0)*rhoa[i][j];
+      }
+    }
+    delta = mmax(M, phi, phip);
+    swap(M, phi, phip);
+    
+  }
+
+  clock_t toc = clock();
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+
+int main(int argc, char *argv[]) {
+
+  int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_4_1d.c b/poisson2d/c_4_1d.c
new file mode 100644
index 0000000..26e839c
--- /dev/null
+++ b/poisson2d/c_4_1d.c
@@ -0,0 +1,98 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+#define IDX(i,j) (i)*M+j
+
+
+double rho(const double x, const double y) {
+  const double s1 = 0.6;
+  const double e1 = 0.8;
+  const double s2 = 0.2;
+  const double e2 = 0.4;
+
+  if (s1 < x && x < e1 && s1 < y && y < e1) {
+    return 1.0;
+  } else if ( s2 < x && x < e2 && s2 < y && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+double iterate(int M, double *restrict phi, double *restrict phinew, double *restrict rhoa) {
+  double delta = 0, err;
+  for (int i=1; i < M-1; i++) {
+    for (int j=1; j < M-1; j++) {
+      phinew[IDX(i,j)] = (phi[IDX(i+1,j)] + phi[IDX(i-1,j)] + phi[IDX(i,j+1)] + phi[IDX(i,j-1)] + rhoa[IDX(i,j)])*0.25;
+      err = fabs(phinew[IDX(i,j)] - phi[IDX(i,j)]);
+      if ( err > delta ) delta = err;
+    }
+  }
+  return delta;
+}
+
+void init_rho(int M, double *restrict rhoa, const double epsilon, const double a) {
+  const double a2 = a * a / epsilon;
+  for (int i=1; i<M-1; i++)  {
+    for (int j=1; j<M-1; j++) {
+      rhoa[IDX(i,j)] = rho(i*a,j*a) * a2;
+    }
+  }
+}
+
+
+void run(int M, int N_ITER) {
+
+  double *phi;
+  double *phip;
+  double *rhoa;
+  double *tmp;
+
+  // A real world program will definitely use malloc
+  phi = malloc(M*M*sizeof(double));
+  phip = malloc(M*M*sizeof(double));
+  rhoa = malloc(M*M*sizeof(double));
+
+  clock_t tic = clock();
+
+  memset(phi, 0, sizeof(double)*M*M);
+  memset(phip, 0, sizeof(double)*M*M);
+
+  // In C one tries to avoid using pow because
+  // it assumes floating point powers (integers are faster)
+  // So better do it directly
+  init_rho(M, rhoa, EPSILON0, SPACE);
+  
+  int iter = 0;
+  double delta;
+  while ( iter < N_ITER ) {
+    iter += 1;
+
+    delta = iterate(M, phi, phip, rhoa);
+
+    // swap pointers (no copies)
+    tmp = phi;
+    phi = phip;
+    phip = tmp;
+  }
+
+  clock_t toc = clock();
+
+  free(phi);
+  free(phip);
+  free(rhoa);
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+int main(int argc, char *argv[]) {
+ int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_4_2d.c b/poisson2d/c_4_2d.c
new file mode 100644
index 0000000..a6bbdd5
--- /dev/null
+++ b/poisson2d/c_4_2d.c
@@ -0,0 +1,127 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#include "c_common.h"
+
+double ** malloc_2d(int m, int n) {
+  if (m <= 0 || n <= 0)
+    return NULL;
+
+  // Allocate pointers for columns
+  double **array2D = malloc(m * sizeof(double *));
+  if (array2D == NULL) {
+    return NULL;
+  }
+
+  // Allocate actual data
+  array2D[0] = malloc(m * n * sizeof(double));
+  if (array2D[0] == NULL) {
+    free(array2D);
+    return NULL;
+  }
+
+  // assign pointers to make it 2D
+  for(int i = 1; i < m; i++) {
+    array2D[i] = array2D[0] + i * n ;
+  }
+
+  return array2D;
+}
+
+void free_2d(double **array2D) {
+  free(array2D[0]);
+  free(array2D);
+}
+
+
+
+double rho(const double x, const double y) {
+  const double s1 = 0.6;
+  const double e1 = 0.8;
+  const double s2 = 0.2;
+  const double e2 = 0.4;
+
+  if (s1 < x && x < e1 && s1 < y && y < e1) {
+    return 1.0;
+  } else if ( s2 < x && x < e2 && s2 < y && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+double iterate(int M, double *restrict*restrict phi, double *restrict*restrict phinew, double *restrict*restrict rhoa) {
+  double delta = 0, err;
+  for (int i=1; i < M-1; i++) {
+    for (int j=1; j < M-1; j++) {
+      phinew[i][j] = (phi[i+1][j] + phi[i-1][j] + phi[i][j+1] + phi[i][j-1] + rhoa[i][j])*0.25;
+      err = fabs(phinew[i][j] - phi[i][j]);
+      if ( err > delta ) delta = err;
+    }
+  }
+  return delta;
+}
+
+void init_rho(int M, double *restrict*restrict rhoa, const double epsilon, const double a) {
+  const double a2 = a * a / epsilon;
+  for (int i=1; i<M-1; i++)  {
+    for (int j=1; j<M-1; j++) {
+      rhoa[i][j] = rho(i*a,j*a) * a2;
+    }
+  }
+}
+
+
+void run(int M, int N_ITER) {
+
+  double **phi;
+  double **phip;
+  double **rhoa;
+  double **tmp;
+
+  // A real world program will definitely use malloc
+  phi = malloc_2d(M, M);
+  phip = malloc_2d(M, M);
+  rhoa = malloc_2d(M, M);
+
+  clock_t tic = clock();
+
+  memset(&phi[0][0], 0, sizeof(double)*M*M);
+  memset(&phip[0][0], 0, sizeof(double)*M*M);
+
+  // In C one tries to avoid using pow because
+  // it assumes floating point powers (integers are faster)
+  // So better do it directly
+  init_rho(M, rhoa, EPSILON0, SPACE);
+  
+  int iter = 0;
+  double delta;
+  while ( iter < N_ITER ) {
+    iter += 1;
+
+    delta = iterate(M, phi, phip, rhoa);
+
+    // swap pointers (no copies)
+    tmp = phi;
+    phi = phip;
+    phip = tmp;
+  }
+
+  clock_t toc = clock();
+
+  free_2d(phi);
+  free_2d(phip);
+  free_2d(rhoa);
+
+  write_timing(M, iter, delta, toc - tic);
+}
+
+int main(int argc, char *argv[]) {
+  int N, N_ITER;
+
+  // Retrieve arguments
+  parse_args(argc, argv, &N, &N_ITER);
+  run(N, N_ITER);
+}
diff --git a/poisson2d/c_common.c b/poisson2d/c_common.c
new file mode 100644
index 0000000..caa4335
--- /dev/null
+++ b/poisson2d/c_common.c
@@ -0,0 +1,17 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <time.h>
+
+void parse_args(int argc, char **argv, int *N, int *N_iter) {
+  if ( argc < 3 ) {
+    printf("Not enough arguments [N N_ITER]\n");
+  }
+
+  *N = atoi(argv[1]);
+  *N_iter = atoi(argv[2]);
+}
+
+void write_timing(int N, int iter, double delta, clock_t timing) {
+  printf("%16d %16d %22.15e %22.15e\n", N, iter, delta, ((double)timing)/CLOCKS_PER_SEC/iter);
+}
diff --git a/poisson2d/c_common.h b/poisson2d/c_common.h
new file mode 100644
index 0000000..374921d
--- /dev/null
+++ b/poisson2d/c_common.h
@@ -0,0 +1,10 @@
+#ifndef _COMMON_HEADER_
+#define _COMMON_HEADER_
+
+#define EPSILON0 8.85e-12
+#define SPACE 0.01
+
+
+void parse_args(int, char **, int *, int *);
+void write_timing(int, int, double, clock_t);
+#endif
diff --git a/poisson2d/fortran_2.f90 b/poisson2d/fortran_2.f90
new file mode 100644
index 0000000..22c1bcd
--- /dev/null
+++ b/poisson2d/fortran_2.f90
@@ -0,0 +1,67 @@
+module poisson_m
+  use common_m
+  implicit none
+
+contains
+
+  pure real(dp) function rho(x,y)
+    real(dp), intent(in) :: x,y
+    if (x > 0.6_dp .and. x < 0.8_dp .and. y > 0.6_dp .and. y<0.8) then
+      rho = 1.0_dp
+    else if (x> 0.2_dp .and. x<0.4_dp .and. y>0.2_dp .and. y<0.4_dp) then
+      rho = -1.0_dp
+    else
+      rho = 0.0_dp
+    end if
+  end function rho
+
+  subroutine run(M, N_ITER)
+    integer, intent(in) :: M, N_ITER
+
+    integer :: i,j, iter
+    real(dp) :: tic, toc
+    real(dp) :: delta, phiprime(M,M), phi(M,M), a2, temp(M,M)
+
+    ! Since arrays are allocated on the stack
+    call cpu_time(tic)
+    
+    phiprime(:,:) = 0.0_dp
+    phi(:,:) = 0.0_dp
+
+    a2 = a**2
+
+    iter = 0
+    do while (iter < N_ITER )
+      iter = iter + 1
+      do j=2, M-1
+        do i=2, M-1
+          phiprime(i,j) = (phi(i+1,j) + phi(i-1,j) + phi(i,j+1) + phi(i,j-1))/4.0_dp &
+              + a2/4.0_dp/epsilon0*rho((i-1)*a,(j-1)*a)
+        end do
+      end do
+
+      delta = maxval(abs(phiprime - phi))
+      temp = phi
+      phi = phiprime 
+      phiprime = temp
+      
+    end do
+    call cpu_time(toc)
+
+    call write_timing(M, iter, delta, toc - tic)
+
+  end subroutine run
+
+end module poisson_m
+
+program poisson
+  use common_m, only : parse_args
+  use poisson_m, only: run
+  implicit none
+  integer :: M, N_ITER
+
+  call parse_args(M, N_ITER)
+
+  call run(M, N_iter)
+
+end program poisson
diff --git a/poisson2d/fortran_3.f90 b/poisson2d/fortran_3.f90
new file mode 100644
index 0000000..9b9153c
--- /dev/null
+++ b/poisson2d/fortran_3.f90
@@ -0,0 +1,74 @@
+module poisson_m
+  use common_m
+  implicit none
+
+contains
+
+  pure real(dp) function rho(x,y)
+    real(dp), intent(in) :: x,y
+    if (x > 0.6_dp .and. x < 0.8_dp .and. y > 0.6_dp .and. y<0.8) then
+      rho = 1.0_dp
+    else if (x> 0.2_dp .and. x<0.4_dp .and. y>0.2_dp .and. y<0.4_dp) then
+      rho = -1.0_dp
+    else
+      rho = 0.0_dp
+    end if
+  end function rho
+
+  subroutine run(M, N_ITER)
+    integer, intent(in) :: M, N_ITER
+
+    integer :: i,j, iter
+    real(dp) :: tic, toc
+    real(dp) :: delta, a2
+    real(dp) :: phiprime(M,M), phi(M,M), rhoa(M,M), temp(M,M)
+
+    ! Since arrays are allocated on the stack
+    call cpu_time(tic)
+    
+    phiprime(:,:) = 0.0_dp
+    phi(:,:) = 0.0_dp
+    do j=2, M-1
+      do i=2, M-1
+        rhoa(i,j) = rho((i-1)*a,(j-1)*a)
+      end do
+    end do
+
+    a2 = a**2
+
+    iter = 0
+    do while (iter < N_ITER )
+      
+      iter = iter + 1
+      do j=2, M-1
+        do i=2, M-1
+          phiprime(i,j) = (phi(i+1,j) + phi(i-1,j) + phi(i,j+1) + phi(i,j-1) + &
+              a2 / epsilon0 * rhoa(i,j))*0.25_dp
+        end do
+      end do
+
+      delta = maxval(abs(phiprime - phi))
+      temp = phi
+      phi = phiprime 
+      phiprime = temp
+      
+    end do
+    call cpu_time(toc)
+
+    call write_timing(M, iter, delta, toc - tic)
+
+  end subroutine run
+
+end module poisson_m
+
+program poisson
+  use common_m, only : parse_args
+  use poisson_m, only: run
+  implicit none
+  integer :: M, N_ITER
+
+  call parse_args(M, N_ITER)
+
+  call run(M, N_iter)
+
+end program poisson
diff --git a/poisson2d/fortran_4.f90 b/poisson2d/fortran_4.f90
new file mode 100644
index 0000000..4ecb6ed
--- /dev/null
+++ b/poisson2d/fortran_4.f90
@@ -0,0 +1,88 @@
+program poisson
+  use common_m
+
+  implicit none
+
+  integer :: M, N_ITER
+  integer :: i,j, iter
+  real(dp) :: tic, toc
+  real(dp) :: delta, a2
+
+  real(dp), allocatable :: rhoa(:,:)
+  real(dp), allocatable, target :: phiprime(:,:), phi(:,:)
+  real(dp), pointer :: p(:,:), pnew(:,:), tmp(:,:)
+
+  call parse_args(M, N_ITER)
+  
+  allocate(phi(M,M))
+  allocate(phiprime(M,M))
+  allocate(rhoa(M,M))
+
+  ! we don't time allocation/deallocation
+  call cpu_time(tic)
+
+  ! Fortran doesn't care too much about pow
+  ! since it figures out if the exponent is an integer or not
+  a2 = a*a / epsilon0
+  do j=2, M-1
+    do i=2, M-1
+      rhoa(i,j) = rho((i-1)*a,(j-1)*a) * a2
+    end do
+  end do
+
+  phi(:,:) = 0.0_dp
+  phiprime(:,:) = 0.0_dp
+  p => phi
+  pnew => phiprime
+  iter = 0
+  do while ( iter < N_ITER )
+    iter = iter + 1
+
+    call iterate(rhoa, p, pnew, delta)
+
+    ! Easer to swap pointer than copying stuff around
+    ! In fortran we could even use pointers
+    ! But we can just call
+    tmp => p
+    p => pnew
+    pnew => tmp
+
+  end do
+  
+  call cpu_time(toc)
+
+  deallocate(phi, phiprime, rhoa)
+
+  call write_timing(M, iter, delta, toc - tic)
+
+contains
+
+  subroutine iterate(rhoa, p, pnew, delta)
+    real(dp), intent(in) :: p(:,:), rhoa(:,:)
+    real(dp), intent(out) :: pnew(:,:)
+    real(dp), intent(out) :: delta
+
+    integer :: i, j
+
+    delta = 0._dp
+    do j=2, M-1
+      do i=2, M-1
+        pnew(i,j) = (p(i+1,j) + p(i-1,j) + p(i,j+1) + p(i,j-1) + rhoa(i,j)) * 0.25_dp
+        delta = max(delta, abs(pnew(i,j) - p(i,j)))
+      end do
+    end do
+
+  end subroutine iterate
+
+  pure real(dp) function rho(x,y)
+    real(dp), intent(in) :: x,y
+    if ( 0.6_dp < x .and. x < 0.8_dp .and. 0.6_dp < y .and. y < 0.8_dp ) then
+      rho = 1.0_dp
+    else if ( 0.2_dp < x .and. x < 0.4_dp .and. 0.2_dp < y .and. y < 0.4_dp ) then
+      rho = -1.0_dp
+    else
+      rho = 0.0_dp
+    end if
+  end function rho
+
+end program poisson
diff --git a/poisson2d/fortran_common.f90 b/poisson2d/fortran_common.f90
new file mode 100644
index 0000000..eaf46fd
--- /dev/null
+++ b/poisson2d/fortran_common.f90
@@ -0,0 +1,43 @@
+!< Module for handling common things such as input arguments and timings
+module common_m
+
+  implicit none
+
+  public
+  integer, parameter :: sp = kind(0.)
+  integer, parameter :: dp = kind(0.d0)
+
+  real(dp), parameter :: a = 0.01_dp
+  real(dp), parameter :: epsilon0 = 8.85E-12_dp
+
+contains
+
+  subroutine parse_args(N, N_ITER)
+    integer, intent(out) :: N, N_ITER
+
+    character(len=128) :: input
+    integer :: length
+
+    if ( command_argument_count() < 2 ) then
+      stop 'Not enough arguments [N N_ITER]'
+    end if
+
+    call get_command_argument(1,input,length)
+    read(input,*) N
+    call get_command_argument(2,input,length)
+    read(input,*) N_ITER
+
+  end subroutine parse_args
+
+  subroutine write_timing(N, iter, delta, timing)
+    integer, intent(in) :: N, iter
+    real(dp), intent(in) :: delta, timing
+
+    write(*,'(2(i16,tr1),2(e22.15,tr1))') N, iter, delta, timing/real(iter,dp)
+
+  end subroutine write_timing
+
+end module common_m
+  
+
+  
diff --git a/poisson2d/naive.c b/poisson2d/naive.c
deleted file mode 100644
index 9d72f22..0000000
--- a/poisson2d/naive.c
+++ /dev/null
@@ -1,90 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <time.h>
-#define M 300
-void swap(double (*a)[M], double (*b)[M], int n)
-{
-  double swp;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      swp = a[i][j];
-      a[i][j] = b[i][j];
-      b[i][j] = swp;
-    }
-  }
-}
-double mmax(double (*phi)[M], double (*phip)[M], int n)
-{
-  double max = 0.0;
-  double diff = 0.0;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      diff = fabs(phi[i][j]-phip[i][j]);
-      if (diff > max)
-        max = diff;
-    }
-  }
-  return max;
-}
-double rho(double x, double y)
-{
-  double s1 = 0.6;
-  double e1 = 0.8;
-	double s2 = 0.2;
-	double e2 = 0.4;
-
-	if (x > s1 && x < e1 && y > s1 && y < e1) {
-		return 1.0;
-	} else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
-		return -1.0;
-	} else {
-		return 0.0;
-	}
-}
-void run(double toler, double a)
-{
-  double epsilon0 = 8.85e-12;
-  double a2;
-
-  double (*phi)[M];
-  double (*phip)[M]; 
-  double (*rhoa)[M]; 
-  phi = malloc(sizeof(double[M][M]));
-  phip = malloc(sizeof(double[M][M]));
-  rhoa = malloc(sizeof(double[M][M]));
-
-  int iter = 0;
-
-  memset(phip, 0, sizeof(phip[0][0])*M*M);
-  memset(phi, 0, sizeof(phi[0][0])*M*M);
-  
-  double delta = 1.0;
-  a2 = pow(a,2.0);
-  while (delta > toler) {
-    iter += 1;
-    
-    for (int i=1; i < M-1; i++) {
-      for (int j=1; j < M-1; j++) {
-        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
-                      phi[i][j+1] + phi[i][j-1])/4.0 +
-                      a2/(4.0 * epsilon0)*rho(i*a,j*a);
-      }
-    }
-    delta = mmax(phi, phip, M);
-    swap(phi, phip, M);
-
-  }
-  printf("iters %d", iter);
-
-}
-
-int main(int argc, char *argv[])
-{
-  clock_t start = clock();
-  run(1e-6, 0.01);
-  clock_t end = clock();
-  double total = ((double)(end - start)) / CLOCKS_PER_SEC;
-  printf("Execution time: %f\n",total);
-}
diff --git a/poisson2d/naive.f90 b/poisson2d/naive.f90
deleted file mode 100644
index 3c34692..0000000
--- a/poisson2d/naive.f90
+++ /dev/null
@@ -1,51 +0,0 @@
-module rhofunc
-implicit none
-public
-integer, parameter :: dp=kind(0.d0)
-contains
-    pure real(dp) function rho(x,y)
-        real(dp), intent(in) :: x,y
-        if (x > 0.6_dp .and. x < 0.8_dp .and. y > 0.6_dp .and. y<0.8) then
-            rho = 1.0_dp
-        else if (x> 0.2_dp .and. x<0.4_dp .and. y>0.2_dp .and. y<0.4_dp) then
-            rho = -1.0_dp
-        else
-            rho = 0.0_dp
-        end if
-    end function
-
-end module
-
-program poisson
-use rhofunc, only: rho
-implicit none
-integer, parameter :: dp=kind(0.d0), M=300
-integer            :: i,j, iter
-real(dp),parameter :: epsilon0=8.85E-12_dp, target=1E-6_dp, a=0.01
-real(dp)           :: delta, b, e, phiprime(M,M), phi(M,M), a2, rhoarr(M,M), temp(M,M)
-
-
-delta = 1.0_dp
-iter = 0
-call cpu_time(b)
-phiprime(:,:) = 0.0_dp
-phi(:,:) = 0.0_dp
-
-do while (delta > target )
-    iter = iter + 1
-    a2 = a**2.0_dp
-    do i=2, M-1
-        do j=2, M-1
-            phiprime(i,j) = (phi(i+1,j) + phi(i-1,j) + phi(i,j+1) + phi(i,j-1))/4.0_dp &
-            + a2/4.0_dp/epsilon0*rho(i*a,j*a)
-        end do
-    end do
-    delta = maxval(abs(phiprime - phi))
-    temp = phi
-    phi = phiprime 
-    phiprime = temp
-
-end do
-call cpu_time(e)
-print *, e-b, iter
-end program
\ No newline at end of file
diff --git a/poisson2d/optimized.c b/poisson2d/optimized.c
deleted file mode 100644
index 0dddfec..0000000
--- a/poisson2d/optimized.c
+++ /dev/null
@@ -1,92 +0,0 @@
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <math.h>
-#include <time.h>
-#define M 300
-void swap(double (*a)[M], double (*b)[M], int n)
-{
-  double swp;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      a[i][j] = b[i][j];
-    }
-  }
-}
-double mmax(double (*phi)[M], double (*phip)[M], int n)
-{
-  double max = 0.0;
-  double diff = 0.0;
-  for (int i = 0; i < n; i++) {
-    for (int j = 0; j < n; j++) {
-      diff = fabs(phi[i][j]-phip[i][j]);
-      if (diff > max)
-        max = diff;
-    }
-  }
-  return max;
-}
-double rho(double x, double y)
-{
-  double s1 = 0.6;
-  double e1 = 0.8;
-	double s2 = 0.2;
-	double e2 = 0.4;
-
-	if (x > s1 && x < e1 && y > s1 && y < e1) {
-		return 1.0;
-	} else if (x > s2 && x < e2 && y > s2 && y < e2 ) {
-		return -1.0;
-	} else {
-		return 0.0;
-	}
-}
-void run(double toler, double a)
-{
-  double epsilon0 = 8.85e-12;
-  double a2;
-
-  double (*phi)[M];
-  double (*phip)[M]; 
-  double (*rhoa)[M]; 
-  phi = malloc(sizeof(double[M][M]));
-  phip = malloc(sizeof(double[M][M]));
-  rhoa = malloc(sizeof(double[M][M]));
-
-  int iter = 0;
-
-  memset(phip, 0, sizeof(phip[0][0])*M*M);
-  memset(phi, 0, sizeof(phi[0][0])*M*M);
-  for (int i=1; i<M-1; i++)  {
-    for (int j=1; j<M-1; j++) {
-      rhoa[i][j] =  rho(i*a,j*a);
-    }
-  }
-  double delta = 1.0;
-  a2 = pow(a,2.0);
-  while (delta > toler) {
-    iter += 1;
-    
-    for (int i=1; i < M-1; i++) {
-      for (int j=1; j < M-1; j++) {
-        phip[i][j] = (phi[i+1][j] + phi[i-1][j] +
-                      phi[i][j+1] + phi[i][j-1])/4.0 +
-                      a2/(4.0 * epsilon0)*rhoa[i][j];
-      }
-    }
-    delta = mmax(phi, phip, M);
-    swap(phi, phip, M);
-
-  }
-  printf("iters %d", iter);
-
-}
-
-int main(int argc, char *argv[])
-{
-  clock_t start = clock();
-  run(1e-6, 0.01);
-  clock_t end = clock();
-  double total = ((double)(end - start)) / CLOCKS_PER_SEC;
-  printf("Execution time: %f\n",total);
-}
diff --git a/poisson2d/optimized.f90 b/poisson2d/optimized.f90
deleted file mode 100644
index 3bc1045..0000000
--- a/poisson2d/optimized.f90
+++ /dev/null
@@ -1,55 +0,0 @@
-module rhofunc
-implicit none
-public
-integer, parameter :: dp=kind(0.d0)
-contains
-    pure real(dp) function rho(x,y)
-        real(dp), intent(in) :: x,y
-        if (x > 0.6_dp .and. x < 0.8_dp .and. y > 0.6_dp .and. y<0.8) then
-            rho = 1.0_dp
-        else if (x> 0.2_dp .and. x<0.4_dp .and. y>0.2_dp .and. y<0.4_dp) then
-            rho = -1.0_dp
-        else
-            rho = 0.0_dp
-        end if
-    end function
-
-end module
-
-program poisson
-use rhofunc, only: rho
-implicit none
-integer, parameter :: dp=kind(0.d0), M=300
-integer            :: i,j, iter
-real(dp),parameter :: epsilon0=8.85E-12_dp, target=1E-6_dp, a=0.01
-real(dp)           :: delta, b, e, phiprime(M,M), phi(M,M), a2, rhoarr(M,M)
-
-
-delta = 1.0_dp
-iter = 0
-call cpu_time(b)
-phiprime(:,:) = 0.0_dp
-phi(:,:) = 0.0_dp
-do i=1, M
-    do j=1, M
-        rhoarr(i,j) = rho(i*a,j*a)
-    end do
-end do
-
-do while (delta > target )
-    iter = iter + 1
-    a2 = a**2.0_dp
-    do i=2, M-1
-        do j=2, M-1
-            phiprime(i,j) = (phi(i+1,j) + phi(i-1,j) + phi(i,j+1) + phi(i,j-1))/4.0_dp &
-            + a2/4.0_dp/epsilon0*rhoarr(i,j)
-        end do
-    end do
-    delta = maxval(abs(phiprime - phi))
-    phi = phiprime 
-
-
-end do
-call cpu_time(e)
-print *, e-b, iter
-end program
\ No newline at end of file
diff --git a/poisson2d/run.sh b/poisson2d/run.sh
new file mode 100644
index 0000000..234dae4
--- /dev/null
+++ b/poisson2d/run.sh
@@ -0,0 +1,44 @@
+#!/bin/bash
+
+# This may not apply to your distribution
+# change or comment out at will, but remark
+# the stack-size.
+ulimit -s unlimited
+
+OPTS="-O3 -march=native"
+FC=gfortran
+CC=gcc
+
+$CC $OPTS -c c_common.c
+for c in 1 2 3 4_1d 4_2d
+do
+    echo "Compiling and running: c_${c}"
+    $CC $OPTS -o c_${c} c_${c}.c c_common.o
+
+    # Now run for
+    {
+	for N in 100 200 300 ; do
+	    ./c_${c} $N 10000
+	done
+	for N in 400 500 600; do
+	    ./c_${c} $N 4000
+	done
+    } > c_${c}.data
+done
+
+$FC $OPTS -c fortran_common.f90
+for f in 2 3 4
+do
+    echo "Compiling and running: fortran_${f}"
+    $FC $OPTS -o fortran_${f} fortran_${f}.f90 fortran_common.o
+
+    # Now run for
+    {
+	for N in 100 200 300 ; do
+	    ./fortran_${f} $N 10000
+	done
+	for N in 400 500 600; do
+	    ./fortran_${f} $N 4000
+	done
+    } > fortran_${f}.data
+done