diff --git a/.isort.cfg b/.isort.cfg
index baa39c2b5d..0b4d779478 100644
--- a/.isort.cfg
+++ b/.isort.cfg
@@ -1,2 +1,2 @@
 [settings]
-known_third_party =pytest,setuptools,sklearn,torch,tqdm
+known_third_party =matplotlib,pandas,pytest,seaborn,setuptools,sklearn,torch,tqdm
diff --git a/README.md b/README.md
index 8df077c532..5ed886ac86 100644
--- a/README.md
+++ b/README.md
@@ -30,10 +30,10 @@ Flexible Transformers, defined by interoperable and optimized building blocks th
         [ ] Waay more tests, find more invariants depending on the blocks
 
     [ ] Benchmark:
-        [ ] add at least something basic to check training
-        [ ] measure throughput and memory
+        [x] add at least something basic to check training
+        [x] measure throughput and memory
             [ ] autogenerate text report
-            [ ] autogenerate curves
+            [x] autogenerate curves
 
 ## Architecture, code
     [x] Remove the AttrDict dependency
@@ -80,6 +80,14 @@ Models live in `xformers/models`. As a general rule, one should try to write the
 These live in `xformers/benchmarks`. Sweeping over different attention settings to log max memory use and runtime can for instance be done by invoking
 `python3 benchmarks/benchmark_attention.py`. Specifying a subset to test is done through command line arguments, for instance `python3 benchmarks/benchmark_attention.py --causal True --attentions random --activations gelu -fp16 True`.
 
+Some examples, generated on CPU:
+
+![](docs/plots/memory_vs_attention.png)
+
+![](docs/plots/runtime_vs_attention.png)
+
+
+
 ## Bibliography
 DRAFT, needs a proper citation format, ..
 
diff --git a/benchmarks/benchmark_attention.py b/benchmarks/benchmark_attention.py
index c03ba31efc..a080483658 100644
--- a/benchmarks/benchmark_attention.py
+++ b/benchmarks/benchmark_attention.py
@@ -1,8 +1,11 @@
 import argparse
 import json
 import time
-from typing import Dict, Optional
+from typing import Any, Dict, List, Optional
 
+import matplotlib.pyplot as plt
+import pandas as pd
+import seaborn as sns
 import torch
 import torch.nn.functional as F
 from sklearn.model_selection import ParameterGrid
@@ -20,6 +23,8 @@
 
 # Credits: Sean Naren
 
+_use_cuda = torch.cuda.is_available()
+
 
 def _train_for_several_steps(
     block: xFormerEncoderBlock,
@@ -36,8 +41,9 @@ def _train_for_several_steps(
     # and this makes it bad for tests
     optim = torch.optim.SGD(block.parameters(), lr=lr, momentum=0.9)
 
-    torch.cuda.reset_peak_memory_stats()
-    torch.cuda.synchronize()
+    if _use_cuda:
+        torch.cuda.reset_peak_memory_stats()
+        torch.cuda.synchronize()
 
     start_time = time.time()
     for _ in range(num_steps):
@@ -55,8 +61,11 @@ def _train_for_several_steps(
             torch.nn.utils.clip_grad_norm_(block.parameters(), clip_norm, norm_type)
         optim.step()
 
-    torch.cuda.synchronize()
-    max_memory = torch.cuda.max_memory_allocated() / 2 ** 20
+    if _use_cuda:
+        torch.cuda.synchronize()
+        max_memory = torch.cuda.max_memory_allocated() / 2 ** 20
+    else:
+        max_memory = -1
     run_time = time.time() - start_time
 
     return {"run_time": run_time, "max_memory": round(max_memory, 1)}
@@ -98,9 +107,7 @@ def test_xformer_encoder_block(
         sequence_length=sequence_length,
         embed_dim=embed_dim,
         dropout=dropout,
-    )
-
-    block.to(device)
+    ).to(device)
 
     return benchmark_model(
         num_steps=num_steps,
@@ -168,6 +175,43 @@ def instantiate_xformer(
     return block
 
 
+def plot(args, results: List[Dict[str, Any]]):
+    df = pd.DataFrame(results)
+
+    HEADS = args.heads[-1]
+    AMP = [args.pytorch_amp] if args.pytorch_amp is not None else True
+    EMB = args.embedding_dim[-1]
+    CAUSAL = args.causal if args.causal is not None else True
+    BATCH_SIZE = args.batch_size[-1]
+    ACTIVATION = args.activations[-1]
+
+    df_filtered = df[
+        (df["activation"] == ACTIVATION)
+        & (df["heads"] == HEADS)
+        & (df["autocast"] == AMP)
+        & (df["embed_dim"] == EMB)
+        & (df["causal"] == CAUSAL)
+        & (df["batch_size"] == BATCH_SIZE)
+    ]
+
+    sns.barplot(
+        x="sequence_length", y="max_memory", hue="attention_name", data=df_filtered
+    )
+    plt.xlabel("Sequence length")
+    plt.ylabel("Max memory being used")
+    plt.title("Memory use")
+    plt.savefig("memory_vs_attention.png")
+    plt.clf()
+
+    sns.barplot(
+        x="sequence_length", y="run_time", hue="attention_name", data=df_filtered
+    )
+    plt.xlabel("Sequence length")
+    plt.ylabel("Average epoch time")
+    plt.title("Runtime")
+    plt.savefig("runtime_vs_attention.png")
+
+
 if __name__ == "__main__":
     # Get the user requests
     parser = argparse.ArgumentParser(
@@ -186,17 +230,19 @@ def instantiate_xformer(
         "-sl", "--sequence_length", nargs="+", default=[128, 512, 768], type=int
     )
     parser.add_argument("-bs", "--batch_size", nargs="+", default=[8, 16, 32], type=int)
+    parser.add_argument("-hd", "--heads", nargs="+", default=[8, 16], type=int)
 
     parser.add_argument(
         "-fp16", "--pytorch_amp", action="store", default=None, type=bool
     )
     parser.add_argument("-causal", "--causal", action="store", default=None, type=bool)
+    parser.add_argument("-plot", "--plot", action="store_true", default=False)
 
     args = parser.parse_args()
 
     # Setup the test configs
     constants = {
-        "device": torch.device("cuda"),
+        "device": torch.device("cuda") if _use_cuda else torch.device("cpu"),
         "num_warmup": 5,
         "num_steps": 10,
         "dropout": 0.0,
@@ -209,7 +255,7 @@ def instantiate_xformer(
         if args.pytorch_amp is not None
         else [False, True],
         "causal": [args.causal] if args.causal is not None else [False, True],
-        "heads": [8, 16],
+        "heads": args.heads,
         "activation": args.activations,
         "attention_name": args.attentions,
         "feedforward_name": list(FEEDFORWARD_REGISTRY.keys()),
@@ -233,3 +279,7 @@ def instantiate_xformer(
         grid_outputs.append(results)
 
     print(json.dumps(grid_outputs, sort_keys=True, indent=4))
+
+    # Optional plots
+    if args.plot:
+        plot(args, grid_outputs)
diff --git a/docs/plots/memory_vs_attention.png b/docs/plots/memory_vs_attention.png
new file mode 100644
index 0000000000..bb81386f5b
Binary files /dev/null and b/docs/plots/memory_vs_attention.png differ
diff --git a/docs/plots/runtime_vs_attention.png b/docs/plots/runtime_vs_attention.png
new file mode 100644
index 0000000000..87d7acdfbe
Binary files /dev/null and b/docs/plots/runtime_vs_attention.png differ
diff --git a/requirements-benchmark.txt b/requirements-benchmark.txt
index f0f353ac0e..ee9cff8035 100644
--- a/requirements-benchmark.txt
+++ b/requirements-benchmark.txt
@@ -3,3 +3,5 @@
 torch >= 1.5.1
 scikit-learn == 0.24.1
 tqdm == 4.59.0
+pandas == 1.2.4
+seaborn == 0.11.1
\ No newline at end of file
diff --git a/requirements-test.txt b/requirements-test.txt
index 298f5708a1..8044c9ab80 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -1,5 +1,7 @@
 # Get core deps.
 -r requirements.txt
+-r requirements-benchmark.txt
+
 
 # Tools for static checking.
 black == 20.8b1