diff --git a/.isort.cfg b/.isort.cfg index baa39c2b5d..0b4d779478 100644 --- a/.isort.cfg +++ b/.isort.cfg @@ -1,2 +1,2 @@ [settings] -known_third_party =pytest,setuptools,sklearn,torch,tqdm +known_third_party =matplotlib,pandas,pytest,seaborn,setuptools,sklearn,torch,tqdm diff --git a/README.md b/README.md index 8df077c532..5ed886ac86 100644 --- a/README.md +++ b/README.md @@ -30,10 +30,10 @@ Flexible Transformers, defined by interoperable and optimized building blocks th [ ] Waay more tests, find more invariants depending on the blocks [ ] Benchmark: - [ ] add at least something basic to check training - [ ] measure throughput and memory + [x] add at least something basic to check training + [x] measure throughput and memory [ ] autogenerate text report - [ ] autogenerate curves + [x] autogenerate curves ## Architecture, code [x] Remove the AttrDict dependency @@ -80,6 +80,14 @@ Models live in `xformers/models`. As a general rule, one should try to write the These live in `xformers/benchmarks`. Sweeping over different attention settings to log max memory use and runtime can for instance be done by invoking `python3 benchmarks/benchmark_attention.py`. Specifying a subset to test is done through command line arguments, for instance `python3 benchmarks/benchmark_attention.py --causal True --attentions random --activations gelu -fp16 True`. +Some examples, generated on CPU: + +![](docs/plots/memory_vs_attention.png) + +![](docs/plots/runtime_vs_attention.png) + + + ## Bibliography DRAFT, needs a proper citation format, .. diff --git a/benchmarks/benchmark_attention.py b/benchmarks/benchmark_attention.py index c03ba31efc..a080483658 100644 --- a/benchmarks/benchmark_attention.py +++ b/benchmarks/benchmark_attention.py @@ -1,8 +1,11 @@ import argparse import json import time -from typing import Dict, Optional +from typing import Any, Dict, List, Optional +import matplotlib.pyplot as plt +import pandas as pd +import seaborn as sns import torch import torch.nn.functional as F from sklearn.model_selection import ParameterGrid @@ -20,6 +23,8 @@ # Credits: Sean Naren +_use_cuda = torch.cuda.is_available() + def _train_for_several_steps( block: xFormerEncoderBlock, @@ -36,8 +41,9 @@ def _train_for_several_steps( # and this makes it bad for tests optim = torch.optim.SGD(block.parameters(), lr=lr, momentum=0.9) - torch.cuda.reset_peak_memory_stats() - torch.cuda.synchronize() + if _use_cuda: + torch.cuda.reset_peak_memory_stats() + torch.cuda.synchronize() start_time = time.time() for _ in range(num_steps): @@ -55,8 +61,11 @@ def _train_for_several_steps( torch.nn.utils.clip_grad_norm_(block.parameters(), clip_norm, norm_type) optim.step() - torch.cuda.synchronize() - max_memory = torch.cuda.max_memory_allocated() / 2 ** 20 + if _use_cuda: + torch.cuda.synchronize() + max_memory = torch.cuda.max_memory_allocated() / 2 ** 20 + else: + max_memory = -1 run_time = time.time() - start_time return {"run_time": run_time, "max_memory": round(max_memory, 1)} @@ -98,9 +107,7 @@ def test_xformer_encoder_block( sequence_length=sequence_length, embed_dim=embed_dim, dropout=dropout, - ) - - block.to(device) + ).to(device) return benchmark_model( num_steps=num_steps, @@ -168,6 +175,43 @@ def instantiate_xformer( return block +def plot(args, results: List[Dict[str, Any]]): + df = pd.DataFrame(results) + + HEADS = args.heads[-1] + AMP = [args.pytorch_amp] if args.pytorch_amp is not None else True + EMB = args.embedding_dim[-1] + CAUSAL = args.causal if args.causal is not None else True + BATCH_SIZE = args.batch_size[-1] + ACTIVATION = args.activations[-1] + + df_filtered = df[ + (df["activation"] == ACTIVATION) + & (df["heads"] == HEADS) + & (df["autocast"] == AMP) + & (df["embed_dim"] == EMB) + & (df["causal"] == CAUSAL) + & (df["batch_size"] == BATCH_SIZE) + ] + + sns.barplot( + x="sequence_length", y="max_memory", hue="attention_name", data=df_filtered + ) + plt.xlabel("Sequence length") + plt.ylabel("Max memory being used") + plt.title("Memory use") + plt.savefig("memory_vs_attention.png") + plt.clf() + + sns.barplot( + x="sequence_length", y="run_time", hue="attention_name", data=df_filtered + ) + plt.xlabel("Sequence length") + plt.ylabel("Average epoch time") + plt.title("Runtime") + plt.savefig("runtime_vs_attention.png") + + if __name__ == "__main__": # Get the user requests parser = argparse.ArgumentParser( @@ -186,17 +230,19 @@ def instantiate_xformer( "-sl", "--sequence_length", nargs="+", default=[128, 512, 768], type=int ) parser.add_argument("-bs", "--batch_size", nargs="+", default=[8, 16, 32], type=int) + parser.add_argument("-hd", "--heads", nargs="+", default=[8, 16], type=int) parser.add_argument( "-fp16", "--pytorch_amp", action="store", default=None, type=bool ) parser.add_argument("-causal", "--causal", action="store", default=None, type=bool) + parser.add_argument("-plot", "--plot", action="store_true", default=False) args = parser.parse_args() # Setup the test configs constants = { - "device": torch.device("cuda"), + "device": torch.device("cuda") if _use_cuda else torch.device("cpu"), "num_warmup": 5, "num_steps": 10, "dropout": 0.0, @@ -209,7 +255,7 @@ def instantiate_xformer( if args.pytorch_amp is not None else [False, True], "causal": [args.causal] if args.causal is not None else [False, True], - "heads": [8, 16], + "heads": args.heads, "activation": args.activations, "attention_name": args.attentions, "feedforward_name": list(FEEDFORWARD_REGISTRY.keys()), @@ -233,3 +279,7 @@ def instantiate_xformer( grid_outputs.append(results) print(json.dumps(grid_outputs, sort_keys=True, indent=4)) + + # Optional plots + if args.plot: + plot(args, grid_outputs) diff --git a/docs/plots/memory_vs_attention.png b/docs/plots/memory_vs_attention.png new file mode 100644 index 0000000000..bb81386f5b Binary files /dev/null and b/docs/plots/memory_vs_attention.png differ diff --git a/docs/plots/runtime_vs_attention.png b/docs/plots/runtime_vs_attention.png new file mode 100644 index 0000000000..87d7acdfbe Binary files /dev/null and b/docs/plots/runtime_vs_attention.png differ diff --git a/requirements-benchmark.txt b/requirements-benchmark.txt index f0f353ac0e..ee9cff8035 100644 --- a/requirements-benchmark.txt +++ b/requirements-benchmark.txt @@ -3,3 +3,5 @@ torch >= 1.5.1 scikit-learn == 0.24.1 tqdm == 4.59.0 +pandas == 1.2.4 +seaborn == 0.11.1 \ No newline at end of file diff --git a/requirements-test.txt b/requirements-test.txt index 298f5708a1..8044c9ab80 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -1,5 +1,7 @@ # Get core deps. -r requirements.txt +-r requirements-benchmark.txt + # Tools for static checking. black == 20.8b1